public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v1 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json
@ 2022-11-03  8:53 Noah Goldstein
  2022-11-03  8:53 ` [PATCH v1 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
                   ` (5 more replies)
  0 siblings, 6 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-03  8:53 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Json output is easier to parse and most other benchmarks already do
the same.
---
 benchtests/bench-strcat.c  | 106 ++++++++++++++++++++++++++-------
 benchtests/bench-strcpy.c  | 118 ++++++++++++++++++++++++-------------
 benchtests/bench-strncat.c |  94 +++++++++++++++++++++--------
 benchtests/bench-strncpy.c |  94 +++++++++++++++++++++--------
 4 files changed, 297 insertions(+), 115 deletions(-)

diff --git a/benchtests/bench-strcat.c b/benchtests/bench-strcat.c
index 749318e37e..890b34b4c1 100644
--- a/benchtests/bench-strcat.c
+++ b/benchtests/bench-strcat.c
@@ -35,6 +35,7 @@
 # define SMALL_CHAR 1273
 #endif /* WIDE */
 
+#include "json-lib.h"
 
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
 
@@ -49,7 +50,7 @@ IMPL (STRCAT, 1)
 IMPL (generic_strcat, 0)
 
 static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src)
 {
   size_t k = STRLEN (dst), i, iters = INNER_LOOP_ITERS_LARGE;
   timing_t start, stop, cur;
@@ -64,7 +65,8 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
 
   if (STRCMP (dst + k, src) != 0)
     {
-      error (0, 0, "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
+      error (0, 0,
+	     "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
 	     impl->name, dst, src);
       ret = 1;
       return;
@@ -80,11 +82,12 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align1, size_t align2, size_t len1, size_t len2, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len1,
+	 size_t len2, int max_char)
 {
   size_t i;
   CHAR *s1, *s2;
@@ -107,53 +110,112 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2, int max_char)
   for (i = 0; i < len2; i++)
     s2[i] = 32 + 23 * i % (max_char - 32);
 
-  printf ("Length %4zd/%4zd, alignment %2zd/%2zd:", len1, len2, align1, align2);
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_attr_uint (json_ctx, "len1", len1);
+  json_attr_uint (json_ctx, "len2", len2);
+  json_attr_uint (json_ctx, "max_char", max_char);
+
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
     {
       s2[len2] = '\0';
-      do_one_test (impl, s2, s1);
+      do_one_test (json_ctx, impl, s2, s1);
     }
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 test_main (void)
 {
+  json_ctx_t json_ctx;
   size_t i;
 
   test_init ();
 
-  printf ("%28s", "");
+  test_init ();
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
   for (i = 0; i < 16; ++i)
     {
-      do_test (0, 0, i, i, SMALL_CHAR);
-      do_test (0, 0, i, i, BIG_CHAR);
-      do_test (0, i, i, i, SMALL_CHAR);
-      do_test (i, 0, i, i, BIG_CHAR);
+      do_test (&json_ctx, 0, 0, i, i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, i, BIG_CHAR);
+      do_test (&json_ctx, 0, i, i, i, SMALL_CHAR);
+      do_test (&json_ctx, i, 0, i, i, BIG_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (0, 0, 8 << i, 8 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 8 << i, 8 << i, SMALL_CHAR);
-      do_test (0, 0, 8 << i, 2 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 8 << i, 2 << i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 8 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 8 << i, 2 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 2 << i, SMALL_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (i, 2 * i, 8 << i, 1, SMALL_CHAR);
-      do_test (2 * i, i, 8 << i, 1, BIG_CHAR);
-      do_test (i, i, 8 << i, 10, SMALL_CHAR);
-      do_test (i, i, 8 << i, 10, BIG_CHAR);
+      do_test (&json_ctx, i, 2 * i, 8 << i, 1, SMALL_CHAR);
+      do_test (&json_ctx, 2 * i, i, 8 << i, 1, BIG_CHAR);
+      do_test (&json_ctx, i, i, 8 << i, 10, SMALL_CHAR);
+      do_test (&json_ctx, i, i, 8 << i, 10, BIG_CHAR);
+    }
+
+  for (i = 32; i < 256; i += 32)
+    {
+      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
     }
 
+  for (; i < 512; i += 64)
+    {
+      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
+    }
+
+  for (; i < 1024; i += 128)
+    {
+      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
+    }
+
+  for (; i < 2048; i += 256)
+    {
+      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
   return ret;
 }
 
diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
index 29deb8a46a..af8673e137 100644
--- a/benchtests/bench-strcpy.c
+++ b/benchtests/bench-strcpy.c
@@ -26,16 +26,18 @@
 # define SMALL_CHAR 127
 #endif
 
+#include "json-lib.h"
+
 #ifndef STRCPY_RESULT
 # define STRCPY_RESULT(dst, len) dst
 # define TEST_MAIN
 # ifndef WIDE
-#  define TEST_NAME "strcpy"
+#   define TEST_NAME "strcpy"
 # else
-#  define TEST_NAME "wcscpy"
-#  define generic_strcpy generic_wcscpy
+#   define TEST_NAME "wcscpy"
+#   define generic_strcpy generic_wcscpy
 # endif
-#include "bench-string.h"
+# include "bench-string.h"
 
 CHAR *
 generic_strcpy (CHAR *dst, const CHAR *src)
@@ -51,8 +53,8 @@ IMPL (generic_strcpy, 0)
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
 
 static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
-	     size_t len __attribute__((unused)))
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
+	     size_t len __attribute__ ((unused)))
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
@@ -77,23 +79,24 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
   TIMING_NOW (start);
   for (i = 0; i < iters; ++i)
     {
-	  CALL (impl, dst, src);
+      CALL (impl, dst, src);
     }
   TIMING_NOW (stop);
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align1, size_t align2, size_t len, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
+	 int max_char)
 {
   size_t i;
   CHAR *s1, *s2;
-/* For wcscpy: align1 and align2 here mean alignment not in bytes,
-   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
-   len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
+  /* For wcscpy: align1 and align2 here mean alignment not in bytes,
+     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
+     len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
   align1 &= 7;
   if ((align1 + len) * sizeof (CHAR) >= page_size)
     return;
@@ -109,65 +112,96 @@ do_test (size_t align1, size_t align2, size_t len, int max_char)
     s1[i] = 32 + 23 * i % (max_char - 32);
   s1[len] = 0;
 
-  printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len,
-	  align1 * sizeof (CHAR), align2 * sizeof (CHAR));
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_attr_uint (json_ctx, "len", len);
+  json_attr_uint (json_ctx, "max_char", max_char);
+
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s2, s1, len);
+    do_one_test (json_ctx, impl, s2, s1, len);
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 test_main (void)
 {
+  json_ctx_t json_ctx;
   size_t i;
 
   test_init ();
 
-  printf ("%23s", "");
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
   for (i = 0; i < 16; ++i)
     {
-      do_test (0, 0, i, SMALL_CHAR);
-      do_test (0, 0, i, BIG_CHAR);
-      do_test (0, i, i, SMALL_CHAR);
-      do_test (i, 0, i, BIG_CHAR);
+      do_test (&json_ctx, 0, 0, i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, BIG_CHAR);
+      do_test (&json_ctx, 0, i, i, SMALL_CHAR);
+      do_test (&json_ctx, i, 0, i, BIG_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (0, 0, 8 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, SMALL_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (i, 2 * i, 8 << i, SMALL_CHAR);
-      do_test (2 * i, i, 8 << i, BIG_CHAR);
-      do_test (i, i, 8 << i, SMALL_CHAR);
-      do_test (i, i, 8 << i, BIG_CHAR);
+      do_test (&json_ctx, i, 2 * i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 2 * i, i, 8 << i, BIG_CHAR);
+      do_test (&json_ctx, i, i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, i, i, 8 << i, BIG_CHAR);
     }
 
-  for (i = 16; i <= 512; i+=4)
+  for (i = 16; i <= 512; i += 4)
     {
-      do_test (0, 4, i, SMALL_CHAR);
-      do_test (4, 0, i, BIG_CHAR);
-      do_test (4, 4, i, SMALL_CHAR);
-      do_test (2, 2, i, BIG_CHAR);
-      do_test (2, 6, i, SMALL_CHAR);
-      do_test (6, 2, i, BIG_CHAR);
-      do_test (1, 7, i, SMALL_CHAR);
-      do_test (7, 1, i, BIG_CHAR);
-      do_test (3, 4, i, SMALL_CHAR);
-      do_test (4, 3, i, BIG_CHAR);
-      do_test (5, 7, i, SMALL_CHAR);
-      do_test (7, 5, i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 4, i, SMALL_CHAR);
+      do_test (&json_ctx, 4, 0, i, BIG_CHAR);
+      do_test (&json_ctx, 4, 4, i, SMALL_CHAR);
+      do_test (&json_ctx, 2, 2, i, BIG_CHAR);
+      do_test (&json_ctx, 2, 6, i, SMALL_CHAR);
+      do_test (&json_ctx, 6, 2, i, BIG_CHAR);
+      do_test (&json_ctx, 1, 7, i, SMALL_CHAR);
+      do_test (&json_ctx, 7, 1, i, BIG_CHAR);
+      do_test (&json_ctx, 3, 4, i, SMALL_CHAR);
+      do_test (&json_ctx, 4, 3, i, BIG_CHAR);
+      do_test (&json_ctx, 5, 7, i, SMALL_CHAR);
+      do_test (&json_ctx, 7, 5, i, SMALL_CHAR);
     }
 
+  for (i = 1; i < 2048; i += i)
+    {
+      do_test (&json_ctx, 1, 0, i, SMALL_CHAR);
+      do_test (&json_ctx, 0, i, i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, SMALL_CHAR);
+      do_test (&json_ctx, i, i, i, SMALL_CHAR);
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
   return ret;
 }
 
diff --git a/benchtests/bench-strncat.c b/benchtests/bench-strncat.c
index b148c55279..5ccc09a4f8 100644
--- a/benchtests/bench-strncat.c
+++ b/benchtests/bench-strncat.c
@@ -33,6 +33,8 @@
 # define SMALL_CHAR 1273
 #endif /* WIDE */
 
+#include "json-lib.h"
+
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
 
 CHAR *
@@ -49,7 +51,8 @@ IMPL (STRNCAT, 2)
 IMPL (generic_strncat, 0)
 
 static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
+	     size_t n)
 {
   size_t k = STRLEN (dst), i, iters = INNER_LOOP_ITERS8;
   timing_t start, stop, cur;
@@ -65,8 +68,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
   size_t len = STRLEN (src);
   if (MEMCMP (dst + k, src, len + 1 > n ? n : len + 1) != 0)
     {
-      error (0, 0, "Incorrect concatenation in function %s",
-	     impl->name);
+      error (0, 0, "Incorrect concatenation in function %s", impl->name);
       ret = 1;
       return;
     }
@@ -88,12 +90,12 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align1, size_t align2, size_t len1, size_t len2,
-	 size_t n, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len1,
+	 size_t len2, size_t n, int max_char)
 {
   size_t i;
   CHAR *s1, *s2;
@@ -118,53 +120,93 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
   for (i = 0; i < len2; i++)
     s2[i] = 32 + 23 * i % (max_char - 32);
 
-  printf ("Length %4zd/%4zd, alignment %2zd/%2zd, N %4zd:",
-	  len1, len2, align1, align2, n);
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_attr_uint (json_ctx, "len1", len1);
+  json_attr_uint (json_ctx, "len2", len2);
+  json_attr_uint (json_ctx, "n", n);
+  json_attr_uint (json_ctx, "max_char", max_char);
+
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
     {
       s2[len2] = '\0';
-      do_one_test (impl, s2, s1, n);
+      do_one_test (json_ctx, impl, s2, s1, n);
     }
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 main (void)
 {
+  json_ctx_t json_ctx;
   size_t i, n;
 
   test_init ();
 
-  printf ("%28s", "");
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
-  for (n = 2; n <= 2048; n*=4)
+  for (n = 2; n <= 2048; n *= 4)
     {
-      do_test (0, 2, 2, 2, n, SMALL_CHAR);
-      do_test (0, 0, 4, 4, n, SMALL_CHAR);
-      do_test (4, 0, 4, 4, n, BIG_CHAR);
-      do_test (0, 0, 8, 8, n, SMALL_CHAR);
-      do_test (0, 8, 8, 8, n, SMALL_CHAR);
+      do_test (&json_ctx, 0, 2, 2, 2, n, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 4, 4, n, SMALL_CHAR);
+      do_test (&json_ctx, 4, 0, 4, 4, n, BIG_CHAR);
+      do_test (&json_ctx, 0, 0, 8, 8, n, SMALL_CHAR);
+      do_test (&json_ctx, 0, 8, 8, 8, n, SMALL_CHAR);
 
       for (i = 1; i < 8; ++i)
 	{
-	  do_test (0, 0, 8 << i, 8 << i, n, SMALL_CHAR);
-	  do_test (8 - i, 2 * i, 8 << i, 8 << i, n, SMALL_CHAR);
-	  do_test (0, 0, 8 << i, 2 << i, n, SMALL_CHAR);
-	  do_test (8 - i, 2 * i, 8 << i, 2 << i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, 8 << i, 8 << i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 8 << i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, 8 << i, 2 << i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 2 << i, n, SMALL_CHAR);
 	}
 
       for (i = 1; i < 8; ++i)
 	{
-	  do_test (i, 2 * i, 8 << i, 1, n, SMALL_CHAR);
-	  do_test (2 * i, i, 8 << i, 1, n, BIG_CHAR);
-	  do_test (i, i, 8 << i, 10, n, SMALL_CHAR);
+	  do_test (&json_ctx, i, 2 * i, 8 << i, 1, n, SMALL_CHAR);
+	  do_test (&json_ctx, 2 * i, i, 8 << i, 1, n, BIG_CHAR);
+	  do_test (&json_ctx, i, i, 8 << i, 10, n, SMALL_CHAR);
 	}
     }
 
+  for (i = 128; i < 2048; i += i)
+    {
+      for (n = i - 64; n <= i + 64; n += 32)
+	{
+	  do_test (&json_ctx, 1, 0, i, i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 0, i, i, i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, i, i, n, SMALL_CHAR);
+	  do_test (&json_ctx, i, i, i, i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 1, 0, i, n, i, SMALL_CHAR);
+	  do_test (&json_ctx, 0, i, i, n, i, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, i, n, i, SMALL_CHAR);
+	  do_test (&json_ctx, i, i, i, n, i, SMALL_CHAR);
+	}
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
   return ret;
 }
diff --git a/benchtests/bench-strncpy.c b/benchtests/bench-strncpy.c
index 8207d99f4d..f621cbfe09 100644
--- a/benchtests/bench-strncpy.c
+++ b/benchtests/bench-strncpy.c
@@ -24,6 +24,8 @@
 # define SMALL_CHAR 127
 #endif /* !WIDE */
 
+#include "json-lib.h"
+
 #ifndef STRNCPY_RESULT
 # define STRNCPY_RESULT(dst, len, n) dst
 # define TEST_MAIN
@@ -52,7 +54,8 @@ IMPL (generic_strncpy, 0)
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
 
 static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
+	     size_t len, size_t n)
 {
   size_t i, iters = INNER_LOOP_ITERS_LARGE * (4 / CHARBYTES);
   timing_t start, stop, cur;
@@ -77,7 +80,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
       size_t i;
 
       for (i = len; i < n; ++i)
-	if (dst [i] != '\0')
+	if (dst[i] != '\0')
 	  {
 	    error (0, 0, "Wrong result in function %s", impl->name);
 	    ret = 1;
@@ -94,17 +97,18 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
+	 size_t n, int max_char)
 {
   size_t i;
   CHAR *s1, *s2;
 
-/* For wcsncpy: align1 and align2 here mean alignment not in bytes,
-   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)).  */
+  /* For wcsncpy: align1 and align2 here mean alignment not in bytes,
+     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)).  */
   align1 &= 7;
   if ((align1 + len) * sizeof (CHAR) >= page_size)
     return;
@@ -123,46 +127,86 @@ do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
        ++i)
     s1[i] = 32 + 32 * i % (max_char - 32);
 
-  printf ("Length %4zd, n %4zd, alignment %2zd/%2zd:", len, n, align1, align2);
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_attr_uint (json_ctx, "len", len);
+  json_attr_uint (json_ctx, "n", n);
+  json_attr_uint (json_ctx, "max_char", max_char);
+
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s2, s1, len, n);
+    do_one_test (json_ctx, impl, s2, s1, len, n);
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 static int
 test_main (void)
 {
-  size_t i;
+  json_ctx_t json_ctx;
+  size_t i, j;
 
   test_init ();
 
-  printf ("%28s", "");
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (i, i, 16, 16, SMALL_CHAR);
-      do_test (i, i, 16, 16, BIG_CHAR);
-      do_test (i, 2 * i, 16, 16, SMALL_CHAR);
-      do_test (2 * i, i, 16, 16, BIG_CHAR);
-      do_test (8 - i, 2 * i, 1 << i, 2 << i, SMALL_CHAR);
-      do_test (2 * i, 8 - i, 2 << i, 1 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 1 << i, 2 << i, BIG_CHAR);
-      do_test (2 * i, 8 - i, 2 << i, 1 << i, BIG_CHAR);
+      do_test (&json_ctx, i, i, 16, 16, SMALL_CHAR);
+      do_test (&json_ctx, i, i, 16, 16, BIG_CHAR);
+      do_test (&json_ctx, i, 2 * i, 16, 16, SMALL_CHAR);
+      do_test (&json_ctx, 2 * i, i, 16, 16, BIG_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 1 << i, 2 << i, SMALL_CHAR);
+      do_test (&json_ctx, 2 * i, 8 - i, 2 << i, 1 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 1 << i, 2 << i, BIG_CHAR);
+      do_test (&json_ctx, 2 * i, 8 - i, 2 << i, 1 << i, BIG_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (0, 0, 4 << i, 8 << i, SMALL_CHAR);
-      do_test (0, 0, 16 << i, 8 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 4 << i, 8 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 16 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 4 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 16 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 4 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 16 << i, 8 << i, SMALL_CHAR);
     }
 
+  for (i = 128; i < 2048; i += i)
+    {
+      for (j = i - 64; j <= i + 64; j += 32)
+	{
+	  do_test (&json_ctx, 1, 0, i, j, SMALL_CHAR);
+	  do_test (&json_ctx, 0, i, i, j, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, i, j, SMALL_CHAR);
+	  do_test (&json_ctx, i, i, i, j, SMALL_CHAR);
+	  do_test (&json_ctx, 1, 0, j, i, SMALL_CHAR);
+	  do_test (&json_ctx, 0, i, j, i, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, j, i, SMALL_CHAR);
+	  do_test (&json_ctx, i, i, j, i, SMALL_CHAR);
+	}
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
   return ret;
 }
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v1 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
  2022-11-03  8:53 [PATCH v1 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json Noah Goldstein
@ 2022-11-03  8:53 ` Noah Goldstein
  2022-11-03  8:55   ` Noah Goldstein
                     ` (2 more replies)
  2022-11-03  8:53 ` [PATCH v1 3/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
                   ` (4 subsequent siblings)
  5 siblings, 3 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-03  8:53 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
    1. Use more overlapping stores to avoid branches.
    2. Reduce how unrolled the aligning copies are (this is more of a
       code-size save, its a negative for some sizes in terms of
       perf).
    3. Improve the loop a bit (similiar to what we do in strlen with
       2x vpminu + kortest instead of 3x vpminu + kmov + test).
    4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
       number that are taken.

Performance Changes:

    Times are from N = 10 runs of the benchmark suite and are
    reported as geometric mean of all ratios of
    New Implementation / Old Implementation.

    stpcpy-evex      -> 0.922
    strcat-evex      -> 0.985
    strcpy-evex      -> 0.880

    strncpy-evex     -> 0.831
    stpncpy-evex     -> 0.780

    strncat-evex     -> 0.978

Code Size Changes:
    function         -> Bytes New / Bytes Old -> Ratio

    strcat-evex      -> 819  / 1874 -> 0.437
    strcpy-evex      -> 700  / 1074 -> 0.652
    stpcpy-evex      -> 735  / 1094 -> 0.672

    strncpy-evex     -> 1397 / 2611 -> 0.535
    stpncpy-evex     -> 1489 / 2691 -> 0.553

    strncat-evex     -> 1166 / 2832 -> 0.412

Notes:
    Because of the significant difference between the
    implementations they are split into three files.

    strcpy-evex.S    -> strcpy, stpcpy, strcat
    strncpy-evex.S   -> strncpy
    strncat-evex.S    > strncat

    I couldn't find a way to merge them without making the ifdefs
    incredibly difficult to follow.

    All implementations can be made evex512 by including
    "x86-evex512-vecs.h" at the top.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---

Results attached.
 sysdeps/x86_64/multiarch/stpncpy-evex.S       |    5 +-
 sysdeps/x86_64/multiarch/strcat-evex.S        |  291 +---
 sysdeps/x86_64/multiarch/strcat-strlen-evex.S |   88 ++
 sysdeps/x86_64/multiarch/strcpy-evex.S        | 1282 ++++++-----------
 sysdeps/x86_64/multiarch/strncat-evex.S       |  517 ++++++-
 sysdeps/x86_64/multiarch/strncpy-evex.S       |  995 ++++++++++++-
 .../multiarch/strncpy-or-cat-overflow-def.h   |   65 +
 7 files changed, 2070 insertions(+), 1173 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h

diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
index 99ea76a372..3693491baa 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
@@ -3,6 +3,5 @@
 #endif
 
 #define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY	STPNCPY
-#include "strcpy-evex.S"
+#define STRNCPY	STPNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
index 0e2df947e9..b4207b7889 100644
--- a/sysdeps/x86_64/multiarch/strcat-evex.S
+++ b/sysdeps/x86_64/multiarch/strcat-evex.S
@@ -1,286 +1,7 @@
-/* strcat with 256-bit EVEX instructions.
-   Copyright (C) 2021-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_evex
-# endif
-
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
-/* zero register */
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMM0		ymm17
-# define YMM1		ymm18
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE	32
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRCAT)
-	mov	%rdi, %r9
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-	xor	%eax, %eax
-	mov	%edi, %ecx
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-	cmp	$(VEC_SIZE * 3), %ecx
-	ja	L(fourth_vector_boundary)
-	vpcmpb	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_first_vector)
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	jmp	L(align_vec_size_start)
-L(fourth_vector_boundary):
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	vpcmpb	$0, (%rax), %YMMZERO, %k0
-	mov	$-1, %r10d
-	sub	%rax, %rcx
-	shl	%cl, %r10d
-	kmovd	%k0, %edx
-	and	%r10d, %edx
-	jnz	L(exit)
-
-L(align_vec_size_start):
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 4), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	kmovd	%k4, %edx
-	add	$(VEC_SIZE * 4), %rax
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 4), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 5), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	add	$VEC_SIZE, %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	add	$VEC_SIZE, %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
-	add	$VEC_SIZE, %rax
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	add	$VEC_SIZE, %rax
-
-	.p2align 4
-L(align_four_vec_loop):
-	VMOVA	(%rax), %YMM0
-	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
-	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
-	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
-	vpminub	%YMM0, %YMM1, %YMM0
-	/* If K0 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM0, %YMMZERO, %k0
-	add	$(VEC_SIZE * 4), %rax
-	ktestd	%k0, %k0
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
-	sub	$(VEC_SIZE * 5), %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit):
-	sub	%rdi, %rax
-L(exit_null_on_first_vector):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_second_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$VEC_SIZE, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_third_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 2), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fourth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 3), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fifth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-
-	.p2align 4
-L(StartStrcpyPart):
-	lea	(%r9, %rax), %rdi
-	mov	%rsi, %rcx
-	mov	%r9, %rax      /* save result */
-
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(ExitZero)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-evex.S"
+#ifndef STRCAT
+# define STRCAT	__strcat_evex
 #endif
+
+#define USE_AS_STRCAT
+#define STRCPY	STRCAT
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
new file mode 100644
index 0000000000..9813d38613
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
@@ -0,0 +1,88 @@
+    /* Simple strlen implementation that ends at L(strcat_strlen_done).  */
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
+	/* Paired down strlen implementation.  We never commit to 4x
+	   loop as we are expecting a relatively short string and want
+	   to minimize code size.  */
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+	VPCMPEQ	(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+#ifdef USE_AS_WCSCPY
+	subl	%r8d, %edi
+	shrl	$2, %edi
+#endif
+	shrx	%VRDI, %VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+	movq	%rax, %rdi
+#endif
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v0)
+
+
+	VPCMPEQ	VEC_SIZE(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	leaq	(VEC_SIZE)(%r8), %rdi
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v1)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v2)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v3)
+
+	andq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4,, 8
+L(strlen_loop_4x_vec):
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(0)
+	VPMIN	(VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(2)
+	VPMIN	(VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	VPTESTN	%VMM(3), %VMM(3), %k3
+	subq	$(VEC_SIZE * -4), %rdi
+	KORTEST	%k1, %k3
+	jz	L(strlen_loop_4x_vec)
+
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v0)
+
+	KMOV	%k1, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v1)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v2)
+
+	KMOV	%k3, %VRCX
+L(bsf_and_done_v3):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v2):
+	bsf	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
+	jmp	L(strcat_strlen_done)
+
+	.p2align 4,, 4
+L(bsf_and_done_v1):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v0):
+	bsf	%VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#else
+	addq	%rcx, %rdi
+#endif
+L(strcat_strlen_done):
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
index 82e45ac675..1ba0195ed2 100644
--- a/sysdeps/x86_64/multiarch/strcpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
@@ -1,4 +1,4 @@
-/* strcpy with 256-bit EVEX instructions.
+/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
    Copyright (C) 2021-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,990 +17,526 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <isa-level.h>
-
 #if ISA_SHOULD_BUILD (4)
 
 
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+	/* Use movsb in page cross case to save code size.  */
+# define USE_MOVSB_IN_PAGE_CROSS	1
 
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_evex
-#  endif
+# include <sysdep.h>
 
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
-/* Number of bytes in a vector register */
-# ifndef VEC_SIZE
-#  define VEC_SIZE	32
+# ifndef STRCPY
+#  define STRCPY	__strcpy_evex
 # endif
 
-# define XMM2		xmm18
-# define XMM3		xmm19
 
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
-# define YMM7		ymm23
+# ifdef USE_AS_WCSCPY
+#  define VMOVU_MASK	vmovdqu32
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
 
-# ifndef USE_AS_STRCAT
+#  define REP_MOVS	rep movsd
 
-/* zero register */
-#  define XMMZERO	xmm16
-#  define YMMZERO	ymm16
-#  define YMM1		ymm17
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRCPY)
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-	test	%R8_LP, %R8_LP
-	jz	L(ExitZero)
-#  endif
-	mov	%rsi, %rcx
-#  ifndef USE_AS_STPCPY
-	mov	%rdi, %rax      /* save result */
-#  endif
+#  define USE_WIDE_CHAR
+# else
+#  define VMOVU_MASK	vmovdqu8
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
 
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+#  define REP_MOVS	rep movsb
 # endif
 
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	cmp	$(VEC_SIZE * 2), %ecx
-	jbe	L(SourceStringAlignmentLessTwoVecSize)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-
-	vpcmpb	$0, (%rsi), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	shr	%cl, %rdx
+# include "reg-macros.h"
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	mov	$VEC_SIZE, %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  else
-	mov	$(VEC_SIZE + 1), %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  endif
-	jbe	L(CopyVecSizeTailCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail)
-
-	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
-	kmovd	%k1, %edx
 
-# ifdef USE_AS_STRNCPY
-	add	$VEC_SIZE, %r10
-	cmp	%r10, %r8
-	jbe	L(CopyTwoVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize)
-
-	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
-	VMOVU	%YMM2, (%rdi)
-
-/* If source address alignment != destination address alignment */
-	.p2align 4
-L(UnalignVecSizeBoth):
-	sub	%rcx, %rdi
-# ifdef USE_AS_STRNCPY
-	add	%rcx, %r8
-	sbb	%rcx, %rcx
-	or	%rcx, %r8
-# endif
-	mov	$VEC_SIZE, %rcx
-	VMOVA	(%rsi, %rcx), %YMM2
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 3), %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STPCPY
+#  define END_REG	rax
 # else
-	jnz	L(CopyVecSize)
+#  define END_REG	rdi, %rdx, CHAR_SIZE
 # endif
 
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
+# ifdef USE_AS_STRCAT
+#  define PAGE_ALIGN_REG	edx
+#  define PAGE_ALIGN_REG_64	rdx
 # else
-	jnz	L(CopyVecSize)
+#  define PAGE_ALIGN_REG	eax
+#  define PAGE_ALIGN_REG_64	rax
 # endif
 
-	VMOVU	%YMM3, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
-	vpcmpb	$0, %YMM4, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
 
-	VMOVU	%YMM4, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
 
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
-	VMOVU	%YMM2, (%rdi, %rcx)
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
 
-	VMOVU	%YMM3, (%rdi, %rcx)
-	mov	%rsi, %rdx
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	and	$-(VEC_SIZE * 4), %rsi
-	sub	%rsi, %rdx
-	sub	%rdx, %rdi
-# ifdef USE_AS_STRNCPY
-	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
-# endif
-L(UnalignedFourVecSizeLoop):
-	VMOVA	(%rsi), %YMM4
-	VMOVA	VEC_SIZE(%rsi), %YMM5
-	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
-	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
-	vpminub	%YMM5, %YMM4, %YMM2
-	vpminub	%YMM7, %YMM6, %YMM3
-	vpminub	%YMM2, %YMM3, %YMM2
-	/* If K7 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM2, %YMMZERO, %k7
-	kmovd	%k7, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRCPY)
+# ifdef USE_AS_STRCAT
+	movq	%rdi, %rax
+#  include "strcat-strlen-evex.S"
 # endif
-	test	%edx, %edx
-	jnz	L(UnalignedFourVecSizeLeave)
-
-L(UnalignedFourVecSizeLoop_start):
-	add	$(VEC_SIZE * 4), %rdi
-	add	$(VEC_SIZE * 4), %rsi
-	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
-	VMOVA	(%rsi), %YMM4
-	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
-	VMOVA	VEC_SIZE(%rsi), %YMM5
-	vpminub	%YMM5, %YMM4, %YMM2
-	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
-	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
-	VMOVU	%YMM7, -VEC_SIZE(%rdi)
-	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
-	vpminub	%YMM7, %YMM6, %YMM3
-	vpminub	%YMM2, %YMM3, %YMM2
-	/* If K7 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM2, %YMMZERO, %k7
-	kmovd	%k7, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
+
+	movl	%esi, %PAGE_ALIGN_REG
+	andl	$(PAGE_SIZE - 1), %PAGE_ALIGN_REG
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 # endif
-	test	%edx, %edx
-	jz	L(UnalignedFourVecSizeLoop_start)
 
-L(UnalignedFourVecSizeLeave):
-	vpcmpb	$0, %YMM4, %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_0)
 
-	vpcmpb	$0, %YMM5, %YMMZERO, %k2
-	kmovd	%k2, %ecx
-	test	%ecx, %ecx
-	jnz	L(CopyVecSizeUnaligned_16)
+	/* Two short string implementations. One with traditional
+	   branching approach and one with masked instructions (which
+	   have potential for dramatically bad perf if dst splits a
+	   page and is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	VPTEST	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+#  ifdef USE_AS_WCSCPY
+	subl	$((1 << CHAR_PER_VEC)- 1), %VRCX
+#  else
+	inc	%VRCX
+#  endif
+	jz	L(more_1x_vec)
+	KMOV	%VRCX, %k1
+	KXOR	%k0, %k1, %k1
 
-	vpcmpb	$0, %YMM6, %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_32)
-
-	vpcmpb	$0, %YMM7, %YMMZERO, %k4
-	kmovd	%k4, %ecx
-	bsf	%ecx, %edx
-	VMOVU	%YMM4, (%rdi)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$(VEC_SIZE * 3), %rsi
-	add	$(VEC_SIZE * 3), %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
 
-/* If source address alignment == destination address alignment */
+#  ifdef USE_AS_STPCPY
+	bsf	%VRCX, %VRCX
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
+#  endif
+	ret
 
-L(SourceStringAlignmentLessTwoVecSize):
-	VMOVU	(%rsi), %YMM3
-	VMOVU	VEC_SIZE(%rsi), %YMM2
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
+# else
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jz	L(more_1x_vec)
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$VEC_SIZE, %r8
+	xorl	%edx, %edx
+	bsf	%VRCX, %VRDX
+#  ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  endif
+
+	/* Use mask bits in rcx to detect which copy we need. If the low
+	   mask is zero then there must be a bit set in the upper half.
+	   I.e if rcx != 0 and ecx == 0, then match must be upper 32
+	   bits so we use L(copy_32_63).  */
+#  if VEC_SIZE == 64
+#   ifdef USE_AS_WCSCPY
+	testb	%cl, %cl
+#   else
+	testl	%ecx, %ecx
+#   endif
+	jz	L(copy_32_63)
+#  endif
+
+#  ifdef USE_AS_WCSCPY
+	testb	$0xf, %cl
 #  else
-	cmp	$(VEC_SIZE + 1), %r8
+	testw	%cx, %cx
 #  endif
-	jbe	L(CopyVecSizeTail1Case2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail1)
+	jz	L(copy_16_31)
 
-	VMOVU	%YMM3, (%rdi)
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$(VEC_SIZE * 2), %r8
+#  ifdef USE_AS_WCSCPY
+	testb	$0x3, %cl
 #  else
-	cmp	$((VEC_SIZE * 2) + 1), %r8
+	testb	%cl, %cl
 #  endif
-	jbe	L(CopyTwoVecSize1Case2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize1)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-	jmp	L(UnalignVecSizeBoth)
+	jz	L(copy_8_15)
 
-/*------End of main part with loops---------------------*/
 
-/* Case1 */
+#  ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+	/* No need to copy, we know its zero.  */
+	movl	$0, (%END_REG)
 
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
-	.p2align 4
-L(CopyVecSize):
-	add	%rcx, %rdi
-# endif
-L(CopyVecSizeTail):
-	add	%rcx, %rsi
-L(CopyVecSizeTail1):
-	bsf	%edx, %edx
-L(CopyVecSizeExit):
-	cmp	$32, %edx
-	jae	L(Exit32_63)
-	cmp	$16, %edx
-	jae	L(Exit16_31)
-	cmp	$8, %edx
-	jae	L(Exit8_15)
-	cmp	$4, %edx
-	jae	L(Exit4_7)
-	cmp	$3, %edx
-	je	L(Exit3)
-	cmp	$1, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	movb	$0, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$1, %r8
-	lea	1(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
 	ret
+#  else
 
-	.p2align 4
-L(CopyTwoVecSize1):
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$VEC_SIZE, %r8
-# endif
-	jmp	L(CopyVecSizeTail1)
-
-	.p2align 4
-L(CopyTwoVecSize):
-	bsf	%edx, %edx
-	add	%rcx, %rsi
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	jmp	L(CopyVecSizeExit)
-
-	.p2align 4
-L(CopyVecSizeUnaligned_0):
-	bsf	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM4, (%rdi)
-	add	$((VEC_SIZE * 4) - 1), %r8
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	jmp	L(CopyVecSizeExit)
-# endif
+	testb	$0x7, %cl
+	jz	L(copy_4_7)
 
-	.p2align 4
-L(CopyVecSizeUnaligned_16):
-	bsf	%ecx, %edx
-	VMOVU	%YMM4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	VEC_SIZE(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	add	$((VEC_SIZE * 3) - 1), %r8
-	sub	%rdx, %r8
-	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
 
-	.p2align 4
-L(CopyVecSizeUnaligned_32):
-	bsf	%edx, %edx
-	VMOVU	%YMM4, (%rdi)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-	add	$((VEC_SIZE * 2) - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$(VEC_SIZE * 2), %rsi
-	add	$(VEC_SIZE * 2), %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
+	test	%edx, %edx
+	jz	L(set_null_term)
 
-# ifdef USE_AS_STRNCPY
-#  ifndef USE_AS_STRCAT
-	.p2align 4
-L(CopyVecSizeUnalignedVec6):
-	VMOVU	%YMM6, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec5):
-	VMOVU	%YMM5, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec4):
-	VMOVU	%YMM4, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec3):
-	VMOVU	%YMM3, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
+	/* NB: make this `vmovw` if support for AVX512-FP16 is added.
+	 */
+	vmovd	%VMM_128(0), %esi
+	movw	%si, (%rdi)
+
+	.p2align 4,, 1
+L(set_null_term):
+	/* No need to copy, we know its zero.  */
+	movb	$0, (%END_REG)
+	ret
 #  endif
 
-/* Case2 */
-
-	.p2align 4
-L(CopyVecSizeCase2):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTailCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTail1Case2):
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-/* Case2 or Case3,  Case3 */
-
-	.p2align 4
-L(CopyVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeCase2)
-L(CopyVecSizeCase3):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyTwoVecSizeCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyVecSizeTailCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTailCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSize1Case2OrCase3):
-	add	$VEC_SIZE, %rdi
-	add	$VEC_SIZE, %rsi
-	sub	$VEC_SIZE, %r8
-L(CopyVecSizeTail1Case2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTail1Case2)
-	jmp	L(StrncpyExit)
+#  if VEC_SIZE == 64
+	.p2align 4,, 6
+L(copy_32_63):
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
+	ret
+#  endif
+
+
+	.p2align 4,, 6
+L(copy_16_31):
+	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
+	   and will save code size.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
+	ret
+
+	.p2align 4,, 8
+L(copy_8_15):
+#  ifdef USE_AS_WCSCPY
+	movl	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+#  else
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
+#  endif
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%END_REG)
+	ret
 # endif
 
-/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
 
-	.p2align 4
-L(Exit1):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
+# ifndef USE_AS_WCSCPY
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%ecx, -(4 - CHAR_SIZE)(%END_REG)
+	ret
 # endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$2, %r8
-	lea	2(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rdi)
 # endif
-	ret
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+	addq	%rsi, %rdi
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
 
-	.p2align 4
-L(Exit2):
-	movzwl	(%rsi), %ecx
-	mov	%cx, (%rdi)
-	movb	$0, 2(%rdi)
+	/* Ideally we store after moves to minimize impact of potential
+	   false-dependencies.  */
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rax)
+# endif
+
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+
+
+	/* Align for 4x loop.  */
+	subq	%rsi, %rdi
+
+	/* + VEC_SIZE * 5 because we never added the original VEC_SIZE
+	   we covered before aligning.  */
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$-(VEC_SIZE * 4), %rsi
+
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	/* Restore rdi (%rdi).  */
+	addq	%rsi, %rdi
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x0_end)
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	/* Place L(ret_vec_x4) here to save code size.  We get a
+	   meaningfuly benefit doing this for stpcpy.  */
+	KMOV	%k4, %VRDX
+L(ret_vec_x3):
+	bsf	%VRDX, %VRDX
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$3, %r8
-	lea	3(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
 # endif
+L(return_end):
 	ret
 
-	.p2align 4
-L(Exit3):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
+	.p2align 4,, 6
+L(ret_vec_x0_end):
+	bsf	%VRCX, %VRCX
 # ifdef USE_AS_STPCPY
-	lea	3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$4, %r8
-	lea	4(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
+	inc	%VRCX
+	VMOVU	(-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
 	ret
 
-	.p2align 4
-L(Exit4_7):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	-3(%rsi, %rdx), %ecx
-	mov	%ecx, -3(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x1):
+	bsf	%VRCX, %VRCX
+	VMOVU	(VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
 	ret
 
-	.p2align 4
-L(Exit8_15):
-	mov	(%rsi), %rcx
-	mov	-7(%rsi, %rdx), %r9
-	mov	%rcx, (%rdi)
-	mov	%r9, -7(%rdi, %rdx)
+	.p2align 4,, 4
+L(ret_vec_x2):
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
 	ret
 
-	.p2align 4
-L(Exit16_31):
-	VMOVU	(%rsi), %XMM2
-	VMOVU	-15(%rsi, %rdx), %XMM3
-	VMOVU	%XMM2, (%rdi)
-	VMOVU	%XMM3, -15(%rdi, %rdx)
+	/* ret_vec_x3 reuses return code after the loop.  */
+	.p2align 4,, 6
+L(ret_vec_x4):
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub %rdx, %r8
-	sub $1, %r8
-	lea 1(%rdi, %rdx), %rdi
-	jnz L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
 	ret
 
-	.p2align 4
-L(Exit32_63):
-	VMOVU	(%rsi), %YMM2
-	VMOVU	-31(%rsi, %rdx), %YMM3
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, -31(%rdi, %rdx)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
+
+	.p2align 4,, 4
+L(page_cross):
+# ifndef USE_AS_STRCAT
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
 # endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	movq	%rsi, %rcx
+	andq	$(VEC_SIZE * -1), %rcx
+
+	VPCMPEQ	(%rcx), %VZERO, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSCPY
+	andl	$(VEC_SIZE - 1), %PAGE_ALIGN_REG
+	shrl	$2, %PAGE_ALIGN_REG
 # endif
-	ret
+	shrx	%VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
 
-# ifdef USE_AS_STRNCPY
+# if USE_MOVSB_IN_PAGE_CROSS
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
 
-	.p2align 4
-L(StrncpyExit1):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 1(%rdi)
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shl	%VRCX
+	jz	L(page_cross_continue)
+#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 #  endif
-	ret
+	bsf	%VRCX, %VRCX
+	REP_MOVS
 
-	.p2align 4
-L(StrncpyExit2):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
 #  ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 2(%rdi)
+	leaq	-CHAR_SIZE(%rdi), %rax
 #  endif
 	ret
 
-	.p2align 4
-L(StrncpyExit3_4):
-	movzwl	(%rsi), %ecx
-	movzwl	-2(%rsi, %r8), %edx
-	mov	%cx, (%rdi)
-	mov	%dx, -2(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
 
-	.p2align 4
-L(StrncpyExit5_8):
-	mov	(%rsi), %ecx
-	mov	-4(%rsi, %r8), %edx
-	mov	%ecx, (%rdi)
-	mov	%edx, -4(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
+# else
+	/* Check if we found zero-char before end of page.  */
+	test	%VRCX, %VRCX
+	jz	L(page_cross_continue)
 
-	.p2align 4
-L(StrncpyExit9_16):
-	mov	(%rsi), %rcx
-	mov	-8(%rsi, %r8), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
+	/* Traditional copy case, essentially same as used in non-page-
+	   cross case but since we can't reuse VMM(0) we need twice as
+	   many loads from rsi.  */
 
-	.p2align 4
-L(StrncpyExit17_32):
-	VMOVU	(%rsi), %XMM2
-	VMOVU	-16(%rsi, %r8), %XMM3
-	VMOVU	%XMM2, (%rdi)
-	VMOVU	%XMM3, -16(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
+#  ifndef USE_AS_STRCAT
+	xorl	%edx, %edx
 #  endif
-	ret
-
-	.p2align 4
-L(StrncpyExit33_64):
-	/*  0/32, 31/16 */
-	VMOVU	(%rsi), %YMM2
-	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
+	/* Dependency on rdi must already have been satisfied.  */
+	bsf	%VRCX, %VRDX
 #  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  elif !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 #  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
 
-	.p2align 4
-L(StrncpyExit65):
-	/* 0/32, 32/32, 64/1 */
-	VMOVU	(%rsi), %YMM2
-	VMOVU	32(%rsi), %YMM3
-	mov	64(%rsi), %cl
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, 32(%rdi)
-	mov	%cl, 64(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	65(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 65(%rdi)
+#  if VEC_SIZE == 64
+#   ifdef USE_AS_WCSCPY
+	testb	%cl, %cl
+#   else
+	test	%ecx, %ecx
+#   endif
+	jz	L(page_cross_copy_32_63)
 #  endif
-	ret
-
-#  ifndef USE_AS_STRCAT
 
-	.p2align 4
-L(Fill1):
-	mov	%dl, (%rdi)
-	ret
+#  ifdef USE_AS_WCSCPY
+	testb	$0xf, %cl
+#  else
+	testw	%cx, %cx
+#  endif
+	jz	L(page_cross_copy_16_31)
 
-	.p2align 4
-L(Fill2):
-	mov	%dx, (%rdi)
-	ret
+#  ifdef USE_AS_WCSCPY
+	testb	$0x3, %cl
+#  else
+	testb	%cl, %cl
+#  endif
+	jz	L(page_cross_copy_8_15)
 
-	.p2align 4
-L(Fill3_4):
-	mov	%dx, (%rdi)
-	mov     %dx, -2(%rdi, %r8)
+#  ifdef USE_AS_WCSCPY
+	movl	(%rsi), %esi
+	movl	%esi, (%rdi)
+	movl	$0, (%END_REG)
 	ret
+#  else
 
-	.p2align 4
-L(Fill5_8):
-	mov	%edx, (%rdi)
-	mov     %edx, -4(%rdi, %r8)
-	ret
+	testb	$0x7, %cl
+	jz	L(page_cross_copy_4_7)
 
-	.p2align 4
-L(Fill9_16):
-	mov	%rdx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
+	test	%edx, %edx
+	jz	L(page_cross_set_null_term)
+	movzwl	(%rsi), %ecx
+	movw	%cx, (%rdi)
+L(page_cross_set_null_term):
+	movb	$0, (%END_REG)
 	ret
 
-	.p2align 4
-L(Fill17_32):
-	VMOVU	%XMMZERO, (%rdi)
-	VMOVU	%XMMZERO, -16(%rdi, %r8)
-	ret
 
-	.p2align 4
-L(CopyVecSizeUnalignedVec2):
-	VMOVU	%YMM2, (%rdi, %rcx)
-
-	.p2align 4
-L(CopyVecSizeVecExit):
-	bsf	%edx, %edx
-	add	$(VEC_SIZE - 1), %r8
-	add	%rcx, %rdi
-#   ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-#   endif
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-
-	.p2align 4
-L(StrncpyFillTailWithZero):
-	xor	%edx, %edx
-	sub	$VEC_SIZE, %r8
-	jbe	L(StrncpyFillExit)
-
-	VMOVU	%YMMZERO, (%rdi)
-	add	$VEC_SIZE, %rdi
-
-	mov	%rdi, %rsi
-	and	$(VEC_SIZE - 1), %esi
-	sub	%rsi, %rdi
-	add	%rsi, %r8
-	sub	$(VEC_SIZE * 4), %r8
-	jb	L(StrncpyFillLessFourVecSize)
-
-L(StrncpyFillLoopVmovdqa):
-	VMOVA	%YMMZERO, (%rdi)
-	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
-	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
-	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE * 4), %rdi
-	sub	$(VEC_SIZE * 4), %r8
-	jae	L(StrncpyFillLoopVmovdqa)
-
-L(StrncpyFillLessFourVecSize):
-	add	$(VEC_SIZE * 2), %r8
-	jl	L(StrncpyFillLessTwoVecSize)
-	VMOVA	%YMMZERO, (%rdi)
-	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
-	add	$(VEC_SIZE * 2), %rdi
-	sub	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	VMOVA	%YMMZERO, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillLessTwoVecSize):
-	add	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	VMOVA	%YMMZERO, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillExit):
-	add	$VEC_SIZE, %r8
-L(Fill):
-	cmp	$17, %r8d
-	jae	L(Fill17_32)
-	cmp	$9, %r8d
-	jae	L(Fill9_16)
-	cmp	$5, %r8d
-	jae	L(Fill5_8)
-	cmp	$3, %r8d
-	jae	L(Fill3_4)
-	cmp	$1, %r8d
-	ja	L(Fill2)
-	je	L(Fill1)
+	.p2align 4,, 4
+L(page_cross_copy_4_7):
+	movl	(%rsi), %ecx
+	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, -(4 - CHAR_SIZE)(%END_REG)
 	ret
-
-/* end of ifndef USE_AS_STRCAT */
 #  endif
 
-	.p2align 4
-L(UnalignedLeaveCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(UnalignedFourVecSizeLeaveCase2)
-L(UnalignedFourVecSizeLeaveCase3):
-	lea	(VEC_SIZE * 4)(%r8), %rcx
-	and	$-VEC_SIZE, %rcx
-	add	$(VEC_SIZE * 3), %r8
-	jl	L(CopyVecSizeCase3)
-	VMOVU	%YMM4, (%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 4)(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (VEC_SIZE * 4)(%rdi)
-#  endif
+#  if VEC_SIZE == 64
+	.p2align 4,, 4
+L(page_cross_copy_32_63):
+	VMOVU	(%rsi), %VMM_256(0)
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
 	ret
-
-	.p2align 4
-L(UnalignedFourVecSizeLeaveCase2):
-	xor	%ecx, %ecx
-	vpcmpb	$0, %YMM4, %YMMZERO, %k1
-	kmovd	%k1, %edx
-	add	$(VEC_SIZE * 3), %r8
-	jle	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
-	vpcmpb	$0, %YMM5, %YMMZERO, %k2
-	kmovd	%k2, %edx
-	VMOVU	%YMM4, (%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec5)
-#  else
-	jnz	L(CopyVecSize)
 #  endif
 
-	vpcmpb	$0, %YMM6, %YMMZERO, %k3
-	kmovd	%k3, %edx
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec6)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
-
-	vpcmpb	$0, %YMM7, %YMMZERO, %k4
-	kmovd	%k4, %edx
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-	lea	VEC_SIZE(%rdi, %rcx), %rdi
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-L(StrncpyExit):
-	cmp	$65, %r8d
-	je	L(StrncpyExit65)
-	cmp	$33, %r8d
-	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8d
-	jae	L(StrncpyExit17_32)
-	cmp	$9, %r8d
-	jae	L(StrncpyExit9_16)
-	cmp	$5, %r8d
-	jae	L(StrncpyExit5_8)
-	cmp	$3, %r8d
-	jae	L(StrncpyExit3_4)
-	cmp	$1, %r8d
-	ja	L(StrncpyExit2)
-	je	L(StrncpyExit1)
-#  ifdef USE_AS_STPCPY
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi)
-#  endif
+	.p2align 4,, 4
+L(page_cross_copy_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
 	ret
 
-	.p2align 4
-L(ExitZero):
-#  ifndef USE_AS_STRCAT
-	mov	%rdi, %rax
-#  endif
+	.p2align 4,, 4
+L(page_cross_copy_8_15):
+	movq	(%rsi), %rcx
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+	movq	%rcx, (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%END_REG)
 	ret
-
-# endif
-
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
 # endif
+END(STRCPY)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
index 203a19bf21..38dcbfa0ec 100644
--- a/sysdeps/x86_64/multiarch/strncat-evex.S
+++ b/sysdeps/x86_64/multiarch/strncat-evex.S
@@ -1,7 +1,512 @@
-#ifndef STRNCAT
-# define STRNCAT	__strncat_evex
-#endif
+/* {wcs|str}ncat  with 256/512-bit EVEX.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+# ifndef STRNCAT
+#  define STRNCAT	__strncat_evex
+# endif
+
+
+# ifdef USE_AS_WCSCPY
+#  define VMOVU_MASK	vmovdqu32
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
+
+#  define REP_MOVS	rep movsd
+
+#  define VMASK_REG	VR10
+#  define FIND_FIRST_ONE(src, dst)	movl $CHAR_PER_VEC, %dst; bsf %src, %dst
+
+#  define USE_WIDE_CHAR
+# else
+#  define VMOVU_MASK	vmovdqu8
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
+
+#  define REP_MOVS	rep movsb
+
+#  define VMASK_REG	VRCX
+#  define FIND_FIRST_ONE(src, dst)	tzcnt %src, %dst
+
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# include "reg-macros.h"
+
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCAT)
+	movq	%rdi, %rax
+# ifdef USE_AS_WCSCPY
+	decq	%rdx
+	movq	%rdx, %rcx
+	shrq	$56, %rcx
+	jnz	L(zero_len)
+# else
+	decq	%rdx
+	jl	L(zero_len)
+# endif
+
+# include "strcat-strlen-evex.S"
+
+	movl	%esi, %ecx
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPTESTN	%VMM(0), %VMM(0), %k0
+
+	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
+	   <= CHAR_PER_VEC with masked instructions (which have
+	   potential for dramatically bad perf if dst splits a page and
+	   is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	KMOV	%k0, %VRCX
+	cmpq	$CHAR_PER_VEC, %rdx
+	jae	L(more_1x_vec)
+	bts	%VRDX, %VRCX
+L(less_1x_vec_masked):
+	blsmsk	%VRCX, %VRCX
+
+	KMOV	%VRCX, %k1
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
+	ret
+# else
+	KMOV	%k0, %VMASK_REG
+	/* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
+	   %VMASK_REG, %VRCX` for wcsncat.  */
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpq	%rcx, %rdx
+	jb	L(less_1x_vec)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	je	L(more_1x_vec)
+
+	movl	%ecx, %edx
+
+L(less_1x_vec):
+#  if VEC_SIZE == 64
+	cmpl	$(32 / CHAR_SIZE - 1), %edx
+	jae	L(copy_32_63)
+#  endif
+
+	cmpl	$(16 / CHAR_SIZE - 1), %edx
+	jae	L(copy_16_31)
+
+
+	cmpl	$(8 / CHAR_SIZE - 1), %edx
+	jae	L(copy_8_15)
+
+#  ifdef USE_AS_WCSCPY
+	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%ecx, -(4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	ret
+#  else
+
+	cmpl	$3, %edx
+	jae	L(copy_4_7)
+
+	movzbl	(%rsi, %rdx), %ecx
+	test	%edx, %edx
+	je	L(set_null_term)
+
+	/* NB: make this `vmovw` if support for AVX512-FP16 is added.
+	 */
+	movzwl	(%rsi), %esi
+	movw	%si, (%rdi)
+
+
+	.p2align 4,, 1
+L(set_null_term):
+	movb	%cl, (%rdi, %rdx)
+	ret
+#  endif
+
+#  if VEC_SIZE == 64
+	.p2align 4,, 6
+L(copy_32_63):
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	ret
+#  endif
+	.p2align 4,, 6
+L(copy_16_31):
+	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
+	   and will save code size.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 2
+L(copy_8_15):
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	ret
+
+#  ifndef USE_AS_WCSCPY
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%ecx, -(4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	ret
+#  endif
+
+# endif
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# if USE_EVEX_MASKED_STORE
+	test	%VRCX, %VRCX
+	jnz	L(less_1x_vec_masked)
+# endif
+
+
+	VMOVU	%VMM(0), (%rdi)
 
-#define USE_AS_STRNCAT
-#define STRCAT	STRNCAT
-#include "strcat-evex.S"
+	/* We are going to align rsi here so will need to be able to re-
+	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
+
+	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+L(loop_last_4x_vec):
+	addq	%rsi, %rdi
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+
+	/* Will need this regardless.  */
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VMASK_REG
+
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpl	%ecx, %edx
+	jb	L(ret_vec_x1_len)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	jne	L(ret_vec_x1)
+
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+
+	/* wcsncat needs to mask edx (length) before `bts`. strncat does
+	   not as `bts` will naturally mask the bit-position to
+	   instruction length.  */
+# ifdef USE_AS_WCSCPY
+	andl	$(CHAR_PER_VEC - 1), %edx
+# endif
+	bts	%VRDX, %VRCX
+L(ret_vec_x2):
+	bsf	%VRCX, %VRCX
+	VMOVU	(VEC_SIZE * 2 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), (VEC_SIZE * 2 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 8
+L(ret_vec_x1_len):
+	movl	%edx, %ecx
+L(ret_vec_x1):
+	VMOVU	(VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), (VEC_SIZE-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 4
+L(zero_len):
+	incq	%rdx
+	jne	OVERFLOW_STRCAT
+	ret
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	/* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
+	   $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
+	   using `movzbl`.  */
+# if CHAR_PER_VEC == 64
+	movzbl	%dl, %edx
+# else
+	andl	$(CHAR_PER_VEC * 4 - 1), %edx
+# endif
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VMASK_REG
+	subq	$-(VEC_SIZE * 4), %rsi
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2 - 1), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+	/* L(ret_vec_x1) expects position already to be in rcx so use
+	   `bsf` to test zero.  */
+	bsf	%VMASK_REG, %VRCX
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VMASK_REG
+
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rdx
+	ja	L(more_4x_vec)
+
+	/* Adjust length before going to L(ret_vec_x3_len) or
+	   L(ret_vec_x3).  */
+	addl	$(CHAR_PER_VEC * -2), %edx
+
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpl	%ecx, %edx
+	jb	L(ret_vec_x3_len)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	jne	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSCPY
+	andl	$(CHAR_PER_VEC - 1), %edx
+# endif
+	bts	%VRDX, %VRCX
+	.p2align 4,, 6
+L(ret_vec_x4):
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	ret
+
+
+	.p2align 4,, 8
+L(ret_vec_x3_len):
+	movl	%edx, %ecx
+L(ret_vec_x3):
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	ret
+
+
+	.p2align 4,, 8
+L(more_4x_vec):
+	bsf	%VMASK_REG, %VRCX
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+
+	/* Check if we are near the end before aligning.  */
+	cmpq	$(CHAR_PER_VEC * 8 - 1), %rdx
+	jbe	L(last_4x_vec)
+
+
+	/* Add rsi to rdx (length) before aligning rsi. NB: Since we
+	   filtered out huge lengths this cannot overflow.  */
+# ifdef USE_AS_WCSCPY
+	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rsi, %rdx
+# endif
+
+	/* Subtract rsi from rdi before aligning (add back will have
+	   correct rdi for aligned rsi).  */
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+
+	/* Offset rsi by VEC_SIZE so that we can jump to
+	   L(loop_last_4x_vec).  */
+	addq	$-(VEC_SIZE), %rsi
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	/* Store loop end in r9.  */
+	leaq	-(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	/* Restore rdi (dst).  */
+	addq	%rsi, %rdi
+
+	/* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
+	   test with bsf.  */
+	bsf	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
+
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	bsf	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+
+	KMOV	%k4, %VRDX
+	bsf	%VRDX, %VRDX
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(return_end):
+	ret
+
+	.p2align 4,, 4
+	.p2align 6,, 8
+L(page_cross):
+	movq	%rsi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+	VPCMPEQ	(%r8), %VZERO, %k0
+
+# ifdef USE_AS_WCSCPY
+	KMOV	%k0, %VR9
+	shrl	$2, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+	shrx	%VRCX, %VR9, %VRCX
+# else
+	KMOV	%k0, %VRCX
+	shrx	%VRSI, %VRCX, %VRCX
+# endif
+
+	subl	%esi, %r8d
+	andl	$(VEC_SIZE - 1), %r8d
+# ifdef USE_AS_WCSCPY
+	shrl	$2, %r8d
+# endif
+	cmpq	%r8, %rdx
+	jb	L(page_cross_small)
+	/* Optimizing more for space as this is very cold code. This
+	   saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shl	%VRCX
+	jz	L(page_cross_continue)
+	bsf	%VRCX, %VRCX
+	REP_MOVS
+	ret
+
+L(page_cross_small):
+	tzcnt	%VRCX, %VRCX
+	cmpq	%rdx, %rcx
+	cmova	%edx, %ecx
+	incl	%ecx
+# ifdef USE_AS_WCSCPY
+	rep	movsd
+# else
+	rep	movsb
+# endif
+	ret
+END(STRNCAT)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
index 1b3426d511..49eaf4cbd9 100644
--- a/sysdeps/x86_64/multiarch/strncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
@@ -1,7 +1,990 @@
-#ifndef STRNCPY
-# define STRNCPY	__strncpy_evex
-#endif
+/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+
+
+# include <sysdep.h>
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+
+# ifndef STRNCPY
+#  define STRNCPY	__strncpy_evex
+# endif
+
+# ifdef USE_AS_WCSCPY
+#  define VMOVU_MASK	vmovdqu32
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define CHAR_SIZE	4
+
+#  define REP_MOVS	rep movsd
+#  define REP_STOS	rep stosl
+
+#  define USE_WIDE_CHAR
+
+# else
+#  define VMOVU_MASK	vmovdqu8
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define CHAR_SIZE	1
+
+#  define REP_MOVS	rep movsb
+#  define REP_STOS	rep stosb
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+
+# define VZERO	VMM(7)
+# define VZERO_256	VMM_256(7)
+# define VZERO_128	VMM_128(7)
+
+# if VEC_SIZE == 64
+#  define VZERO_HALF	VZERO_256
+# else
+#  define VZERO_HALF	VZERO_128
+# endif
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCPY)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   just running rep stos{b|l} to zero set (which will almost
+	   certainly segfault), if that succeeds then just calling
+	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
+# ifdef USE_AS_WCSCPY
+	decq	%rdx
+	movq	%rdx, %rax
+	/* 56 is end of max supported address space.  */
+	shr	$56, %rax
+	jnz	L(zero_len)
+# else
+	decq	%rdx
+	/* If the flag needs to become `jb` replace `dec` with `sub`.
+	 */
+	jl	L(zero_len)
+# endif
+
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
+	movl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+
+	/* If no STPCPY just save end ahead of time.  */
+# ifndef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+
+
+	cmpq	$(CHAR_PER_VEC), %rdx
+
+	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
+	   <= CHAR_PER_VEC with masked instructions (which have
+	   potential for dramatically bad perf if dst splits a page and
+	   is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	/* `jae` because length rdx is now length - 1.  */
+	jae	L(more_1x_vec)
+
+	/* If there where multiple zero-CHAR matches in the first VEC,
+	   VRCX will be overset but thats fine since any oversets where
+	   at zero-positions anyways.  */
+
+#  ifdef USE_AS_STPCPY
+	tzcnt	%VRCX, %VRAX
+	cmpl	%eax, %edx
+	cmovb	%edx, %eax
+#   ifdef USE_AS_WCSCPY
+	adcl	$0, %eax
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#   else
+	adcq	%rdi, %rax
+#   endif
+#  endif
+	dec	%VRCX
+
+	/* Zero out all non-zero CHAR's after the first zero match.  */
+	KMOV	%VRCX, %k1
+
+	/* Use VZERO as destination so this can be reused for
+	   L(zfill_less_vec) (which if jumped to by subsequent logic
+	   will have zerod out VZERO.  */
+	VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
+L(zfill_less_vec):
+	/* Get mask for what we need to set.  */
+	incl	%edx
+	mov	$-1, %VRCX
+	bzhi	%VRDX, %VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	VMOVU_MASK %VZERO, (%rdi){%k1}
+	ret
+
+	.p2align 4,, 4
+L(zero_len):
+	cmpq	$-1, %rdx
+	jne	L(best_effort_strncpy)
+	movq	%rdi, %rax
+	ret
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# else
+	/* `jb` because length rdx is now length - 1.  */
+	jb	L(less_1x_vec)
+# endif
+
+
+	/* This may overset but thats fine because we still need to zero
+	   fill.  */
+	VMOVU	%VMM(0), (%rdi)
+
+
+	/* Length must be >= CHAR_PER_VEC so match here means we must
+	   zero-fill.  */
+	test	%VRCX, %VRCX
+	jnz	L(zfill)
+
+
+	/* We are going to align rsi here so will need to be able to re-
+	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
+	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+
+L(loop_last_4x_vec):
+	addq	%rsi, %rdi
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+
+	/* -1 because of the `dec %rdx` earlier.  */
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	/* This will be need to be computed no matter what. We do it
+	   ahead of time for CHAR_PER_VEC == 64 because we can't adjust
+	   the value of `tzcnt` with a shift.  */
+# if CHAR_PER_VEC == 64
+	tzcntq	%rcx, %rcx
+# endif
+
+	cmpl	$(CHAR_PER_VEC), %edx
+	jb	L(ret_vec_x1_len)
+
+	/* Seperate logic for CHAR_PER_VEC == 64 because we already did
+	   `tzcnt` on VRCX.  */
+# if CHAR_PER_VEC == 64
+	/* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
+	cmpb	$CHAR_PER_VEC, %cl
+	jnz	L(ret_vec_x1_no_bsf)
+# else
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+# endif
+
+
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %VZERO, %k0
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	KMOV	%k0, %VRCX
+
+# if CHAR_PER_VEC < 64
+	/* This essentiallys adds CHAR_PER_VEC to computed result.  */
+	shlq	$CHAR_PER_VEC, %rcx
+# else
+	tzcntq	%rcx, %rcx
+	addl	$CHAR_PER_VEC, %ecx
+# endif
+
+	.p2align 4,, 4
+L(ret_vec_x1_len):
+	/* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
+	   already been done.  */
+# if CHAR_PER_VEC < 64
+	tzcntq	%rcx, %rcx
+# endif
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x1_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+L(ret_vec_x1_len_no_zfill):
+	VMOVU	((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+	.p2align 4,, 10
+L(ret_vec_x1):
+	bsf	%VRCX, %VRCX
+L(ret_vec_x1_no_bsf):
+	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	subl	%ecx, %edx
+	cmpl	$CHAR_PER_VEC, %edx
+	jb	L(ret_vec_x1_len_no_zfill_mov)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+	ret
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	/* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
+	   $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
+	   using `movzbl`.  */
+# if CHAR_PER_VEC == 64
+	movzbl	%dl, %edx
+# else
+	andl	$(CHAR_PER_VEC * 4 - 1), %edx
+# endif
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+	subq	$-(VEC_SIZE * 4), %rsi
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2 - 1), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	test	%VRCX, %VRCX
+	/* Must fill at least 2x VEC.  */
+	jnz	L(zfill_vec1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	/* Must fill at least 1x VEC.  */
+	jnz	L(zfill_vec2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRCX
+
+	/* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rdx
+	ja	L(more_4x_vec)
+
+	subl	$(CHAR_PER_VEC * 3), %edx
+	jb	L(ret_vec_x3_len)
+
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rsi), %VZERO, %k0
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	KMOV	%k0, %VRCX
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x4_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	movl	%ecx, %edx
+L(ret_vec_x4_len_no_zfill):
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 4 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+L(ret_vec_x3_len):
+	addl	$(CHAR_PER_VEC * 1), %edx
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x3_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+	.p2align 4,, 4
+L(ret_vec_x3_len_no_zfill):
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 3 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsf	%VRCX, %VRCX
+	VMOVU	%VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
+	subl	%ecx, %edx
+	jl	L(ret_vec_x3_len_no_zfill_mov)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+	ret
+
+	.p2align 4,, 8
+L(more_4x_vec):
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec4)
 
-#define USE_AS_STRNCPY
-#define STRCPY	STRNCPY
-#include "strcpy-evex.S"
+	/* Recheck length before aligning.  */
+	cmpq	$(CHAR_PER_VEC * 8 - 1), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
+# ifdef USE_AS_WCSCPY
+	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rsi, %rdx
+# endif
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+
+
+	/* Offset rsi by VEC_SIZE so that we can jump to
+	   L(loop_last_4x_vec).  */
+	addq	$-(VEC_SIZE), %rsi
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	/* Store loop end in r9.  */
+	leaq	-(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	/* Restore rdx (length).  */
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	/* Restore rdi (dst).  */
+	addq	%rsi, %rdi
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec1)
+
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec2)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec3)
+
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
+	KMOV	%k4, %VRCX
+	// Zfill more....
+
+	.p2align 4,, 4
+L(zfill_vec4):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -2), %rdx
+L(zfill_vec2):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -1), %rdx
+L(zfill):
+	/* VRCX must be non-zero.  */
+	bsf	%VRCX, %VRCX
+
+	/* Adjust length / dst for zfill.  */
+	subq	%rcx, %rdx
+# ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+# else
+	addq	%rcx, %rdi
+# endif
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_from_page_cross):
+
+	/* From here on out its just memset(rdi, 0, rdx).  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jb	L(zfill_less_vec)
+
+L(zfill_more_1x_vec):
+	VMOVU	%VZERO, (%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
+	ja	L(zfill_more_2x_vec)
+L(zfill_done0):
+	ret
+
+	/* Coming from vec1/vec2 we must be able to zfill at least 2x
+	   VEC.  */
+	.p2align 4,, 8
+L(zfill_vec3):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -2), %rdx
+	.p2align 4,, 2
+L(zfill_vec1):
+	bsfq	%rcx, %rcx
+	/* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
+	 */
+	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+
+
+	VMOVU	%VZERO, (%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	jb	L(zfill_done0)
+L(zfill_more_2x_vec):
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VZERO, (VEC_SIZE)(%rdi)
+	subq	$(CHAR_PER_VEC * 4 - 1), %rdx
+	jbe	L(zfill_done)
+
+# ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rdi, %rdx
+# endif
+
+	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi)
+
+
+	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
+	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	jbe	L(zfill_done)
+
+	/* Align rdi and zfill loop.  */
+	andq	$-(VEC_SIZE), %rdi
+	.p2align 4,, 12
+L(zfill_loop_4x_vec):
+	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	ja	L(zfill_loop_4x_vec)
+L(zfill_done):
+	ret
+
+
+	/* Less 1x VEC case if we are not using evex masked store.  */
+# if !USE_EVEX_MASKED_STORE
+	.p2align 4,, 8
+L(copy_1x):
+	/* Special case for copy 1x. It can be handled quickly and many
+	   buffer sizes have convenient alignment.  */
+	VMOVU	%VMM(0), (%rdi)
+	/* If no zeros then we are done.  */
+	testl	%ecx, %ecx
+	jz	L(ret_1x_1x)
+
+	/* Need to zfill, not we know that length <= CHAR_PER_VEC so we
+	   only handle the small case here.  */
+	bsf	%VRCX, %VRCX
+L(zfill_less_vec_no_bsf):
+	/* Adjust length / dst then just zfill less_vec.  */
+	subq	%rcx, %rdx
+#  ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#  else
+	addq	%rcx, %rdi
+#  endif
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+
+L(zfill_less_vec):
+	cmpl	$((VEC_SIZE / 2) / CHAR_SIZE), %edx
+	jb	L(zfill_less_half)
+
+	VMOVU	%VZERO_HALF, (%rdi)
+	VMOVU	%VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	ret
+#  ifdef USE_AS_STPCPY
+L(ret_1x_1x):
+	leaq	CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
+	ret
+#  endif
+
+
+#  if VEC_SIZE == 64
+	.p2align 4,, 4
+L(copy_32_63):
+	/* Overfill to avoid branches.  */
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+
+	/* We are taking advantage of the fact that to be here we must
+	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
+	   way for overwriting.  */
+	cmpl	%ecx, %edx
+	ja	L(zfill_less_vec_no_bsf)
+#   ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+#   else
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  endif
+
+	.p2align 4,, 4
+L(copy_16_31):
+	/* Overfill to avoid branches.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpl	%ecx, %edx
+
+	/* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
+	   we have a larger copy block for 32-63 so this is just falls
+	   through to zfill 16-31. If VEC_SIZE == 32 then we check for
+	   full zfill of less 1x VEC.  */
+#  if VEC_SIZE == 64
+	jbe	L(ret_16_31)
+	subl	%ecx, %edx
+#   ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#   else
+	addq	%rcx, %rdi
+#   endif
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+L(zfill_less_half):
+L(zfill_less_32):
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jb	L(zfill_less_16)
+	VMOVU	%VZERO_128, (%rdi)
+	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+#   ifdef USE_AS_STPCPY
+	ret
+#   endif
+L(ret_16_31):
+#   ifdef USE_AS_STPCPY
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  else
+	/* VEC_SIZE == 32 begins.  */
+	ja	L(zfill_less_vec_no_bsf)
+#   ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+#   else
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  endif
+
+
+	.p2align 4,, 4
+L(copy_8_15):
+	/* Overfill to avoid branches.  */
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpl	%ecx, %edx
+	jbe	L(ret_8_15)
+	subl	%ecx, %edx
+#  ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#  else
+	addq	%rcx, %rdi
+#  endif
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+	.p2align 4,, 8
+#  if VEC_SIZE == 32
+L(zfill_less_half):
+#  endif
+L(zfill_less_16):
+	xorl	%ecx, %ecx
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jb	L(zfill_less_8)
+	movq	%rcx, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+#  ifndef USE_AS_STPCPY
+L(ret_8_15):
+#  endif
+	ret
+
+	.p2align 4,, 8
+L(less_1x_vec):
+	je	L(copy_1x)
+
+	/* We will need `tzcnt` result for all other copy sizes.  */
+	tzcnt	%VRCX, %VRCX
+#  if VEC_SIZE == 64
+	cmpl	$(32 / CHAR_SIZE), %edx
+	jae	L(copy_32_63)
+#  endif
+
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jae	L(copy_16_31)
+
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jae	L(copy_8_15)
+#  ifdef USE_AS_WCSCPY
+	testl	%ecx, %ecx
+	jz	L(zfill_less_8_set_ret)
+
+	movl	(%rsi, %rdx, CHAR_SIZE), %esi
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%esi, (%rdi, %rdx, CHAR_SIZE)
+#   ifdef USE_AS_STPCPY
+	cmpl	%ecx, %edx
+L(ret_8_15):
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#   endif
+	ret
+L(zfill_less_8_set_ret):
+	xorl	%ecx, %ecx
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+L(zfill_less_8):
+	movl	%ecx, (%rdi)
+	movl	%ecx, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  else
+	cmpl	$3, %edx
+	jb	L(copy_0_3)
+	/* Overfill to avoid branches.  */
+	movl	-3(%rsi, %rdx), %esi
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%esi, -3(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_4_7)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+	xorl	%ecx, %ecx
+	.p2align 4,, 8
+L(zfill_less_8):
+	cmpl	$3, %edx
+	jb	L(zfill_less_3)
+	movl	%ecx, (%rdi)
+	movl	%ecx, -3(%rdi, %rdx)
+#   ifdef USE_AS_STPCPY
+	ret
+#   endif
+
+L(ret_4_7):
+#   ifdef USE_AS_STPCPY
+L(ret_8_15):
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#   endif
+	ret
+
+	.p2align 4,, 4
+L(zfill_less_3):
+	testl	%edx, %edx
+	jz	L(zfill_1)
+	movw	%cx, (%rdi)
+L(zfill_1):
+	movb	%cl, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_0_3):
+	vmovd	%VMM_128(0), %r8d
+	testl	%edx, %edx
+	jz	L(copy_1)
+	movw	%r8w, (%rdi)
+	cmpl	%ecx, %edx
+	ja	L(zfill_from_1)
+	movzbl	(%rsi, %rdx), %r8d
+#   ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+	movb	%r8b, (%rdi, %rdx)
+	ret
+#   endif
+
+L(copy_1):
+#   ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	cmpl	%ecx, %edx
+	adcq	%rdi, %rax
+#   endif
+#   ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+#   else
+	movb	%r8b, (%rdi, %rdx)
+#   endif
+	ret
+#  endif
+
+
+#  ifndef USE_AS_WCSCPY
+	.p2align 4,, 8
+L(zfill_from_1):
+#   ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+#   endif
+	movw	$0, -1(%rdi, %rdx)
+	ret
+#  endif
+
+	.p2align 4,, 4
+L(zero_len):
+	incq	%rdx
+	jne	L(best_effort_strncpy)
+	movq	%rdi, %rax
+	ret
+# endif
+
+
+	.p2align 4,, 4
+	.p2align 6,, 8
+L(page_cross):
+	movq	%rsi, %rax
+	andq	$(VEC_SIZE * -1), %rax
+	VPCMPEQ	(%rax), %VZERO, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSCPY
+	movl	%esi, %r8d
+	shrl	$2, %r8d
+	andl	$(CHAR_PER_VEC - 1), %r8d
+	shrx	%VR8, %VRCX, %VRCX
+# else
+	shrx	%VRSI, %VRCX, %VRCX
+# endif
+
+	/* Compute amount of bytes we checked.  */
+	subl	%esi, %eax
+	andl	$(VEC_SIZE - 1), %eax
+# ifdef USE_AS_WCSCPY
+	shrl	$2, %eax
+# endif
+
+	/* If rax > rdx then we are finishing the copy at the end of the
+	   page.  */
+	cmpq	%rax, %rdx
+	jb	L(page_cross_small)
+
+
+	/* If rcx is non-zero then continue.  */
+	test	%VRCX, %VRCX
+	jz	L(page_cross_continue)
+
+	/* We found zero-CHAR so need to copy then zfill (we know we
+	   didn't cover all of length here).  */
+	bsf	%VRCX, %VRCX
+L(movsb_and_zfill):
+	incl	%ecx
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	leaq	-CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
+# else
+	movq	%rdi, %rax
+# endif
+
+	REP_MOVS
+# ifdef USE_AS_WCSCPY
+	movl	$0, (%rdi)
+# else
+	movb	$0, (%rdi)
+# endif
+	jmp	L(zfill_from_page_cross)
+
+L(page_cross_small):
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(page_cross_copy_only)
+
+	/* Do a zfill of the tail before copying.  */
+	movq	%rdi, %r9
+	xorl	%eax, %eax
+
+	movl	%ecx, %r8d
+
+	subl	%ecx, %edx
+	leaq	CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+	movl	%edx, %ecx
+	REP_STOS
+	movq	%r9, %rdi
+	movl	%r8d, %edx
+L(page_cross_copy_only):
+	leal	1(%rdx), %ecx
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcl	$0, %edx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# else
+	movq	%rdi, %rax
+# endif
+	REP_MOVS
+	ret
+
+
+L(best_effort_strncpy):
+	movq	%rdx, %rcx
+	xorl	%eax, %eax
+	movq	%rdi, %r8
+	/* The length is >= 2^63. We very much so expect to segfault at
+	   rep stos. If that doesn't happen then just strcpy to finish.
+	 */
+	REP_STOS
+	movq	%r8, %rdi
+	jmp	OVERFLOW_STRCPY
+END(STRNCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
new file mode 100644
index 0000000000..d5ff4cbe50
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
@@ -0,0 +1,65 @@
+#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
+#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
+
+#if defined USE_MULTIARCH && IS_IN(libc)
+#  define UNDERSCORES __
+#  ifdef USE_WITH_SSE2
+#    define ISA_EXT _sse2
+#  elif defined USE_WITH_AVX
+#    ifdef USE_WITH_RTM
+#      define ISA_EXT _avx_rtm
+#    else
+#      define ISA_EXT _avx
+#    endif
+#  elif defined USE_WITH_AVX2
+#    ifdef USE_WITH_RTM
+#      define ISA_EXT _avx2_rtm
+#    else
+#      define ISA_EXT _avx2
+#    endif
+
+#  elif defined USE_WITH_EVEX256
+#    define ISA_EXT _evex
+#  elif defined USE_WITH_EVEX512
+#    define ISA_EXT _evex512
+#  endif
+#else
+#  define UNDERSCORES
+#  define ISA_EXT
+#endif
+
+#ifdef USE_AS_WCSCPY
+#  define STRCPY_PREFIX wc
+#  define STRCAT_PREFIX wcs
+#  ifdef USE_AS_STPCPY
+#    define STRCPY_POSTFIX pcpy
+#  else
+#    define STRCPY_POSTFIX scpy
+#  endif
+#else
+#  define STRCPY_PREFIX st
+#  define STRCAT_PREFIX str
+#  ifdef USE_AS_STPCPY
+#    define STRCPY_POSTFIX pcpy
+#  else
+#    define STRCPY_POSTFIX rcpy
+#  endif
+#endif
+#define STRCAT_POSTFIX cat
+
+#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext)                 \
+  underscores##prefix##postfix##ext
+
+#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
+
+#ifndef OVERFLOW_STRCPY
+#  define OVERFLOW_STRCPY                                                     \
+    OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
+#endif
+
+#ifndef OVERFLOW_STRCAT
+#  define OVERFLOW_STRCAT                                                     \
+    OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
+#endif
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v1 3/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions
  2022-11-03  8:53 [PATCH v1 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json Noah Goldstein
  2022-11-03  8:53 ` [PATCH v1 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
@ 2022-11-03  8:53 ` Noah Goldstein
  2022-11-03  8:55   ` Noah Goldstein
  2022-11-03  8:53 ` [PATCH v1 4/4] x86: Add optimized functions for the wide-character strcpy family Noah Goldstein
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 42+ messages in thread
From: Noah Goldstein @ 2022-11-03  8:53 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
    1. Use more overlapping stores to avoid branches.
    2. Reduce how unrolled the aligning copies are (this is more of a
       code-size save, its a negative for some sizes in terms of
       perf).
    3. For st{r|p}n{cat|cpy} re-order the branches to minimize the
       number that are taken.

Performance Changes:

    Times are from N = 10 runs of the benchmark suite and are
    reported as geometric mean of all ratios of
    New Implementation / Old Implementation.

    strcat-avx2      -> 0.998
    strcpy-avx2      -> 0.937
    stpcpy-avx2      -> 0.971

    strncpy-avx2     -> 0.793
    stpncpy-avx2     -> 0.775

    strncat-avx2     -> 0.993

Code Size Changes:
    function         -> Bytes New / Bytes Old -> Ratio

    strcat-avx2      -> 685  / 1639 -> 0.418
    strcpy-avx2      -> 560  / 903  -> 0.620
    stpcpy-avx2      -> 592  / 939  -> 0.630

    strncpy-avx2     -> 1176 / 2390 -> 0.492
    stpncpy-avx2     -> 1268 / 2438 -> 0.520

    strncat-avx2     -> 981  / 2563 -> 0.383

Notes:
    Because of the significant difference between the
    implementations they are split into three files.

    strcpy-avx2.S    -> strcpy, stpcpy, strcat
    strncpy-avx2.S   -> strncpy
    strncat-avx2.S    > strncat

    I couldn't find a way to merge them without making the ifdefs
    incredibly difficult to follow.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
Results attached.

 sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S    |    6 +-
 sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S   |    7 +-
 sysdeps/x86_64/multiarch/stpncpy-avx2.S       |    5 +-
 sysdeps/x86_64/multiarch/strcat-avx2-rtm.S    |   13 +-
 sysdeps/x86_64/multiarch/strcat-avx2.S        |  268 +---
 sysdeps/x86_64/multiarch/strcat-strlen-avx2.S |   76 +
 sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S    |   13 +-
 sysdeps/x86_64/multiarch/strcpy-avx2.S        | 1236 +++++------------
 sysdeps/x86_64/multiarch/strncat-avx2-rtm.S   |    6 +-
 sysdeps/x86_64/multiarch/strncat-avx2.S       |  477 ++++++-
 sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S   |    6 +-
 sysdeps/x86_64/multiarch/strncpy-avx2.S       |  743 +++++++++-
 sysdeps/x86_64/multiarch/x86-avx-vecs.h       |    5 +-
 sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h  |   26 +
 sysdeps/x86_64/multiarch/x86-avx2-vecs.h      |   27 +
 15 files changed, 1680 insertions(+), 1234 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-vecs.h

diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
index 2b9c07a59f..189a288053 100644
--- a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STPCPY	__stpcpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "stpcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
index 60a2ccfe53..1b252985e7 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
@@ -1,4 +1,3 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STPNCPY	__stpncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "stpncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
index b2f8c19143..a46a8edbe2 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
@@ -3,6 +3,5 @@
 #endif
 
 #define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY	STPNCPY
-#include "strcpy-avx2.S"
+#define STRNCPY	STPNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
index 637fb557c4..94d51d10bd 100644
--- a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
@@ -1,12 +1,3 @@
-#ifndef STRCAT
-# define STRCAT __strcat_avx2_rtm
-#endif
-
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
-
-#define SECTION(p) p##.avx.rtm
-
+#define STRCAT	__strcat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
 #include "strcat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
index d9b7fb2a43..3f914fa342 100644
--- a/sysdeps/x86_64/multiarch/strcat-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
@@ -16,266 +16,10 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (3)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_avx2
-# endif
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE	32
-
-# ifndef SECTION
-#  define SECTION(p)	p##.avx
-# endif
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRCAT)
-	mov	%rdi, %r9
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-	xor	%eax, %eax
-	mov	%edi, %ecx
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	vpxor	%xmm6, %xmm6, %xmm6
-	cmp	$(VEC_SIZE * 3), %ecx
-	ja	L(fourth_vector_boundary)
-	vpcmpeqb (%rdi), %ymm6, %ymm0
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_first_vector)
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	jmp	L(align_vec_size_start)
-L(fourth_vector_boundary):
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	vpcmpeqb	(%rax), %ymm6, %ymm0
-	mov	$-1, %r10d
-	sub	%rax, %rcx
-	shl	%cl, %r10d
-	vpmovmskb %ymm0, %edx
-	and	%r10d, %edx
-	jnz	L(exit)
-
-L(align_vec_size_start):
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 4), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 4), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 4), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 5), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
-	add	$VEC_SIZE, %rax
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
-	add	$VEC_SIZE, %rax
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
-	add	$VEC_SIZE, %rax
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	add	$VEC_SIZE, %rax
-
-	.p2align 4
-L(align_four_vec_loop):
-	vmovaps	(%rax),	%ymm4
-	vpminub	VEC_SIZE(%rax),	%ymm4, %ymm4
-	vmovaps	(VEC_SIZE * 2)(%rax),	%ymm5
-	vpminub	(VEC_SIZE * 3)(%rax),	%ymm5, %ymm5
-	add	$(VEC_SIZE * 4),	%rax
-	vpminub	%ymm4,	%ymm5, %ymm5
-	vpcmpeqb %ymm5,	%ymm6, %ymm5
-	vpmovmskb %ymm5,	%edx
-	test	%edx,	%edx
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
-	sub	$(VEC_SIZE * 5),	%rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit):
-	sub	%rdi, %rax
-L(exit_null_on_first_vector):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_second_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$VEC_SIZE, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_third_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 2), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fourth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 3), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fifth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-
-	.p2align 4
-L(StartStrcpyPart):
-	lea	(%r9, %rax), %rdi
-	mov	%rsi, %rcx
-	mov	%r9, %rax      /* save result */
-
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(ExitZero)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-avx2.S"
+#ifndef STRCAT
+# define STRCAT	__strcat_avx2
 #endif
+
+#define USE_AS_STRCAT
+#define STRCPY	STRCAT
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
new file mode 100644
index 0000000000..e0fc286826
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
@@ -0,0 +1,76 @@
+    /* Simple strlen implementation that ends at L(strcat_strlen_done).  */
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+	VPCMPEQ	(%r8), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	shrxl	%edi, %ecx, %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	VEC_SIZE(%r8), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	leaq	(VEC_SIZE)(%r8), %rdi
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v1)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v2)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v3)
+
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	.p2align 4,, 8
+L(strlen_loop_4x_vec):
+	VMOVA	(VEC_SIZE * 0 + 1)(%rdi), %VMM(0)
+	VPMIN	(VEC_SIZE * 1 + 1)(%rdi), %VMM(0), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 1)(%rdi), %VMM(2)
+	VPMIN	(VEC_SIZE * 3 + 1)(%rdi), %VMM(2), %VMM(3)
+	VPMIN	%VMM(1), %VMM(3), %VMM(3)
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(3)
+	vpmovmskb %VMM(3), %r8d
+	subq	$(VEC_SIZE * -4), %rdi
+	testl	%r8d, %r8d
+	jz	L(strlen_loop_4x_vec)
+
+	addq	$(VEC_SIZE * -4 + 1), %rdi
+
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(1)
+	vpmovmskb %VMM(1), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v1)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(2)
+	vpmovmskb %VMM(2), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v2)
+
+	movl	%r8d, %ecx
+L(bsf_and_done_v3):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v2):
+	bsfl	%ecx, %ecx
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx), %rdi
+	jmp	L(strcat_strlen_done)
+
+	.p2align 4,, 4
+L(bsf_and_done_v1):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v0):
+	bsfl	%ecx, %ecx
+	addq	%rcx, %rdi
+L(strcat_strlen_done):
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
index c2c581ecf7..fe80ffd265 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
@@ -1,12 +1,3 @@
-#ifndef STRCPY
-# define STRCPY __strcpy_avx2_rtm
-#endif
-
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
-
-#define SECTION(p) p##.avx.rtm
-
+#define STRCPY	__strcpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
 #include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index c725834929..b87a1722d5 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -20,984 +20,378 @@
 
 #if ISA_SHOULD_BUILD (3)
 
+# include <sysdep.h>
 
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
-
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_avx2
-#  endif
-
-# endif
-
-/* Number of bytes in a vector register */
 # ifndef VEC_SIZE
-#  define VEC_SIZE	32
-# endif
-
-# ifndef VZEROUPPER
-#  define VZEROUPPER	vzeroupper
-# endif
-
-# ifndef SECTION
-#  define SECTION(p)	p##.avx
-# endif
-
-/* zero register */
-#define xmmZ	xmm0
-#define ymmZ	ymm0
-
-/* mask register */
-#define ymmM	ymm1
-
-# ifndef USE_AS_STRCAT
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRCPY)
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-	test	%R8_LP, %R8_LP
-	jz	L(ExitZero)
-#  endif
-	mov	%rsi, %rcx
-#  ifndef USE_AS_STPCPY
-	mov	%rdi, %rax      /* save result */
-#  endif
-
+#  include "x86-avx2-vecs.h"
 # endif
 
-	vpxor	%xmmZ, %xmmZ, %xmmZ
-
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	cmp	$(VEC_SIZE * 2), %ecx
-	jbe	L(SourceStringAlignmentLessTwoVecSize)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-
-	vpcmpeqb (%rsi), %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	shr	%cl, %rdx
-
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	mov	$VEC_SIZE, %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  else
-	mov	$(VEC_SIZE + 1), %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  endif
-	jbe	L(CopyVecSizeTailCase2OrCase3)
+# ifndef STRCPY
+#  define STRCPY	__strcpy_avx2
 # endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail)
 
-	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
-	vpmovmskb %ymm2, %edx
+	/* Use movsb in page cross case to save code size.  */
+# define USE_MOVSB_IN_PAGE_CROSS	1
 
-# ifdef USE_AS_STRNCPY
-	add	$VEC_SIZE, %r10
-	cmp	%r10, %r8
-	jbe	L(CopyTwoVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize)
-
-	vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
-	vmovdqu %ymm2, (%rdi)
-
-/* If source address alignment != destination address alignment */
-	.p2align 4
-L(UnalignVecSizeBoth):
-	sub	%rcx, %rdi
-# ifdef USE_AS_STRNCPY
-	add	%rcx, %r8
-	sbb	%rcx, %rcx
-	or	%rcx, %r8
-# endif
-	mov	$VEC_SIZE, %rcx
-	vmovdqa (%rsi, %rcx), %ymm2
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 3), %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_WCSCPY
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
 # else
-	jnz	L(CopyVecSize)
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
 # endif
 
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
-	vpcmpeqb %ymm3, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define PAGE_SIZE	4096
 
-	vmovdqu %ymm3, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
-	vpcmpeqb %ymm4, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
+# ifdef USE_AS_STPCPY
+#  define END_REG	rax
 # else
-	jnz	L(CopyVecSize)
+#  define END_REG	rdi, %rdx
 # endif
 
-	vmovdqu %ymm4, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STRCAT
+#  define PAGE_ALIGN_REG	ecx
 # else
-	jnz	L(CopyVecSize)
+#  define PAGE_ALIGN_REG	eax
 # endif
 
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
 
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vpcmpeqb %ymm3, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRCPY)
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
 
-	vmovdqu %ymm3, (%rdi, %rcx)
-	mov	%rsi, %rdx
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	and	$-(VEC_SIZE * 4), %rsi
-	sub	%rsi, %rdx
-	sub	%rdx, %rdi
-# ifdef USE_AS_STRNCPY
-	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
-# endif
-L(UnalignedFourVecSizeLoop):
-	vmovdqa (%rsi), %ymm4
-	vmovdqa VEC_SIZE(%rsi), %ymm5
-	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
-	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
-	vpminub %ymm5, %ymm4, %ymm2
-	vpminub %ymm7, %ymm6, %ymm3
-	vpminub %ymm2, %ymm3, %ymm3
-	vpcmpeqb %ymmM, %ymm3, %ymm3
-	vpmovmskb %ymm3, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(UnalignedFourVecSizeLeave)
-
-L(UnalignedFourVecSizeLoop_start):
-	add	$(VEC_SIZE * 4), %rdi
-	add	$(VEC_SIZE * 4), %rsi
-	vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
-	vmovdqa (%rsi), %ymm4
-	vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
-	vmovdqa VEC_SIZE(%rsi), %ymm5
-	vpminub %ymm5, %ymm4, %ymm2
-	vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
-	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
-	vmovdqu %ymm7, -VEC_SIZE(%rdi)
-	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
-	vpminub %ymm7, %ymm6, %ymm3
-	vpminub %ymm2, %ymm3, %ymm3
-	vpcmpeqb %ymmM, %ymm3, %ymm3
-	vpmovmskb %ymm3, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jz	L(UnalignedFourVecSizeLoop_start)
-
-L(UnalignedFourVecSizeLeave):
-	vpcmpeqb %ymm4, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_0)
-
-	vpcmpeqb %ymm5, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %ecx
-	test	%ecx, %ecx
-	jnz	L(CopyVecSizeUnaligned_16)
-
-	vpcmpeqb %ymm6, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_32)
-
-	vpcmpeqb %ymm7, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %ecx
-	bsf	%ecx, %edx
-	vmovdqu %ymm4, (%rdi)
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$(VEC_SIZE * 3), %rsi
-	add	$(VEC_SIZE * 3), %rdi
-	jmp	L(CopyVecSizeExit)
+# ifdef USE_AS_STRCAT
+	movq	%rdi, %rax
+#  include "strcat-strlen-avx2.S"
 # endif
 
-/* If source address alignment == destination address alignment */
-
-L(SourceStringAlignmentLessTwoVecSize):
-	vmovdqu (%rsi), %ymm3
-	vmovdqu VEC_SIZE(%rsi), %ymm2
-	vpcmpeqb %ymm3, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$VEC_SIZE, %r8
-#  else
-	cmp	$(VEC_SIZE + 1), %r8
-#  endif
-	jbe	L(CopyVecSizeTail1Case2OrCase3)
+	movl	%esi, %PAGE_ALIGN_REG
+	andl	$(PAGE_SIZE - 1), %PAGE_ALIGN_REG
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
+	ja	L(page_cross)
+L(page_cross_continue):
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 # endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail1)
-
-	vmovdqu %ymm3, (%rdi)
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$(VEC_SIZE * 2), %r8
-#  else
-	cmp	$((VEC_SIZE * 2) + 1), %r8
-#  endif
-	jbe	L(CopyTwoVecSize1Case2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize1)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-	jmp	L(UnalignVecSizeBoth)
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
 
-/*------End of main part with loops---------------------*/
+	testl	%ecx, %ecx
+	jz	L(more_1x_vec)
 
-/* Case1 */
+	/* No longer need ymm registers so just vzeroupper so it doesn't
+	   need to be duplicated at each return statement.  */
+	COND_VZEROUPPER
 
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
-	.p2align 4
-L(CopyVecSize):
-	add	%rcx, %rdi
-# endif
-L(CopyVecSizeTail):
-	add	%rcx, %rsi
-L(CopyVecSizeTail1):
-	bsf	%edx, %edx
-L(CopyVecSizeExit):
-	cmp	$32, %edx
-	jae	L(Exit32_63)
-	cmp	$16, %edx
-	jae	L(Exit16_31)
-	cmp	$8, %edx
-	jae	L(Exit8_15)
-	cmp	$4, %edx
-	jae	L(Exit4_7)
-	cmp	$3, %edx
-	je	L(Exit3)
-	cmp	$1, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	movb	$0, (%rdi)
+	xorl	%edx, %edx
+	bsfl	%ecx, %edx
 # ifdef USE_AS_STPCPY
-	lea	(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$1, %r8
-	lea	1(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
-
-	.p2align 4
-L(CopyTwoVecSize1):
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$VEC_SIZE, %r8
-# endif
-	jmp	L(CopyVecSizeTail1)
-
-	.p2align 4
-L(CopyTwoVecSize):
-	bsf	%edx, %edx
-	add	%rcx, %rsi
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	jmp	L(CopyVecSizeExit)
-
-	.p2align 4
-L(CopyVecSizeUnaligned_0):
-	bsf	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm4, (%rdi)
-	add	$((VEC_SIZE * 4) - 1), %r8
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	jmp	L(CopyVecSizeExit)
-# endif
-
-	.p2align 4
-L(CopyVecSizeUnaligned_16):
-	bsf	%ecx, %edx
-	vmovdqu %ymm4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	VEC_SIZE(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	add	$((VEC_SIZE * 3) - 1), %r8
-	sub	%rdx, %r8
-	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
+	leaq	(%rdi, %rdx), %rax
+# endif
+
+	/* Use mask bits in rcx to detect which copy we need. If the low
+	   mask is zero then there must be a bit set in the upper half.
+	   I.e if ecx != 0 and cx == 0, then match must be upper 16
+	   bits so we use L(copy_16_31).  */
+	testw	%cx, %cx
+	jz	L(copy_16_31)
+
+	testb	%cl, %cl
+	jz	L(copy_8_15)
+# ifdef USE_AS_WCSCPY
+	vmovd	%xmm0, (%rdi)
+	movl	$0, (%END_REG)
+	ret
 # else
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
-
-	.p2align 4
-L(CopyVecSizeUnaligned_32):
-	bsf	%edx, %edx
-	vmovdqu %ymm4, (%rdi)
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-	add	$((VEC_SIZE * 2) - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
+	testb	$0x7, %cl
+	jz	L(copy_4_7)
+
+	testl	%edx, %edx
+	jz	L(set_null_term)
+	vmovd	%xmm0, %ecx
+	movw	%cx, (%rdi)
+
+	.p2align 4,, 2
+L(set_null_term):
+	movb	$0, (%END_REG)
+	ret
+
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-3(%rsi, %rdx), %ecx
+	vmovd	%xmm0, (%rdi)
+	movl	%ecx, -3(%END_REG)
+	ret
+# endif
+
+	.p2align 4,, 10
+L(copy_16_31):
+	VMOVU	-(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
+	ret
+
+	.p2align 4,, 10
+L(copy_8_15):
+# ifdef USE_AS_WCSCPY
+	movl	-(8 - CHAR_SIZE)(%rsi, %rdx), %ecx
 # else
-	add	$(VEC_SIZE * 2), %rsi
-	add	$(VEC_SIZE * 2), %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
-
-# ifdef USE_AS_STRNCPY
-#  ifndef USE_AS_STRCAT
-	.p2align 4
-L(CopyVecSizeUnalignedVec6):
-	vmovdqu %ymm6, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec5):
-	vmovdqu %ymm5, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec4):
-	vmovdqu %ymm4, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec3):
-	vmovdqu %ymm3, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-#  endif
-
-/* Case2 */
-
-	.p2align 4
-L(CopyVecSizeCase2):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTailCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTail1Case2):
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-/* Case2 or Case3,  Case3 */
-
-	.p2align 4
-L(CopyVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeCase2)
-L(CopyVecSizeCase3):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyTwoVecSizeCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyVecSizeTailCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTailCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSize1Case2OrCase3):
-	add	$VEC_SIZE, %rdi
-	add	$VEC_SIZE, %rsi
-	sub	$VEC_SIZE, %r8
-L(CopyVecSizeTail1Case2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTail1Case2)
-	jmp	L(StrncpyExit)
-# endif
-
-/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
-
-	.p2align 4
-L(Exit1):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$2, %r8
-	lea	2(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Exit2):
-	movzwl	(%rsi), %ecx
-	mov	%cx, (%rdi)
-	movb	$0, 2(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$3, %r8
-	lea	3(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Exit3):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
+# endif
+	vmovq	%xmm0, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%END_REG)
+	ret
+
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rdi)
+# endif
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	addq	%rsi, %rdi
+	VMOVA	1(%rsi), %VMM(1)
+
+	/* Try and order stores after as many loads as is reasonable to
+	   avoid potential false dependencies.  */
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rax)
+# endif
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE + 1)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), 1(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE + 1)(%rdi)
+
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2 + 1)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %edx
+	testl	%edx, %edx
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 3 + 1)(%rdi)
+
+	/* Subtract rsi from rdi before aligning. Adding back rsi will
+	   get proper rdi (dst) for new src.  */
+	subq	%rsi, %rdi
+	incq	%rsi
+	orq	$(VEC_SIZE * 4 - 1), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %edx
+	addq	%rsi, %rdi
+
+	testl	%edx, %edx
+	jnz	L(loop_4x_done)
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 1)(%rdi)
+
+
+	VMOVA	(VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %edx
+	subq	$(VEC_SIZE * -4), %rdi
+	testl	%edx, %edx
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
+
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
+L(ret_vec_x4):
+	bsfl	%edx, %edx
+	VMOVU	((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
 # ifdef USE_AS_STPCPY
-	lea	3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$4, %r8
-	lea	4(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax
 # endif
+L(return_end):
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(Exit4_7):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	-3(%rsi, %rdx), %ecx
-	mov	%ecx, -3(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x1):
+	bsfl	%ecx, %ecx
+	VMOVU	(1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	1(%rcx, %rdi), %rax
 # endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Exit8_15):
-	mov	(%rsi), %rcx
-	mov	-7(%rsi, %rdx), %r9
-	mov	%rcx, (%rdi)
-	mov	%r9, -7(%rdi, %rdx)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
-L(Exit16_31):
-	vmovdqu (%rsi), %xmm2
-	vmovdqu -15(%rsi, %rdx), %xmm3
-	vmovdqu %xmm2, (%rdi)
-	vmovdqu %xmm3, -15(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x2):
+	bsfl	%ecx, %ecx
+	VMOVU	((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub %rdx, %r8
-	sub $1, %r8
-	lea 1(%rdi, %rdx), %rdi
-	jnz L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax
 # endif
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(Exit32_63):
-	vmovdqu (%rsi), %ymm2
-	vmovdqu -31(%rsi, %rdx), %ymm3
-	vmovdqu %ymm2, (%rdi)
-	vmovdqu %ymm3, -31(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsfl	%ecx, %ecx
+	VMOVU	((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax
 # endif
 	VZEROUPPER_RETURN
 
-# ifdef USE_AS_STRNCPY
 
-	.p2align 4
-L(StrncpyExit1):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
+	.p2align 4,, 4
+L(page_cross):
+	movq	%rsi, %rcx
+	andq	$(VEC_SIZE * -1), %rcx
+
+	VPCMPEQ	(%rcx), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+# if USE_MOVSB_IN_PAGE_CROSS
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shll	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsfl	%ecx, %ecx
+#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
+#  endif
+	rep	movsb
 #  ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 1(%rdi)
+	leaq	-CHAR_SIZE(%rdi), %rax
 #  endif
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(StrncpyExit2):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 2(%rdi)
-#  endif
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(StrncpyExit3_4):
-	movzwl	(%rsi), %ecx
-	movzwl	-2(%rsi, %r8), %edx
-	mov	%cx, (%rdi)
-	mov	%dx, -2(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit5_8):
-	mov	(%rsi), %ecx
-	mov	-4(%rsi, %r8), %edx
-	mov	%ecx, (%rdi)
-	mov	%edx, -4(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit9_16):
-	mov	(%rsi), %rcx
-	mov	-8(%rsi, %r8), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit17_32):
-	vmovdqu (%rsi), %xmm2
-	vmovdqu -16(%rsi, %r8), %xmm3
-	vmovdqu %xmm2, (%rdi)
-	vmovdqu %xmm3, -16(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit33_64):
-	/*  0/32, 31/16 */
-	vmovdqu (%rsi), %ymm2
-	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
-	vmovdqu %ymm2, (%rdi)
-	vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit65):
-	/* 0/32, 32/32, 64/1 */
-	vmovdqu (%rsi), %ymm2
-	vmovdqu 32(%rsi), %ymm3
-	mov	64(%rsi), %cl
-	vmovdqu %ymm2, (%rdi)
-	vmovdqu %ymm3, 32(%rdi)
-	mov	%cl, 64(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	65(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 65(%rdi)
-#  endif
-	VZEROUPPER_RETURN
+# else
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
 
+	/* Traditional copy case, essentially same as used in non-page-
+	   cross case but since we can't reuse VMM(0) we need twice as
+	   many loads from rsi.  */
 #  ifndef USE_AS_STRCAT
-
-	.p2align 4
-L(Fill1):
-	mov	%dl, (%rdi)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill2):
-	mov	%dx, (%rdi)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill3_4):
-	mov	%dx, (%rdi)
-	mov     %dx, -2(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill5_8):
-	mov	%edx, (%rdi)
-	mov     %edx, -4(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill9_16):
-	mov	%rdx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill17_32):
-	vmovdqu %xmmZ, (%rdi)
-	vmovdqu %xmmZ, -16(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec2):
-	vmovdqu %ymm2, (%rdi, %rcx)
-
-	.p2align 4
-L(CopyVecSizeVecExit):
-	bsf	%edx, %edx
-	add	$(VEC_SIZE - 1), %r8
-	add	%rcx, %rdi
-#   ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-#   endif
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-
-	.p2align 4
-L(StrncpyFillTailWithZero):
-	xor	%edx, %edx
-	sub	$VEC_SIZE, %r8
-	jbe	L(StrncpyFillExit)
-
-	vmovdqu %ymmZ, (%rdi)
-	add	$VEC_SIZE, %rdi
-
-	mov	%rdi, %rsi
-	and	$(VEC_SIZE - 1), %esi
-	sub	%rsi, %rdi
-	add	%rsi, %r8
-	sub	$(VEC_SIZE * 4), %r8
-	jb	L(StrncpyFillLessFourVecSize)
-
-L(StrncpyFillLoopVmovdqa):
-	vmovdqa %ymmZ, (%rdi)
-	vmovdqa %ymmZ, VEC_SIZE(%rdi)
-	vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
-	vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE * 4), %rdi
-	sub	$(VEC_SIZE * 4), %r8
-	jae	L(StrncpyFillLoopVmovdqa)
-
-L(StrncpyFillLessFourVecSize):
-	add	$(VEC_SIZE * 2), %r8
-	jl	L(StrncpyFillLessTwoVecSize)
-	vmovdqa %ymmZ, (%rdi)
-	vmovdqa %ymmZ, VEC_SIZE(%rdi)
-	add	$(VEC_SIZE * 2), %rdi
-	sub	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	vmovdqa %ymmZ, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillLessTwoVecSize):
-	add	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	vmovdqa %ymmZ, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillExit):
-	add	$VEC_SIZE, %r8
-L(Fill):
-	cmp	$17, %r8d
-	jae	L(Fill17_32)
-	cmp	$9, %r8d
-	jae	L(Fill9_16)
-	cmp	$5, %r8d
-	jae	L(Fill5_8)
-	cmp	$3, %r8d
-	jae	L(Fill3_4)
-	cmp	$1, %r8d
-	ja	L(Fill2)
-	je	L(Fill1)
-	VZEROUPPER_RETURN
-
-/* end of ifndef USE_AS_STRCAT */
+	xorl	%edx, %edx
 #  endif
-
-	.p2align 4
-L(UnalignedLeaveCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(UnalignedFourVecSizeLeaveCase2)
-L(UnalignedFourVecSizeLeaveCase3):
-	lea	(VEC_SIZE * 4)(%r8), %rcx
-	and	$-VEC_SIZE, %rcx
-	add	$(VEC_SIZE * 3), %r8
-	jl	L(CopyVecSizeCase3)
-	vmovdqu %ymm4, (%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
+	bsfl	%ecx, %edx
 #  ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 4)(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (VEC_SIZE * 4)(%rdi)
+	leaq	(%rdi, %rdx), %rax
+#  elif !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 #  endif
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(UnalignedFourVecSizeLeaveCase2):
-	xor	%ecx, %ecx
-	vpcmpeqb %ymm4, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$(VEC_SIZE * 3), %r8
-	jle	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
-	vpcmpeqb %ymm5, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	vmovdqu %ymm4, (%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec5)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
+	/* vzeroupper early to avoid duplicating at each return.  */
+	COND_VZEROUPPER
 
-	vpcmpeqb %ymm6, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec6)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
+	testw	%cx, %cx
+	jz	L(page_cross_copy_16_31)
 
-	vpcmpeqb %ymm7, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-	lea	VEC_SIZE(%rdi, %rcx), %rdi
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-L(StrncpyExit):
-	cmp	$65, %r8d
-	je	L(StrncpyExit65)
-	cmp	$33, %r8d
-	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8d
-	jae	L(StrncpyExit17_32)
-	cmp	$9, %r8d
-	jae	L(StrncpyExit9_16)
-	cmp	$5, %r8d
-	jae	L(StrncpyExit5_8)
-	cmp	$3, %r8d
-	jae	L(StrncpyExit3_4)
-	cmp	$1, %r8d
-	ja	L(StrncpyExit2)
-	je	L(StrncpyExit1)
-#  ifdef USE_AS_STPCPY
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(ExitZero):
-#  ifndef USE_AS_STRCAT
-	mov	%rdi, %rax
-#  endif
-	VZEROUPPER_RETURN
+	testb	%cl, %cl
+	jz	L(page_cross_copy_8_15)
 
-# endif
+	testl	$0x7, %cl
+	jz	L(page_cross_copy_4_7)
 
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
-# endif
+	testl	%edx, %edx
+	jz	L(page_cross_set_null_term)
+	movzwl	(%rsi), %ecx
+	movw	%cx, (%rdi)
+L(page_cross_set_null_term):
+	movb	$0, (%END_REG)
+	ret
+
+	.p2align 4,, 4
+L(page_cross_copy_4_7):
+	movl	(%rsi), %ecx
+	movl	-3(%rsi, %rdx), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, -3(%END_REG)
+	ret
+
+	.p2align 4,, 4
+L(page_cross_copy_8_15):
+	movq	(%rsi), %rcx
+	movq	-7(%rsi, %rdx), %rsi
+	movq	%rcx, (%rdi)
+	movq	%rsi, -7(%END_REG)
+	ret
+
+
+	.p2align 4,, 3
+L(page_cross_copy_16_31):
+	VMOVU	(%rsi), %xmm0
+	VMOVU	-15(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -15(%END_REG)
+	ret
+# endif
+
+END(STRCPY)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
index 0dcea18dbb..2bbdbb91ab 100644
--- a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_avx2_rtm
-#include "strcat-avx2-rtm.S"
+#define STRNCAT	__strncat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "strncat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
index 52ecbca943..99d094af63 100644
--- a/sysdeps/x86_64/multiarch/strncat-avx2.S
+++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
@@ -1,7 +1,472 @@
-#ifndef STRNCAT
-# define STRNCAT	__strncat_avx2
-#endif
+/* strncat with AVX2
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+#  include "x86-avx2-vecs.h"
+# endif
+
+# ifndef STRNCAT
+#  define STRNCAT	__strncat_avx2
+# endif
+
+
+	/* Use movsb in page cross case to save code size.  */
+# define USE_MOVSB_IN_PAGE_CROSS	1
+
+# ifdef USE_AS_WCSCPY
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCAT)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   using the non-length variant {wcs|str}cat.  */
+# ifdef USE_AS_WCSCPY
+	decq	%rdx
+	movq	%rdx, %rcx
+	shr	$56, %rcx
+	jnz	L(zero_len)
+	salq	$2, %rdx
+# else
+	decq	%rdx
+	/* `dec` can macrofuse with `jl`. If the flag needs to become
+	   `jb` replace `dec` with `sub`.  */
+	jl	L(zero_len)
+# endif
+	movq	%rdi, %rax
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
+
+# include "strcat-strlen-avx2.S"
+
+	movl	%esi, %ecx
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	cmpq	$VEC_SIZE, %rdx
+	/* `jb` because length rdx is now length - 1.  */
+	jb	L(less_1x_vec)
+
+	testl	%ecx, %ecx
+	jz	L(more_1x_vec)
+
+	/* Hoist this to save code size.  */
+	COND_VZEROUPPER
+	bsfl	%ecx, %edx
+
+	/* Use mask bits in rcx to detect which copy we need. If the low
+	   mask is zero then there must be a bit set in the upper half.
+	   I.e if ecx != 0 and cx == 0, then match must be upper 16
+	   bits so we use L(copy_16_31).  */
+	testw	%cx, %cx
+	jz	L(copy_16_31)
+
+	.p2align 4,, 2
+L(copy_less_16):
+	testb	%cl, %cl
+	jz	L(copy_8_15)
+
+# ifndef USE_AS_WCSCPY
+	testb	$0x7, %cl
+	jz	L(copy_4_7)
+
+	vmovd	%xmm0, %ecx
+	testl	%edx, %edx
+	jz	L(set_null_term)
+	movw	%cx, (%rdi)
+	movzbl	(%rsi, %rdx), %ecx
+L(set_null_term):
+	movb	%cl, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 11
+L(copy_4_7):
+# endif
+	movl	-(4 - CHAR_SIZE)(%rsi, %rdx), %ecx
+	vmovd	%xmm0, (%rdi)
+	movl	%ecx, -(4 - CHAR_SIZE)(%rdi, %rdx)
+	ret
+
+
+	.p2align 4,, 4
+L(less_1x_vec):
+	btsl	%edx, %ecx
+	COND_VZEROUPPER
+	/* edx already a dependency.  */
+	bsfl	%ecx, %edx
+	testw	%cx, %cx
+	jnz	L(copy_less_16)
+
+	.p2align 4,, 10
+L(copy_16_31):
+	VMOVU	-(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 10
+L(copy_8_15):
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
+	vmovq	%xmm0, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+	.p2align 6,, 14
+L(more_1x_vec):
+	VMOVU	%VMM(0), (%rdi)
+
+	/* Align rsi (src) and just rdx/rdi (length/dst).  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	incq	%rsi
+	addq	%rsi, %rdi
+L(loop_last_4x_vec):
+	subq	%rsi, %rdx
+L(last_4x_vec):
+	VMOVA	0(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jae	L(more_2x_vec)
+L(last_2x_vec):
+	tzcnt	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jb	L(ret_vec_x1_len)
+
+	cmpl	$VEC_SIZE, %ecx
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE + 0)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), 0(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+L(ret_vec_x2_len):
+	/* btsl will automatically mask lower VEC_SIZE - 1 bits from
+	   edx.  */
+	btsl	%edx, %ecx
+	.p2align 4,, 3
+L(ret_vec_x2):
+	bsfl	%ecx, %ecx
+	VMOVU	((VEC_SIZE + 0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+
+	.p2align 4,, 12
+L(ret_vec_x1_len):
+	movl	%edx, %ecx
+L(ret_vec_x1):
+	VMOVU	(0 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), (0 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(ret_vec_x3_len):
+	btsl	%edx, %ecx
+L(ret_vec_x3):
+	bsfl	%ecx, %ecx
+	VMOVU	((VEC_SIZE * 2 + 0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 2 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+	.p2align 6,, 14
+L(more_2x_vec):
+	/* L(ret_vec_x1) expects ecx to have position of first match so
+	   test with bsf.  */
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE + 0)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), 0(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE + 0)(%rdi)
+
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
 
-#define USE_AS_STRNCAT
-#define STRCAT	STRNCAT
-#include "strcat-avx2.S"
+	/* Check if length is greater than 4x VEC.  */
+	addq	$(VEC_SIZE * -4), %rdx
+	jbe	L(more_4x_vec)
+
+	/* Check if length was between [VEC_SIZE * 2 + 1, VEC_SIZE * 3].
+	 */
+	cmpl	$((VEC_SIZE * 3 - 1)-(VEC_SIZE * 4)), %edx
+	jle	L(ret_vec_x3_len)
+
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+L(ret_vec_x4_len):
+	btsl	%edx, %ecx
+L(ret_vec_x4):
+	bsfl	%ecx, %ecx
+	VMOVU	((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
+	VZEROUPPER_RETURN
+
+
+
+
+	.p2align 4,, 8
+	.p2align 6,, 10
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 3 + 0)(%rdi)
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+	/* Recheck length before aligning.  */
+	cmpq	$(VEC_SIZE * 4 - 1), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Align rsi (src) and just rdx/rdi (length/dst).  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %r8d
+	addq	%rsi, %rdi
+	testl	%r8d, %r8d
+	jnz	L(loop_4x_done)
+
+	/* Use r9 for end of region before handling last 4x VEC
+	   specially.  */
+	leaq	-(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %r8d
+
+	testl	%r8d, %r8d
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	/* L(ret_vec_x1) expects ecx to have position of first match so
+	   test with bsf.  */
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	bsfl	%r8d, %r8d
+	VMOVU	((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %r8), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %r8)
+L(return_end):
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 4
+L(page_cross):
+	movq	%rsi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+
+	VPCMPEQ	(%r8), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+
+	subl	%esi, %r8d
+	andl	$(VEC_SIZE - 1), %r8d
+	cmpq	%r8, %rdx
+	jb	L(page_cross_small)
+# if USE_MOVSB_IN_PAGE_CROSS
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shll	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsfl	%ecx, %ecx
+	rep	movsb
+	VZEROUPPER_RETURN
+
+L(page_cross_small):
+	tzcntl	%ecx, %ecx
+	cmpq	%rdx, %rcx
+	cmova	%edx, %ecx
+#  ifdef USE_AS_WCSCPY
+	addl	$CHAR_SIZE, %ecx
+#  else
+	incl	%ecx
+#  endif
+	rep	movsb
+	VZEROUPPER_RETURN
+
+# else
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+	xorl	%edx, %edx
+
+
+	/* Traditional copy case, essentially same as used in non-page-
+	   cross case but since we can't reuse VMM(0) we need twice as
+	   many loads from rsi.  */
+	bsfl	%ecx, %edx
+	COND_VZEROUPPER
+
+	testw	%cx, %cx
+	jz	L(page_cross_copy_16_31)
+
+	testb	%cl, %cl
+	jz	L(page_cross_copy_8_15)
+
+	testb	$0x7, %cl
+	jz	L(page_cross_copy_4_7)
+
+	testl	%edx, %edx
+	jz	L(page_cross_set_null_term)
+	movzwl	(%rsi), %ecx
+	movw	%cx, (%rdi)
+L(page_cross_set_null_term):
+	movb	$0, (%rdi, %rdx)
+	ret
+
+
+	.p2align 4,, 4
+L(page_cross_copy_4_7):
+	movl	(%rsi), %ecx
+	movl	-3(%rsi, %rdx), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, -3(%rdi, %rdx)
+	ret
+	.p2align 4,, 4
+L(page_cross_copy_8_15):
+	movq	(%rsi), %rcx
+	movq	-7(%rsi, %rdx), %rsi
+	movq	%rcx, (%rdi)
+	movq	%rsi, -7(%rdi, %rdx)
+	ret
+
+
+	.p2align 4,, 3
+L(page_cross_copy_16_31):
+	VMOVU	(%rsi), %xmm0
+	VMOVU	-15(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -15(%rdi, %rdx)
+	ret
+
+# endif
+
+L(zero_len):
+	incq	%rdx
+	jnz	OVERFLOW_STRCAT
+	movq	%rdi, %rax
+	ret
+
+
+END(STRNCAT)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
index 79e7083299..b582a4a7a1 100644
--- a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STRNCPY	__strncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
index ce634e94fa..dfdde74751 100644
--- a/sysdeps/x86_64/multiarch/strncpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
@@ -1,7 +1,738 @@
-#ifndef STRNCPY
-# define STRNCPY	__strncpy_avx2
-#endif
+/* strncpy with AVX2
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+
+# ifndef VEC_SIZE
+#  include "x86-avx2-vecs.h"
+# endif
+
+# ifndef STRNCPY
+#  define STRNCPY	__strncpy_avx2
+# endif
+
+
+# ifdef USE_AS_WCSCPY
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCPY)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   just running rep stos{b|l} to zero set (which will almost
+	   certainly segfault), if that succeeds then just calling
+	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
+# ifdef USE_AS_WCSCPY
+	decq	%rdx
+	movq	%rdx, %rax
+	/* 56 is end of max supported address space.  */
+	shr	$56, %rax
+	jnz	L(zero_len)
+	salq	$2, %rdx
+# else
+	decq	%rdx
+	/* `dec` can macrofuse with `jl`. If the flag needs to become
+	   `jb` replace `dec` with `sub`.  */
+	jl	L(zero_len)
+# endif
+
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
+	movl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* If no STPCPY just save end ahead of time.  */
+# ifndef USE_AS_STPCPY
+	movq	%rdi, %rax
+# elif defined USE_AS_WCSCPY
+	/* Clear dependency as nearly all return code for wcpncpy uses
+	   `setc %al`.  */
+	xorl	%eax, %eax
+# endif
+
+	cmpq	$(VEC_SIZE - CHAR_SIZE), %rdx
+	/* `jb` because length rdx is now length - CHAR_SIZE.  */
+	jbe	L(less_1x_vec)
+
+	/* This may overset but thats fine because we still need to zero
+	   fill.  */
+	VMOVU	%VMM(0), (%rdi)
+
+	testl	%ecx, %ecx
+	jnz	L(zfill)
+
+	/* Align.  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	incq	%rsi
+L(last_4x_vec):
+	addq	%rsi, %rdi
+L(loop_last_4x_vec):
+	subq	%rsi, %rdx
+
+
+	VMOVA	0(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jae	L(more_2x_vec)
+
+	cmpl	$(VEC_SIZE), %edx
+	jb	L(ret_vec_x1_len)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VPCMPEQ	VEC_SIZE(%rsi), %VZERO, %VMM(6)
+	VMOVU	%VMM(1), (%rdi)
+	vpmovmskb %VMM(6), %ecx
+	shlq	$VEC_SIZE, %rcx
+L(ret_vec_x1_len):
+	tzcntq	%rcx, %rcx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+L(ret_vec_x1_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+L(ret_vec_x1_len_no_zfill):
+	VMOVU	((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	.p2align 4,, 6
+L(ret_vec_x1):
+	bsfl	%ecx, %ecx
+	VMOVU	%VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+	subl	%ecx, %edx
+	/* Check if we need to reload/store.  */
+	cmpl	$VEC_SIZE, %edx
+	jb	L(ret_vec_x1_len_no_zfill_mov)
+	/* Otherwise safe to just store directly.  */
+	VMOVU	%VMM(1), (%rdi)
+	VMOVU	%VZERO, (%rdi, %rcx)
+# ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 12
+L(more_2x_vec):
+	VMOVU	%VMM(1), (%rdi)
+	testl	%ecx, %ecx
+	/* Must fill at least 2x VEC.  */
+	jnz	L(zfill_vec1)
+
+	VMOVA	VEC_SIZE(%rsi), %VMM(2)
+	VMOVU	%VMM(2), VEC_SIZE(%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	/* Must fill at least 1x VEC.  */
+	jnz	L(zfill_vec2)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
+	   CHAR_SIZE.  */
+	cmpq	$(VEC_SIZE * 4 - CHAR_SIZE), %rdx
+	ja	L(more_4x_vec)
+
+	subl	$(VEC_SIZE * 3), %edx
+	jb	L(ret_vec_x3_len)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	vpmovmskb %VMM(6), %ecx
+	tzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x4_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+	movl	%ecx, %edx
+L(ret_vec_x4_len_no_zfill):
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 3 + 0)(%edx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+
+L(ret_vec_x3_len):
+	addl	$(VEC_SIZE * 1), %edx
+	tzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+L(ret_vec_x3_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+	.p2align 4,, 4
+L(ret_vec_x3_len_no_zfill):
+	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 2 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsfl	%ecx, %ecx
+	VMOVU	%VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
+	subl	%ecx, %edx
+	jl	L(ret_vec_x3_len_no_zfill_mov)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx), %rax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(more_4x_vec):
+
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec3)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(4)
+	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec4)
+
+	movq	%rdx, %rcx
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+	/* Recheck length before aligning.  */
+	cmpq	$(VEC_SIZE * 8 - CHAR_SIZE), %rcx
+	jbe	L(last_4x_vec)
+
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %r8d
+	addq	%rsi, %rdi
+	testl	%r8d, %r8d
+	jnz	L(loop_4x_done)
+
+	/* Use r9 as end register.  */
+	leaq	-(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
 
-#define USE_AS_STRNCPY
-#define STRCPY	STRNCPY
-#include "strcpy-avx2.S"
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %r8d
+
+	testl	%r8d, %r8d
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	subq	%rsi, %rdx
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec1)
+
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec2)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec3)
+
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+	movl	%r8d, %ecx
+
+	// Zfill more....
+
+	.p2align 4,, 4
+L(zfill_vec4):
+	addq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2), %rdx
+L(zfill_vec2):
+	shlq	$VEC_SIZE, %rcx
+L(zfill):
+	bsfq	%rcx, %rcx
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_from_page_cross):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(zfill_less_vec_vzeroupper)
+
+L(zfill_more_1x_vec):
+	VMOVU	%VZERO, CHAR_SIZE(%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jae	L(zfill_more_2x_vec)
+L(zfill_done0):
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(zfill_vec3):
+	addq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2), %rdx
+	.p2align 4,, 2
+L(zfill_vec1):
+	bsfl	%ecx, %ecx
+	addq	%rcx, %rdi
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+	/* zfill from vec1/vec3 must have to set at least 2x VECS.  */
+
+	VMOVU	%VZERO, CHAR_SIZE(%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jb	L(zfill_done0)
+L(zfill_more_2x_vec):
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
+	subq	$(VEC_SIZE * 4 - CHAR_SIZE), %rdx
+	jbe	L(zfill_done)
+
+	addq	%rdi, %rdx
+	VMOVU	%VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
+
+
+	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
+	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
+
+	subq	$-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
+	cmpq	%rdi, %rdx
+	jbe	L(zfill_done)
+
+	andq	$-(VEC_SIZE), %rdi
+	.p2align 4,, 12
+L(zfill_loop_4x_vec):
+	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	ja	L(zfill_loop_4x_vec)
+L(zfill_done):
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(copy_1x):
+	VMOVU	%VMM(0), (%rdi)
+	testl	%ecx, %ecx
+	jz	L(ret_32_32)
+L(zfill_less_vec):
+	bsfl	%ecx, %ecx
+L(zfill_less_vec_no_bsf):
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_less_vec_vzeroupper):
+	COND_VZEROUPPER
+	/* We are taking advantage of the fact that to be here we must
+	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
+	   way for overwriting.  */
+	cmpl	$16, %edx
+	jb	L(zfill_less_16)
+	VMOVU	%VZERO_128, (%rdi)
+	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
+	ret
+# ifdef USE_AS_STPCPY
+L(ret_32_32):
+	leaq	CHAR_SIZE(%rdi, %rdx), %rax
+	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4,, 4
+L(copy_16_31):
+	/* Overfill to avoid branches.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	ja	L(zfill_less_vec_no_bsf)
+# ifndef USE_AS_STPCPY
+L(ret_32_32):
+# else
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 4
+L(copy_8_15):
+	/* Overfill to avoid branches.  */
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
+	vmovq	%xmm0, (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_8_15)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+	.p2align 4,, 8
+L(zfill_less_16):
+	xorl	%ecx, %ecx
+	cmpl	$8, %edx
+	jb	L(zfill_less_8)
+	movq	%rcx, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
+# ifndef USE_AS_STPCPY
+L(ret_8_15):
+# endif
+	ret
+
+
+	.p2align 4,, 8
+L(less_1x_vec):
+	/* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
+	   buffer sizes are aligned conventially.  */
+	je	L(copy_1x)
+
+	tzcntl	%ecx, %ecx
+	cmpl	$16, %edx
+	jae	L(copy_16_31)
+
+	COND_VZEROUPPER
+	cmpl	$8, %edx
+	jae	L(copy_8_15)
+# ifdef USE_AS_WCSCPY
+	testl	%ecx, %ecx
+	jz	L(zfill_less_8_set_ret)
+
+	movl	(%rsi, %rdx), %esi
+	vmovd	%xmm0, (%rdi)
+	movl	%esi, (%rdi, %rdx)
+
+#  ifdef USE_AS_STPCPY
+	cmpl	%ecx, %edx
+L(ret_8_15):
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  endif
+	ret
+L(zfill_less_8_set_ret):
+	xorl	%ecx, %ecx
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+L(zfill_less_8):
+	movl	%ecx, (%rdi)
+	movl	%ecx, (%rdi, %rdx)
+	ret
+
+# else
+	cmpl	$3, %edx
+	jb	L(copy_0_3)
+	/* Overfill to avoid branches.  */
+	movl	-3(%rsi, %rdx), %esi
+	vmovd	%xmm0, (%rdi)
+	movl	%esi, -3(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_4_7)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+	xorl	%ecx, %ecx
+	.p2align 4,, 8
+L(zfill_less_8):
+	cmpl	$3, %edx
+	jb	L(zfill_less_3)
+	movl	%ecx, (%rdi)
+	movl	%ecx, -3(%rdi, %rdx)
+#  ifdef USE_AS_STPCPY
+	ret
+#  endif
+
+L(ret_4_7):
+#  ifdef USE_AS_STPCPY
+L(ret_8_15):
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+	ret
+
+	.p2align 4,, 4
+L(zfill_less_3):
+	testl	%edx, %edx
+	jz	L(zfill_1)
+	movw	%cx, (%rdi)
+L(zfill_1):
+	movb	%cl, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_0_3):
+	vmovd	%xmm0, %r8d
+	testl	%edx, %edx
+	jz	L(copy_1)
+	movw	%r8w, (%rdi)
+	cmpl	%ecx, %edx
+	ja	L(zfill_from_1)
+	movzbl	(%rsi, %rdx), %r8d
+#  ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+	movb	%r8b, (%rdi, %rdx)
+	ret
+#  endif
+
+L(copy_1):
+#  ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	cmpl	%ecx, %edx
+	adcq	%rdi, %rax
+#  endif
+#  ifdef USE_AS_WCSCPY
+	vmovd	%xmm0, (%rdi)
+#  else
+	movb	%r8b, (%rdi, %rdx)
+#  endif
+	ret
+# endif
+
+	.p2align 4,, 2
+L(zero_len):
+	movq	%rdi, %rax
+	ret
+# ifndef USE_AS_WCSCPY
+	.p2align 4,, 8
+L(zfill_from_1):
+#  ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+#  endif
+	movw	$0, -1(%rdi, %rdx)
+	ret
+# endif
+
+	.p2align 4,, 4
+	.p2align 6,, 8
+L(page_cross):
+	movq	%rsi, %rax
+	andq	$(VEC_SIZE * -1), %rax
+
+	VPCMPEQ	(%rax), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+
+	subl	%esi, %eax
+	andl	$(VEC_SIZE - 1), %eax
+	cmpq	%rax, %rdx
+	jb	L(page_cross_small)
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* If rcx is non-zero then continue.  */
+	shl	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsf	%ecx, %ecx
+
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	leaq	-CHAR_SIZE(%rdi, %rcx), %rax
+# else
+	movq	%rdi, %rax
+# endif
+
+	rep	movsb
+# ifdef USE_AS_WCSCPY
+	movl	$0, (%rdi)
+# else
+	movb	$0, (%rdi)
+# endif
+	jmp	L(zfill_from_page_cross)
+
+L(page_cross_small):
+	tzcntl	%ecx, %ecx
+	xorl	%eax, %eax
+	cmpl	%ecx, %edx
+	jbe	L(page_cross_copy_only)
+
+	/* Do a zfill of the tail before copying.  */
+	movq	%rdi, %r9
+	movl	%ecx, %r8d
+
+	subl	%ecx, %edx
+	leaq	CHAR_SIZE(%rdi, %rcx), %rdi
+	movl	%edx, %ecx
+	rep	stosb
+	movq	%r9, %rdi
+	movl	%r8d, %edx
+L(page_cross_copy_only):
+	leal	CHAR_SIZE(%rdx), %ecx
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# else
+	movq	%rdi, %rax
+# endif
+	rep	movsb
+	ret
+
+
+L(best_effort_strncpy):
+	movq	%rdx, %rcx
+	xorl	%eax, %eax
+	movq	%rdi, %r8
+	/* The length is >= 2^63. We very much so expect to segfault at
+	   rep stos. If that doesn't happen then just strcpy to finish.
+	 */
+# ifdef USE_AS_WCSCPY
+	rep	stosl
+# else
+	rep	stosb
+# endif
+	movq	%r8, %rdi
+	jmp	OVERFLOW_STRCPY
+
+
+
+END(STRNCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
index dca1089060..01bead1435 100644
--- a/sysdeps/x86_64/multiarch/x86-avx-vecs.h
+++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
@@ -27,7 +27,10 @@
 #define VEC_SIZE			32
 #include "x86-vec-macros.h"
 
-#define USE_WITH_AVX		1
+#ifndef USE_WITH_AVX2
+# define USE_WITH_AVX		1
+#endif
+
 #define SECTION(p)			p##.avx
 
 /* 4-byte mov instructions with AVX2.  */
diff --git a/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
new file mode 100644
index 0000000000..a5966701ec
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
@@ -0,0 +1,26 @@
+/* Common config for AVX2-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX2_RTM_VECS_H
+#define _X86_AVX2_RTM_VECS_H			1
+
+#define USE_WITH_AVX2		1
+#include "x86-avx-rtm-vecs.h"
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-avx2-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
new file mode 100644
index 0000000000..16d7ae5147
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
@@ -0,0 +1,27 @@
+/* Common config for AVX2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX2_VECS_H
+#define _X86_AVX2_VECS_H			1
+
+#define USE_WITH_AVX2		1
+
+#include "x86-avx-vecs.h"
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v1 4/4] x86: Add optimized functions for the wide-character strcpy family
  2022-11-03  8:53 [PATCH v1 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json Noah Goldstein
  2022-11-03  8:53 ` [PATCH v1 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
  2022-11-03  8:53 ` [PATCH v1 3/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
@ 2022-11-03  8:53 ` Noah Goldstein
  2022-11-03  9:06 ` [PATCH v1 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json Noah Goldstein
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-03  8:53 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Implemented:
    wcscat-avx2{+rtm}
    wcscpy-avx2{+rtm}
    wcpcpy-avx2{+rtm}
    wcsncpy-avx2{+rtm}
    wcpncpy-avx2{+rtm}
    wcsncat-avx2{+rtm}
    wcscat-evex
    wcscpy-evex
    wcpcpy-evex
    wcsncpy-evex
    wcpncpy-evex
    wcsncat-evex

Performance Changes:
    Times are from N = 10 runs of the benchmark suite and are reported
    as geometric mean of all ratios of New Implementation / Best Old
    Implementation. Best Old Implementation was determined with the
    highest ISA implementation.

    wcscat-avx2     -> 0.975
    wcscpy-avx2     -> 0.591
    wcpcpy-avx2     -> 0.698
    wcsncpy-avx2    -> 0.730
    wcpncpy-avx2    -> 0.711
    wcsncat-avx2    -> 0.938
    wcscat-evex     -> 0.991
    wcscpy-evex     -> 0.587
    wcpcpy-evex     -> 0.695
    wcsncpy-evex    -> 0.719
    wcpncpy-evex    -> 0.694
    wcsncat-evex    -> 0.959

Code Size Changes:
    This change (compared with the last two commits without it)
    increase the size of libc.so by 19392 bytes. For reference this
    entire patchset increases libc.so by 2624 bytes (so without the
    wide-character functions libc.so would decrease by 16768 bytes).

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
Results attached to the evex/avx2 patches.
    
 sysdeps/x86_64/Makefile                     |   5 +
 sysdeps/x86_64/multiarch/Makefile           |  26 +++-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c  | 135 +++++++++++++++++++-
 sysdeps/x86_64/multiarch/ifunc-wcs.h        |  60 +++++++++
 sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S  |   3 +
 sysdeps/x86_64/multiarch/wcpcpy-avx2.S      |   8 ++
 sysdeps/x86_64/multiarch/wcpcpy-evex.S      |   8 ++
 sysdeps/x86_64/multiarch/wcpcpy-generic.c   |  27 ++++
 sysdeps/x86_64/multiarch/wcpcpy.c           |  37 ++++++
 sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S |   3 +
 sysdeps/x86_64/multiarch/wcpncpy-avx2.S     |   8 ++
 sysdeps/x86_64/multiarch/wcpncpy-evex.S     |   8 ++
 sysdeps/x86_64/multiarch/wcpncpy-generic.c  |  27 ++++
 sysdeps/x86_64/multiarch/wcpncpy.c          |  37 ++++++
 sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S  |   3 +
 sysdeps/x86_64/multiarch/wcscat-avx2.S      |  10 ++
 sysdeps/x86_64/multiarch/wcscat-evex.S      |   9 ++
 sysdeps/x86_64/multiarch/wcscat-generic.c   |  27 ++++
 sysdeps/x86_64/multiarch/wcscat.c           |  37 ++++++
 sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S  |   3 +
 sysdeps/x86_64/multiarch/wcscpy-avx2.S      |   7 +
 sysdeps/x86_64/multiarch/wcscpy-evex.S      |   7 +
 sysdeps/x86_64/multiarch/wcscpy-generic.c   |   3 +-
 sysdeps/x86_64/multiarch/wcscpy.c           |  21 +++
 sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S |   3 +
 sysdeps/x86_64/multiarch/wcsncat-avx2.S     |   9 ++
 sysdeps/x86_64/multiarch/wcsncat-evex.S     |   9 ++
 sysdeps/x86_64/multiarch/wcsncat-generic.c  |  27 ++++
 sysdeps/x86_64/multiarch/wcsncat.c          |  34 +++++
 sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S |   3 +
 sysdeps/x86_64/multiarch/wcsncpy-avx2.S     |   7 +
 sysdeps/x86_64/multiarch/wcsncpy-evex.S     |   7 +
 sysdeps/x86_64/multiarch/wcsncpy-generic.c  |  27 ++++
 sysdeps/x86_64/multiarch/wcsncpy.c          |  37 ++++++
 sysdeps/x86_64/wcpcpy-generic.c             |  31 +++++
 sysdeps/x86_64/wcpcpy.S                     |  41 ++++++
 sysdeps/x86_64/wcpncpy-generic.c            |  31 +++++
 sysdeps/x86_64/wcpncpy.S                    |  41 ++++++
 sysdeps/x86_64/wcscat-generic.c             |  31 +++++
 sysdeps/x86_64/wcscat.S                     |  41 ++++++
 sysdeps/x86_64/wcscpy.S                     |   2 +
 sysdeps/x86_64/wcsncat-generic.c            |  31 +++++
 sysdeps/x86_64/wcsncat.S                    |  39 ++++++
 sysdeps/x86_64/wcsncpy-generic.c            |  31 +++++
 sysdeps/x86_64/wcsncpy.S                    |  41 ++++++
 45 files changed, 1036 insertions(+), 6 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcs.h
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy.c
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy.c
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcscat.c
 create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat.c
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy.c
 create mode 100644 sysdeps/x86_64/wcpcpy-generic.c
 create mode 100644 sysdeps/x86_64/wcpcpy.S
 create mode 100644 sysdeps/x86_64/wcpncpy-generic.c
 create mode 100644 sysdeps/x86_64/wcpncpy.S
 create mode 100644 sysdeps/x86_64/wcscat-generic.c
 create mode 100644 sysdeps/x86_64/wcscat.S
 create mode 100644 sysdeps/x86_64/wcsncat-generic.c
 create mode 100644 sysdeps/x86_64/wcsncat.S
 create mode 100644 sysdeps/x86_64/wcsncpy-generic.c
 create mode 100644 sysdeps/x86_64/wcsncpy.S

diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 3627c5659f..688eb2d7c4 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -188,8 +188,13 @@ endif
 ifeq ($(subdir),wcsmbs)
 
 sysdep_routines += \
+  wcpcpy-generic \
+  wcpncpy-generic \
+  wcscat-generic \
   wcscpy-generic \
+  wcsncat-generic \
   wcsncmp-generic \
+  wcsncpy-generic \
   wcsnlen-generic \
 # sysdep_routines
 
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 597ac9d5e9..8b5a3a4ee5 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -130,6 +130,18 @@ endif
 
 ifeq ($(subdir),wcsmbs)
 sysdep_routines += \
+  wcpcpy-avx2 \
+  wcpcpy-avx2-rtm \
+  wcpcpy-evex \
+  wcpcpy-generic \
+  wcpncpy-avx2 \
+  wcpncpy-avx2-rtm \
+  wcpncpy-evex \
+  wcpncpy-generic \
+  wcscat-avx2 \
+  wcscat-avx2-rtm \
+  wcscat-evex \
+  wcscat-generic \
   wcschr-avx2 \
   wcschr-avx2-rtm \
   wcschr-evex \
@@ -139,6 +151,10 @@ sysdep_routines += \
   wcscmp-avx2-rtm \
   wcscmp-evex \
   wcscmp-sse2 \
+  wcscpy-avx2 \
+  wcscpy-avx2-rtm \
+  wcscpy-evex \
+  wcscpy-generic \
   wcscpy-ssse3 \
   wcslen-avx2 \
   wcslen-avx2-rtm \
@@ -146,9 +162,17 @@ sysdep_routines += \
   wcslen-evex512 \
   wcslen-sse2 \
   wcslen-sse4_1 \
+  wcsncat-avx2 \
+  wcsncat-avx2-rtm \
+  wcsncat-evex \
+  wcsncat-generic \
   wcsncmp-avx2 \
   wcsncmp-avx2-rtm \
   wcsncmp-evex \
+  wcsncpy-avx2 \
+  wcsncpy-avx2-rtm \
+  wcsncpy-evex \
+  wcsncpy-generic \
   wcsnlen-avx2 \
   wcsnlen-avx2-rtm \
   wcsnlen-evex \
@@ -161,8 +185,8 @@ sysdep_routines += \
   wmemchr-avx2 \
   wmemchr-avx2-rtm \
   wmemchr-evex \
-  wmemchr-evex512 \
   wmemchr-evex-rtm \
+  wmemchr-evex512 \
   wmemchr-sse2 \
   wmemcmp-avx2-movbe \
   wmemcmp-avx2-movbe-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c3d75a09f4..f31fdf4d05 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -891,16 +891,145 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/wcscpy.c.  */
   IFUNC_IMPL (i, name, wcscpy,
-	      /* ISA V4 wrapper for SSSE3 implementation because
-	         the SSSE3 implementation is also used at ISA
-	         level 3/4.  */
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcscpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcscpy_avx2_rtm)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcscpy,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __wcscpy_ssse3)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
 				     1,
 				     __wcscpy_generic))
 
+  /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
+  IFUNC_IMPL (i, name, wcsncpy,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcsncpy_avx2_rtm)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcpncpy,
+				     1,
+				     __wcsncpy_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcpcpy.c.  */
+  IFUNC_IMPL (i, name, wcpcpy,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcpcpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpcpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpcpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcpcpy_avx2_rtm)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcpcpy,
+				     1,
+				     __wcpcpy_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcpncpy.c.  */
+  IFUNC_IMPL (i, name, wcpncpy,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcpncpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpncpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpncpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcpncpy_avx2_rtm)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncpy,
+				     1,
+				     __wcpncpy_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcscat.c.  */
+  IFUNC_IMPL (i, name, wcscat,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcscat,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscat_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscat_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcscat_avx2_rtm)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscat,
+				     1,
+				     __wcscat_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcsncat.c.  */
+  IFUNC_IMPL (i, name, wcsncat,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncat,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncat_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncat_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcsncat_avx2_rtm)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncat,
+				     1,
+				     __wcsncat_generic))
+
   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
   IFUNC_IMPL (i, name, wcslen,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcslen,
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcs.h b/sysdeps/x86_64/multiarch/ifunc-wcs.h
new file mode 100644
index 0000000000..cda633d8fb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-wcs.h
@@ -0,0 +1,60 @@
+/* Common definition for ifunc selections optimized wide-character
+   string copy functions.
+
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+#ifndef GENERIC
+# define GENERIC generic
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+				      AVX_Fast_Unaligned_Load, ))
+    {
+      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+				       Prefer_No_VZEROUPPER, !))
+	return OPTIMIZE (avx2);
+
+    }
+
+  return OPTIMIZE (GENERIC);
+}
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
new file mode 100644
index 0000000000..756280a3ab
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCPCPY	__wcpcpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcpcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
new file mode 100644
index 0000000000..0fffd912d3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
@@ -0,0 +1,8 @@
+#ifndef WCPCPY
+# define WCPCPY	__wcpcpy_avx2
+#endif
+
+#define USE_AS_STPCPY
+#define USE_AS_WCSCPY
+#define STRCPY	WCPCPY
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-evex.S b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
new file mode 100644
index 0000000000..ac6429cc07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
@@ -0,0 +1,8 @@
+#ifndef WCPCPY
+# define WCPCPY	__wcpcpy_evex
+#endif
+
+#define USE_AS_STPCPY
+#define USE_AS_WCSCPY
+#define STRCPY	WCPCPY
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-generic.c b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
new file mode 100644
index 0000000000..0ba29b081f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
@@ -0,0 +1,27 @@
+/* wcpcpy.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (2)
+
+# define WCPCPY __wcpcpy_generic
+# include <wcsmbs/wcpcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcpcpy.c b/sysdeps/x86_64/multiarch/wcpcpy.c
new file mode 100644
index 0000000000..8f96ddbc99
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcpcpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcpcpy __redirect_wcpcpy
+# include <wchar.h>
+# undef __wcpcpy
+
+# define SYMBOL_NAME wcpcpy
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcpcpy, __wcpcpy, IFUNC_SELECTOR ());
+weak_alias (__wcpcpy, wcpcpy)
+# ifdef SHARED
+__hidden_ver1 (__wcpcpy, __GI___wcpcpy, __redirect_wcpcpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpcpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
new file mode 100644
index 0000000000..80600d6b01
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCPNCPY	__wcpncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcpncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
new file mode 100644
index 0000000000..b7e594f7b7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
@@ -0,0 +1,8 @@
+#ifndef WCPNCPY
+# define WCPNCPY	__wcpncpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STPCPY
+#define STRNCPY	WCPNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-evex.S b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
new file mode 100644
index 0000000000..62ddb694fe
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
@@ -0,0 +1,8 @@
+#ifndef WCPNCPY
+# define WCPNCPY	__wcpncpy_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STPCPY
+#define STRNCPY	WCPNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-generic.c b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
new file mode 100644
index 0000000000..4aab4ecdd2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
@@ -0,0 +1,27 @@
+/* wcpncpy.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (2)
+
+# define WCPNCPY __wcpncpy_generic
+# include <wcsmbs/wcpncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcpncpy.c b/sysdeps/x86_64/multiarch/wcpncpy.c
new file mode 100644
index 0000000000..ed8f307e07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcpncpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcpncpy __redirect_wcpncpy
+# include <wchar.h>
+# undef __wcpncpy
+
+# define SYMBOL_NAME wcpncpy
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcpncpy, __wcpncpy, IFUNC_SELECTOR ());
+weak_alias (__wcpncpy, wcpncpy)
+# ifdef SHARED
+__hidden_ver1 (__wcpncpy, __GI___wcpncpy, __redirect_wcpncpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpncpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
new file mode 100644
index 0000000000..e99449a2dc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSCAT	__wcscat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcscat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2.S b/sysdeps/x86_64/multiarch/wcscat-avx2.S
new file mode 100644
index 0000000000..a20f23c09d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-avx2.S
@@ -0,0 +1,10 @@
+#ifndef WCSCAT
+# define WCSCAT	__wcscat_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRCPY	WCSCAT
+
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscat-evex.S b/sysdeps/x86_64/multiarch/wcscat-evex.S
new file mode 100644
index 0000000000..1d017e4899
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-evex.S
@@ -0,0 +1,9 @@
+#ifndef WCSCAT
+# define WCSCAT	__wcscat_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRCPY	WCSCAT
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcscat-generic.c b/sysdeps/x86_64/multiarch/wcscat-generic.c
new file mode 100644
index 0000000000..6476f85bbb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-generic.c
@@ -0,0 +1,27 @@
+/* wcscat.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (2)
+
+# define WCSCAT __wcscat_generic
+# include <wcsmbs/wcscat.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcscat.c b/sysdeps/x86_64/multiarch/wcscat.c
new file mode 100644
index 0000000000..3277c44561
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcscat.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcscat __redirect_wcscat
+# include <wchar.h>
+# undef __wcscat
+
+# define SYMBOL_NAME wcscat
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcscat, __wcscat, IFUNC_SELECTOR ());
+weak_alias (__wcscat, wcscat)
+# ifdef SHARED
+__hidden_ver1 (__wcscat, __GI___wcscat, __redirect_wcscat)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcscat);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
new file mode 100644
index 0000000000..2f800c8d3e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSCPY	__wcscpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcscpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2.S b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
new file mode 100644
index 0000000000..6bc509da07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
@@ -0,0 +1,7 @@
+#ifndef WCSCPY
+# define WCSCPY	__wcscpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define STRCPY	WCSCPY
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-evex.S b/sysdeps/x86_64/multiarch/wcscpy-evex.S
new file mode 100644
index 0000000000..1069a8e224
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-evex.S
@@ -0,0 +1,7 @@
+#ifndef WCSCPY
+# define WCSCPY	__wcscpy_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define STRCPY	WCSCPY
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-generic.c b/sysdeps/x86_64/multiarch/wcscpy-generic.c
index 93d314aaad..600d606c45 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcscpy-generic.c
@@ -18,8 +18,7 @@
 
 
 #include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (1)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSCPY  __wcscpy_generic
 # include <wcsmbs/wcscpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
index 92c917b6b4..7f6387817b 100644
--- a/sysdeps/x86_64/multiarch/wcscpy.c
+++ b/sysdeps/x86_64/multiarch/wcscpy.c
@@ -26,6 +26,11 @@
 # define SYMBOL_NAME wcscpy
 # include <init-arch.h>
 
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
@@ -35,6 +40,22 @@ IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
+  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load, ))
+    {
+      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER, !))
+	return OPTIMIZE (avx2);
+    }
+
   if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
     return OPTIMIZE (ssse3);
 
diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
new file mode 100644
index 0000000000..609d6e69c0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSNCAT	__wcsncat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcsncat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2.S b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
new file mode 100644
index 0000000000..a72105b7e9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
@@ -0,0 +1,9 @@
+#ifndef WCSNCAT
+# define WCSNCAT	__wcsncat_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRNCAT	WCSNCAT
+#include "strncat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncat-evex.S b/sysdeps/x86_64/multiarch/wcsncat-evex.S
new file mode 100644
index 0000000000..392215950a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-evex.S
@@ -0,0 +1,9 @@
+#ifndef WCSCAT
+# define WCSCAT	__wcsncat_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRNCAT	WCSCAT
+#include "strncat-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncat-generic.c b/sysdeps/x86_64/multiarch/wcsncat-generic.c
new file mode 100644
index 0000000000..9ced02b35e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-generic.c
@@ -0,0 +1,27 @@
+/* wcsncat.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (2)
+
+# define WCSNCAT __wcsncat_generic
+# include <wcsmbs/wcsncat.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncat.c b/sysdeps/x86_64/multiarch/wcsncat.c
new file mode 100644
index 0000000000..49c46aef08
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat.c
@@ -0,0 +1,34 @@
+/* Multiple versions of wcsncat.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define wcsncat __redirect_wcsncat
+# include <wchar.h>
+# undef wcsncat
+
+# define SYMBOL_NAME wcsncat
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcsncat, wcsncat, IFUNC_SELECTOR ());
+# ifdef SHARED
+__hidden_ver1 (wcsncat, __GI_wcsncat, __redirect_wcsncat)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncat);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
new file mode 100644
index 0000000000..cab5a6b820
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSNCPY	__wcsncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcsncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
new file mode 100644
index 0000000000..3a1a8a372c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
@@ -0,0 +1,7 @@
+#ifndef WCSNCPY
+# define WCSNCPY	__wcsncpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define STRNCPY	WCSNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-evex.S b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
new file mode 100644
index 0000000000..2debb8fd6b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
@@ -0,0 +1,7 @@
+#ifndef WCSNCPY
+# define WCSNCPY	__wcsncpy_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define STRNCPY	WCSNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-generic.c b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
new file mode 100644
index 0000000000..693521713b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
@@ -0,0 +1,27 @@
+/* wcsncpy.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (2)
+
+# define WCSNCPY __wcsncpy_generic
+# include <wcsmbs/wcsncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncpy.c b/sysdeps/x86_64/multiarch/wcsncpy.c
new file mode 100644
index 0000000000..5b89dd4d27
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcsncpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcsncpy __redirect_wcsncpy
+# include <wchar.h>
+# undef __wcsncpy
+
+# define SYMBOL_NAME wcsncpy
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcsncpy, __wcsncpy, IFUNC_SELECTOR ());
+weak_alias (__wcsncpy, wcsncpy)
+# ifdef SHARED
+__hidden_ver1 (__wcsncpy, __GI___wcsncpy, __redirect_wcsncpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/wcpcpy-generic.c b/sysdeps/x86_64/wcpcpy-generic.c
new file mode 100644
index 0000000000..d52525f288
--- /dev/null
+++ b/sysdeps/x86_64/wcpcpy-generic.c
@@ -0,0 +1,31 @@
+/* wcpcpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpcpy non-multiarch build is split into two files,
+   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcpcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcpcpy.S b/sysdeps/x86_64/wcpcpy.S
new file mode 100644
index 0000000000..ec32dc070a
--- /dev/null
+++ b/sysdeps/x86_64/wcpcpy.S
@@ -0,0 +1,41 @@
+/* wcpcpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpcpy non-multiarch build is split into two files,
+   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCPCPY	__wcpcpy
+
+# define DEFAULT_IMPL_V4	"multiarch/wcpcpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcpcpy-avx2.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcpcpy, wcpcpy)
+libc_hidden_def (__wcpcpy)
+#endif
diff --git a/sysdeps/x86_64/wcpncpy-generic.c b/sysdeps/x86_64/wcpncpy-generic.c
new file mode 100644
index 0000000000..871219a445
--- /dev/null
+++ b/sysdeps/x86_64/wcpncpy-generic.c
@@ -0,0 +1,31 @@
+/* wcpncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpncpy non-multiarch build is split into two files,
+   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcpncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcpncpy.S b/sysdeps/x86_64/wcpncpy.S
new file mode 100644
index 0000000000..68e6ff1836
--- /dev/null
+++ b/sysdeps/x86_64/wcpncpy.S
@@ -0,0 +1,41 @@
+/* wcpncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpncpy non-multiarch build is split into two files,
+   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCPNCPY	__wcpncpy
+
+# define DEFAULT_IMPL_V4	"multiarch/wcpncpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcpncpy-avx2.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcpncpy, wcpncpy)
+libc_hidden_def (__wcpncpy)
+#endif
diff --git a/sysdeps/x86_64/wcscat-generic.c b/sysdeps/x86_64/wcscat-generic.c
new file mode 100644
index 0000000000..85f981a81f
--- /dev/null
+++ b/sysdeps/x86_64/wcscat-generic.c
@@ -0,0 +1,31 @@
+/* wcscat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcscat non-multiarch build is split into two files,
+   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcscat.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcscat.S b/sysdeps/x86_64/wcscat.S
new file mode 100644
index 0000000000..007de3c40c
--- /dev/null
+++ b/sysdeps/x86_64/wcscat.S
@@ -0,0 +1,41 @@
+/* wcscat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcscat non-multiarch build is split into two files,
+   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCSCAT	__wcscat
+
+# define DEFAULT_IMPL_V4	"multiarch/wcscat-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcscat-avx2.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcscat, wcscat)
+libc_hidden_def (__wcscat)
+#endif
diff --git a/sysdeps/x86_64/wcscpy.S b/sysdeps/x86_64/wcscpy.S
index 11d0bb4bab..ab9288ed74 100644
--- a/sysdeps/x86_64/wcscpy.S
+++ b/sysdeps/x86_64/wcscpy.S
@@ -28,6 +28,8 @@
 
 # define WCSCPY	__wcscpy
 
+# define DEFAULT_IMPL_V4	"multiarch/wcscpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcscpy-avx2.S"
 # define DEFAULT_IMPL_V2	"multiarch/wcscpy-ssse3.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
diff --git a/sysdeps/x86_64/wcsncat-generic.c b/sysdeps/x86_64/wcsncat-generic.c
new file mode 100644
index 0000000000..2cc0f7b11a
--- /dev/null
+++ b/sysdeps/x86_64/wcsncat-generic.c
@@ -0,0 +1,31 @@
+/* wcsncat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncat non-multiarch build is split into two files,
+   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcsncat.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcsncat.S b/sysdeps/x86_64/wcsncat.S
new file mode 100644
index 0000000000..3f4c7948db
--- /dev/null
+++ b/sysdeps/x86_64/wcsncat.S
@@ -0,0 +1,39 @@
+/* wcsncat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncat non-multiarch build is split into two files,
+   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCSNCAT	wcsncat
+
+# define DEFAULT_IMPL_V4	"multiarch/wcsncat-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcsncat-avx2.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+#endif
diff --git a/sysdeps/x86_64/wcsncpy-generic.c b/sysdeps/x86_64/wcsncpy-generic.c
new file mode 100644
index 0000000000..49d06b8ae8
--- /dev/null
+++ b/sysdeps/x86_64/wcsncpy-generic.c
@@ -0,0 +1,31 @@
+/* wcsncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncpy non-multiarch build is split into two files,
+   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcsncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcsncpy.S b/sysdeps/x86_64/wcsncpy.S
new file mode 100644
index 0000000000..e1428fd4c1
--- /dev/null
+++ b/sysdeps/x86_64/wcsncpy.S
@@ -0,0 +1,41 @@
+/* wcsncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncpy non-multiarch build is split into two files,
+   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCSNCPY	__wcsncpy
+
+# define DEFAULT_IMPL_V4	"multiarch/wcsncpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcsncpy-avx2.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcsncpy, wcsncpy)
+libc_hidden_def (__wcsncpy)
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v1 3/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions
  2022-11-03  8:53 ` [PATCH v1 3/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
@ 2022-11-03  8:55   ` Noah Goldstein
  0 siblings, 0 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-03  8:55 UTC (permalink / raw)
  To: libc-alpha; +Cc: hjl.tools, carlos

[-- Attachment #1: Type: text/plain, Size: 95944 bytes --]

On Thu, Nov 3, 2022 at 1:54 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
>     1. Use more overlapping stores to avoid branches.
>     2. Reduce how unrolled the aligning copies are (this is more of a
>        code-size save, its a negative for some sizes in terms of
>        perf).
>     3. For st{r|p}n{cat|cpy} re-order the branches to minimize the
>        number that are taken.
>
> Performance Changes:
>
>     Times are from N = 10 runs of the benchmark suite and are
>     reported as geometric mean of all ratios of
>     New Implementation / Old Implementation.
>
>     strcat-avx2      -> 0.998
>     strcpy-avx2      -> 0.937
>     stpcpy-avx2      -> 0.971
>
>     strncpy-avx2     -> 0.793
>     stpncpy-avx2     -> 0.775
>
>     strncat-avx2     -> 0.993
>
> Code Size Changes:
>     function         -> Bytes New / Bytes Old -> Ratio
>
>     strcat-avx2      -> 685  / 1639 -> 0.418
>     strcpy-avx2      -> 560  / 903  -> 0.620
>     stpcpy-avx2      -> 592  / 939  -> 0.630
>
>     strncpy-avx2     -> 1176 / 2390 -> 0.492
>     stpncpy-avx2     -> 1268 / 2438 -> 0.520
>
>     strncat-avx2     -> 981  / 2563 -> 0.383
>
> Notes:
>     Because of the significant difference between the
>     implementations they are split into three files.
>
>     strcpy-avx2.S    -> strcpy, stpcpy, strcat
>     strncpy-avx2.S   -> strncpy
>     strncat-avx2.S    > strncat
>
>     I couldn't find a way to merge them without making the ifdefs
>     incredibly difficult to follow.
>
> Full check passes on x86-64 and build succeeds for all ISA levels w/
> and w/o multiarch.
> ---
> Results attached.
>
>  sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S    |    6 +-
>  sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S   |    7 +-
>  sysdeps/x86_64/multiarch/stpncpy-avx2.S       |    5 +-
>  sysdeps/x86_64/multiarch/strcat-avx2-rtm.S    |   13 +-
>  sysdeps/x86_64/multiarch/strcat-avx2.S        |  268 +---
>  sysdeps/x86_64/multiarch/strcat-strlen-avx2.S |   76 +
>  sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S    |   13 +-
>  sysdeps/x86_64/multiarch/strcpy-avx2.S        | 1236 +++++------------
>  sysdeps/x86_64/multiarch/strncat-avx2-rtm.S   |    6 +-
>  sysdeps/x86_64/multiarch/strncat-avx2.S       |  477 ++++++-
>  sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S   |    6 +-
>  sysdeps/x86_64/multiarch/strncpy-avx2.S       |  743 +++++++++-
>  sysdeps/x86_64/multiarch/x86-avx-vecs.h       |    5 +-
>  sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h  |   26 +
>  sysdeps/x86_64/multiarch/x86-avx2-vecs.h      |   27 +
>  15 files changed, 1680 insertions(+), 1234 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-vecs.h
>
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> index 2b9c07a59f..189a288053 100644
> --- a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> @@ -1,3 +1,3 @@
> -#define USE_AS_STPCPY
> -#define STRCPY __stpcpy_avx2_rtm
> -#include "strcpy-avx2-rtm.S"
> +#define STPCPY __stpcpy_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
> +#include "stpcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> index 60a2ccfe53..1b252985e7 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> @@ -1,4 +1,3 @@
> -#define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY __stpncpy_avx2_rtm
> -#include "strcpy-avx2-rtm.S"
> +#define STPNCPY        __stpncpy_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
> +#include "stpncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> index b2f8c19143..a46a8edbe2 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> @@ -3,6 +3,5 @@
>  #endif
>
>  #define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY STPNCPY
> -#include "strcpy-avx2.S"
> +#define STRNCPY        STPNCPY
> +#include "strncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> index 637fb557c4..94d51d10bd 100644
> --- a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> @@ -1,12 +1,3 @@
> -#ifndef STRCAT
> -# define STRCAT __strcat_avx2_rtm
> -#endif
> -
> -#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> -  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> -
> -#define VZEROUPPER_RETURN jmp   L(return_vzeroupper)
> -
> -#define SECTION(p) p##.avx.rtm
> -
> +#define STRCAT __strcat_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
>  #include "strcat-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
> index d9b7fb2a43..3f914fa342 100644
> --- a/sysdeps/x86_64/multiarch/strcat-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
> @@ -16,266 +16,10 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <isa-level.h>
> -
> -#if ISA_SHOULD_BUILD (3)
> -
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_avx2
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -/* Number of bytes in a vector register */
> -# define VEC_SIZE      32
> -
> -# ifndef SECTION
> -#  define SECTION(p)   p##.avx
> -# endif
> -
> -       .section SECTION(.text),"ax",@progbits
> -ENTRY (STRCAT)
> -       mov     %rdi, %r9
> -# ifdef USE_AS_STRNCAT
> -       mov     %rdx, %r8
> -# endif
> -
> -       xor     %eax, %eax
> -       mov     %edi, %ecx
> -       and     $((VEC_SIZE * 4) - 1), %ecx
> -       vpxor   %xmm6, %xmm6, %xmm6
> -       cmp     $(VEC_SIZE * 3), %ecx
> -       ja      L(fourth_vector_boundary)
> -       vpcmpeqb (%rdi), %ymm6, %ymm0
> -       vpmovmskb %ymm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_first_vector)
> -       mov     %rdi, %rax
> -       and     $-VEC_SIZE, %rax
> -       jmp     L(align_vec_size_start)
> -L(fourth_vector_boundary):
> -       mov     %rdi, %rax
> -       and     $-VEC_SIZE, %rax
> -       vpcmpeqb        (%rax), %ymm6, %ymm0
> -       mov     $-1, %r10d
> -       sub     %rax, %rcx
> -       shl     %cl, %r10d
> -       vpmovmskb %ymm0, %edx
> -       and     %r10d, %edx
> -       jnz     L(exit)
> -
> -L(align_vec_size_start):
> -       vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
> -       vpmovmskb %ymm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> -       vpmovmskb %ymm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> -       vpmovmskb %ymm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> -       vpmovmskb %ymm3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> -       add     $(VEC_SIZE * 4), %rax
> -       vpmovmskb %ymm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> -       vpmovmskb %ymm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> -       vpmovmskb %ymm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> -       vpmovmskb %ymm3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> -       add     $(VEC_SIZE * 4), %rax
> -       vpmovmskb %ymm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> -       vpmovmskb %ymm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> -       vpmovmskb %ymm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> -       vpmovmskb %ymm3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> -       add     $(VEC_SIZE * 4), %rax
> -       vpmovmskb %ymm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> -       vpmovmskb %ymm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> -       vpmovmskb %ymm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> -       vpmovmskb %ymm3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> -       add     $(VEC_SIZE * 5), %rax
> -       vpmovmskb %ymm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
> -       add     $VEC_SIZE, %rax
> -       vpmovmskb %ymm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
> -       add     $VEC_SIZE, %rax
> -       vpmovmskb %ymm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
> -       add     $VEC_SIZE, %rax
> -       vpmovmskb %ymm3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       add     $VEC_SIZE, %rax
> -
> -       .p2align 4
> -L(align_four_vec_loop):
> -       vmovaps (%rax), %ymm4
> -       vpminub VEC_SIZE(%rax), %ymm4, %ymm4
> -       vmovaps (VEC_SIZE * 2)(%rax),   %ymm5
> -       vpminub (VEC_SIZE * 3)(%rax),   %ymm5, %ymm5
> -       add     $(VEC_SIZE * 4),        %rax
> -       vpminub %ymm4,  %ymm5, %ymm5
> -       vpcmpeqb %ymm5, %ymm6, %ymm5
> -       vpmovmskb %ymm5,        %edx
> -       test    %edx,   %edx
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
> -       sub     $(VEC_SIZE * 5),        %rax
> -       vpmovmskb %ymm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> -       vpmovmskb %ymm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> -       vpmovmskb %ymm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> -       vpmovmskb %ymm3, %edx
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 4), %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit):
> -       sub     %rdi, %rax
> -L(exit_null_on_first_vector):
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_second_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $VEC_SIZE, %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_third_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 2), %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_fourth_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 3), %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_fifth_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 4), %rax
> -
> -       .p2align 4
> -L(StartStrcpyPart):
> -       lea     (%r9, %rax), %rdi
> -       mov     %rsi, %rcx
> -       mov     %r9, %rax      /* save result */
> -
> -# ifdef USE_AS_STRNCAT
> -       test    %r8, %r8
> -       jz      L(ExitZero)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-avx2.S"
> +#ifndef STRCAT
> +# define STRCAT        __strcat_avx2
>  #endif
> +
> +#define USE_AS_STRCAT
> +#define STRCPY STRCAT
> +#include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
> new file mode 100644
> index 0000000000..e0fc286826
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
> @@ -0,0 +1,76 @@
> +    /* Simple strlen implementation that ends at L(strcat_strlen_done).  */
> +       movq    %rdi, %r8
> +       andq    $(VEC_SIZE * -1), %r8
> +       VPCMPEQ (%r8), %VZERO, %VMM(0)
> +       vpmovmskb %VMM(0), %ecx
> +       shrxl   %edi, %ecx, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v0)
> +
> +       VPCMPEQ VEC_SIZE(%r8), %VZERO, %VMM(0)
> +       vpmovmskb %VMM(0), %ecx
> +       leaq    (VEC_SIZE)(%r8), %rdi
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v0)
> +
> +       VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %VMM(0)
> +       vpmovmskb %VMM(0), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v1)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %VMM(0)
> +       vpmovmskb %VMM(0), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v2)
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %VMM(0)
> +       vpmovmskb %VMM(0), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v3)
> +
> +       orq     $(VEC_SIZE * 4 - 1), %rdi
> +       .p2align 4,, 8
> +L(strlen_loop_4x_vec):
> +       VMOVA   (VEC_SIZE * 0 + 1)(%rdi), %VMM(0)
> +       VPMIN   (VEC_SIZE * 1 + 1)(%rdi), %VMM(0), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 1)(%rdi), %VMM(2)
> +       VPMIN   (VEC_SIZE * 3 + 1)(%rdi), %VMM(2), %VMM(3)
> +       VPMIN   %VMM(1), %VMM(3), %VMM(3)
> +       VPCMPEQ %VMM(3), %VZERO, %VMM(3)
> +       vpmovmskb %VMM(3), %r8d
> +       subq    $(VEC_SIZE * -4), %rdi
> +       testl   %r8d, %r8d
> +       jz      L(strlen_loop_4x_vec)
> +
> +       addq    $(VEC_SIZE * -4 + 1), %rdi
> +
> +       VPCMPEQ %VMM(0), %VZERO, %VMM(0)
> +       vpmovmskb %VMM(0), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v0)
> +
> +       VPCMPEQ %VMM(1), %VZERO, %VMM(1)
> +       vpmovmskb %VMM(1), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v1)
> +
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(2)
> +       vpmovmskb %VMM(2), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v2)
> +
> +       movl    %r8d, %ecx
> +L(bsf_and_done_v3):
> +       addq    $VEC_SIZE, %rdi
> +L(bsf_and_done_v2):
> +       bsfl    %ecx, %ecx
> +       leaq    (VEC_SIZE * 2)(%rdi, %rcx), %rdi
> +       jmp     L(strcat_strlen_done)
> +
> +       .p2align 4,, 4
> +L(bsf_and_done_v1):
> +       addq    $VEC_SIZE, %rdi
> +L(bsf_and_done_v0):
> +       bsfl    %ecx, %ecx
> +       addq    %rcx, %rdi
> +L(strcat_strlen_done):
> diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> index c2c581ecf7..fe80ffd265 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> @@ -1,12 +1,3 @@
> -#ifndef STRCPY
> -# define STRCPY __strcpy_avx2_rtm
> -#endif
> -
> -#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> -  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> -
> -#define VZEROUPPER_RETURN jmp   L(return_vzeroupper)
> -
> -#define SECTION(p) p##.avx.rtm
> -
> +#define STRCPY __strcpy_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
>  #include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
> index c725834929..b87a1722d5 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
> @@ -20,984 +20,378 @@
>
>  #if ISA_SHOULD_BUILD (3)
>
> +# include <sysdep.h>
>
> -# ifndef USE_AS_STRCAT
> -#  include <sysdep.h>
> -
> -#  ifndef STRCPY
> -#   define STRCPY  __strcpy_avx2
> -#  endif
> -
> -# endif
> -
> -/* Number of bytes in a vector register */
>  # ifndef VEC_SIZE
> -#  define VEC_SIZE     32
> -# endif
> -
> -# ifndef VZEROUPPER
> -#  define VZEROUPPER   vzeroupper
> -# endif
> -
> -# ifndef SECTION
> -#  define SECTION(p)   p##.avx
> -# endif
> -
> -/* zero register */
> -#define xmmZ   xmm0
> -#define ymmZ   ymm0
> -
> -/* mask register */
> -#define ymmM   ymm1
> -
> -# ifndef USE_AS_STRCAT
> -
> -       .section SECTION(.text),"ax",@progbits
> -ENTRY (STRCPY)
> -#  ifdef USE_AS_STRNCPY
> -       mov     %RDX_LP, %R8_LP
> -       test    %R8_LP, %R8_LP
> -       jz      L(ExitZero)
> -#  endif
> -       mov     %rsi, %rcx
> -#  ifndef USE_AS_STPCPY
> -       mov     %rdi, %rax      /* save result */
> -#  endif
> -
> +#  include "x86-avx2-vecs.h"
>  # endif
>
> -       vpxor   %xmmZ, %xmmZ, %xmmZ
> -
> -       and     $((VEC_SIZE * 4) - 1), %ecx
> -       cmp     $(VEC_SIZE * 2), %ecx
> -       jbe     L(SourceStringAlignmentLessTwoVecSize)
> -
> -       and     $-VEC_SIZE, %rsi
> -       and     $(VEC_SIZE - 1), %ecx
> -
> -       vpcmpeqb (%rsi), %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       shr     %cl, %rdx
> -
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -       mov     $VEC_SIZE, %r10
> -       sub     %rcx, %r10
> -       cmp     %r10, %r8
> -#  else
> -       mov     $(VEC_SIZE + 1), %r10
> -       sub     %rcx, %r10
> -       cmp     %r10, %r8
> -#  endif
> -       jbe     L(CopyVecSizeTailCase2OrCase3)
> +# ifndef STRCPY
> +#  define STRCPY       __strcpy_avx2
>  # endif
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeTail)
>
> -       vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
> -       vpmovmskb %ymm2, %edx
> +       /* Use movsb in page cross case to save code size.  */
> +# define USE_MOVSB_IN_PAGE_CROSS       1
>
> -# ifdef USE_AS_STRNCPY
> -       add     $VEC_SIZE, %r10
> -       cmp     %r10, %r8
> -       jbe     L(CopyTwoVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyTwoVecSize)
> -
> -       vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
> -       vmovdqu %ymm2, (%rdi)
> -
> -/* If source address alignment != destination address alignment */
> -       .p2align 4
> -L(UnalignVecSizeBoth):
> -       sub     %rcx, %rdi
> -# ifdef USE_AS_STRNCPY
> -       add     %rcx, %r8
> -       sbb     %rcx, %rcx
> -       or      %rcx, %r8
> -# endif
> -       mov     $VEC_SIZE, %rcx
> -       vmovdqa (%rsi, %rcx), %ymm2
> -       vmovdqu %ymm2, (%rdi, %rcx)
> -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> -       vpcmpeqb %ymm2, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $(VEC_SIZE * 3), %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec2)
> +# ifdef USE_AS_WCSCPY
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPMIN        vpminud
> +#  define CHAR_SIZE    4
>  # else
> -       jnz     L(CopyVecSize)
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPMIN        vpminub
> +#  define CHAR_SIZE    1
>  # endif
>
> -       vmovdqu %ymm2, (%rdi, %rcx)
> -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
> -       vpcmpeqb %ymm3, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec3)
> -# else
> -       jnz     L(CopyVecSize)
> -# endif
> +# define PAGE_SIZE     4096
>
> -       vmovdqu %ymm3, (%rdi, %rcx)
> -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
> -       vpcmpeqb %ymm4, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec4)
> +# ifdef USE_AS_STPCPY
> +#  define END_REG      rax
>  # else
> -       jnz     L(CopyVecSize)
> +#  define END_REG      rdi, %rdx
>  # endif
>
> -       vmovdqu %ymm4, (%rdi, %rcx)
> -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> -       vpcmpeqb %ymm2, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec2)
> +# ifdef USE_AS_STRCAT
> +#  define PAGE_ALIGN_REG       ecx
>  # else
> -       jnz     L(CopyVecSize)
> +#  define PAGE_ALIGN_REG       eax
>  # endif
>
> -       vmovdqu %ymm2, (%rdi, %rcx)
> -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> -       vpcmpeqb %ymm2, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec2)
> -# else
> -       jnz     L(CopyVecSize)
> -# endif
> +# define VZERO VMM(7)
> +# define VZERO_128     VMM_128(7)
>
> -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
> -       vmovdqu %ymm2, (%rdi, %rcx)
> -       vpcmpeqb %ymm3, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec3)
> -# else
> -       jnz     L(CopyVecSize)
> -# endif
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(STRCPY)
> +       vpxor   %VZERO_128, %VZERO_128, %VZERO_128
>
> -       vmovdqu %ymm3, (%rdi, %rcx)
> -       mov     %rsi, %rdx
> -       lea     VEC_SIZE(%rsi, %rcx), %rsi
> -       and     $-(VEC_SIZE * 4), %rsi
> -       sub     %rsi, %rdx
> -       sub     %rdx, %rdi
> -# ifdef USE_AS_STRNCPY
> -       lea     (VEC_SIZE * 8)(%r8, %rdx), %r8
> -# endif
> -L(UnalignedFourVecSizeLoop):
> -       vmovdqa (%rsi), %ymm4
> -       vmovdqa VEC_SIZE(%rsi), %ymm5
> -       vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
> -       vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
> -       vpminub %ymm5, %ymm4, %ymm2
> -       vpminub %ymm7, %ymm6, %ymm3
> -       vpminub %ymm2, %ymm3, %ymm3
> -       vpcmpeqb %ymmM, %ymm3, %ymm3
> -       vpmovmskb %ymm3, %edx
> -# ifdef USE_AS_STRNCPY
> -       sub     $(VEC_SIZE * 4), %r8
> -       jbe     L(UnalignedLeaveCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(UnalignedFourVecSizeLeave)
> -
> -L(UnalignedFourVecSizeLoop_start):
> -       add     $(VEC_SIZE * 4), %rdi
> -       add     $(VEC_SIZE * 4), %rsi
> -       vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
> -       vmovdqa (%rsi), %ymm4
> -       vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
> -       vmovdqa VEC_SIZE(%rsi), %ymm5
> -       vpminub %ymm5, %ymm4, %ymm2
> -       vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
> -       vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
> -       vmovdqu %ymm7, -VEC_SIZE(%rdi)
> -       vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
> -       vpminub %ymm7, %ymm6, %ymm3
> -       vpminub %ymm2, %ymm3, %ymm3
> -       vpcmpeqb %ymmM, %ymm3, %ymm3
> -       vpmovmskb %ymm3, %edx
> -# ifdef USE_AS_STRNCPY
> -       sub     $(VEC_SIZE * 4), %r8
> -       jbe     L(UnalignedLeaveCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jz      L(UnalignedFourVecSizeLoop_start)
> -
> -L(UnalignedFourVecSizeLeave):
> -       vpcmpeqb %ymm4, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeUnaligned_0)
> -
> -       vpcmpeqb %ymm5, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %ecx
> -       test    %ecx, %ecx
> -       jnz     L(CopyVecSizeUnaligned_16)
> -
> -       vpcmpeqb %ymm6, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeUnaligned_32)
> -
> -       vpcmpeqb %ymm7, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %ecx
> -       bsf     %ecx, %edx
> -       vmovdqu %ymm4, (%rdi)
> -       vmovdqu %ymm5, VEC_SIZE(%rdi)
> -       vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     (VEC_SIZE * 3)(%rdi, %rdx), %rax
> -# endif
> -       vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
> -       add     $(VEC_SIZE - 1), %r8
> -       sub     %rdx, %r8
> -       lea     ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> -# else
> -       add     $(VEC_SIZE * 3), %rsi
> -       add     $(VEC_SIZE * 3), %rdi
> -       jmp     L(CopyVecSizeExit)
> +# ifdef USE_AS_STRCAT
> +       movq    %rdi, %rax
> +#  include "strcat-strlen-avx2.S"
>  # endif
>
> -/* If source address alignment == destination address alignment */
> -
> -L(SourceStringAlignmentLessTwoVecSize):
> -       vmovdqu (%rsi), %ymm3
> -       vmovdqu VEC_SIZE(%rsi), %ymm2
> -       vpcmpeqb %ymm3, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -       cmp     $VEC_SIZE, %r8
> -#  else
> -       cmp     $(VEC_SIZE + 1), %r8
> -#  endif
> -       jbe     L(CopyVecSizeTail1Case2OrCase3)
> +       movl    %esi, %PAGE_ALIGN_REG
> +       andl    $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
> +       ja      L(page_cross)
> +L(page_cross_continue):
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +       movq    %rdi, %rax
>  # endif
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeTail1)
> -
> -       vmovdqu %ymm3, (%rdi)
> -       vpcmpeqb %ymm2, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -       cmp     $(VEC_SIZE * 2), %r8
> -#  else
> -       cmp     $((VEC_SIZE * 2) + 1), %r8
> -#  endif
> -       jbe     L(CopyTwoVecSize1Case2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyTwoVecSize1)
> -
> -       and     $-VEC_SIZE, %rsi
> -       and     $(VEC_SIZE - 1), %ecx
> -       jmp     L(UnalignVecSizeBoth)
> +       VMOVU   (%rsi), %VMM(0)
> +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
>
> -/*------End of main part with loops---------------------*/
> +       testl   %ecx, %ecx
> +       jz      L(more_1x_vec)
>
> -/* Case1 */
> +       /* No longer need ymm registers so just vzeroupper so it doesn't
> +          need to be duplicated at each return statement.  */
> +       COND_VZEROUPPER
>
> -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> -       .p2align 4
> -L(CopyVecSize):
> -       add     %rcx, %rdi
> -# endif
> -L(CopyVecSizeTail):
> -       add     %rcx, %rsi
> -L(CopyVecSizeTail1):
> -       bsf     %edx, %edx
> -L(CopyVecSizeExit):
> -       cmp     $32, %edx
> -       jae     L(Exit32_63)
> -       cmp     $16, %edx
> -       jae     L(Exit16_31)
> -       cmp     $8, %edx
> -       jae     L(Exit8_15)
> -       cmp     $4, %edx
> -       jae     L(Exit4_7)
> -       cmp     $3, %edx
> -       je      L(Exit3)
> -       cmp     $1, %edx
> -       ja      L(Exit2)
> -       je      L(Exit1)
> -       movb    $0, (%rdi)
> +       xorl    %edx, %edx
> +       bsfl    %ecx, %edx
>  # ifdef USE_AS_STPCPY
> -       lea     (%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $1, %r8
> -       lea     1(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> -# endif
> -L(return_vzeroupper):
> -       ZERO_UPPER_VEC_REGISTERS_RETURN
> -
> -       .p2align 4
> -L(CopyTwoVecSize1):
> -       add     $VEC_SIZE, %rsi
> -       add     $VEC_SIZE, %rdi
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $VEC_SIZE, %r8
> -# endif
> -       jmp     L(CopyVecSizeTail1)
> -
> -       .p2align 4
> -L(CopyTwoVecSize):
> -       bsf     %edx, %edx
> -       add     %rcx, %rsi
> -       add     $VEC_SIZE, %edx
> -       sub     %ecx, %edx
> -       jmp     L(CopyVecSizeExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnaligned_0):
> -       bsf     %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -       vmovdqu %ymm4, (%rdi)
> -       add     $((VEC_SIZE * 4) - 1), %r8
> -       sub     %rdx, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> -# else
> -       jmp     L(CopyVecSizeExit)
> -# endif
> -
> -       .p2align 4
> -L(CopyVecSizeUnaligned_16):
> -       bsf     %ecx, %edx
> -       vmovdqu %ymm4, (%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     VEC_SIZE(%rdi, %rdx), %rax
> -# endif
> -       vmovdqu %ymm5, VEC_SIZE(%rdi)
> -       add     $((VEC_SIZE * 3) - 1), %r8
> -       sub     %rdx, %r8
> -       lea     (VEC_SIZE + 1)(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> +       leaq    (%rdi, %rdx), %rax
> +# endif
> +
> +       /* Use mask bits in rcx to detect which copy we need. If the low
> +          mask is zero then there must be a bit set in the upper half.
> +          I.e if ecx != 0 and cx == 0, then match must be upper 16
> +          bits so we use L(copy_16_31).  */
> +       testw   %cx, %cx
> +       jz      L(copy_16_31)
> +
> +       testb   %cl, %cl
> +       jz      L(copy_8_15)
> +# ifdef USE_AS_WCSCPY
> +       vmovd   %xmm0, (%rdi)
> +       movl    $0, (%END_REG)
> +       ret
>  # else
> -       add     $VEC_SIZE, %rsi
> -       add     $VEC_SIZE, %rdi
> -       jmp     L(CopyVecSizeExit)
> -# endif
> -
> -       .p2align 4
> -L(CopyVecSizeUnaligned_32):
> -       bsf     %edx, %edx
> -       vmovdqu %ymm4, (%rdi)
> -       vmovdqu %ymm5, VEC_SIZE(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     (VEC_SIZE * 2)(%rdi, %rdx), %rax
> -# endif
> -       vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> -       add     $((VEC_SIZE * 2) - 1), %r8
> -       sub     %rdx, %r8
> -       lea     ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> +       testb   $0x7, %cl
> +       jz      L(copy_4_7)
> +
> +       testl   %edx, %edx
> +       jz      L(set_null_term)
> +       vmovd   %xmm0, %ecx
> +       movw    %cx, (%rdi)
> +
> +       .p2align 4,, 2
> +L(set_null_term):
> +       movb    $0, (%END_REG)
> +       ret
> +
> +       .p2align 4,, 12
> +L(copy_4_7):
> +       movl    -3(%rsi, %rdx), %ecx
> +       vmovd   %xmm0, (%rdi)
> +       movl    %ecx, -3(%END_REG)
> +       ret
> +# endif
> +
> +       .p2align 4,, 10
> +L(copy_16_31):
> +       VMOVU   -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
> +       VMOVU   %xmm0, (%rdi)
> +       VMOVU   %xmm1, -(16 - CHAR_SIZE)(%END_REG)
> +       ret
> +
> +       .p2align 4,, 10
> +L(copy_8_15):
> +# ifdef USE_AS_WCSCPY
> +       movl    -(8 - CHAR_SIZE)(%rsi, %rdx), %ecx
>  # else
> -       add     $(VEC_SIZE * 2), %rsi
> -       add     $(VEC_SIZE * 2), %rdi
> -       jmp     L(CopyVecSizeExit)
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> -#  ifndef USE_AS_STRCAT
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec6):
> -       vmovdqu %ymm6, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec5):
> -       vmovdqu %ymm5, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec4):
> -       vmovdqu %ymm4, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec3):
> -       vmovdqu %ymm3, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> -#  endif
> -
> -/* Case2 */
> -
> -       .p2align 4
> -L(CopyVecSizeCase2):
> -       add     $VEC_SIZE, %r8
> -       add     %rcx, %rdi
> -       add     %rcx, %rsi
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyTwoVecSizeCase2):
> -       add     %rcx, %rsi
> -       bsf     %edx, %edx
> -       add     $VEC_SIZE, %edx
> -       sub     %ecx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -L(CopyVecSizeTailCase2):
> -       add     %rcx, %rsi
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -L(CopyVecSizeTail1Case2):
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -/* Case2 or Case3,  Case3 */
> -
> -       .p2align 4
> -L(CopyVecSizeCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyVecSizeCase2)
> -L(CopyVecSizeCase3):
> -       add     $VEC_SIZE, %r8
> -       add     %rcx, %rdi
> -       add     %rcx, %rsi
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyTwoVecSizeCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyTwoVecSizeCase2)
> -       add     %rcx, %rsi
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyVecSizeTailCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyVecSizeTailCase2)
> -       add     %rcx, %rsi
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyTwoVecSize1Case2OrCase3):
> -       add     $VEC_SIZE, %rdi
> -       add     $VEC_SIZE, %rsi
> -       sub     $VEC_SIZE, %r8
> -L(CopyVecSizeTail1Case2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyVecSizeTail1Case2)
> -       jmp     L(StrncpyExit)
> -# endif
> -
> -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
> -
> -       .p2align 4
> -L(Exit1):
> -       movzwl  (%rsi), %edx
> -       mov     %dx, (%rdi)
> -# ifdef USE_AS_STPCPY
> -       lea     1(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $2, %r8
> -       lea     2(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> -# endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Exit2):
> -       movzwl  (%rsi), %ecx
> -       mov     %cx, (%rdi)
> -       movb    $0, 2(%rdi)
> -# ifdef USE_AS_STPCPY
> -       lea     2(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $3, %r8
> -       lea     3(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> -# endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Exit3):
> -       mov     (%rsi), %edx
> -       mov     %edx, (%rdi)
> +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
> +# endif
> +       vmovq   %xmm0, (%rdi)
> +       movq    %rcx, -(8 - CHAR_SIZE)(%END_REG)
> +       ret
> +
> +
> +       .p2align 4,, 8
> +L(more_1x_vec):
> +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> +       VMOVU   %VMM(0), (%rdi)
> +# endif
> +       subq    %rsi, %rdi
> +       orq     $(VEC_SIZE - 1), %rsi
> +       addq    %rsi, %rdi
> +       VMOVA   1(%rsi), %VMM(1)
> +
> +       /* Try and order stores after as many loads as is reasonable to
> +          avoid potential false dependencies.  */
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +       VMOVU   %VMM(0), (%rax)
> +# endif
> +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
> +
> +       VMOVA   (VEC_SIZE + 1)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), 1(%rdi)
> +
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
> +
> +       VMOVA   (VEC_SIZE * 2 + 1)(%rsi), %VMM(3)
> +       VMOVU   %VMM(2), (VEC_SIZE + 1)(%rdi)
> +
> +       VPCMPEQ %VMM(3), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
> +
> +       VMOVA   (VEC_SIZE * 3 + 1)(%rsi), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 2 + 1)(%rdi)
> +       VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %edx
> +       testl   %edx, %edx
> +       jnz     L(ret_vec_x4)
> +
> +       VMOVU   %VMM(4), (VEC_SIZE * 3 + 1)(%rdi)
> +
> +       /* Subtract rsi from rdi before aligning. Adding back rsi will
> +          get proper rdi (dst) for new src.  */
> +       subq    %rsi, %rdi
> +       incq    %rsi
> +       orq     $(VEC_SIZE * 4 - 1), %rsi
> +
> +       /* Do first half of loop ahead of time so loop can just start by
> +          storing.  */
> +       VMOVA   (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %edx
> +       addq    %rsi, %rdi
> +
> +       testl   %edx, %edx
> +       jnz     L(loop_4x_done)
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
> +       subq    $(VEC_SIZE * -4), %rsi
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3 + 1)(%rdi)
> +
> +
> +       VMOVA   (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> +
> +       vpmovmskb %VMM(6), %edx
> +       subq    $(VEC_SIZE * -4), %rdi
> +       testl   %edx, %edx
> +       jz      L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
> +
> +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
> +
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
> +L(ret_vec_x4):
> +       bsfl    %edx, %edx
> +       VMOVU   ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> +       VMOVU   %VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
>  # ifdef USE_AS_STPCPY
> -       lea     3(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $4, %r8
> -       lea     4(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       leaq    (VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax
>  # endif
> +L(return_end):
>         VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(Exit4_7):
> -       mov     (%rsi), %ecx
> -       mov     %ecx, (%rdi)
> -       mov     -3(%rsi, %rdx), %ecx
> -       mov     %ecx, -3(%rdi, %rdx)
> +       .p2align 4,, 8
> +L(ret_vec_x1):
> +       bsfl    %ecx, %ecx
> +       VMOVU   (1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> +       VMOVU   %VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
>  # ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     %rdx, %r8
> -       sub     $1, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       leaq    1(%rcx, %rdi), %rax
>  # endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Exit8_15):
> -       mov     (%rsi), %rcx
> -       mov     -7(%rsi, %rdx), %r9
> -       mov     %rcx, (%rdi)
> -       mov     %r9, -7(%rdi, %rdx)
> -# ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     %rdx, %r8
> -       sub     $1, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> -# endif
> -       VZEROUPPER_RETURN
> +L(return_vzeroupper):
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
>
> -       .p2align 4
> -L(Exit16_31):
> -       vmovdqu (%rsi), %xmm2
> -       vmovdqu -15(%rsi, %rdx), %xmm3
> -       vmovdqu %xmm2, (%rdi)
> -       vmovdqu %xmm3, -15(%rdi, %rdx)
> +       .p2align 4,, 8
> +L(ret_vec_x2):
> +       bsfl    %ecx, %ecx
> +       VMOVU   ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> +       VMOVU   %VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
>  # ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub %rdx, %r8
> -       sub $1, %r8
> -       lea 1(%rdi, %rdx), %rdi
> -       jnz L(StrncpyFillTailWithZero)
> +       leaq    (VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax
>  # endif
>         VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(Exit32_63):
> -       vmovdqu (%rsi), %ymm2
> -       vmovdqu -31(%rsi, %rdx), %ymm3
> -       vmovdqu %ymm2, (%rdi)
> -       vmovdqu %ymm3, -31(%rdi, %rdx)
> +       .p2align 4,, 8
> +L(ret_vec_x3):
> +       bsfl    %ecx, %ecx
> +       VMOVU   ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> +       VMOVU   %VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
>  # ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     %rdx, %r8
> -       sub     $1, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       leaq    (VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax
>  # endif
>         VZEROUPPER_RETURN
>
> -# ifdef USE_AS_STRNCPY
>
> -       .p2align 4
> -L(StrncpyExit1):
> -       movzbl  (%rsi), %edx
> -       mov     %dl, (%rdi)
> +       .p2align 4,, 4
> +L(page_cross):
> +       movq    %rsi, %rcx
> +       andq    $(VEC_SIZE * -1), %rcx
> +
> +       VPCMPEQ (%rcx), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       shrxl   %esi, %ecx, %ecx
> +# if USE_MOVSB_IN_PAGE_CROSS
> +       /* Optimizing more aggressively for space as this is very cold
> +          code. This saves 2x cache lines.  */
> +
> +       /* This adds once to the later result which will get correct
> +          copy bounds. NB: this can never zero-out a non-zero RCX as
> +          to be in the page cross case rsi cannot be aligned and we
> +          already right-shift rcx by the misalignment.  */
> +       shll    $CHAR_SIZE, %ecx
> +       jz      L(page_cross_continue)
> +       bsfl    %ecx, %ecx
> +#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +       movq    %rdi, %rax
> +#  endif
> +       rep     movsb
>  #  ifdef USE_AS_STPCPY
> -       lea     1(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, 1(%rdi)
> +       leaq    -CHAR_SIZE(%rdi), %rax
>  #  endif
> -       VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(StrncpyExit2):
> -       movzwl  (%rsi), %edx
> -       mov     %dx, (%rdi)
> -#  ifdef USE_AS_STPCPY
> -       lea     2(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, 2(%rdi)
> -#  endif
>         VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(StrncpyExit3_4):
> -       movzwl  (%rsi), %ecx
> -       movzwl  -2(%rsi, %r8), %edx
> -       mov     %cx, (%rdi)
> -       mov     %dx, -2(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(StrncpyExit5_8):
> -       mov     (%rsi), %ecx
> -       mov     -4(%rsi, %r8), %edx
> -       mov     %ecx, (%rdi)
> -       mov     %edx, -4(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(StrncpyExit9_16):
> -       mov     (%rsi), %rcx
> -       mov     -8(%rsi, %r8), %rdx
> -       mov     %rcx, (%rdi)
> -       mov     %rdx, -8(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(StrncpyExit17_32):
> -       vmovdqu (%rsi), %xmm2
> -       vmovdqu -16(%rsi, %r8), %xmm3
> -       vmovdqu %xmm2, (%rdi)
> -       vmovdqu %xmm3, -16(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(StrncpyExit33_64):
> -       /*  0/32, 31/16 */
> -       vmovdqu (%rsi), %ymm2
> -       vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
> -       vmovdqu %ymm2, (%rdi)
> -       vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(StrncpyExit65):
> -       /* 0/32, 32/32, 64/1 */
> -       vmovdqu (%rsi), %ymm2
> -       vmovdqu 32(%rsi), %ymm3
> -       mov     64(%rsi), %cl
> -       vmovdqu %ymm2, (%rdi)
> -       vmovdqu %ymm3, 32(%rdi)
> -       mov     %cl, 64(%rdi)
> -#  ifdef USE_AS_STPCPY
> -       lea     65(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, 65(%rdi)
> -#  endif
> -       VZEROUPPER_RETURN
> +# else
> +       testl   %ecx, %ecx
> +       jz      L(page_cross_continue)
>
> +       /* Traditional copy case, essentially same as used in non-page-
> +          cross case but since we can't reuse VMM(0) we need twice as
> +          many loads from rsi.  */
>  #  ifndef USE_AS_STRCAT
> -
> -       .p2align 4
> -L(Fill1):
> -       mov     %dl, (%rdi)
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Fill2):
> -       mov     %dx, (%rdi)
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Fill3_4):
> -       mov     %dx, (%rdi)
> -       mov     %dx, -2(%rdi, %r8)
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Fill5_8):
> -       mov     %edx, (%rdi)
> -       mov     %edx, -4(%rdi, %r8)
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Fill9_16):
> -       mov     %rdx, (%rdi)
> -       mov     %rdx, -8(%rdi, %r8)
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Fill17_32):
> -       vmovdqu %xmmZ, (%rdi)
> -       vmovdqu %xmmZ, -16(%rdi, %r8)
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec2):
> -       vmovdqu %ymm2, (%rdi, %rcx)
> -
> -       .p2align 4
> -L(CopyVecSizeVecExit):
> -       bsf     %edx, %edx
> -       add     $(VEC_SIZE - 1), %r8
> -       add     %rcx, %rdi
> -#   ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -#   endif
> -       sub     %rdx, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -
> -       .p2align 4
> -L(StrncpyFillTailWithZero):
> -       xor     %edx, %edx
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(StrncpyFillExit)
> -
> -       vmovdqu %ymmZ, (%rdi)
> -       add     $VEC_SIZE, %rdi
> -
> -       mov     %rdi, %rsi
> -       and     $(VEC_SIZE - 1), %esi
> -       sub     %rsi, %rdi
> -       add     %rsi, %r8
> -       sub     $(VEC_SIZE * 4), %r8
> -       jb      L(StrncpyFillLessFourVecSize)
> -
> -L(StrncpyFillLoopVmovdqa):
> -       vmovdqa %ymmZ, (%rdi)
> -       vmovdqa %ymmZ, VEC_SIZE(%rdi)
> -       vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
> -       vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
> -       add     $(VEC_SIZE * 4), %rdi
> -       sub     $(VEC_SIZE * 4), %r8
> -       jae     L(StrncpyFillLoopVmovdqa)
> -
> -L(StrncpyFillLessFourVecSize):
> -       add     $(VEC_SIZE * 2), %r8
> -       jl      L(StrncpyFillLessTwoVecSize)
> -       vmovdqa %ymmZ, (%rdi)
> -       vmovdqa %ymmZ, VEC_SIZE(%rdi)
> -       add     $(VEC_SIZE * 2), %rdi
> -       sub     $VEC_SIZE, %r8
> -       jl      L(StrncpyFillExit)
> -       vmovdqa %ymmZ, (%rdi)
> -       add     $VEC_SIZE, %rdi
> -       jmp     L(Fill)
> -
> -       .p2align 4
> -L(StrncpyFillLessTwoVecSize):
> -       add     $VEC_SIZE, %r8
> -       jl      L(StrncpyFillExit)
> -       vmovdqa %ymmZ, (%rdi)
> -       add     $VEC_SIZE, %rdi
> -       jmp     L(Fill)
> -
> -       .p2align 4
> -L(StrncpyFillExit):
> -       add     $VEC_SIZE, %r8
> -L(Fill):
> -       cmp     $17, %r8d
> -       jae     L(Fill17_32)
> -       cmp     $9, %r8d
> -       jae     L(Fill9_16)
> -       cmp     $5, %r8d
> -       jae     L(Fill5_8)
> -       cmp     $3, %r8d
> -       jae     L(Fill3_4)
> -       cmp     $1, %r8d
> -       ja      L(Fill2)
> -       je      L(Fill1)
> -       VZEROUPPER_RETURN
> -
> -/* end of ifndef USE_AS_STRCAT */
> +       xorl    %edx, %edx
>  #  endif
> -
> -       .p2align 4
> -L(UnalignedLeaveCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(UnalignedFourVecSizeLeaveCase2)
> -L(UnalignedFourVecSizeLeaveCase3):
> -       lea     (VEC_SIZE * 4)(%r8), %rcx
> -       and     $-VEC_SIZE, %rcx
> -       add     $(VEC_SIZE * 3), %r8
> -       jl      L(CopyVecSizeCase3)
> -       vmovdqu %ymm4, (%rdi)
> -       sub     $VEC_SIZE, %r8
> -       jb      L(CopyVecSizeCase3)
> -       vmovdqu %ymm5, VEC_SIZE(%rdi)
> -       sub     $VEC_SIZE, %r8
> -       jb      L(CopyVecSizeCase3)
> -       vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> -       sub     $VEC_SIZE, %r8
> -       jb      L(CopyVecSizeCase3)
> -       vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
> +       bsfl    %ecx, %edx
>  #  ifdef USE_AS_STPCPY
> -       lea     (VEC_SIZE * 4)(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (VEC_SIZE * 4)(%rdi)
> +       leaq    (%rdi, %rdx), %rax
> +#  elif !defined USE_AS_STRCAT
> +       movq    %rdi, %rax
>  #  endif
> -       VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(UnalignedFourVecSizeLeaveCase2):
> -       xor     %ecx, %ecx
> -       vpcmpeqb %ymm4, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       add     $(VEC_SIZE * 3), %r8
> -       jle     L(CopyVecSizeCase2OrCase3)
> -       test    %edx, %edx
> -#  ifndef USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec4)
> -#  else
> -       jnz     L(CopyVecSize)
> -#  endif
> -       vpcmpeqb %ymm5, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       vmovdqu %ymm4, (%rdi)
> -       add     $VEC_SIZE, %rcx
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -       test    %edx, %edx
> -#  ifndef USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec5)
> -#  else
> -       jnz     L(CopyVecSize)
> -#  endif
> +       /* vzeroupper early to avoid duplicating at each return.  */
> +       COND_VZEROUPPER
>
> -       vpcmpeqb %ymm6, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       vmovdqu %ymm5, VEC_SIZE(%rdi)
> -       add     $VEC_SIZE, %rcx
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -       test    %edx, %edx
> -#  ifndef USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec6)
> -#  else
> -       jnz     L(CopyVecSize)
> -#  endif
> +       testw   %cx, %cx
> +       jz      L(page_cross_copy_16_31)
>
> -       vpcmpeqb %ymm7, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> -       lea     VEC_SIZE(%rdi, %rcx), %rdi
> -       lea     VEC_SIZE(%rsi, %rcx), %rsi
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -L(StrncpyExit):
> -       cmp     $65, %r8d
> -       je      L(StrncpyExit65)
> -       cmp     $33, %r8d
> -       jae     L(StrncpyExit33_64)
> -       cmp     $17, %r8d
> -       jae     L(StrncpyExit17_32)
> -       cmp     $9, %r8d
> -       jae     L(StrncpyExit9_16)
> -       cmp     $5, %r8d
> -       jae     L(StrncpyExit5_8)
> -       cmp     $3, %r8d
> -       jae     L(StrncpyExit3_4)
> -       cmp     $1, %r8d
> -       ja      L(StrncpyExit2)
> -       je      L(StrncpyExit1)
> -#  ifdef USE_AS_STPCPY
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi)
> -#  endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(ExitZero):
> -#  ifndef USE_AS_STRCAT
> -       mov     %rdi, %rax
> -#  endif
> -       VZEROUPPER_RETURN
> +       testb   %cl, %cl
> +       jz      L(page_cross_copy_8_15)
>
> -# endif
> +       testl   $0x7, %cl
> +       jz      L(page_cross_copy_4_7)
>
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# else
> -END (STRCAT)
> -# endif
> +       testl   %edx, %edx
> +       jz      L(page_cross_set_null_term)
> +       movzwl  (%rsi), %ecx
> +       movw    %cx, (%rdi)
> +L(page_cross_set_null_term):
> +       movb    $0, (%END_REG)
> +       ret
> +
> +       .p2align 4,, 4
> +L(page_cross_copy_4_7):
> +       movl    (%rsi), %ecx
> +       movl    -3(%rsi, %rdx), %esi
> +       movl    %ecx, (%rdi)
> +       movl    %esi, -3(%END_REG)
> +       ret
> +
> +       .p2align 4,, 4
> +L(page_cross_copy_8_15):
> +       movq    (%rsi), %rcx
> +       movq    -7(%rsi, %rdx), %rsi
> +       movq    %rcx, (%rdi)
> +       movq    %rsi, -7(%END_REG)
> +       ret
> +
> +
> +       .p2align 4,, 3
> +L(page_cross_copy_16_31):
> +       VMOVU   (%rsi), %xmm0
> +       VMOVU   -15(%rsi, %rdx), %xmm1
> +       VMOVU   %xmm0, (%rdi)
> +       VMOVU   %xmm1, -15(%END_REG)
> +       ret
> +# endif
> +
> +END(STRCPY)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> index 0dcea18dbb..2bbdbb91ab 100644
> --- a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> @@ -1,3 +1,3 @@
> -#define USE_AS_STRNCAT
> -#define STRCAT __strncat_avx2_rtm
> -#include "strcat-avx2-rtm.S"
> +#define STRNCAT        __strncat_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
> +#include "strncat-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
> index 52ecbca943..99d094af63 100644
> --- a/sysdeps/x86_64/multiarch/strncat-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
> @@ -1,7 +1,472 @@
> -#ifndef STRNCAT
> -# define STRNCAT       __strncat_avx2
> -#endif
> +/* strncat with AVX2
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (3)
> +
> +# include <sysdep.h>
> +
> +# ifndef VEC_SIZE
> +#  include "x86-avx2-vecs.h"
> +# endif
> +
> +# ifndef STRNCAT
> +#  define STRNCAT      __strncat_avx2
> +# endif
> +
> +
> +       /* Use movsb in page cross case to save code size.  */
> +# define USE_MOVSB_IN_PAGE_CROSS       1
> +
> +# ifdef USE_AS_WCSCPY
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPMIN        vpminud
> +#  define CHAR_SIZE    4
> +# else
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPMIN        vpminub
> +#  define CHAR_SIZE    1
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# define PAGE_SIZE     4096
> +
> +# define VZERO VMM(7)
> +# define VZERO_128     VMM_128(7)
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCAT)
> +       /* Filter zero length strings and very long strings.  Zero
> +          length strings just return, very long strings are handled by
> +          using the non-length variant {wcs|str}cat.  */
> +# ifdef USE_AS_WCSCPY
> +       decq    %rdx
> +       movq    %rdx, %rcx
> +       shr     $56, %rcx
> +       jnz     L(zero_len)
> +       salq    $2, %rdx
> +# else
> +       decq    %rdx
> +       /* `dec` can macrofuse with `jl`. If the flag needs to become
> +          `jb` replace `dec` with `sub`.  */
> +       jl      L(zero_len)
> +# endif
> +       movq    %rdi, %rax
> +       vpxor   %VZERO_128, %VZERO_128, %VZERO_128
> +
> +# include "strcat-strlen-avx2.S"
> +
> +       movl    %esi, %ecx
> +       andl    $(PAGE_SIZE - 1), %ecx
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> +       ja      L(page_cross)
> +L(page_cross_continue):
> +       VMOVU   (%rsi), %VMM(0)
> +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +
> +       cmpq    $VEC_SIZE, %rdx
> +       /* `jb` because length rdx is now length - 1.  */
> +       jb      L(less_1x_vec)
> +
> +       testl   %ecx, %ecx
> +       jz      L(more_1x_vec)
> +
> +       /* Hoist this to save code size.  */
> +       COND_VZEROUPPER
> +       bsfl    %ecx, %edx
> +
> +       /* Use mask bits in rcx to detect which copy we need. If the low
> +          mask is zero then there must be a bit set in the upper half.
> +          I.e if ecx != 0 and cx == 0, then match must be upper 16
> +          bits so we use L(copy_16_31).  */
> +       testw   %cx, %cx
> +       jz      L(copy_16_31)
> +
> +       .p2align 4,, 2
> +L(copy_less_16):
> +       testb   %cl, %cl
> +       jz      L(copy_8_15)
> +
> +# ifndef USE_AS_WCSCPY
> +       testb   $0x7, %cl
> +       jz      L(copy_4_7)
> +
> +       vmovd   %xmm0, %ecx
> +       testl   %edx, %edx
> +       jz      L(set_null_term)
> +       movw    %cx, (%rdi)
> +       movzbl  (%rsi, %rdx), %ecx
> +L(set_null_term):
> +       movb    %cl, (%rdi, %rdx)
> +       ret
> +
> +       .p2align 4,, 11
> +L(copy_4_7):
> +# endif
> +       movl    -(4 - CHAR_SIZE)(%rsi, %rdx), %ecx
> +       vmovd   %xmm0, (%rdi)
> +       movl    %ecx, -(4 - CHAR_SIZE)(%rdi, %rdx)
> +       ret
> +
> +
> +       .p2align 4,, 4
> +L(less_1x_vec):
> +       btsl    %edx, %ecx
> +       COND_VZEROUPPER
> +       /* edx already a dependency.  */
> +       bsfl    %ecx, %edx
> +       testw   %cx, %cx
> +       jnz     L(copy_less_16)
> +
> +       .p2align 4,, 10
> +L(copy_16_31):
> +       VMOVU   -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
> +       VMOVU   %xmm0, (%rdi)
> +       VMOVU   %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
> +       ret
> +
> +       .p2align 4,, 10
> +L(copy_8_15):
> +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
> +       vmovq   %xmm0, (%rdi)
> +       movq    %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
> +       ret
> +
> +       .p2align 4,, 8
> +       .p2align 6,, 14
> +L(more_1x_vec):
> +       VMOVU   %VMM(0), (%rdi)
> +
> +       /* Align rsi (src) and just rdx/rdi (length/dst).  */
> +       addq    %rsi, %rdx
> +       subq    %rsi, %rdi
> +       orq     $(VEC_SIZE - 1), %rsi
> +       incq    %rsi
> +       addq    %rsi, %rdi
> +L(loop_last_4x_vec):
> +       subq    %rsi, %rdx
> +L(last_4x_vec):
> +       VMOVA   0(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +
> +
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       jae     L(more_2x_vec)
> +L(last_2x_vec):
> +       tzcnt   %ecx, %ecx
> +       cmpl    %ecx, %edx
> +       jb      L(ret_vec_x1_len)
> +
> +       cmpl    $VEC_SIZE, %ecx
> +       jnz     L(ret_vec_x1)
> +
> +       VMOVA   (VEC_SIZE + 0)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), 0(%rdi)
> +
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +L(ret_vec_x2_len):
> +       /* btsl will automatically mask lower VEC_SIZE - 1 bits from
> +          edx.  */
> +       btsl    %edx, %ecx
> +       .p2align 4,, 3
> +L(ret_vec_x2):
> +       bsfl    %ecx, %ecx
> +       VMOVU   ((VEC_SIZE + 0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> +       VMOVU   %VMM(1), ((VEC_SIZE + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
> +L(return_vzeroupper):
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
> +
> +
> +       .p2align 4,, 12
> +L(ret_vec_x1_len):
> +       movl    %edx, %ecx
> +L(ret_vec_x1):
> +       VMOVU   (0 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> +       VMOVU   %VMM(1), (0 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4,, 8
> +L(ret_vec_x3_len):
> +       btsl    %edx, %ecx
> +L(ret_vec_x3):
> +       bsfl    %ecx, %ecx
> +       VMOVU   ((VEC_SIZE * 2 + 0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> +       VMOVU   %VMM(1), ((VEC_SIZE * 2 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4,, 8
> +       .p2align 6,, 14
> +L(more_2x_vec):
> +       /* L(ret_vec_x1) expects ecx to have position of first match so
> +          test with bsf.  */
> +       bsfl    %ecx, %ecx
> +       jnz     L(ret_vec_x1)
> +
> +       VMOVA   (VEC_SIZE + 0)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), 0(%rdi)
> +
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
> +
> +
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(3)
> +       VMOVU   %VMM(2), (VEC_SIZE + 0)(%rdi)
> +
> +       VPCMPEQ %VMM(3), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
>
> -#define USE_AS_STRNCAT
> -#define STRCAT STRNCAT
> -#include "strcat-avx2.S"
> +       /* Check if length is greater than 4x VEC.  */
> +       addq    $(VEC_SIZE * -4), %rdx
> +       jbe     L(more_4x_vec)
> +
> +       /* Check if length was between [VEC_SIZE * 2 + 1, VEC_SIZE * 3].
> +        */
> +       cmpl    $((VEC_SIZE * 3 - 1)-(VEC_SIZE * 4)), %edx
> +       jle     L(ret_vec_x3_len)
> +
> +L(last_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
> +
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
> +       VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +L(ret_vec_x4_len):
> +       btsl    %edx, %ecx
> +L(ret_vec_x4):
> +       bsfl    %ecx, %ecx
> +       VMOVU   ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> +       VMOVU   %VMM(1), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
> +       VZEROUPPER_RETURN
> +
> +
> +
> +
> +       .p2align 4,, 8
> +       .p2align 6,, 10
> +L(more_4x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
> +
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
> +       VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x4)
> +
> +       VMOVU   %VMM(4), (VEC_SIZE * 3 + 0)(%rdi)
> +
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       subq    $-(VEC_SIZE * 4), %rsi
> +       /* Recheck length before aligning.  */
> +       cmpq    $(VEC_SIZE * 4 - 1), %rdx
> +       jbe     L(last_4x_vec)
> +
> +       /* Align rsi (src) and just rdx/rdi (length/dst).  */
> +       addq    %rsi, %rdx
> +       subq    %rsi, %rdi
> +       andq    $(VEC_SIZE * -4), %rsi
> +
> +       /* Do first half of loop ahead of time so loop can just start by
> +          storing.  */
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %r8d
> +       addq    %rsi, %rdi
> +       testl   %r8d, %r8d
> +       jnz     L(loop_4x_done)
> +
> +       /* Use r9 for end of region before handling last 4x VEC
> +          specially.  */
> +       leaq    -(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +       subq    $(VEC_SIZE * -4), %rsi
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> +
> +       subq    $(VEC_SIZE * -4), %rdi
> +       cmpq    %rsi, %r9
> +       jbe     L(loop_last_4x_vec)
> +
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> +
> +       vpmovmskb %VMM(6), %r8d
> +
> +       testl   %r8d, %r8d
> +       jz      L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       /* L(ret_vec_x1) expects ecx to have position of first match so
> +          test with bsf.  */
> +       bsfl    %ecx, %ecx
> +       jnz     L(ret_vec_x1)
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +
> +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +       bsfl    %r8d, %r8d
> +       VMOVU   ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %r8), %VMM(1)
> +       VMOVU   %VMM(1), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %r8)
> +L(return_end):
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4,, 4
> +L(page_cross):
> +       movq    %rsi, %r8
> +       andq    $(VEC_SIZE * -1), %r8
> +
> +       VPCMPEQ (%r8), %VZERO, %VMM(6)
> +
> +       vpmovmskb %VMM(6), %ecx
> +       shrxl   %esi, %ecx, %ecx
> +
> +       subl    %esi, %r8d
> +       andl    $(VEC_SIZE - 1), %r8d
> +       cmpq    %r8, %rdx
> +       jb      L(page_cross_small)
> +# if USE_MOVSB_IN_PAGE_CROSS
> +       /* Optimizing more aggressively for space as this is very cold
> +          code. This saves 2x cache lines.  */
> +
> +       /* This adds once to the later result which will get correct
> +          copy bounds. NB: this can never zero-out a non-zero RCX as
> +          to be in the page cross case rsi cannot be aligned and we
> +          already right-shift rcx by the misalignment.  */
> +       shll    $CHAR_SIZE, %ecx
> +       jz      L(page_cross_continue)
> +       bsfl    %ecx, %ecx
> +       rep     movsb
> +       VZEROUPPER_RETURN
> +
> +L(page_cross_small):
> +       tzcntl  %ecx, %ecx
> +       cmpq    %rdx, %rcx
> +       cmova   %edx, %ecx
> +#  ifdef USE_AS_WCSCPY
> +       addl    $CHAR_SIZE, %ecx
> +#  else
> +       incl    %ecx
> +#  endif
> +       rep     movsb
> +       VZEROUPPER_RETURN
> +
> +# else
> +       testl   %ecx, %ecx
> +       jz      L(page_cross_continue)
> +       xorl    %edx, %edx
> +
> +
> +       /* Traditional copy case, essentially same as used in non-page-
> +          cross case but since we can't reuse VMM(0) we need twice as
> +          many loads from rsi.  */
> +       bsfl    %ecx, %edx
> +       COND_VZEROUPPER
> +
> +       testw   %cx, %cx
> +       jz      L(page_cross_copy_16_31)
> +
> +       testb   %cl, %cl
> +       jz      L(page_cross_copy_8_15)
> +
> +       testb   $0x7, %cl
> +       jz      L(page_cross_copy_4_7)
> +
> +       testl   %edx, %edx
> +       jz      L(page_cross_set_null_term)
> +       movzwl  (%rsi), %ecx
> +       movw    %cx, (%rdi)
> +L(page_cross_set_null_term):
> +       movb    $0, (%rdi, %rdx)
> +       ret
> +
> +
> +       .p2align 4,, 4
> +L(page_cross_copy_4_7):
> +       movl    (%rsi), %ecx
> +       movl    -3(%rsi, %rdx), %esi
> +       movl    %ecx, (%rdi)
> +       movl    %esi, -3(%rdi, %rdx)
> +       ret
> +       .p2align 4,, 4
> +L(page_cross_copy_8_15):
> +       movq    (%rsi), %rcx
> +       movq    -7(%rsi, %rdx), %rsi
> +       movq    %rcx, (%rdi)
> +       movq    %rsi, -7(%rdi, %rdx)
> +       ret
> +
> +
> +       .p2align 4,, 3
> +L(page_cross_copy_16_31):
> +       VMOVU   (%rsi), %xmm0
> +       VMOVU   -15(%rsi, %rdx), %xmm1
> +       VMOVU   %xmm0, (%rdi)
> +       VMOVU   %xmm1, -15(%rdi, %rdx)
> +       ret
> +
> +# endif
> +
> +L(zero_len):
> +       incq    %rdx
> +       jnz     OVERFLOW_STRCAT
> +       movq    %rdi, %rax
> +       ret
> +
> +
> +END(STRNCAT)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> index 79e7083299..b582a4a7a1 100644
> --- a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> @@ -1,3 +1,3 @@
> -#define USE_AS_STRNCPY
> -#define STRCPY __strncpy_avx2_rtm
> -#include "strcpy-avx2-rtm.S"
> +#define STRNCPY        __strncpy_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
> +#include "strncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
> index ce634e94fa..dfdde74751 100644
> --- a/sysdeps/x86_64/multiarch/strncpy-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
> @@ -1,7 +1,738 @@
> -#ifndef STRNCPY
> -# define STRNCPY       __strncpy_avx2
> -#endif
> +/* strncpy with AVX2
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (3)
> +
> +# include <sysdep.h>
> +
> +
> +# ifndef VEC_SIZE
> +#  include "x86-avx2-vecs.h"
> +# endif
> +
> +# ifndef STRNCPY
> +#  define STRNCPY      __strncpy_avx2
> +# endif
> +
> +
> +# ifdef USE_AS_WCSCPY
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPMIN        vpminud
> +#  define CHAR_SIZE    4
> +# else
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPMIN        vpminub
> +#  define CHAR_SIZE    1
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# define PAGE_SIZE     4096
> +
> +# define VZERO VMM(7)
> +# define VZERO_128     VMM_128(7)
> +
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCPY)
> +       /* Filter zero length strings and very long strings.  Zero
> +          length strings just return, very long strings are handled by
> +          just running rep stos{b|l} to zero set (which will almost
> +          certainly segfault), if that succeeds then just calling
> +          OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
> +# ifdef USE_AS_WCSCPY
> +       decq    %rdx
> +       movq    %rdx, %rax
> +       /* 56 is end of max supported address space.  */
> +       shr     $56, %rax
> +       jnz     L(zero_len)
> +       salq    $2, %rdx
> +# else
> +       decq    %rdx
> +       /* `dec` can macrofuse with `jl`. If the flag needs to become
> +          `jb` replace `dec` with `sub`.  */
> +       jl      L(zero_len)
> +# endif
> +
> +       vpxor   %VZERO_128, %VZERO_128, %VZERO_128
> +       movl    %esi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(page_cross)
> +
> +L(page_cross_continue):
> +       VMOVU   (%rsi), %VMM(0)
> +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +
> +       /* If no STPCPY just save end ahead of time.  */
> +# ifndef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# elif defined USE_AS_WCSCPY
> +       /* Clear dependency as nearly all return code for wcpncpy uses
> +          `setc %al`.  */
> +       xorl    %eax, %eax
> +# endif
> +
> +       cmpq    $(VEC_SIZE - CHAR_SIZE), %rdx
> +       /* `jb` because length rdx is now length - CHAR_SIZE.  */
> +       jbe     L(less_1x_vec)
> +
> +       /* This may overset but thats fine because we still need to zero
> +          fill.  */
> +       VMOVU   %VMM(0), (%rdi)
> +
> +       testl   %ecx, %ecx
> +       jnz     L(zfill)
> +
> +       /* Align.  */
> +       addq    %rsi, %rdx
> +       subq    %rsi, %rdi
> +       orq     $(VEC_SIZE - 1), %rsi
> +       incq    %rsi
> +L(last_4x_vec):
> +       addq    %rsi, %rdi
> +L(loop_last_4x_vec):
> +       subq    %rsi, %rdx
> +
> +
> +       VMOVA   0(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       jae     L(more_2x_vec)
> +
> +       cmpl    $(VEC_SIZE), %edx
> +       jb      L(ret_vec_x1_len)
> +
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
> +
> +       VPCMPEQ VEC_SIZE(%rsi), %VZERO, %VMM(6)
> +       VMOVU   %VMM(1), (%rdi)
> +       vpmovmskb %VMM(6), %ecx
> +       shlq    $VEC_SIZE, %rcx
> +L(ret_vec_x1_len):
> +       tzcntq  %rcx, %rcx
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x1_len_no_zfill)
> +       /* Fall through (expectation) is copy len < buffer len.  */
> +       VMOVU   %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +L(ret_vec_x1_len_no_zfill_mov):
> +       movl    %ecx, %edx
> +# ifdef USE_AS_STPCPY
> +       /* clear flags.  */
> +       xorl    %ecx, %ecx
> +# endif
> +L(ret_vec_x1_len_no_zfill):
> +       VMOVU   ((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> +       VMOVU   %VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       setc    %al
> +       addq    %rdx, %rdi
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# endif
> +L(return_vzeroupper):
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
> +
> +       .p2align 4,, 6
> +L(ret_vec_x1):
> +       bsfl    %ecx, %ecx
> +       VMOVU   %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +       subl    %ecx, %edx
> +       /* Check if we need to reload/store.  */
> +       cmpl    $VEC_SIZE, %edx
> +       jb      L(ret_vec_x1_len_no_zfill_mov)
> +       /* Otherwise safe to just store directly.  */
> +       VMOVU   %VMM(1), (%rdi)
> +       VMOVU   %VZERO, (%rdi, %rcx)
> +# ifdef USE_AS_STPCPY
> +       leaq    (%rdi, %rcx), %rax
> +# endif
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4,, 12
> +L(more_2x_vec):
> +       VMOVU   %VMM(1), (%rdi)
> +       testl   %ecx, %ecx
> +       /* Must fill at least 2x VEC.  */
> +       jnz     L(zfill_vec1)
> +
> +       VMOVA   VEC_SIZE(%rsi), %VMM(2)
> +       VMOVU   %VMM(2), VEC_SIZE(%rdi)
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       /* Must fill at least 1x VEC.  */
> +       jnz     L(zfill_vec2)
> +
> +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(3)
> +       VPCMPEQ %VMM(3), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +
> +       /* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
> +          CHAR_SIZE.  */
> +       cmpq    $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
> +       ja      L(more_4x_vec)
> +
> +       subl    $(VEC_SIZE * 3), %edx
> +       jb      L(ret_vec_x3_len)
> +
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
> +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi)
> +       vpmovmskb %VMM(6), %ecx
> +       tzcntl  %ecx, %ecx
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x4_len_no_zfill)
> +       /* Fall through (expectation) is copy len < buffer len.  */
> +       VMOVU   %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +       movl    %ecx, %edx
> +L(ret_vec_x4_len_no_zfill):
> +       VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> +       VMOVU   %VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       setc    %al
> +       addq    %rdx, %rdi
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +       leal    (VEC_SIZE * 3 + 0)(%edx), %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# endif
> +       VZEROUPPER_RETURN
> +
> +
> +L(ret_vec_x3_len):
> +       addl    $(VEC_SIZE * 1), %edx
> +       tzcntl  %ecx, %ecx
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x3_len_no_zfill)
> +       /* Fall through (expectation) is copy len < buffer len.  */
> +       VMOVU   %VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +L(ret_vec_x3_len_no_zfill_mov):
> +       movl    %ecx, %edx
> +# ifdef USE_AS_STPCPY
> +       /* clear flags.  */
> +       xorl    %ecx, %ecx
> +# endif
> +       .p2align 4,, 4
> +L(ret_vec_x3_len_no_zfill):
> +       VMOVU   ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> +       VMOVU   %VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       setc    %al
> +       addq    %rdx, %rdi
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +       leal    (VEC_SIZE * 2 + 0)(%rdx), %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# endif
> +       VZEROUPPER_RETURN
> +
> +
> +       .p2align 4,, 8
> +L(ret_vec_x3):
> +       bsfl    %ecx, %ecx
> +       VMOVU   %VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
> +       subl    %ecx, %edx
> +       jl      L(ret_vec_x3_len_no_zfill_mov)
> +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi)
> +       VMOVU   %VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
> +# ifdef USE_AS_STPCPY
> +       leaq    (VEC_SIZE * 2)(%rdi, %rcx), %rax
> +# endif
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +
> +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi)
> +       testl   %ecx, %ecx
> +       jnz     L(zfill_vec3)
> +
> +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(4)
> +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi)
> +       VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(zfill_vec4)
> +
> +       movq    %rdx, %rcx
> +       addq    %rsi, %rdx
> +       subq    %rsi, %rdi
> +       subq    $-(VEC_SIZE * 4), %rsi
> +       /* Recheck length before aligning.  */
> +       cmpq    $(VEC_SIZE * 8 - CHAR_SIZE), %rcx
> +       jbe     L(last_4x_vec)
> +
> +       andq    $(VEC_SIZE * -4), %rsi
> +
> +       /* Do first half of loop ahead of time so loop can just start by
> +          storing.  */
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %r8d
> +       addq    %rsi, %rdi
> +       testl   %r8d, %r8d
> +       jnz     L(loop_4x_done)
> +
> +       /* Use r9 as end register.  */
> +       leaq    -(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
>
> -#define USE_AS_STRNCPY
> -#define STRCPY STRNCPY
> -#include "strcpy-avx2.S"
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +       subq    $(VEC_SIZE * -4), %rsi
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> +
> +       subq    $(VEC_SIZE * -4), %rdi
> +       cmpq    %rsi, %r9
> +       jbe     L(loop_last_4x_vec)
> +
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> +
> +       vpmovmskb %VMM(6), %r8d
> +
> +       testl   %r8d, %r8d
> +       jz      L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +       subq    %rsi, %rdx
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(zfill_vec1)
> +
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(zfill_vec2)
> +
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(zfill_vec3)
> +
> +       VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> +       movl    %r8d, %ecx
> +
> +       // Zfill more....
> +
> +       .p2align 4,, 4
> +L(zfill_vec4):
> +       addq    $(VEC_SIZE * 2), %rdi
> +       subq    $(VEC_SIZE * 2), %rdx
> +L(zfill_vec2):
> +       shlq    $VEC_SIZE, %rcx
> +L(zfill):
> +       bsfq    %rcx, %rcx
> +       subq    %rcx, %rdx
> +       addq    %rcx, %rdi
> +# ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# endif
> +L(zfill_from_page_cross):
> +       cmpq    $VEC_SIZE, %rdx
> +       jb      L(zfill_less_vec_vzeroupper)
> +
> +L(zfill_more_1x_vec):
> +       VMOVU   %VZERO, CHAR_SIZE(%rdi)
> +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       jae     L(zfill_more_2x_vec)
> +L(zfill_done0):
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4,, 8
> +L(zfill_vec3):
> +       addq    $(VEC_SIZE * 2), %rdi
> +       subq    $(VEC_SIZE * 2), %rdx
> +       .p2align 4,, 2
> +L(zfill_vec1):
> +       bsfl    %ecx, %ecx
> +       addq    %rcx, %rdi
> +       subq    %rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# endif
> +       /* zfill from vec1/vec3 must have to set at least 2x VECS.  */
> +
> +       VMOVU   %VZERO, CHAR_SIZE(%rdi)
> +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       jb      L(zfill_done0)
> +L(zfill_more_2x_vec):
> +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   %VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
> +       subq    $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
> +       jbe     L(zfill_done)
> +
> +       addq    %rdi, %rdx
> +       VMOVU   %VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
> +       VMOVU   %VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
> +
> +
> +       VMOVU   %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
> +       VMOVU   %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
> +
> +       subq    $-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
> +       cmpq    %rdi, %rdx
> +       jbe     L(zfill_done)
> +
> +       andq    $-(VEC_SIZE), %rdi
> +       .p2align 4,, 12
> +L(zfill_loop_4x_vec):
> +       VMOVA   %VZERO, (VEC_SIZE * 0)(%rdi)
> +       VMOVA   %VZERO, (VEC_SIZE * 1)(%rdi)
> +       VMOVA   %VZERO, (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VZERO, (VEC_SIZE * 3)(%rdi)
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       cmpq    %rdi, %rdx
> +       ja      L(zfill_loop_4x_vec)
> +L(zfill_done):
> +       VZEROUPPER_RETURN
> +
> +
> +       .p2align 4,, 8
> +L(copy_1x):
> +       VMOVU   %VMM(0), (%rdi)
> +       testl   %ecx, %ecx
> +       jz      L(ret_32_32)
> +L(zfill_less_vec):
> +       bsfl    %ecx, %ecx
> +L(zfill_less_vec_no_bsf):
> +       subq    %rcx, %rdx
> +       addq    %rcx, %rdi
> +# ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# endif
> +L(zfill_less_vec_vzeroupper):
> +       COND_VZEROUPPER
> +       /* We are taking advantage of the fact that to be here we must
> +          be writing null-term as (%rdi, %rcx) we have a byte of lee-
> +          way for overwriting.  */
> +       cmpl    $16, %edx
> +       jb      L(zfill_less_16)
> +       VMOVU   %VZERO_128, (%rdi)
> +       VMOVU   %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
> +       ret
> +# ifdef USE_AS_STPCPY
> +L(ret_32_32):
> +       leaq    CHAR_SIZE(%rdi, %rdx), %rax
> +       VZEROUPPER_RETURN
> +# endif
> +
> +       .p2align 4,, 4
> +L(copy_16_31):
> +       /* Overfill to avoid branches.  */
> +       vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
> +       vmovdqu %xmm0, (%rdi)
> +       vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
> +       cmpl    %ecx, %edx
> +       ja      L(zfill_less_vec_no_bsf)
> +# ifndef USE_AS_STPCPY
> +L(ret_32_32):
> +# else
> +#  ifdef USE_AS_WCSCPY
> +       setc    %al
> +       addq    %rdx, %rdi
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# endif
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4,, 4
> +L(copy_8_15):
> +       /* Overfill to avoid branches.  */
> +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
> +       vmovq   %xmm0, (%rdi)
> +       movq    %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_8_15)
> +       subq    %rcx, %rdx
> +       addq    %rcx, %rdi
> +# ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# endif
> +       .p2align 4,, 8
> +L(zfill_less_16):
> +       xorl    %ecx, %ecx
> +       cmpl    $8, %edx
> +       jb      L(zfill_less_8)
> +       movq    %rcx, (%rdi)
> +       movq    %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
> +# ifndef USE_AS_STPCPY
> +L(ret_8_15):
> +# endif
> +       ret
> +
> +
> +       .p2align 4,, 8
> +L(less_1x_vec):
> +       /* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
> +          buffer sizes are aligned conventially.  */
> +       je      L(copy_1x)
> +
> +       tzcntl  %ecx, %ecx
> +       cmpl    $16, %edx
> +       jae     L(copy_16_31)
> +
> +       COND_VZEROUPPER
> +       cmpl    $8, %edx
> +       jae     L(copy_8_15)
> +# ifdef USE_AS_WCSCPY
> +       testl   %ecx, %ecx
> +       jz      L(zfill_less_8_set_ret)
> +
> +       movl    (%rsi, %rdx), %esi
> +       vmovd   %xmm0, (%rdi)
> +       movl    %esi, (%rdi, %rdx)
> +
> +#  ifdef USE_AS_STPCPY
> +       cmpl    %ecx, %edx
> +L(ret_8_15):
> +       setc    %al
> +       addq    %rdx, %rdi
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#  endif
> +       ret
> +L(zfill_less_8_set_ret):
> +       xorl    %ecx, %ecx
> +#  ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +#  endif
> +L(zfill_less_8):
> +       movl    %ecx, (%rdi)
> +       movl    %ecx, (%rdi, %rdx)
> +       ret
> +
> +# else
> +       cmpl    $3, %edx
> +       jb      L(copy_0_3)
> +       /* Overfill to avoid branches.  */
> +       movl    -3(%rsi, %rdx), %esi
> +       vmovd   %xmm0, (%rdi)
> +       movl    %esi, -3(%rdi, %rdx)
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_4_7)
> +       subq    %rcx, %rdx
> +       addq    %rcx, %rdi
> +#  ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +#  endif
> +       xorl    %ecx, %ecx
> +       .p2align 4,, 8
> +L(zfill_less_8):
> +       cmpl    $3, %edx
> +       jb      L(zfill_less_3)
> +       movl    %ecx, (%rdi)
> +       movl    %ecx, -3(%rdi, %rdx)
> +#  ifdef USE_AS_STPCPY
> +       ret
> +#  endif
> +
> +L(ret_4_7):
> +#  ifdef USE_AS_STPCPY
> +L(ret_8_15):
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#  endif
> +       ret
> +
> +       .p2align 4,, 4
> +L(zfill_less_3):
> +       testl   %edx, %edx
> +       jz      L(zfill_1)
> +       movw    %cx, (%rdi)
> +L(zfill_1):
> +       movb    %cl, (%rdi, %rdx)
> +       ret
> +
> +       .p2align 4,, 8
> +L(copy_0_3):
> +       vmovd   %xmm0, %r8d
> +       testl   %edx, %edx
> +       jz      L(copy_1)
> +       movw    %r8w, (%rdi)
> +       cmpl    %ecx, %edx
> +       ja      L(zfill_from_1)
> +       movzbl  (%rsi, %rdx), %r8d
> +#  ifdef USE_AS_STPCPY
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +       movb    %r8b, (%rdi, %rdx)
> +       ret
> +#  endif
> +
> +L(copy_1):
> +#  ifdef USE_AS_STPCPY
> +       movl    %edx, %eax
> +       cmpl    %ecx, %edx
> +       adcq    %rdi, %rax
> +#  endif
> +#  ifdef USE_AS_WCSCPY
> +       vmovd   %xmm0, (%rdi)
> +#  else
> +       movb    %r8b, (%rdi, %rdx)
> +#  endif
> +       ret
> +# endif
> +
> +       .p2align 4,, 2
> +L(zero_len):
> +       movq    %rdi, %rax
> +       ret
> +# ifndef USE_AS_WCSCPY
> +       .p2align 4,, 8
> +L(zfill_from_1):
> +#  ifdef USE_AS_STPCPY
> +       leaq    (%rdi, %rcx), %rax
> +#  endif
> +       movw    $0, -1(%rdi, %rdx)
> +       ret
> +# endif
> +
> +       .p2align 4,, 4
> +       .p2align 6,, 8
> +L(page_cross):
> +       movq    %rsi, %rax
> +       andq    $(VEC_SIZE * -1), %rax
> +
> +       VPCMPEQ (%rax), %VZERO, %VMM(6)
> +
> +       vpmovmskb %VMM(6), %ecx
> +       shrxl   %esi, %ecx, %ecx
> +
> +       subl    %esi, %eax
> +       andl    $(VEC_SIZE - 1), %eax
> +       cmpq    %rax, %rdx
> +       jb      L(page_cross_small)
> +       /* Optimizing more aggressively for space as this is very cold
> +          code. This saves 2x cache lines.  */
> +
> +       /* If rcx is non-zero then continue.  */
> +       shl     $CHAR_SIZE, %ecx
> +       jz      L(page_cross_continue)
> +       bsf     %ecx, %ecx
> +
> +       subq    %rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +       leaq    -CHAR_SIZE(%rdi, %rcx), %rax
> +# else
> +       movq    %rdi, %rax
> +# endif
> +
> +       rep     movsb
> +# ifdef USE_AS_WCSCPY
> +       movl    $0, (%rdi)
> +# else
> +       movb    $0, (%rdi)
> +# endif
> +       jmp     L(zfill_from_page_cross)
> +
> +L(page_cross_small):
> +       tzcntl  %ecx, %ecx
> +       xorl    %eax, %eax
> +       cmpl    %ecx, %edx
> +       jbe     L(page_cross_copy_only)
> +
> +       /* Do a zfill of the tail before copying.  */
> +       movq    %rdi, %r9
> +       movl    %ecx, %r8d
> +
> +       subl    %ecx, %edx
> +       leaq    CHAR_SIZE(%rdi, %rcx), %rdi
> +       movl    %edx, %ecx
> +       rep     stosb
> +       movq    %r9, %rdi
> +       movl    %r8d, %edx
> +L(page_cross_copy_only):
> +       leal    CHAR_SIZE(%rdx), %ecx
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       setc    %al
> +       addq    %rdx, %rdi
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# else
> +       movq    %rdi, %rax
> +# endif
> +       rep     movsb
> +       ret
> +
> +
> +L(best_effort_strncpy):
> +       movq    %rdx, %rcx
> +       xorl    %eax, %eax
> +       movq    %rdi, %r8
> +       /* The length is >= 2^63. We very much so expect to segfault at
> +          rep stos. If that doesn't happen then just strcpy to finish.
> +        */
> +# ifdef USE_AS_WCSCPY
> +       rep     stosl
> +# else
> +       rep     stosb
> +# endif
> +       movq    %r8, %rdi
> +       jmp     OVERFLOW_STRCPY
> +
> +
> +
> +END(STRNCPY)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> index dca1089060..01bead1435 100644
> --- a/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> +++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> @@ -27,7 +27,10 @@
>  #define VEC_SIZE                       32
>  #include "x86-vec-macros.h"
>
> -#define USE_WITH_AVX           1
> +#ifndef USE_WITH_AVX2
> +# define USE_WITH_AVX          1
> +#endif
> +
>  #define SECTION(p)                     p##.avx
>
>  /* 4-byte mov instructions with AVX2.  */
> diff --git a/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
> new file mode 100644
> index 0000000000..a5966701ec
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
> @@ -0,0 +1,26 @@
> +/* Common config for AVX2-RTM VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_AVX2_RTM_VECS_H
> +#define _X86_AVX2_RTM_VECS_H                   1
> +
> +#define USE_WITH_AVX2          1
> +#include "x86-avx-rtm-vecs.h"
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-avx2-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
> new file mode 100644
> index 0000000000..16d7ae5147
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
> @@ -0,0 +1,27 @@
> +/* Common config for AVX2 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_AVX2_VECS_H
> +#define _X86_AVX2_VECS_H                       1
> +
> +#define USE_WITH_AVX2          1
> +
> +#include "x86-avx-vecs.h"
> +
> +#endif
> --
> 2.34.1
>

[-- Attachment #2: strcpy-avx2-results.tar.gz --]
[-- Type: application/gzip, Size: 102168 bytes --]

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v1 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
  2022-11-03  8:53 ` [PATCH v1 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
@ 2022-11-03  8:55   ` Noah Goldstein
  2022-11-04 23:04   ` [PATCH v4 1/4] " Noah Goldstein
  2022-11-09  1:38   ` [PATCH v5 " Noah Goldstein
  2 siblings, 0 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-03  8:55 UTC (permalink / raw)
  To: libc-alpha; +Cc: hjl.tools, carlos

[-- Attachment #1: Type: text/plain, Size: 106358 bytes --]

On Thu, Nov 3, 2022 at 1:53 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
>     1. Use more overlapping stores to avoid branches.
>     2. Reduce how unrolled the aligning copies are (this is more of a
>        code-size save, its a negative for some sizes in terms of
>        perf).
>     3. Improve the loop a bit (similiar to what we do in strlen with
>        2x vpminu + kortest instead of 3x vpminu + kmov + test).
>     4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
>        number that are taken.
>
> Performance Changes:
>
>     Times are from N = 10 runs of the benchmark suite and are
>     reported as geometric mean of all ratios of
>     New Implementation / Old Implementation.
>
>     stpcpy-evex      -> 0.922
>     strcat-evex      -> 0.985
>     strcpy-evex      -> 0.880
>
>     strncpy-evex     -> 0.831
>     stpncpy-evex     -> 0.780
>
>     strncat-evex     -> 0.978
>
> Code Size Changes:
>     function         -> Bytes New / Bytes Old -> Ratio
>
>     strcat-evex      -> 819  / 1874 -> 0.437
>     strcpy-evex      -> 700  / 1074 -> 0.652
>     stpcpy-evex      -> 735  / 1094 -> 0.672
>
>     strncpy-evex     -> 1397 / 2611 -> 0.535
>     stpncpy-evex     -> 1489 / 2691 -> 0.553
>
>     strncat-evex     -> 1166 / 2832 -> 0.412
>
> Notes:
>     Because of the significant difference between the
>     implementations they are split into three files.
>
>     strcpy-evex.S    -> strcpy, stpcpy, strcat
>     strncpy-evex.S   -> strncpy
>     strncat-evex.S    > strncat
>
>     I couldn't find a way to merge them without making the ifdefs
>     incredibly difficult to follow.
>
>     All implementations can be made evex512 by including
>     "x86-evex512-vecs.h" at the top.
>
> Full check passes on x86-64 and build succeeds for all ISA levels w/
> and w/o multiarch.
> ---
>
> Results attached.
>  sysdeps/x86_64/multiarch/stpncpy-evex.S       |    5 +-
>  sysdeps/x86_64/multiarch/strcat-evex.S        |  291 +---
>  sysdeps/x86_64/multiarch/strcat-strlen-evex.S |   88 ++
>  sysdeps/x86_64/multiarch/strcpy-evex.S        | 1282 ++++++-----------
>  sysdeps/x86_64/multiarch/strncat-evex.S       |  517 ++++++-
>  sysdeps/x86_64/multiarch/strncpy-evex.S       |  995 ++++++++++++-
>  .../multiarch/strncpy-or-cat-overflow-def.h   |   65 +
>  7 files changed, 2070 insertions(+), 1173 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
>
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> index 99ea76a372..3693491baa 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
> +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> @@ -3,6 +3,5 @@
>  #endif
>
>  #define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY STPNCPY
> -#include "strcpy-evex.S"
> +#define STRNCPY        STPNCPY
> +#include "strncpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
> index 0e2df947e9..b4207b7889 100644
> --- a/sysdeps/x86_64/multiarch/strcat-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcat-evex.S
> @@ -1,286 +1,7 @@
> -/* strcat with 256-bit EVEX instructions.
> -   Copyright (C) 2021-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <isa-level.h>
> -
> -#if ISA_SHOULD_BUILD (4)
> -
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_evex
> -# endif
> -
> -# define VMOVU         vmovdqu64
> -# define VMOVA         vmovdqa64
> -
> -/* zero register */
> -# define XMMZERO       xmm16
> -# define YMMZERO       ymm16
> -# define YMM0          ymm17
> -# define YMM1          ymm18
> -
> -# define USE_AS_STRCAT
> -
> -/* Number of bytes in a vector register */
> -# define VEC_SIZE      32
> -
> -       .section .text.evex,"ax",@progbits
> -ENTRY (STRCAT)
> -       mov     %rdi, %r9
> -# ifdef USE_AS_STRNCAT
> -       mov     %rdx, %r8
> -# endif
> -
> -       xor     %eax, %eax
> -       mov     %edi, %ecx
> -       and     $((VEC_SIZE * 4) - 1), %ecx
> -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> -       cmp     $(VEC_SIZE * 3), %ecx
> -       ja      L(fourth_vector_boundary)
> -       vpcmpb  $0, (%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_first_vector)
> -       mov     %rdi, %rax
> -       and     $-VEC_SIZE, %rax
> -       jmp     L(align_vec_size_start)
> -L(fourth_vector_boundary):
> -       mov     %rdi, %rax
> -       and     $-VEC_SIZE, %rax
> -       vpcmpb  $0, (%rax), %YMMZERO, %k0
> -       mov     $-1, %r10d
> -       sub     %rax, %rcx
> -       shl     %cl, %r10d
> -       kmovd   %k0, %edx
> -       and     %r10d, %edx
> -       jnz     L(exit)
> -
> -L(align_vec_size_start):
> -       vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -       kmovd   %k1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -       kmovd   %k2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -       kmovd   %k3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -       add     $(VEC_SIZE * 4), %rax
> -       kmovd   %k4, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -       kmovd   %k1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -       kmovd   %k2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -       kmovd   %k3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -       kmovd   %k4, %edx
> -       add     $(VEC_SIZE * 4), %rax
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -       kmovd   %k1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -       kmovd   %k2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -       kmovd   %k3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -       add     $(VEC_SIZE * 4), %rax
> -       kmovd   %k4, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -       kmovd   %k1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -       kmovd   %k2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -       kmovd   %k3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -       add     $(VEC_SIZE * 5), %rax
> -       kmovd   %k4, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> -       add     $VEC_SIZE, %rax
> -       kmovd   %k0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> -       add     $VEC_SIZE, %rax
> -       kmovd   %k0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k1
> -       add     $VEC_SIZE, %rax
> -       kmovd   %k1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       add     $VEC_SIZE, %rax
> -
> -       .p2align 4
> -L(align_four_vec_loop):
> -       VMOVA   (%rax), %YMM0
> -       VMOVA   (VEC_SIZE * 2)(%rax), %YMM1
> -       vpminub VEC_SIZE(%rax), %YMM0, %YMM0
> -       vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1
> -       vpminub %YMM0, %YMM1, %YMM0
> -       /* If K0 != 0, there is a null byte.  */
> -       vpcmpb  $0, %YMM0, %YMMZERO, %k0
> -       add     $(VEC_SIZE * 4), %rax
> -       ktestd  %k0, %k0
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
> -       sub     $(VEC_SIZE * 5), %rax
> -       kmovd   %k0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -       kmovd   %k1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -       kmovd   %k2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -       kmovd   %k3, %edx
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 4), %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit):
> -       sub     %rdi, %rax
> -L(exit_null_on_first_vector):
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_second_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $VEC_SIZE, %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_third_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 2), %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_fourth_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 3), %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_fifth_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 4), %rax
> -
> -       .p2align 4
> -L(StartStrcpyPart):
> -       lea     (%r9, %rax), %rdi
> -       mov     %rsi, %rcx
> -       mov     %r9, %rax      /* save result */
> -
> -# ifdef USE_AS_STRNCAT
> -       test    %r8, %r8
> -       jz      L(ExitZero)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-evex.S"
> +#ifndef STRCAT
> +# define STRCAT        __strcat_evex
>  #endif
> +
> +#define USE_AS_STRCAT
> +#define STRCPY STRCAT
> +#include "strcpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> new file mode 100644
> index 0000000000..9813d38613
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> @@ -0,0 +1,88 @@
> +    /* Simple strlen implementation that ends at L(strcat_strlen_done).  */
> +       vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> +       /* Paired down strlen implementation.  We never commit to 4x
> +          loop as we are expecting a relatively short string and want
> +          to minimize code size.  */
> +       movq    %rdi, %r8
> +       andq    $(VEC_SIZE * -1), %r8
> +       VPCMPEQ (%r8), %VZERO, %k0
> +       KMOV    %k0, %VRCX
> +#ifdef USE_AS_WCSCPY
> +       subl    %r8d, %edi
> +       shrl    $2, %edi
> +#endif
> +       shrx    %VRDI, %VRCX, %VRCX
> +#ifdef USE_AS_WCSCPY
> +       movq    %rax, %rdi
> +#endif
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v0)
> +
> +
> +       VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0
> +       KMOV    %k0, %VRCX
> +       leaq    (VEC_SIZE)(%r8), %rdi
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v0)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v1)
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v2)
> +
> +       VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v3)
> +
> +       andq    $-(VEC_SIZE * 4), %rdi
> +       .p2align 4,, 8
> +L(strlen_loop_4x_vec):
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(0)
> +       VPMIN   (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
> +       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(2)
> +       VPMIN   (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
> +       VPTESTN %VMM(1), %VMM(1), %k1
> +       VPTESTN %VMM(3), %VMM(3), %k3
> +       subq    $(VEC_SIZE * -4), %rdi
> +       KORTEST %k1, %k3
> +       jz      L(strlen_loop_4x_vec)
> +
> +       VPTESTN %VMM(0), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v0)
> +
> +       KMOV    %k1, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v1)
> +
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v2)
> +
> +       KMOV    %k3, %VRCX
> +L(bsf_and_done_v3):
> +       addq    $VEC_SIZE, %rdi
> +L(bsf_and_done_v2):
> +       bsf     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
> +       jmp     L(strcat_strlen_done)
> +
> +       .p2align 4,, 4
> +L(bsf_and_done_v1):
> +       addq    $VEC_SIZE, %rdi
> +L(bsf_and_done_v0):
> +       bsf     %VRCX, %VRCX
> +#ifdef USE_AS_WCSCPY
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> +#else
> +       addq    %rcx, %rdi
> +#endif
> +L(strcat_strlen_done):
> diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
> index 82e45ac675..1ba0195ed2 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
> @@ -1,4 +1,4 @@
> -/* strcpy with 256-bit EVEX instructions.
> +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
>     Copyright (C) 2021-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
> @@ -17,990 +17,526 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #include <isa-level.h>
> -
>  #if ISA_SHOULD_BUILD (4)
>
>
> -# ifndef USE_AS_STRCAT
> -#  include <sysdep.h>
> +       /* Use evex-masked stores for small sizes. Turned off at the
> +          moment.  */
> +# define USE_EVEX_MASKED_STORE 0
> +       /* Use movsb in page cross case to save code size.  */
> +# define USE_MOVSB_IN_PAGE_CROSS       1
>
> -#  ifndef STRCPY
> -#   define STRCPY  __strcpy_evex
> -#  endif
> +# include <sysdep.h>
>
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
>  # endif
>
> -# define VMOVU         vmovdqu64
> -# define VMOVA         vmovdqa64
> -
> -/* Number of bytes in a vector register */
> -# ifndef VEC_SIZE
> -#  define VEC_SIZE     32
> +# ifndef STRCPY
> +#  define STRCPY       __strcpy_evex
>  # endif
>
> -# define XMM2          xmm18
> -# define XMM3          xmm19
>
> -# define YMM2          ymm18
> -# define YMM3          ymm19
> -# define YMM4          ymm20
> -# define YMM5          ymm21
> -# define YMM6          ymm22
> -# define YMM7          ymm23
> +# ifdef USE_AS_WCSCPY
> +#  define VMOVU_MASK   vmovdqu32
> +#  define VPMIN        vpminud
> +#  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
> +#  define VPCMPEQ      vpcmpeqd
> +#  define CHAR_SIZE    4
>
> -# ifndef USE_AS_STRCAT
> +#  define REP_MOVS     rep movsd
>
> -/* zero register */
> -#  define XMMZERO      xmm16
> -#  define YMMZERO      ymm16
> -#  define YMM1         ymm17
> -
> -       .section .text.evex,"ax",@progbits
> -ENTRY (STRCPY)
> -#  ifdef USE_AS_STRNCPY
> -       mov     %RDX_LP, %R8_LP
> -       test    %R8_LP, %R8_LP
> -       jz      L(ExitZero)
> -#  endif
> -       mov     %rsi, %rcx
> -#  ifndef USE_AS_STPCPY
> -       mov     %rdi, %rax      /* save result */
> -#  endif
> +#  define USE_WIDE_CHAR
> +# else
> +#  define VMOVU_MASK   vmovdqu8
> +#  define VPMIN        vpminub
> +#  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
> +#  define VPCMPEQ      vpcmpeqb
> +#  define CHAR_SIZE    1
>
> -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> +#  define REP_MOVS     rep movsb
>  # endif
>
> -       and     $((VEC_SIZE * 4) - 1), %ecx
> -       cmp     $(VEC_SIZE * 2), %ecx
> -       jbe     L(SourceStringAlignmentLessTwoVecSize)
> -
> -       and     $-VEC_SIZE, %rsi
> -       and     $(VEC_SIZE - 1), %ecx
> -
> -       vpcmpb  $0, (%rsi), %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       shr     %cl, %rdx
> +# include "reg-macros.h"
>
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -       mov     $VEC_SIZE, %r10
> -       sub     %rcx, %r10
> -       cmp     %r10, %r8
> -#  else
> -       mov     $(VEC_SIZE + 1), %r10
> -       sub     %rcx, %r10
> -       cmp     %r10, %r8
> -#  endif
> -       jbe     L(CopyVecSizeTailCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeTail)
> -
> -       vpcmpb  $0, VEC_SIZE(%rsi), %YMMZERO, %k1
> -       kmovd   %k1, %edx
>
> -# ifdef USE_AS_STRNCPY
> -       add     $VEC_SIZE, %r10
> -       cmp     %r10, %r8
> -       jbe     L(CopyTwoVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyTwoVecSize)
> -
> -       VMOVU   (%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
> -       VMOVU   %YMM2, (%rdi)
> -
> -/* If source address alignment != destination address alignment */
> -       .p2align 4
> -L(UnalignVecSizeBoth):
> -       sub     %rcx, %rdi
> -# ifdef USE_AS_STRNCPY
> -       add     %rcx, %r8
> -       sbb     %rcx, %rcx
> -       or      %rcx, %r8
> -# endif
> -       mov     $VEC_SIZE, %rcx
> -       VMOVA   (%rsi, %rcx), %YMM2
> -       VMOVU   %YMM2, (%rdi, %rcx)
> -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> -       vpcmpb  $0, %YMM2, %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $(VEC_SIZE * 3), %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec2)
> +# ifdef USE_AS_STPCPY
> +#  define END_REG      rax
>  # else
> -       jnz     L(CopyVecSize)
> +#  define END_REG      rdi, %rdx, CHAR_SIZE
>  # endif
>
> -       VMOVU   %YMM2, (%rdi, %rcx)
> -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM3
> -       vpcmpb  $0, %YMM3, %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec3)
> +# ifdef USE_AS_STRCAT
> +#  define PAGE_ALIGN_REG       edx
> +#  define PAGE_ALIGN_REG_64    rdx
>  # else
> -       jnz     L(CopyVecSize)
> +#  define PAGE_ALIGN_REG       eax
> +#  define PAGE_ALIGN_REG_64    rax
>  # endif
>
> -       VMOVU   %YMM3, (%rdi, %rcx)
> -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM4
> -       vpcmpb  $0, %YMM4, %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec4)
> -# else
> -       jnz     L(CopyVecSize)
> -# endif
> +# define VZERO VMM(7)
> +# define VZERO_128     VMM_128(7)
>
> -       VMOVU   %YMM4, (%rdi, %rcx)
> -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> -       vpcmpb  $0, %YMM2, %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec2)
> -# else
> -       jnz     L(CopyVecSize)
> -# endif
>
> -       VMOVU   %YMM2, (%rdi, %rcx)
> -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> -       vpcmpb  $0, %YMM2, %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec2)
> -# else
> -       jnz     L(CopyVecSize)
> -# endif
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
> -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM3
> -       VMOVU   %YMM2, (%rdi, %rcx)
> -       vpcmpb  $0, %YMM3, %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec3)
> -# else
> -       jnz     L(CopyVecSize)
> -# endif
>
> -       VMOVU   %YMM3, (%rdi, %rcx)
> -       mov     %rsi, %rdx
> -       lea     VEC_SIZE(%rsi, %rcx), %rsi
> -       and     $-(VEC_SIZE * 4), %rsi
> -       sub     %rsi, %rdx
> -       sub     %rdx, %rdi
> -# ifdef USE_AS_STRNCPY
> -       lea     (VEC_SIZE * 8)(%r8, %rdx), %r8
> -# endif
> -L(UnalignedFourVecSizeLoop):
> -       VMOVA   (%rsi), %YMM4
> -       VMOVA   VEC_SIZE(%rsi), %YMM5
> -       VMOVA   (VEC_SIZE * 2)(%rsi), %YMM6
> -       VMOVA   (VEC_SIZE * 3)(%rsi), %YMM7
> -       vpminub %YMM5, %YMM4, %YMM2
> -       vpminub %YMM7, %YMM6, %YMM3
> -       vpminub %YMM2, %YMM3, %YMM2
> -       /* If K7 != 0, there is a null byte.  */
> -       vpcmpb  $0, %YMM2, %YMMZERO, %k7
> -       kmovd   %k7, %edx
> -# ifdef USE_AS_STRNCPY
> -       sub     $(VEC_SIZE * 4), %r8
> -       jbe     L(UnalignedLeaveCase2OrCase3)
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(STRCPY)
> +# ifdef USE_AS_STRCAT
> +       movq    %rdi, %rax
> +#  include "strcat-strlen-evex.S"
>  # endif
> -       test    %edx, %edx
> -       jnz     L(UnalignedFourVecSizeLeave)
> -
> -L(UnalignedFourVecSizeLoop_start):
> -       add     $(VEC_SIZE * 4), %rdi
> -       add     $(VEC_SIZE * 4), %rsi
> -       VMOVU   %YMM4, -(VEC_SIZE * 4)(%rdi)
> -       VMOVA   (%rsi), %YMM4
> -       VMOVU   %YMM5, -(VEC_SIZE * 3)(%rdi)
> -       VMOVA   VEC_SIZE(%rsi), %YMM5
> -       vpminub %YMM5, %YMM4, %YMM2
> -       VMOVU   %YMM6, -(VEC_SIZE * 2)(%rdi)
> -       VMOVA   (VEC_SIZE * 2)(%rsi), %YMM6
> -       VMOVU   %YMM7, -VEC_SIZE(%rdi)
> -       VMOVA   (VEC_SIZE * 3)(%rsi), %YMM7
> -       vpminub %YMM7, %YMM6, %YMM3
> -       vpminub %YMM2, %YMM3, %YMM2
> -       /* If K7 != 0, there is a null byte.  */
> -       vpcmpb  $0, %YMM2, %YMMZERO, %k7
> -       kmovd   %k7, %edx
> -# ifdef USE_AS_STRNCPY
> -       sub     $(VEC_SIZE * 4), %r8
> -       jbe     L(UnalignedLeaveCase2OrCase3)
> +
> +       movl    %esi, %PAGE_ALIGN_REG
> +       andl    $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
> +       ja      L(page_cross)
> +L(page_cross_continue):
> +       VMOVU   (%rsi), %VMM(0)
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +       movq    %rdi, %rax
>  # endif
> -       test    %edx, %edx
> -       jz      L(UnalignedFourVecSizeLoop_start)
>
> -L(UnalignedFourVecSizeLeave):
> -       vpcmpb  $0, %YMM4, %YMMZERO, %k1
> -       kmovd   %k1, %edx
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeUnaligned_0)
>
> -       vpcmpb  $0, %YMM5, %YMMZERO, %k2
> -       kmovd   %k2, %ecx
> -       test    %ecx, %ecx
> -       jnz     L(CopyVecSizeUnaligned_16)
> +       /* Two short string implementations. One with traditional
> +          branching approach and one with masked instructions (which
> +          have potential for dramatically bad perf if dst splits a
> +          page and is not in the TLB).  */
> +# if USE_EVEX_MASKED_STORE
> +       VPTEST  %VMM(0), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +#  ifdef USE_AS_WCSCPY
> +       subl    $((1 << CHAR_PER_VEC)- 1), %VRCX
> +#  else
> +       inc     %VRCX
> +#  endif
> +       jz      L(more_1x_vec)
> +       KMOV    %VRCX, %k1
> +       KXOR    %k0, %k1, %k1
>
> -       vpcmpb  $0, %YMM6, %YMMZERO, %k3
> -       kmovd   %k3, %edx
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeUnaligned_32)
> -
> -       vpcmpb  $0, %YMM7, %YMMZERO, %k4
> -       kmovd   %k4, %ecx
> -       bsf     %ecx, %edx
> -       VMOVU   %YMM4, (%rdi)
> -       VMOVU   %YMM5, VEC_SIZE(%rdi)
> -       VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     (VEC_SIZE * 3)(%rdi, %rdx), %rax
> -# endif
> -       VMOVU   %YMM7, (VEC_SIZE * 3)(%rdi)
> -       add     $(VEC_SIZE - 1), %r8
> -       sub     %rdx, %r8
> -       lea     ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> -# else
> -       add     $(VEC_SIZE * 3), %rsi
> -       add     $(VEC_SIZE * 3), %rdi
> -       jmp     L(CopyVecSizeExit)
> -# endif
> +       VMOVU_MASK %VMM(0), (%rdi){%k1}
>
> -/* If source address alignment == destination address alignment */
> +#  ifdef USE_AS_STPCPY
> +       bsf     %VRCX, %VRCX
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rax
> +#  endif
> +       ret
>
> -L(SourceStringAlignmentLessTwoVecSize):
> -       VMOVU   (%rsi), %YMM3
> -       VMOVU   VEC_SIZE(%rsi), %YMM2
> -       vpcmpb  $0, %YMM3, %YMMZERO, %k0
> -       kmovd   %k0, %edx
> +# else
> +       VPTESTN %VMM(0), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jz      L(more_1x_vec)
>
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -       cmp     $VEC_SIZE, %r8
> +       xorl    %edx, %edx
> +       bsf     %VRCX, %VRDX
> +#  ifdef USE_AS_STPCPY
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> +#  endif
> +
> +       /* Use mask bits in rcx to detect which copy we need. If the low
> +          mask is zero then there must be a bit set in the upper half.
> +          I.e if rcx != 0 and ecx == 0, then match must be upper 32
> +          bits so we use L(copy_32_63).  */
> +#  if VEC_SIZE == 64
> +#   ifdef USE_AS_WCSCPY
> +       testb   %cl, %cl
> +#   else
> +       testl   %ecx, %ecx
> +#   endif
> +       jz      L(copy_32_63)
> +#  endif
> +
> +#  ifdef USE_AS_WCSCPY
> +       testb   $0xf, %cl
>  #  else
> -       cmp     $(VEC_SIZE + 1), %r8
> +       testw   %cx, %cx
>  #  endif
> -       jbe     L(CopyVecSizeTail1Case2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeTail1)
> +       jz      L(copy_16_31)
>
> -       VMOVU   %YMM3, (%rdi)
> -       vpcmpb  $0, %YMM2, %YMMZERO, %k0
> -       kmovd   %k0, %edx
>
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -       cmp     $(VEC_SIZE * 2), %r8
> +#  ifdef USE_AS_WCSCPY
> +       testb   $0x3, %cl
>  #  else
> -       cmp     $((VEC_SIZE * 2) + 1), %r8
> +       testb   %cl, %cl
>  #  endif
> -       jbe     L(CopyTwoVecSize1Case2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyTwoVecSize1)
> -
> -       and     $-VEC_SIZE, %rsi
> -       and     $(VEC_SIZE - 1), %ecx
> -       jmp     L(UnalignVecSizeBoth)
> +       jz      L(copy_8_15)
>
> -/*------End of main part with loops---------------------*/
>
> -/* Case1 */
> +#  ifdef USE_AS_WCSCPY
> +       vmovd   %VMM_128(0), (%rdi)
> +       /* No need to copy, we know its zero.  */
> +       movl    $0, (%END_REG)
>
> -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> -       .p2align 4
> -L(CopyVecSize):
> -       add     %rcx, %rdi
> -# endif
> -L(CopyVecSizeTail):
> -       add     %rcx, %rsi
> -L(CopyVecSizeTail1):
> -       bsf     %edx, %edx
> -L(CopyVecSizeExit):
> -       cmp     $32, %edx
> -       jae     L(Exit32_63)
> -       cmp     $16, %edx
> -       jae     L(Exit16_31)
> -       cmp     $8, %edx
> -       jae     L(Exit8_15)
> -       cmp     $4, %edx
> -       jae     L(Exit4_7)
> -       cmp     $3, %edx
> -       je      L(Exit3)
> -       cmp     $1, %edx
> -       ja      L(Exit2)
> -       je      L(Exit1)
> -       movb    $0, (%rdi)
> -# ifdef USE_AS_STPCPY
> -       lea     (%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $1, %r8
> -       lea     1(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> -# endif
>         ret
> +#  else
>
> -       .p2align 4
> -L(CopyTwoVecSize1):
> -       add     $VEC_SIZE, %rsi
> -       add     $VEC_SIZE, %rdi
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $VEC_SIZE, %r8
> -# endif
> -       jmp     L(CopyVecSizeTail1)
> -
> -       .p2align 4
> -L(CopyTwoVecSize):
> -       bsf     %edx, %edx
> -       add     %rcx, %rsi
> -       add     $VEC_SIZE, %edx
> -       sub     %ecx, %edx
> -       jmp     L(CopyVecSizeExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnaligned_0):
> -       bsf     %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -       VMOVU   %YMM4, (%rdi)
> -       add     $((VEC_SIZE * 4) - 1), %r8
> -       sub     %rdx, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> -# else
> -       jmp     L(CopyVecSizeExit)
> -# endif
> +       testb   $0x7, %cl
> +       jz      L(copy_4_7)
>
> -       .p2align 4
> -L(CopyVecSizeUnaligned_16):
> -       bsf     %ecx, %edx
> -       VMOVU   %YMM4, (%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     VEC_SIZE(%rdi, %rdx), %rax
> -# endif
> -       VMOVU   %YMM5, VEC_SIZE(%rdi)
> -       add     $((VEC_SIZE * 3) - 1), %r8
> -       sub     %rdx, %r8
> -       lea     (VEC_SIZE + 1)(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> -# else
> -       add     $VEC_SIZE, %rsi
> -       add     $VEC_SIZE, %rdi
> -       jmp     L(CopyVecSizeExit)
> -# endif
>
> -       .p2align 4
> -L(CopyVecSizeUnaligned_32):
> -       bsf     %edx, %edx
> -       VMOVU   %YMM4, (%rdi)
> -       VMOVU   %YMM5, VEC_SIZE(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     (VEC_SIZE * 2)(%rdi, %rdx), %rax
> -# endif
> -       VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> -       add     $((VEC_SIZE * 2) - 1), %r8
> -       sub     %rdx, %r8
> -       lea     ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> -# else
> -       add     $(VEC_SIZE * 2), %rsi
> -       add     $(VEC_SIZE * 2), %rdi
> -       jmp     L(CopyVecSizeExit)
> -# endif
> +       test    %edx, %edx
> +       jz      L(set_null_term)
>
> -# ifdef USE_AS_STRNCPY
> -#  ifndef USE_AS_STRCAT
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec6):
> -       VMOVU   %YMM6, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec5):
> -       VMOVU   %YMM5, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec4):
> -       VMOVU   %YMM4, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec3):
> -       VMOVU   %YMM3, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> +       /* NB: make this `vmovw` if support for AVX512-FP16 is added.
> +        */
> +       vmovd   %VMM_128(0), %esi
> +       movw    %si, (%rdi)
> +
> +       .p2align 4,, 1
> +L(set_null_term):
> +       /* No need to copy, we know its zero.  */
> +       movb    $0, (%END_REG)
> +       ret
>  #  endif
>
> -/* Case2 */
> -
> -       .p2align 4
> -L(CopyVecSizeCase2):
> -       add     $VEC_SIZE, %r8
> -       add     %rcx, %rdi
> -       add     %rcx, %rsi
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyTwoVecSizeCase2):
> -       add     %rcx, %rsi
> -       bsf     %edx, %edx
> -       add     $VEC_SIZE, %edx
> -       sub     %ecx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -L(CopyVecSizeTailCase2):
> -       add     %rcx, %rsi
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -L(CopyVecSizeTail1Case2):
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -/* Case2 or Case3,  Case3 */
> -
> -       .p2align 4
> -L(CopyVecSizeCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyVecSizeCase2)
> -L(CopyVecSizeCase3):
> -       add     $VEC_SIZE, %r8
> -       add     %rcx, %rdi
> -       add     %rcx, %rsi
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyTwoVecSizeCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyTwoVecSizeCase2)
> -       add     %rcx, %rsi
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyVecSizeTailCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyVecSizeTailCase2)
> -       add     %rcx, %rsi
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyTwoVecSize1Case2OrCase3):
> -       add     $VEC_SIZE, %rdi
> -       add     $VEC_SIZE, %rsi
> -       sub     $VEC_SIZE, %r8
> -L(CopyVecSizeTail1Case2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyVecSizeTail1Case2)
> -       jmp     L(StrncpyExit)
> +#  if VEC_SIZE == 64
> +       .p2align 4,, 6
> +L(copy_32_63):
> +       VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +       VMOVU   %VMM_256(0), (%rdi)
> +       VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
> +       ret
> +#  endif
> +
> +
> +       .p2align 4,, 6
> +L(copy_16_31):
> +       /* Use xmm1 explicitly here as it won't require a `vzeroupper`
> +          and will save code size.  */
> +       vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +       VMOVU   %VMM_128(0), (%rdi)
> +       vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
> +       ret
> +
> +       .p2align 4,, 8
> +L(copy_8_15):
> +#  ifdef USE_AS_WCSCPY
> +       movl    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> +#  else
> +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
> +#  endif
> +       vmovq   %VMM_128(0), (%rdi)
> +       movq    %rcx, -(8 - CHAR_SIZE)(%END_REG)
> +       ret
>  # endif
>
> -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
>
> -       .p2align 4
> -L(Exit1):
> -       movzwl  (%rsi), %edx
> -       mov     %dx, (%rdi)
> -# ifdef USE_AS_STPCPY
> -       lea     1(%rdi), %rax
> +# ifndef USE_AS_WCSCPY
> +       .p2align 4,, 12
> +L(copy_4_7):
> +       movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> +       vmovd   %VMM_128(0), (%rdi)
> +       movl    %ecx, -(4 - CHAR_SIZE)(%END_REG)
> +       ret
>  # endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $2, %r8
> -       lea     2(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +
> +
> +       .p2align 4,, 8
> +L(more_1x_vec):
> +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> +       VMOVU   %VMM(0), (%rdi)
>  # endif
> -       ret
> +       subq    %rsi, %rdi
> +       andq    $-(VEC_SIZE), %rsi
> +       addq    %rsi, %rdi
> +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
>
> -       .p2align 4
> -L(Exit2):
> -       movzwl  (%rsi), %ecx
> -       mov     %cx, (%rdi)
> -       movb    $0, 2(%rdi)
> +       /* Ideally we store after moves to minimize impact of potential
> +          false-dependencies.  */
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +       VMOVU   %VMM(0), (%rax)
> +# endif
> +
> +       VPTESTN %VMM(1), %VMM(1), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x1)
> +
> +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), VEC_SIZE(%rdi)
> +
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x2)
> +
> +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> +
> +       VPTESTN %VMM(3), %VMM(3), %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(ret_vec_x3)
> +
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> +       VPTESTN %VMM(4), %VMM(4), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x4)
> +
> +       VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> +
> +
> +       /* Align for 4x loop.  */
> +       subq    %rsi, %rdi
> +
> +       /* + VEC_SIZE * 5 because we never added the original VEC_SIZE
> +          we covered before aligning.  */
> +       subq    $-(VEC_SIZE * 5), %rsi
> +       andq    $-(VEC_SIZE * 4), %rsi
> +
> +
> +       /* Load first half of the loop before entry.  */
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPTESTN %VMM(4), %VMM(4), %k2
> +       VPTESTN %VMM(6), %VMM(6), %k4
> +       KORTEST %k2, %k4
> +       jnz     L(loop_4x_done)
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> +
> +       subq    $(VEC_SIZE * -4), %rsi
> +
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPTESTN %VMM(4), %VMM(4), %k2
> +       VPTESTN %VMM(6), %VMM(6), %k4
> +       KORTEST %k2, %k4
> +       jz      L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +       VPTESTN %VMM(0), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       /* Restore rdi (%rdi).  */
> +       addq    %rsi, %rdi
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x0_end)
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +
> +       KMOV    %k2, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x1)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x2)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +       /* Place L(ret_vec_x4) here to save code size.  We get a
> +          meaningfuly benefit doing this for stpcpy.  */
> +       KMOV    %k4, %VRDX
> +L(ret_vec_x3):
> +       bsf     %VRDX, %VRDX
> +       VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -       lea     2(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $3, %r8
> -       lea     3(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       leaq    (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
>  # endif
> +L(return_end):
>         ret
>
> -       .p2align 4
> -L(Exit3):
> -       mov     (%rsi), %edx
> -       mov     %edx, (%rdi)
> +       .p2align 4,, 6
> +L(ret_vec_x0_end):
> +       bsf     %VRCX, %VRCX
>  # ifdef USE_AS_STPCPY
> -       lea     3(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $4, %r8
> -       lea     4(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
> +       inc     %VRCX
> +       VMOVU   (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
>         ret
>
> -       .p2align 4
> -L(Exit4_7):
> -       mov     (%rsi), %ecx
> -       mov     %ecx, (%rdi)
> -       mov     -3(%rsi, %rdx), %ecx
> -       mov     %ecx, -3(%rdi, %rdx)
> +       .p2align 4,, 8
> +L(ret_vec_x1):
> +       bsf     %VRCX, %VRCX
> +       VMOVU   (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     %rdx, %r8
> -       sub     $1, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       leaq    VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
>         ret
>
> -       .p2align 4
> -L(Exit8_15):
> -       mov     (%rsi), %rcx
> -       mov     -7(%rsi, %rdx), %r9
> -       mov     %rcx, (%rdi)
> -       mov     %r9, -7(%rdi, %rdx)
> +       .p2align 4,, 4
> +L(ret_vec_x2):
> +       bsf     %VRCX, %VRCX
> +       VMOVU   ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     %rdx, %r8
> -       sub     $1, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
>         ret
>
> -       .p2align 4
> -L(Exit16_31):
> -       VMOVU   (%rsi), %XMM2
> -       VMOVU   -15(%rsi, %rdx), %XMM3
> -       VMOVU   %XMM2, (%rdi)
> -       VMOVU   %XMM3, -15(%rdi, %rdx)
> +       /* ret_vec_x3 reuses return code after the loop.  */
> +       .p2align 4,, 6
> +L(ret_vec_x4):
> +       bsf     %VRCX, %VRCX
> +       VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub %rdx, %r8
> -       sub $1, %r8
> -       lea 1(%rdi, %rdx), %rdi
> -       jnz L(StrncpyFillTailWithZero)
> +       leaq    (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
>         ret
>
> -       .p2align 4
> -L(Exit32_63):
> -       VMOVU   (%rsi), %YMM2
> -       VMOVU   -31(%rsi, %rdx), %YMM3
> -       VMOVU   %YMM2, (%rdi)
> -       VMOVU   %YMM3, -31(%rdi, %rdx)
> -# ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> +
> +       .p2align 4,, 4
> +L(page_cross):
> +# ifndef USE_AS_STRCAT
> +       vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
>  # endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     %rdx, %r8
> -       sub     $1, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       movq    %rsi, %rcx
> +       andq    $(VEC_SIZE * -1), %rcx
> +
> +       VPCMPEQ (%rcx), %VZERO, %k0
> +       KMOV    %k0, %VRCX
> +# ifdef USE_AS_WCSCPY
> +       andl    $(VEC_SIZE - 1), %PAGE_ALIGN_REG
> +       shrl    $2, %PAGE_ALIGN_REG
>  # endif
> -       ret
> +       shrx    %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
>
> -# ifdef USE_AS_STRNCPY
> +# if USE_MOVSB_IN_PAGE_CROSS
> +       /* Optimizing more aggressively for space as this is very cold
> +          code. This saves 2x cache lines.  */
>
> -       .p2align 4
> -L(StrncpyExit1):
> -       movzbl  (%rsi), %edx
> -       mov     %dl, (%rdi)
> -#  ifdef USE_AS_STPCPY
> -       lea     1(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, 1(%rdi)
> +       /* This adds once to the later result which will get correct
> +          copy bounds. NB: this can never zero-out a non-zero RCX as
> +          to be in the page cross case rsi cannot be aligned and we
> +          already right-shift rcx by the misalignment.  */
> +       shl     %VRCX
> +       jz      L(page_cross_continue)
> +#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +       movq    %rdi, %rax
>  #  endif
> -       ret
> +       bsf     %VRCX, %VRCX
> +       REP_MOVS
>
> -       .p2align 4
> -L(StrncpyExit2):
> -       movzwl  (%rsi), %edx
> -       mov     %dx, (%rdi)
>  #  ifdef USE_AS_STPCPY
> -       lea     2(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, 2(%rdi)
> +       leaq    -CHAR_SIZE(%rdi), %rax
>  #  endif
>         ret
>
> -       .p2align 4
> -L(StrncpyExit3_4):
> -       movzwl  (%rsi), %ecx
> -       movzwl  -2(%rsi, %r8), %edx
> -       mov     %cx, (%rdi)
> -       mov     %dx, -2(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       ret
>
> -       .p2align 4
> -L(StrncpyExit5_8):
> -       mov     (%rsi), %ecx
> -       mov     -4(%rsi, %r8), %edx
> -       mov     %ecx, (%rdi)
> -       mov     %edx, -4(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       ret
> +# else
> +       /* Check if we found zero-char before end of page.  */
> +       test    %VRCX, %VRCX
> +       jz      L(page_cross_continue)
>
> -       .p2align 4
> -L(StrncpyExit9_16):
> -       mov     (%rsi), %rcx
> -       mov     -8(%rsi, %r8), %rdx
> -       mov     %rcx, (%rdi)
> -       mov     %rdx, -8(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       ret
> +       /* Traditional copy case, essentially same as used in non-page-
> +          cross case but since we can't reuse VMM(0) we need twice as
> +          many loads from rsi.  */
>
> -       .p2align 4
> -L(StrncpyExit17_32):
> -       VMOVU   (%rsi), %XMM2
> -       VMOVU   -16(%rsi, %r8), %XMM3
> -       VMOVU   %XMM2, (%rdi)
> -       VMOVU   %XMM3, -16(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> +#  ifndef USE_AS_STRCAT
> +       xorl    %edx, %edx
>  #  endif
> -       ret
> -
> -       .p2align 4
> -L(StrncpyExit33_64):
> -       /*  0/32, 31/16 */
> -       VMOVU   (%rsi), %YMM2
> -       VMOVU   -VEC_SIZE(%rsi, %r8), %YMM3
> -       VMOVU   %YMM2, (%rdi)
> -       VMOVU   %YMM3, -VEC_SIZE(%rdi, %r8)
> +       /* Dependency on rdi must already have been satisfied.  */
> +       bsf     %VRCX, %VRDX
>  #  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> +#  elif !defined USE_AS_STRCAT
> +       movq    %rdi, %rax
>  #  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       ret
>
> -       .p2align 4
> -L(StrncpyExit65):
> -       /* 0/32, 32/32, 64/1 */
> -       VMOVU   (%rsi), %YMM2
> -       VMOVU   32(%rsi), %YMM3
> -       mov     64(%rsi), %cl
> -       VMOVU   %YMM2, (%rdi)
> -       VMOVU   %YMM3, 32(%rdi)
> -       mov     %cl, 64(%rdi)
> -#  ifdef USE_AS_STPCPY
> -       lea     65(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, 65(%rdi)
> +#  if VEC_SIZE == 64
> +#   ifdef USE_AS_WCSCPY
> +       testb   %cl, %cl
> +#   else
> +       test    %ecx, %ecx
> +#   endif
> +       jz      L(page_cross_copy_32_63)
>  #  endif
> -       ret
> -
> -#  ifndef USE_AS_STRCAT
>
> -       .p2align 4
> -L(Fill1):
> -       mov     %dl, (%rdi)
> -       ret
> +#  ifdef USE_AS_WCSCPY
> +       testb   $0xf, %cl
> +#  else
> +       testw   %cx, %cx
> +#  endif
> +       jz      L(page_cross_copy_16_31)
>
> -       .p2align 4
> -L(Fill2):
> -       mov     %dx, (%rdi)
> -       ret
> +#  ifdef USE_AS_WCSCPY
> +       testb   $0x3, %cl
> +#  else
> +       testb   %cl, %cl
> +#  endif
> +       jz      L(page_cross_copy_8_15)
>
> -       .p2align 4
> -L(Fill3_4):
> -       mov     %dx, (%rdi)
> -       mov     %dx, -2(%rdi, %r8)
> +#  ifdef USE_AS_WCSCPY
> +       movl    (%rsi), %esi
> +       movl    %esi, (%rdi)
> +       movl    $0, (%END_REG)
>         ret
> +#  else
>
> -       .p2align 4
> -L(Fill5_8):
> -       mov     %edx, (%rdi)
> -       mov     %edx, -4(%rdi, %r8)
> -       ret
> +       testb   $0x7, %cl
> +       jz      L(page_cross_copy_4_7)
>
> -       .p2align 4
> -L(Fill9_16):
> -       mov     %rdx, (%rdi)
> -       mov     %rdx, -8(%rdi, %r8)
> +       test    %edx, %edx
> +       jz      L(page_cross_set_null_term)
> +       movzwl  (%rsi), %ecx
> +       movw    %cx, (%rdi)
> +L(page_cross_set_null_term):
> +       movb    $0, (%END_REG)
>         ret
>
> -       .p2align 4
> -L(Fill17_32):
> -       VMOVU   %XMMZERO, (%rdi)
> -       VMOVU   %XMMZERO, -16(%rdi, %r8)
> -       ret
>
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec2):
> -       VMOVU   %YMM2, (%rdi, %rcx)
> -
> -       .p2align 4
> -L(CopyVecSizeVecExit):
> -       bsf     %edx, %edx
> -       add     $(VEC_SIZE - 1), %r8
> -       add     %rcx, %rdi
> -#   ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -#   endif
> -       sub     %rdx, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -
> -       .p2align 4
> -L(StrncpyFillTailWithZero):
> -       xor     %edx, %edx
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(StrncpyFillExit)
> -
> -       VMOVU   %YMMZERO, (%rdi)
> -       add     $VEC_SIZE, %rdi
> -
> -       mov     %rdi, %rsi
> -       and     $(VEC_SIZE - 1), %esi
> -       sub     %rsi, %rdi
> -       add     %rsi, %r8
> -       sub     $(VEC_SIZE * 4), %r8
> -       jb      L(StrncpyFillLessFourVecSize)
> -
> -L(StrncpyFillLoopVmovdqa):
> -       VMOVA   %YMMZERO, (%rdi)
> -       VMOVA   %YMMZERO, VEC_SIZE(%rdi)
> -       VMOVA   %YMMZERO, (VEC_SIZE * 2)(%rdi)
> -       VMOVA   %YMMZERO, (VEC_SIZE * 3)(%rdi)
> -       add     $(VEC_SIZE * 4), %rdi
> -       sub     $(VEC_SIZE * 4), %r8
> -       jae     L(StrncpyFillLoopVmovdqa)
> -
> -L(StrncpyFillLessFourVecSize):
> -       add     $(VEC_SIZE * 2), %r8
> -       jl      L(StrncpyFillLessTwoVecSize)
> -       VMOVA   %YMMZERO, (%rdi)
> -       VMOVA   %YMMZERO, VEC_SIZE(%rdi)
> -       add     $(VEC_SIZE * 2), %rdi
> -       sub     $VEC_SIZE, %r8
> -       jl      L(StrncpyFillExit)
> -       VMOVA   %YMMZERO, (%rdi)
> -       add     $VEC_SIZE, %rdi
> -       jmp     L(Fill)
> -
> -       .p2align 4
> -L(StrncpyFillLessTwoVecSize):
> -       add     $VEC_SIZE, %r8
> -       jl      L(StrncpyFillExit)
> -       VMOVA   %YMMZERO, (%rdi)
> -       add     $VEC_SIZE, %rdi
> -       jmp     L(Fill)
> -
> -       .p2align 4
> -L(StrncpyFillExit):
> -       add     $VEC_SIZE, %r8
> -L(Fill):
> -       cmp     $17, %r8d
> -       jae     L(Fill17_32)
> -       cmp     $9, %r8d
> -       jae     L(Fill9_16)
> -       cmp     $5, %r8d
> -       jae     L(Fill5_8)
> -       cmp     $3, %r8d
> -       jae     L(Fill3_4)
> -       cmp     $1, %r8d
> -       ja      L(Fill2)
> -       je      L(Fill1)
> +       .p2align 4,, 4
> +L(page_cross_copy_4_7):
> +       movl    (%rsi), %ecx
> +       movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
> +       movl    %ecx, (%rdi)
> +       movl    %esi, -(4 - CHAR_SIZE)(%END_REG)
>         ret
> -
> -/* end of ifndef USE_AS_STRCAT */
>  #  endif
>
> -       .p2align 4
> -L(UnalignedLeaveCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(UnalignedFourVecSizeLeaveCase2)
> -L(UnalignedFourVecSizeLeaveCase3):
> -       lea     (VEC_SIZE * 4)(%r8), %rcx
> -       and     $-VEC_SIZE, %rcx
> -       add     $(VEC_SIZE * 3), %r8
> -       jl      L(CopyVecSizeCase3)
> -       VMOVU   %YMM4, (%rdi)
> -       sub     $VEC_SIZE, %r8
> -       jb      L(CopyVecSizeCase3)
> -       VMOVU   %YMM5, VEC_SIZE(%rdi)
> -       sub     $VEC_SIZE, %r8
> -       jb      L(CopyVecSizeCase3)
> -       VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> -       sub     $VEC_SIZE, %r8
> -       jb      L(CopyVecSizeCase3)
> -       VMOVU   %YMM7, (VEC_SIZE * 3)(%rdi)
> -#  ifdef USE_AS_STPCPY
> -       lea     (VEC_SIZE * 4)(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (VEC_SIZE * 4)(%rdi)
> -#  endif
> +#  if VEC_SIZE == 64
> +       .p2align 4,, 4
> +L(page_cross_copy_32_63):
> +       VMOVU   (%rsi), %VMM_256(0)
> +       VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +       VMOVU   %VMM_256(0), (%rdi)
> +       VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
>         ret
> -
> -       .p2align 4
> -L(UnalignedFourVecSizeLeaveCase2):
> -       xor     %ecx, %ecx
> -       vpcmpb  $0, %YMM4, %YMMZERO, %k1
> -       kmovd   %k1, %edx
> -       add     $(VEC_SIZE * 3), %r8
> -       jle     L(CopyVecSizeCase2OrCase3)
> -       test    %edx, %edx
> -#  ifndef USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec4)
> -#  else
> -       jnz     L(CopyVecSize)
> -#  endif
> -       vpcmpb  $0, %YMM5, %YMMZERO, %k2
> -       kmovd   %k2, %edx
> -       VMOVU   %YMM4, (%rdi)
> -       add     $VEC_SIZE, %rcx
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -       test    %edx, %edx
> -#  ifndef USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec5)
> -#  else
> -       jnz     L(CopyVecSize)
>  #  endif
>
> -       vpcmpb  $0, %YMM6, %YMMZERO, %k3
> -       kmovd   %k3, %edx
> -       VMOVU   %YMM5, VEC_SIZE(%rdi)
> -       add     $VEC_SIZE, %rcx
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -       test    %edx, %edx
> -#  ifndef USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec6)
> -#  else
> -       jnz     L(CopyVecSize)
> -#  endif
> -
> -       vpcmpb  $0, %YMM7, %YMMZERO, %k4
> -       kmovd   %k4, %edx
> -       VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> -       lea     VEC_SIZE(%rdi, %rcx), %rdi
> -       lea     VEC_SIZE(%rsi, %rcx), %rsi
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -L(StrncpyExit):
> -       cmp     $65, %r8d
> -       je      L(StrncpyExit65)
> -       cmp     $33, %r8d
> -       jae     L(StrncpyExit33_64)
> -       cmp     $17, %r8d
> -       jae     L(StrncpyExit17_32)
> -       cmp     $9, %r8d
> -       jae     L(StrncpyExit9_16)
> -       cmp     $5, %r8d
> -       jae     L(StrncpyExit5_8)
> -       cmp     $3, %r8d
> -       jae     L(StrncpyExit3_4)
> -       cmp     $1, %r8d
> -       ja      L(StrncpyExit2)
> -       je      L(StrncpyExit1)
> -#  ifdef USE_AS_STPCPY
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi)
> -#  endif
> +       .p2align 4,, 4
> +L(page_cross_copy_16_31):
> +       vmovdqu (%rsi), %xmm0
> +       vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +       vmovdqu %xmm0, (%rdi)
> +       vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
>         ret
>
> -       .p2align 4
> -L(ExitZero):
> -#  ifndef USE_AS_STRCAT
> -       mov     %rdi, %rax
> -#  endif
> +       .p2align 4,, 4
> +L(page_cross_copy_8_15):
> +       movq    (%rsi), %rcx
> +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> +       movq    %rcx, (%rdi)
> +       movq    %rsi, -(8 - CHAR_SIZE)(%END_REG)
>         ret
> -
> -# endif
> -
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# else
> -END (STRCAT)
>  # endif
> +END(STRCPY)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
> index 203a19bf21..38dcbfa0ec 100644
> --- a/sysdeps/x86_64/multiarch/strncat-evex.S
> +++ b/sysdeps/x86_64/multiarch/strncat-evex.S
> @@ -1,7 +1,512 @@
> -#ifndef STRNCAT
> -# define STRNCAT       __strncat_evex
> -#endif
> +/* {wcs|str}ncat  with 256/512-bit EVEX.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +       /* Use evex-masked stores for small sizes. Turned off at the
> +          moment.  */
> +# define USE_EVEX_MASKED_STORE 0
> +
> +# include <sysdep.h>
> +
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +# ifndef STRNCAT
> +#  define STRNCAT      __strncat_evex
> +# endif
> +
> +
> +# ifdef USE_AS_WCSCPY
> +#  define VMOVU_MASK   vmovdqu32
> +#  define VPMIN        vpminud
> +#  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
> +#  define VPCMPEQ      vpcmpeqd
> +#  define CHAR_SIZE    4
> +
> +#  define REP_MOVS     rep movsd
> +
> +#  define VMASK_REG    VR10
> +#  define FIND_FIRST_ONE(src, dst)     movl $CHAR_PER_VEC, %dst; bsf %src, %dst
> +
> +#  define USE_WIDE_CHAR
> +# else
> +#  define VMOVU_MASK   vmovdqu8
> +#  define VPMIN        vpminub
> +#  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
> +#  define VPCMPEQ      vpcmpeqb
> +#  define CHAR_SIZE    1
> +
> +#  define REP_MOVS     rep movsb
> +
> +#  define VMASK_REG    VRCX
> +#  define FIND_FIRST_ONE(src, dst)     tzcnt %src, %dst
> +
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# include "reg-macros.h"
> +
> +
> +# define VZERO VMM(7)
> +# define VZERO_128     VMM_128(7)
> +
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCAT)
> +       movq    %rdi, %rax
> +# ifdef USE_AS_WCSCPY
> +       decq    %rdx
> +       movq    %rdx, %rcx
> +       shrq    $56, %rcx
> +       jnz     L(zero_len)
> +# else
> +       decq    %rdx
> +       jl      L(zero_len)
> +# endif
> +
> +# include "strcat-strlen-evex.S"
> +
> +       movl    %esi, %ecx
> +       andl    $(PAGE_SIZE - 1), %ecx
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> +       ja      L(page_cross)
> +L(page_cross_continue):
> +       VMOVU   (%rsi), %VMM(0)
> +       VPTESTN %VMM(0), %VMM(0), %k0
> +
> +       /* If USE_EVEX_MASK_STORE is enabled then we just handle length
> +          <= CHAR_PER_VEC with masked instructions (which have
> +          potential for dramatically bad perf if dst splits a page and
> +          is not in the TLB).  */
> +# if USE_EVEX_MASKED_STORE
> +       KMOV    %k0, %VRCX
> +       cmpq    $CHAR_PER_VEC, %rdx
> +       jae     L(more_1x_vec)
> +       bts     %VRDX, %VRCX
> +L(less_1x_vec_masked):
> +       blsmsk  %VRCX, %VRCX
> +
> +       KMOV    %VRCX, %k1
> +       VMOVU_MASK %VMM(0), (%rdi){%k1}
> +       ret
> +# else
> +       KMOV    %k0, %VMASK_REG
> +       /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
> +          %VMASK_REG, %VRCX` for wcsncat.  */
> +       FIND_FIRST_ONE (VMASK_REG, VRCX)
> +       cmpq    %rcx, %rdx
> +       jb      L(less_1x_vec)
> +
> +       /* If there were no zero-CHARs (rcx was zero before
> +          FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> +       cmpl    $CHAR_PER_VEC, %ecx
> +       je      L(more_1x_vec)
> +
> +       movl    %ecx, %edx
> +
> +L(less_1x_vec):
> +#  if VEC_SIZE == 64
> +       cmpl    $(32 / CHAR_SIZE - 1), %edx
> +       jae     L(copy_32_63)
> +#  endif
> +
> +       cmpl    $(16 / CHAR_SIZE - 1), %edx
> +       jae     L(copy_16_31)
> +
> +
> +       cmpl    $(8 / CHAR_SIZE - 1), %edx
> +       jae     L(copy_8_15)
> +
> +#  ifdef USE_AS_WCSCPY
> +       movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> +       vmovd   %VMM_128(0), (%rdi)
> +       movl    %ecx, -(4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       ret
> +#  else
> +
> +       cmpl    $3, %edx
> +       jae     L(copy_4_7)
> +
> +       movzbl  (%rsi, %rdx), %ecx
> +       test    %edx, %edx
> +       je      L(set_null_term)
> +
> +       /* NB: make this `vmovw` if support for AVX512-FP16 is added.
> +        */
> +       movzwl  (%rsi), %esi
> +       movw    %si, (%rdi)
> +
> +
> +       .p2align 4,, 1
> +L(set_null_term):
> +       movb    %cl, (%rdi, %rdx)
> +       ret
> +#  endif
> +
> +#  if VEC_SIZE == 64
> +       .p2align 4,, 6
> +L(copy_32_63):
> +       VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +       VMOVU   %VMM_256(0), (%rdi)
> +       VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       ret
> +#  endif
> +       .p2align 4,, 6
> +L(copy_16_31):
> +       /* Use xmm1 explicitly here as it won't require a `vzeroupper`
> +          and will save code size.  */
> +       vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +       VMOVU   %VMM_128(0), (%rdi)
> +       vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       ret
> +
> +       .p2align 4,, 2
> +L(copy_8_15):
> +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
> +       vmovq   %VMM_128(0), (%rdi)
> +       movq    %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       ret
> +
> +#  ifndef USE_AS_WCSCPY
> +       .p2align 4,, 12
> +L(copy_4_7):
> +       movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> +       vmovd   %VMM_128(0), (%rdi)
> +       movl    %ecx, -(4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       ret
> +#  endif
> +
> +# endif
> +
> +       .p2align 4,, 8
> +L(more_1x_vec):
> +# if USE_EVEX_MASKED_STORE
> +       test    %VRCX, %VRCX
> +       jnz     L(less_1x_vec_masked)
> +# endif
> +
> +
> +       VMOVU   %VMM(0), (%rdi)
>
> -#define USE_AS_STRNCAT
> -#define STRCAT STRNCAT
> -#include "strcat-evex.S"
> +       /* We are going to align rsi here so will need to be able to re-
> +          adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> +          so rsi + rdx * CHAR_SIZE cannot overflow.  */
> +
> +       leaq    (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> +       subq    %rsi, %rdi
> +       andq    $-(VEC_SIZE), %rsi
> +L(loop_last_4x_vec):
> +       addq    %rsi, %rdi
> +       subq    %rsi, %rdx
> +# ifdef USE_AS_WCSCPY
> +       shrq    $2, %rdx
> +# endif
> +
> +       /* Will need this regardless.  */
> +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> +       VPTESTN %VMM(1), %VMM(1), %k0
> +       KMOV    %k0, %VMASK_REG
> +
> +       cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
> +       ja      L(more_2x_vec)
> +
> +L(last_2x_vec):
> +       FIND_FIRST_ONE (VMASK_REG, VRCX)
> +       cmpl    %ecx, %edx
> +       jb      L(ret_vec_x1_len)
> +
> +       /* If there were no zero-CHARs (rcx was zero before
> +          FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> +       cmpl    $CHAR_PER_VEC, %ecx
> +       jne     L(ret_vec_x1)
> +
> +
> +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +
> +       /* wcsncat needs to mask edx (length) before `bts`. strncat does
> +          not as `bts` will naturally mask the bit-position to
> +          instruction length.  */
> +# ifdef USE_AS_WCSCPY
> +       andl    $(CHAR_PER_VEC - 1), %edx
> +# endif
> +       bts     %VRDX, %VRCX
> +L(ret_vec_x2):
> +       bsf     %VRCX, %VRCX
> +       VMOVU   (VEC_SIZE * 2 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +       ret
> +
> +       .p2align 4,, 8
> +L(ret_vec_x1_len):
> +       movl    %edx, %ecx
> +L(ret_vec_x1):
> +       VMOVU   (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), (VEC_SIZE-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +       ret
> +
> +       .p2align 4,, 4
> +L(zero_len):
> +       incq    %rdx
> +       jne     OVERFLOW_STRCAT
> +       ret
> +
> +       .p2align 4,, 8
> +L(last_4x_vec):
> +       /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
> +          $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
> +          using `movzbl`.  */
> +# if CHAR_PER_VEC == 64
> +       movzbl  %dl, %edx
> +# else
> +       andl    $(CHAR_PER_VEC * 4 - 1), %edx
> +# endif
> +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(1)
> +       VPTESTN %VMM(1), %VMM(1), %k0
> +       KMOV    %k0, %VMASK_REG
> +       subq    $-(VEC_SIZE * 4), %rsi
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       cmpl    $(CHAR_PER_VEC * 2 - 1), %edx
> +       jbe     L(last_2x_vec)
> +       .p2align 4,, 8
> +L(more_2x_vec):
> +       /* L(ret_vec_x1) expects position already to be in rcx so use
> +          `bsf` to test zero.  */
> +       bsf     %VMASK_REG, %VRCX
> +       jnz     L(ret_vec_x1)
> +
> +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x2)
> +
> +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> +       VPTESTN %VMM(3), %VMM(3), %k0
> +       KMOV    %k0, %VMASK_REG
> +
> +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rdx
> +       ja      L(more_4x_vec)
> +
> +       /* Adjust length before going to L(ret_vec_x3_len) or
> +          L(ret_vec_x3).  */
> +       addl    $(CHAR_PER_VEC * -2), %edx
> +
> +       FIND_FIRST_ONE (VMASK_REG, VRCX)
> +       cmpl    %ecx, %edx
> +       jb      L(ret_vec_x3_len)
> +
> +       /* If there were no zero-CHARs (rcx was zero before
> +          FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> +       cmpl    $CHAR_PER_VEC, %ecx
> +       jne     L(ret_vec_x3)
> +
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> +       VPTESTN %VMM(4), %VMM(4), %k0
> +       KMOV    %k0, %VRCX
> +# ifdef USE_AS_WCSCPY
> +       andl    $(CHAR_PER_VEC - 1), %edx
> +# endif
> +       bts     %VRDX, %VRCX
> +       .p2align 4,, 6
> +L(ret_vec_x4):
> +       bsf     %VRCX, %VRCX
> +       VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +       ret
> +
> +
> +       .p2align 4,, 8
> +L(ret_vec_x3_len):
> +       movl    %edx, %ecx
> +L(ret_vec_x3):
> +       VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +       ret
> +
> +
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +       bsf     %VMASK_REG, %VRCX
> +       jnz     L(ret_vec_x3)
> +
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> +       VPTESTN %VMM(4), %VMM(4), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x4)
> +
> +       VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> +
> +       /* Check if we are near the end before aligning.  */
> +       cmpq    $(CHAR_PER_VEC * 8 - 1), %rdx
> +       jbe     L(last_4x_vec)
> +
> +
> +       /* Add rsi to rdx (length) before aligning rsi. NB: Since we
> +          filtered out huge lengths this cannot overflow.  */
> +# ifdef USE_AS_WCSCPY
> +       leaq    (%rsi, %rdx, CHAR_SIZE), %rdx
> +# else
> +       addq    %rsi, %rdx
> +# endif
> +
> +       /* Subtract rsi from rdi before aligning (add back will have
> +          correct rdi for aligned rsi).  */
> +       subq    %rsi, %rdi
> +       subq    $-(VEC_SIZE * 5), %rsi
> +       andq    $(VEC_SIZE * -4), %rsi
> +
> +       /* Load first half of the loop before entry.  */
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPTESTN %VMM(4), %VMM(4), %k2
> +       VPTESTN %VMM(6), %VMM(6), %k4
> +
> +       /* Offset rsi by VEC_SIZE so that we can jump to
> +          L(loop_last_4x_vec).  */
> +       addq    $-(VEC_SIZE), %rsi
> +       KORTEST %k2, %k4
> +       jnz     L(loop_4x_done)
> +
> +       /* Store loop end in r9.  */
> +       leaq    -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +       VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> +
> +       subq    $(VEC_SIZE * -4), %rsi
> +       cmpq    %rsi, %r9
> +       jbe     L(loop_last_4x_vec)
> +
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPTESTN %VMM(4), %VMM(4), %k2
> +       VPTESTN %VMM(6), %VMM(6), %k4
> +       KORTEST %k2, %k4
> +       jz      L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +       VPTESTN %VMM(0), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       /* Restore rdi (dst).  */
> +       addq    %rsi, %rdi
> +
> +       /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
> +          test with bsf.  */
> +       bsf     %VRCX, %VRCX
> +       jnz     L(ret_vec_x1)
> +       VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
> +
> +       KMOV    %k2, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x2)
> +       VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> +
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       bsf     %VRCX, %VRCX
> +       jnz     L(ret_vec_x3)
> +       VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> +
> +       KMOV    %k4, %VRDX
> +       bsf     %VRDX, %VRDX
> +       VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +L(return_end):
> +       ret
> +
> +       .p2align 4,, 4
> +       .p2align 6,, 8
> +L(page_cross):
> +       movq    %rsi, %r8
> +       andq    $(VEC_SIZE * -1), %r8
> +       VPCMPEQ (%r8), %VZERO, %k0
> +
> +# ifdef USE_AS_WCSCPY
> +       KMOV    %k0, %VR9
> +       shrl    $2, %ecx
> +       andl    $(CHAR_PER_VEC - 1), %ecx
> +       shrx    %VRCX, %VR9, %VRCX
> +# else
> +       KMOV    %k0, %VRCX
> +       shrx    %VRSI, %VRCX, %VRCX
> +# endif
> +
> +       subl    %esi, %r8d
> +       andl    $(VEC_SIZE - 1), %r8d
> +# ifdef USE_AS_WCSCPY
> +       shrl    $2, %r8d
> +# endif
> +       cmpq    %r8, %rdx
> +       jb      L(page_cross_small)
> +       /* Optimizing more for space as this is very cold code. This
> +          saves 2x cache lines.  */
> +
> +       /* This adds once to the later result which will get correct
> +          copy bounds. NB: this can never zero-out a non-zero RCX as
> +          to be in the page cross case rsi cannot be aligned and we
> +          already right-shift rcx by the misalignment.  */
> +       shl     %VRCX
> +       jz      L(page_cross_continue)
> +       bsf     %VRCX, %VRCX
> +       REP_MOVS
> +       ret
> +
> +L(page_cross_small):
> +       tzcnt   %VRCX, %VRCX
> +       cmpq    %rdx, %rcx
> +       cmova   %edx, %ecx
> +       incl    %ecx
> +# ifdef USE_AS_WCSCPY
> +       rep     movsd
> +# else
> +       rep     movsb
> +# endif
> +       ret
> +END(STRNCAT)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
> index 1b3426d511..49eaf4cbd9 100644
> --- a/sysdeps/x86_64/multiarch/strncpy-evex.S
> +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
> @@ -1,7 +1,990 @@
> -#ifndef STRNCPY
> -# define STRNCPY       __strncpy_evex
> -#endif
> +/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +       /* Use evex-masked stores for small sizes. Turned off at the
> +          moment.  */
> +# define USE_EVEX_MASKED_STORE 0
> +
> +
> +# include <sysdep.h>
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +
> +# ifndef STRNCPY
> +#  define STRNCPY      __strncpy_evex
> +# endif
> +
> +# ifdef USE_AS_WCSCPY
> +#  define VMOVU_MASK   vmovdqu32
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPMIN        vpminud
> +#  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
> +#  define CHAR_SIZE    4
> +
> +#  define REP_MOVS     rep movsd
> +#  define REP_STOS     rep stosl
> +
> +#  define USE_WIDE_CHAR
> +
> +# else
> +#  define VMOVU_MASK   vmovdqu8
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPMIN        vpminub
> +#  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
> +#  define CHAR_SIZE    1
> +
> +#  define REP_MOVS     rep movsb
> +#  define REP_STOS     rep stosb
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +# include "reg-macros.h"
> +
> +
> +# define VZERO VMM(7)
> +# define VZERO_256     VMM_256(7)
> +# define VZERO_128     VMM_128(7)
> +
> +# if VEC_SIZE == 64
> +#  define VZERO_HALF   VZERO_256
> +# else
> +#  define VZERO_HALF   VZERO_128
> +# endif
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCPY)
> +       /* Filter zero length strings and very long strings.  Zero
> +          length strings just return, very long strings are handled by
> +          just running rep stos{b|l} to zero set (which will almost
> +          certainly segfault), if that succeeds then just calling
> +          OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
> +# ifdef USE_AS_WCSCPY
> +       decq    %rdx
> +       movq    %rdx, %rax
> +       /* 56 is end of max supported address space.  */
> +       shr     $56, %rax
> +       jnz     L(zero_len)
> +# else
> +       decq    %rdx
> +       /* If the flag needs to become `jb` replace `dec` with `sub`.
> +        */
> +       jl      L(zero_len)
> +# endif
> +
> +       vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> +       movl    %esi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(page_cross)
> +
> +L(page_cross_continue):
> +       VMOVU   (%rsi), %VMM(0)
> +       VPTESTN %VMM(0), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +
> +       /* If no STPCPY just save end ahead of time.  */
> +# ifndef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# endif
> +
> +
> +       cmpq    $(CHAR_PER_VEC), %rdx
> +
> +       /* If USE_EVEX_MASK_STORE is enabled then we just handle length
> +          <= CHAR_PER_VEC with masked instructions (which have
> +          potential for dramatically bad perf if dst splits a page and
> +          is not in the TLB).  */
> +# if USE_EVEX_MASKED_STORE
> +       /* `jae` because length rdx is now length - 1.  */
> +       jae     L(more_1x_vec)
> +
> +       /* If there where multiple zero-CHAR matches in the first VEC,
> +          VRCX will be overset but thats fine since any oversets where
> +          at zero-positions anyways.  */
> +
> +#  ifdef USE_AS_STPCPY
> +       tzcnt   %VRCX, %VRAX
> +       cmpl    %eax, %edx
> +       cmovb   %edx, %eax
> +#   ifdef USE_AS_WCSCPY
> +       adcl    $0, %eax
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#   else
> +       adcq    %rdi, %rax
> +#   endif
> +#  endif
> +       dec     %VRCX
> +
> +       /* Zero out all non-zero CHAR's after the first zero match.  */
> +       KMOV    %VRCX, %k1
> +
> +       /* Use VZERO as destination so this can be reused for
> +          L(zfill_less_vec) (which if jumped to by subsequent logic
> +          will have zerod out VZERO.  */
> +       VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
> +L(zfill_less_vec):
> +       /* Get mask for what we need to set.  */
> +       incl    %edx
> +       mov     $-1, %VRCX
> +       bzhi    %VRDX, %VRCX, %VRCX
> +       KMOV    %VRCX, %k1
> +       VMOVU_MASK %VZERO, (%rdi){%k1}
> +       ret
> +
> +       .p2align 4,, 4
> +L(zero_len):
> +       cmpq    $-1, %rdx
> +       jne     L(best_effort_strncpy)
> +       movq    %rdi, %rax
> +       ret
> +
> +       .p2align 4,, 8
> +L(more_1x_vec):
> +# else
> +       /* `jb` because length rdx is now length - 1.  */
> +       jb      L(less_1x_vec)
> +# endif
> +
> +
> +       /* This may overset but thats fine because we still need to zero
> +          fill.  */
> +       VMOVU   %VMM(0), (%rdi)
> +
> +
> +       /* Length must be >= CHAR_PER_VEC so match here means we must
> +          zero-fill.  */
> +       test    %VRCX, %VRCX
> +       jnz     L(zfill)
> +
> +
> +       /* We are going to align rsi here so will need to be able to re-
> +          adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> +          so rsi + rdx * CHAR_SIZE cannot overflow.  */
> +       leaq    (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> +       subq    %rsi, %rdi
> +       andq    $-(VEC_SIZE), %rsi
> +
> +L(loop_last_4x_vec):
> +       addq    %rsi, %rdi
> +       subq    %rsi, %rdx
> +# ifdef USE_AS_WCSCPY
> +       shrq    $2, %rdx
> +# endif
> +
> +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> +       VPTESTN %VMM(1), %VMM(1), %k0
> +       KMOV    %k0, %VRCX
> +
> +       /* -1 because of the `dec %rdx` earlier.  */
> +       cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
> +       ja      L(more_2x_vec)
> +
> +L(last_2x_vec):
> +       /* This will be need to be computed no matter what. We do it
> +          ahead of time for CHAR_PER_VEC == 64 because we can't adjust
> +          the value of `tzcnt` with a shift.  */
> +# if CHAR_PER_VEC == 64
> +       tzcntq  %rcx, %rcx
> +# endif
> +
> +       cmpl    $(CHAR_PER_VEC), %edx
> +       jb      L(ret_vec_x1_len)
> +
> +       /* Seperate logic for CHAR_PER_VEC == 64 because we already did
> +          `tzcnt` on VRCX.  */
> +# if CHAR_PER_VEC == 64
> +       /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
> +       cmpb    $CHAR_PER_VEC, %cl
> +       jnz     L(ret_vec_x1_no_bsf)
> +# else
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x1)
> +# endif
> +
> +
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0
> +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> +       KMOV    %k0, %VRCX
> +
> +# if CHAR_PER_VEC < 64
> +       /* This essentiallys adds CHAR_PER_VEC to computed result.  */
> +       shlq    $CHAR_PER_VEC, %rcx
> +# else
> +       tzcntq  %rcx, %rcx
> +       addl    $CHAR_PER_VEC, %ecx
> +# endif
> +
> +       .p2align 4,, 4
> +L(ret_vec_x1_len):
> +       /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
> +          already been done.  */
> +# if CHAR_PER_VEC < 64
> +       tzcntq  %rcx, %rcx
> +# endif
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x1_len_no_zfill)
> +       /* Fall through (expectation) is copy len < buffer len.  */
> +       VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +L(ret_vec_x1_len_no_zfill_mov):
> +       movl    %ecx, %edx
> +# ifdef USE_AS_STPCPY
> +       /* clear flags.  */
> +       xorl    %ecx, %ecx
> +# endif
> +L(ret_vec_x1_len_no_zfill):
> +       VMOVU   ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       adcq    $0, %rdx
> +       leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +       leal    (VEC_SIZE)(%rdx), %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# endif
> +       ret
> +
> +
> +       .p2align 4,, 10
> +L(ret_vec_x1):
> +       bsf     %VRCX, %VRCX
> +L(ret_vec_x1_no_bsf):
> +       VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +       subl    %ecx, %edx
> +       cmpl    $CHAR_PER_VEC, %edx
> +       jb      L(ret_vec_x1_len_no_zfill_mov)
> +       /* Fall through (expectation) is copy len < buffer len.  */
> +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> +       VMOVU   %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +       leaq    (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
> +# endif
> +       ret
> +
> +       .p2align 4,, 8
> +L(last_4x_vec):
> +       /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
> +          $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
> +          using `movzbl`.  */
> +# if CHAR_PER_VEC == 64
> +       movzbl  %dl, %edx
> +# else
> +       andl    $(CHAR_PER_VEC * 4 - 1), %edx
> +# endif
> +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(1)
> +       VPTESTN %VMM(1), %VMM(1), %k0
> +       KMOV    %k0, %VRCX
> +       subq    $-(VEC_SIZE * 4), %rsi
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       cmpl    $(CHAR_PER_VEC * 2 - 1), %edx
> +       jbe     L(last_2x_vec)
> +       .p2align 4,, 8
> +L(more_2x_vec):
> +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> +       test    %VRCX, %VRCX
> +       /* Must fill at least 2x VEC.  */
> +       jnz     L(zfill_vec1)
> +
> +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       /* Must fill at least 1x VEC.  */
> +       jnz     L(zfill_vec2)
> +
> +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> +       VPTESTN %VMM(3), %VMM(3), %k0
> +       KMOV    %k0, %VRCX
> +
> +       /* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
> +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rdx
> +       ja      L(more_4x_vec)
> +
> +       subl    $(CHAR_PER_VEC * 3), %edx
> +       jb      L(ret_vec_x3_len)
> +
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x3)
> +
> +       VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> +       KMOV    %k0, %VRCX
> +       tzcnt   %VRCX, %VRCX
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x4_len_no_zfill)
> +       /* Fall through (expectation) is copy len < buffer len.  */
> +       VMOVU   %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +       movl    %ecx, %edx
> +L(ret_vec_x4_len_no_zfill):
> +       VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       adcq    $0, %rdx
> +       leaq    (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +       leal    (VEC_SIZE * 4 + 0)(%rdx), %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# endif
> +       ret
> +
> +
> +L(ret_vec_x3_len):
> +       addl    $(CHAR_PER_VEC * 1), %edx
> +       tzcnt   %VRCX, %VRCX
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x3_len_no_zfill)
> +       /* Fall through (expectation) is copy len < buffer len.  */
> +       VMOVU   %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +L(ret_vec_x3_len_no_zfill_mov):
> +       movl    %ecx, %edx
> +# ifdef USE_AS_STPCPY
> +       /* clear flags.  */
> +       xorl    %ecx, %ecx
> +# endif
> +       .p2align 4,, 4
> +L(ret_vec_x3_len_no_zfill):
> +       VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       adcq    $0, %rdx
> +       leaq    (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +       leal    (VEC_SIZE * 3 + 0)(%rdx), %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# endif
> +       ret
> +
> +
> +       .p2align 4,, 8
> +L(ret_vec_x3):
> +       bsf     %VRCX, %VRCX
> +       VMOVU   %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
> +       subl    %ecx, %edx
> +       jl      L(ret_vec_x3_len_no_zfill_mov)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> +       VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +       leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
> +# endif
> +       ret
> +
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> +       test    %VRCX, %VRCX
> +       jnz     L(zfill_vec3)
> +
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> +       VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> +       VPTESTN %VMM(4), %VMM(4), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(zfill_vec4)
>
> -#define USE_AS_STRNCPY
> -#define STRCPY STRNCPY
> -#include "strcpy-evex.S"
> +       /* Recheck length before aligning.  */
> +       cmpq    $(CHAR_PER_VEC * 8 - 1), %rdx
> +       jbe     L(last_4x_vec)
> +
> +       /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
> +# ifdef USE_AS_WCSCPY
> +       leaq    (%rsi, %rdx, CHAR_SIZE), %rdx
> +# else
> +       addq    %rsi, %rdx
> +# endif
> +       subq    %rsi, %rdi
> +       subq    $-(VEC_SIZE * 5), %rsi
> +       andq    $(VEC_SIZE * -4), %rsi
> +
> +
> +       /* Load first half of the loop before entry.  */
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPTESTN %VMM(4), %VMM(4), %k2
> +       VPTESTN %VMM(6), %VMM(6), %k4
> +
> +
> +       /* Offset rsi by VEC_SIZE so that we can jump to
> +          L(loop_last_4x_vec).  */
> +       addq    $-(VEC_SIZE), %rsi
> +       KORTEST %k2, %k4
> +       jnz     L(loop_4x_done)
> +
> +       /* Store loop end in r9.  */
> +       leaq    -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +       VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> +
> +       subq    $(VEC_SIZE * -4), %rsi
> +       cmpq    %rsi, %r9
> +       jbe     L(loop_last_4x_vec)
> +
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPTESTN %VMM(4), %VMM(4), %k2
> +       VPTESTN %VMM(6), %VMM(6), %k4
> +       KORTEST %k2, %k4
> +       jz      L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +       /* Restore rdx (length).  */
> +       subq    %rsi, %rdx
> +# ifdef USE_AS_WCSCPY
> +       shrq    $2, %rdx
> +# endif
> +       VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +       /* Restore rdi (dst).  */
> +       addq    %rsi, %rdi
> +       VPTESTN %VMM(0), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(zfill_vec1)
> +
> +       VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> +       KMOV    %k2, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(zfill_vec2)
> +
> +       VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(zfill_vec3)
> +
> +       VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
> +       KMOV    %k4, %VRCX
> +       // Zfill more....
> +
> +       .p2align 4,, 4
> +L(zfill_vec4):
> +       subq    $(VEC_SIZE * -2), %rdi
> +       addq    $(CHAR_PER_VEC * -2), %rdx
> +L(zfill_vec2):
> +       subq    $(VEC_SIZE * -2), %rdi
> +       addq    $(CHAR_PER_VEC * -1), %rdx
> +L(zfill):
> +       /* VRCX must be non-zero.  */
> +       bsf     %VRCX, %VRCX
> +
> +       /* Adjust length / dst for zfill.  */
> +       subq    %rcx, %rdx
> +# ifdef USE_AS_WCSCPY
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> +# else
> +       addq    %rcx, %rdi
> +# endif
> +# ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# endif
> +L(zfill_from_page_cross):
> +
> +       /* From here on out its just memset(rdi, 0, rdx).  */
> +       cmpq    $CHAR_PER_VEC, %rdx
> +       jb      L(zfill_less_vec)
> +
> +L(zfill_more_1x_vec):
> +       VMOVU   %VZERO, (%rdi)
> +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
> +       ja      L(zfill_more_2x_vec)
> +L(zfill_done0):
> +       ret
> +
> +       /* Coming from vec1/vec2 we must be able to zfill at least 2x
> +          VEC.  */
> +       .p2align 4,, 8
> +L(zfill_vec3):
> +       subq    $(VEC_SIZE * -2), %rdi
> +       addq    $(CHAR_PER_VEC * -2), %rdx
> +       .p2align 4,, 2
> +L(zfill_vec1):
> +       bsfq    %rcx, %rcx
> +       /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
> +        */
> +       leaq    VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> +       subq    %rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# endif
> +
> +
> +       VMOVU   %VZERO, (%rdi)
> +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       cmpq    $(CHAR_PER_VEC * 2), %rdx
> +       jb      L(zfill_done0)
> +L(zfill_more_2x_vec):
> +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +       VMOVU   %VZERO, (VEC_SIZE)(%rdi)
> +       subq    $(CHAR_PER_VEC * 4 - 1), %rdx
> +       jbe     L(zfill_done)
> +
> +# ifdef USE_AS_WCSCPY
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rdx
> +# else
> +       addq    %rdi, %rdx
> +# endif
> +
> +       VMOVU   %VZERO, (VEC_SIZE * 2)(%rdi)
> +       VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi)
> +
> +
> +       VMOVU   %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
> +       VMOVU   %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
> +
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       cmpq    %rdi, %rdx
> +       jbe     L(zfill_done)
> +
> +       /* Align rdi and zfill loop.  */
> +       andq    $-(VEC_SIZE), %rdi
> +       .p2align 4,, 12
> +L(zfill_loop_4x_vec):
> +       VMOVA   %VZERO, (VEC_SIZE * 0)(%rdi)
> +       VMOVA   %VZERO, (VEC_SIZE * 1)(%rdi)
> +       VMOVA   %VZERO, (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VZERO, (VEC_SIZE * 3)(%rdi)
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       cmpq    %rdi, %rdx
> +       ja      L(zfill_loop_4x_vec)
> +L(zfill_done):
> +       ret
> +
> +
> +       /* Less 1x VEC case if we are not using evex masked store.  */
> +# if !USE_EVEX_MASKED_STORE
> +       .p2align 4,, 8
> +L(copy_1x):
> +       /* Special case for copy 1x. It can be handled quickly and many
> +          buffer sizes have convenient alignment.  */
> +       VMOVU   %VMM(0), (%rdi)
> +       /* If no zeros then we are done.  */
> +       testl   %ecx, %ecx
> +       jz      L(ret_1x_1x)
> +
> +       /* Need to zfill, not we know that length <= CHAR_PER_VEC so we
> +          only handle the small case here.  */
> +       bsf     %VRCX, %VRCX
> +L(zfill_less_vec_no_bsf):
> +       /* Adjust length / dst then just zfill less_vec.  */
> +       subq    %rcx, %rdx
> +#  ifdef USE_AS_WCSCPY
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> +#  else
> +       addq    %rcx, %rdi
> +#  endif
> +#  ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +#  endif
> +
> +L(zfill_less_vec):
> +       cmpl    $((VEC_SIZE / 2) / CHAR_SIZE), %edx
> +       jb      L(zfill_less_half)
> +
> +       VMOVU   %VZERO_HALF, (%rdi)
> +       VMOVU   %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       ret
> +#  ifdef USE_AS_STPCPY
> +L(ret_1x_1x):
> +       leaq    CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
> +       ret
> +#  endif
> +
> +
> +#  if VEC_SIZE == 64
> +       .p2align 4,, 4
> +L(copy_32_63):
> +       /* Overfill to avoid branches.  */
> +       VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +       VMOVU   %VMM_256(0), (%rdi)
> +       VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +
> +       /* We are taking advantage of the fact that to be here we must
> +          be writing null-term as (%rdi, %rcx) we have a byte of lee-
> +          way for overwriting.  */
> +       cmpl    %ecx, %edx
> +       ja      L(zfill_less_vec_no_bsf)
> +#   ifndef USE_AS_STPCPY
> +L(ret_1x_1x):
> +#   else
> +#    ifdef USE_AS_WCSCPY
> +       adcq    $0, %rdx
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> +#    else
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#    endif
> +#   endif
> +       ret
> +#  endif
> +
> +       .p2align 4,, 4
> +L(copy_16_31):
> +       /* Overfill to avoid branches.  */
> +       vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +       VMOVU   %VMM_128(0), (%rdi)
> +       vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       cmpl    %ecx, %edx
> +
> +       /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
> +          we have a larger copy block for 32-63 so this is just falls
> +          through to zfill 16-31. If VEC_SIZE == 32 then we check for
> +          full zfill of less 1x VEC.  */
> +#  if VEC_SIZE == 64
> +       jbe     L(ret_16_31)
> +       subl    %ecx, %edx
> +#   ifdef USE_AS_WCSCPY
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> +#   else
> +       addq    %rcx, %rdi
> +#   endif
> +#   ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +#   endif
> +L(zfill_less_half):
> +L(zfill_less_32):
> +       cmpl    $(16 / CHAR_SIZE), %edx
> +       jb      L(zfill_less_16)
> +       VMOVU   %VZERO_128, (%rdi)
> +       VMOVU   %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +#   ifdef USE_AS_STPCPY
> +       ret
> +#   endif
> +L(ret_16_31):
> +#   ifdef USE_AS_STPCPY
> +#    ifdef USE_AS_WCSCPY
> +       adcq    $0, %rdx
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> +#    else
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#    endif
> +#   endif
> +       ret
> +#  else
> +       /* VEC_SIZE == 32 begins.  */
> +       ja      L(zfill_less_vec_no_bsf)
> +#   ifndef USE_AS_STPCPY
> +L(ret_1x_1x):
> +#   else
> +#    ifdef USE_AS_WCSCPY
> +       adcq    $0, %rdx
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> +#    else
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#    endif
> +#   endif
> +       ret
> +#  endif
> +
> +
> +       .p2align 4,, 4
> +L(copy_8_15):
> +       /* Overfill to avoid branches.  */
> +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> +       vmovq   %VMM_128(0), (%rdi)
> +       movq    %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_8_15)
> +       subl    %ecx, %edx
> +#  ifdef USE_AS_WCSCPY
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> +#  else
> +       addq    %rcx, %rdi
> +#  endif
> +#  ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +#  endif
> +       .p2align 4,, 8
> +#  if VEC_SIZE == 32
> +L(zfill_less_half):
> +#  endif
> +L(zfill_less_16):
> +       xorl    %ecx, %ecx
> +       cmpl    $(8 / CHAR_SIZE), %edx
> +       jb      L(zfill_less_8)
> +       movq    %rcx, (%rdi)
> +       movq    %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +#  ifndef USE_AS_STPCPY
> +L(ret_8_15):
> +#  endif
> +       ret
> +
> +       .p2align 4,, 8
> +L(less_1x_vec):
> +       je      L(copy_1x)
> +
> +       /* We will need `tzcnt` result for all other copy sizes.  */
> +       tzcnt   %VRCX, %VRCX
> +#  if VEC_SIZE == 64
> +       cmpl    $(32 / CHAR_SIZE), %edx
> +       jae     L(copy_32_63)
> +#  endif
> +
> +       cmpl    $(16 / CHAR_SIZE), %edx
> +       jae     L(copy_16_31)
> +
> +       cmpl    $(8 / CHAR_SIZE), %edx
> +       jae     L(copy_8_15)
> +#  ifdef USE_AS_WCSCPY
> +       testl   %ecx, %ecx
> +       jz      L(zfill_less_8_set_ret)
> +
> +       movl    (%rsi, %rdx, CHAR_SIZE), %esi
> +       vmovd   %VMM_128(0), (%rdi)
> +       movl    %esi, (%rdi, %rdx, CHAR_SIZE)
> +#   ifdef USE_AS_STPCPY
> +       cmpl    %ecx, %edx
> +L(ret_8_15):
> +       adcq    $0, %rdx
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> +#   endif
> +       ret
> +L(zfill_less_8_set_ret):
> +       xorl    %ecx, %ecx
> +#   ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +#   endif
> +L(zfill_less_8):
> +       movl    %ecx, (%rdi)
> +       movl    %ecx, (%rdi, %rdx, CHAR_SIZE)
> +       ret
> +#  else
> +       cmpl    $3, %edx
> +       jb      L(copy_0_3)
> +       /* Overfill to avoid branches.  */
> +       movl    -3(%rsi, %rdx), %esi
> +       vmovd   %VMM_128(0), (%rdi)
> +       movl    %esi, -3(%rdi, %rdx)
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_4_7)
> +       subq    %rcx, %rdx
> +       addq    %rcx, %rdi
> +#   ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +#   endif
> +       xorl    %ecx, %ecx
> +       .p2align 4,, 8
> +L(zfill_less_8):
> +       cmpl    $3, %edx
> +       jb      L(zfill_less_3)
> +       movl    %ecx, (%rdi)
> +       movl    %ecx, -3(%rdi, %rdx)
> +#   ifdef USE_AS_STPCPY
> +       ret
> +#   endif
> +
> +L(ret_4_7):
> +#   ifdef USE_AS_STPCPY
> +L(ret_8_15):
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#   endif
> +       ret
> +
> +       .p2align 4,, 4
> +L(zfill_less_3):
> +       testl   %edx, %edx
> +       jz      L(zfill_1)
> +       movw    %cx, (%rdi)
> +L(zfill_1):
> +       movb    %cl, (%rdi, %rdx)
> +       ret
> +
> +       .p2align 4,, 8
> +L(copy_0_3):
> +       vmovd   %VMM_128(0), %r8d
> +       testl   %edx, %edx
> +       jz      L(copy_1)
> +       movw    %r8w, (%rdi)
> +       cmpl    %ecx, %edx
> +       ja      L(zfill_from_1)
> +       movzbl  (%rsi, %rdx), %r8d
> +#   ifdef USE_AS_STPCPY
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +       movb    %r8b, (%rdi, %rdx)
> +       ret
> +#   endif
> +
> +L(copy_1):
> +#   ifdef USE_AS_STPCPY
> +       movl    %edx, %eax
> +       cmpl    %ecx, %edx
> +       adcq    %rdi, %rax
> +#   endif
> +#   ifdef USE_AS_WCSCPY
> +       vmovd   %VMM_128(0), (%rdi)
> +#   else
> +       movb    %r8b, (%rdi, %rdx)
> +#   endif
> +       ret
> +#  endif
> +
> +
> +#  ifndef USE_AS_WCSCPY
> +       .p2align 4,, 8
> +L(zfill_from_1):
> +#   ifdef USE_AS_STPCPY
> +       leaq    (%rdi, %rcx), %rax
> +#   endif
> +       movw    $0, -1(%rdi, %rdx)
> +       ret
> +#  endif
> +
> +       .p2align 4,, 4
> +L(zero_len):
> +       incq    %rdx
> +       jne     L(best_effort_strncpy)
> +       movq    %rdi, %rax
> +       ret
> +# endif
> +
> +
> +       .p2align 4,, 4
> +       .p2align 6,, 8
> +L(page_cross):
> +       movq    %rsi, %rax
> +       andq    $(VEC_SIZE * -1), %rax
> +       VPCMPEQ (%rax), %VZERO, %k0
> +       KMOV    %k0, %VRCX
> +# ifdef USE_AS_WCSCPY
> +       movl    %esi, %r8d
> +       shrl    $2, %r8d
> +       andl    $(CHAR_PER_VEC - 1), %r8d
> +       shrx    %VR8, %VRCX, %VRCX
> +# else
> +       shrx    %VRSI, %VRCX, %VRCX
> +# endif
> +
> +       /* Compute amount of bytes we checked.  */
> +       subl    %esi, %eax
> +       andl    $(VEC_SIZE - 1), %eax
> +# ifdef USE_AS_WCSCPY
> +       shrl    $2, %eax
> +# endif
> +
> +       /* If rax > rdx then we are finishing the copy at the end of the
> +          page.  */
> +       cmpq    %rax, %rdx
> +       jb      L(page_cross_small)
> +
> +
> +       /* If rcx is non-zero then continue.  */
> +       test    %VRCX, %VRCX
> +       jz      L(page_cross_continue)
> +
> +       /* We found zero-CHAR so need to copy then zfill (we know we
> +          didn't cover all of length here).  */
> +       bsf     %VRCX, %VRCX
> +L(movsb_and_zfill):
> +       incl    %ecx
> +       subq    %rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +       leaq    -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
> +# else
> +       movq    %rdi, %rax
> +# endif
> +
> +       REP_MOVS
> +# ifdef USE_AS_WCSCPY
> +       movl    $0, (%rdi)
> +# else
> +       movb    $0, (%rdi)
> +# endif
> +       jmp     L(zfill_from_page_cross)
> +
> +L(page_cross_small):
> +       tzcnt   %VRCX, %VRCX
> +       cmpl    %ecx, %edx
> +       jbe     L(page_cross_copy_only)
> +
> +       /* Do a zfill of the tail before copying.  */
> +       movq    %rdi, %r9
> +       xorl    %eax, %eax
> +
> +       movl    %ecx, %r8d
> +
> +       subl    %ecx, %edx
> +       leaq    CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> +       movl    %edx, %ecx
> +       REP_STOS
> +       movq    %r9, %rdi
> +       movl    %r8d, %edx
> +L(page_cross_copy_only):
> +       leal    1(%rdx), %ecx
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       adcl    $0, %edx
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# else
> +       movq    %rdi, %rax
> +# endif
> +       REP_MOVS
> +       ret
> +
> +
> +L(best_effort_strncpy):
> +       movq    %rdx, %rcx
> +       xorl    %eax, %eax
> +       movq    %rdi, %r8
> +       /* The length is >= 2^63. We very much so expect to segfault at
> +          rep stos. If that doesn't happen then just strcpy to finish.
> +        */
> +       REP_STOS
> +       movq    %r8, %rdi
> +       jmp     OVERFLOW_STRCPY
> +END(STRNCPY)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> new file mode 100644
> index 0000000000..d5ff4cbe50
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> @@ -0,0 +1,65 @@
> +#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
> +#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
> +
> +#if defined USE_MULTIARCH && IS_IN(libc)
> +#  define UNDERSCORES __
> +#  ifdef USE_WITH_SSE2
> +#    define ISA_EXT _sse2
> +#  elif defined USE_WITH_AVX
> +#    ifdef USE_WITH_RTM
> +#      define ISA_EXT _avx_rtm
> +#    else
> +#      define ISA_EXT _avx
> +#    endif
> +#  elif defined USE_WITH_AVX2
> +#    ifdef USE_WITH_RTM
> +#      define ISA_EXT _avx2_rtm
> +#    else
> +#      define ISA_EXT _avx2
> +#    endif
> +
> +#  elif defined USE_WITH_EVEX256
> +#    define ISA_EXT _evex
> +#  elif defined USE_WITH_EVEX512
> +#    define ISA_EXT _evex512
> +#  endif
> +#else
> +#  define UNDERSCORES
> +#  define ISA_EXT
> +#endif
> +
> +#ifdef USE_AS_WCSCPY
> +#  define STRCPY_PREFIX wc
> +#  define STRCAT_PREFIX wcs
> +#  ifdef USE_AS_STPCPY
> +#    define STRCPY_POSTFIX pcpy
> +#  else
> +#    define STRCPY_POSTFIX scpy
> +#  endif
> +#else
> +#  define STRCPY_PREFIX st
> +#  define STRCAT_PREFIX str
> +#  ifdef USE_AS_STPCPY
> +#    define STRCPY_POSTFIX pcpy
> +#  else
> +#    define STRCPY_POSTFIX rcpy
> +#  endif
> +#endif
> +#define STRCAT_POSTFIX cat
> +
> +#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext)                 \
> +  underscores##prefix##postfix##ext
> +
> +#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
> +
> +#ifndef OVERFLOW_STRCPY
> +#  define OVERFLOW_STRCPY                                                     \
> +    OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
> +#endif
> +
> +#ifndef OVERFLOW_STRCAT
> +#  define OVERFLOW_STRCAT                                                     \
> +    OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
> +#endif
> +
> +#endif
> --
> 2.34.1
>

[-- Attachment #2: strcpy-evex-results.tar.gz --]
[-- Type: application/gzip, Size: 102214 bytes --]

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v1 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json
  2022-11-03  8:53 [PATCH v1 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json Noah Goldstein
                   ` (2 preceding siblings ...)
  2022-11-03  8:53 ` [PATCH v1 4/4] x86: Add optimized functions for the wide-character strcpy family Noah Goldstein
@ 2022-11-03  9:06 ` Noah Goldstein
  2022-11-04  8:20 ` [PATCH v2 " Noah Goldstein
  2022-11-04 20:13 ` [PATCH v3 1/5] " Noah Goldstein
  5 siblings, 0 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-03  9:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: hjl.tools, carlos

On Thu, Nov 3, 2022 at 1:53 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Json output is easier to parse and most other benchmarks already do
> the same.
> ---
>  benchtests/bench-strcat.c  | 106 ++++++++++++++++++++++++++-------
>  benchtests/bench-strcpy.c  | 118 ++++++++++++++++++++++++-------------
>  benchtests/bench-strncat.c |  94 +++++++++++++++++++++--------
>  benchtests/bench-strncpy.c |  94 +++++++++++++++++++++--------
>  4 files changed, 297 insertions(+), 115 deletions(-)
>
> diff --git a/benchtests/bench-strcat.c b/benchtests/bench-strcat.c
> index 749318e37e..890b34b4c1 100644
> --- a/benchtests/bench-strcat.c
> +++ b/benchtests/bench-strcat.c
> @@ -35,6 +35,7 @@
>  # define SMALL_CHAR 1273
>  #endif /* WIDE */
>
> +#include "json-lib.h"
>
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
>
> @@ -49,7 +50,7 @@ IMPL (STRCAT, 1)
>  IMPL (generic_strcat, 0)
>
>  static void
> -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src)
>  {
>    size_t k = STRLEN (dst), i, iters = INNER_LOOP_ITERS_LARGE;
>    timing_t start, stop, cur;
> @@ -64,7 +65,8 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
>
>    if (STRCMP (dst + k, src) != 0)
>      {
> -      error (0, 0, "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
> +      error (0, 0,
> +            "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
>              impl->name, dst, src);
>        ret = 1;
>        return;
> @@ -80,11 +82,12 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align1, size_t align2, size_t len1, size_t len2, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len1,
> +        size_t len2, int max_char)
>  {
>    size_t i;
>    CHAR *s1, *s2;
> @@ -107,53 +110,112 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2, int max_char)
>    for (i = 0; i < len2; i++)
>      s2[i] = 32 + 23 * i % (max_char - 32);
>
> -  printf ("Length %4zd/%4zd, alignment %2zd/%2zd:", len1, len2, align1, align2);
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_attr_uint (json_ctx, "len1", len1);
> +  json_attr_uint (json_ctx, "len2", len2);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
>      {
>        s2[len2] = '\0';
> -      do_one_test (impl, s2, s1);
> +      do_one_test (json_ctx, impl, s2, s1);
>      }
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  test_main (void)
>  {
> +  json_ctx_t json_ctx;
>    size_t i;
>
>    test_init ();
>
> -  printf ("%28s", "");
> +  test_init ();
> +
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
>    for (i = 0; i < 16; ++i)
>      {
> -      do_test (0, 0, i, i, SMALL_CHAR);
> -      do_test (0, 0, i, i, BIG_CHAR);
> -      do_test (0, i, i, i, SMALL_CHAR);
> -      do_test (i, 0, i, i, BIG_CHAR);
> +      do_test (&json_ctx, 0, 0, i, i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, i, BIG_CHAR);
> +      do_test (&json_ctx, 0, i, i, i, SMALL_CHAR);
> +      do_test (&json_ctx, i, 0, i, i, BIG_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (0, 0, 8 << i, 8 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 8 << i, 8 << i, SMALL_CHAR);
> -      do_test (0, 0, 8 << i, 2 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 8 << i, 2 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 8 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 8 << i, 2 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 2 << i, SMALL_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (i, 2 * i, 8 << i, 1, SMALL_CHAR);
> -      do_test (2 * i, i, 8 << i, 1, BIG_CHAR);
> -      do_test (i, i, 8 << i, 10, SMALL_CHAR);
> -      do_test (i, i, 8 << i, 10, BIG_CHAR);
> +      do_test (&json_ctx, i, 2 * i, 8 << i, 1, SMALL_CHAR);
> +      do_test (&json_ctx, 2 * i, i, 8 << i, 1, BIG_CHAR);
> +      do_test (&json_ctx, i, i, 8 << i, 10, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, 8 << i, 10, BIG_CHAR);
> +    }
> +
> +  for (i = 32; i < 256; i += 32)
> +    {
> +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
>      }
>
> +  for (; i < 512; i += 64)
> +    {
> +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
> +    }
> +
> +  for (; i < 1024; i += 128)
> +    {
> +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
> +    }
> +
> +  for (; i < 2048; i += 256)
> +    {
> +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
>    return ret;
>  }
>
> diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
> index 29deb8a46a..af8673e137 100644
> --- a/benchtests/bench-strcpy.c
> +++ b/benchtests/bench-strcpy.c
> @@ -26,16 +26,18 @@
>  # define SMALL_CHAR 127
>  #endif
>
> +#include "json-lib.h"
> +
>  #ifndef STRCPY_RESULT
>  # define STRCPY_RESULT(dst, len) dst
>  # define TEST_MAIN
>  # ifndef WIDE
> -#  define TEST_NAME "strcpy"
> +#   define TEST_NAME "strcpy"
>  # else
> -#  define TEST_NAME "wcscpy"
> -#  define generic_strcpy generic_wcscpy
> +#   define TEST_NAME "wcscpy"
> +#   define generic_strcpy generic_wcscpy
>  # endif
> -#include "bench-string.h"
> +# include "bench-string.h"
>
>  CHAR *
>  generic_strcpy (CHAR *dst, const CHAR *src)
> @@ -51,8 +53,8 @@ IMPL (generic_strcpy, 0)
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
>
>  static void
> -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
> -            size_t len __attribute__((unused)))
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
> +            size_t len __attribute__ ((unused)))
>  {
>    size_t i, iters = INNER_LOOP_ITERS;
>    timing_t start, stop, cur;
> @@ -77,23 +79,24 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
>    TIMING_NOW (start);
>    for (i = 0; i < iters; ++i)
>      {
> -         CALL (impl, dst, src);
> +      CALL (impl, dst, src);
>      }
>    TIMING_NOW (stop);
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align1, size_t align2, size_t len, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
> +        int max_char)
>  {
>    size_t i;
>    CHAR *s1, *s2;
> -/* For wcscpy: align1 and align2 here mean alignment not in bytes,
> -   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> -   len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
> +  /* For wcscpy: align1 and align2 here mean alignment not in bytes,
> +     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> +     len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
>    align1 &= 7;
>    if ((align1 + len) * sizeof (CHAR) >= page_size)
>      return;
> @@ -109,65 +112,96 @@ do_test (size_t align1, size_t align2, size_t len, int max_char)
>      s1[i] = 32 + 23 * i % (max_char - 32);
>    s1[len] = 0;
>
> -  printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len,
> -         align1 * sizeof (CHAR), align2 * sizeof (CHAR));
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_attr_uint (json_ctx, "len", len);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, s2, s1, len);
> +    do_one_test (json_ctx, impl, s2, s1, len);
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  test_main (void)
>  {
> +  json_ctx_t json_ctx;
>    size_t i;
>
>    test_init ();
>
> -  printf ("%23s", "");
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
>    for (i = 0; i < 16; ++i)
>      {
> -      do_test (0, 0, i, SMALL_CHAR);
> -      do_test (0, 0, i, BIG_CHAR);
> -      do_test (0, i, i, SMALL_CHAR);
> -      do_test (i, 0, i, BIG_CHAR);
> +      do_test (&json_ctx, 0, 0, i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, BIG_CHAR);
> +      do_test (&json_ctx, 0, i, i, SMALL_CHAR);
> +      do_test (&json_ctx, i, 0, i, BIG_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (0, 0, 8 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, SMALL_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (i, 2 * i, 8 << i, SMALL_CHAR);
> -      do_test (2 * i, i, 8 << i, BIG_CHAR);
> -      do_test (i, i, 8 << i, SMALL_CHAR);
> -      do_test (i, i, 8 << i, BIG_CHAR);
> +      do_test (&json_ctx, i, 2 * i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 2 * i, i, 8 << i, BIG_CHAR);
> +      do_test (&json_ctx, i, i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, 8 << i, BIG_CHAR);
>      }
>
> -  for (i = 16; i <= 512; i+=4)
> +  for (i = 16; i <= 512; i += 4)
>      {
> -      do_test (0, 4, i, SMALL_CHAR);
> -      do_test (4, 0, i, BIG_CHAR);
> -      do_test (4, 4, i, SMALL_CHAR);
> -      do_test (2, 2, i, BIG_CHAR);
> -      do_test (2, 6, i, SMALL_CHAR);
> -      do_test (6, 2, i, BIG_CHAR);
> -      do_test (1, 7, i, SMALL_CHAR);
> -      do_test (7, 1, i, BIG_CHAR);
> -      do_test (3, 4, i, SMALL_CHAR);
> -      do_test (4, 3, i, BIG_CHAR);
> -      do_test (5, 7, i, SMALL_CHAR);
> -      do_test (7, 5, i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 4, i, SMALL_CHAR);
> +      do_test (&json_ctx, 4, 0, i, BIG_CHAR);
> +      do_test (&json_ctx, 4, 4, i, SMALL_CHAR);
> +      do_test (&json_ctx, 2, 2, i, BIG_CHAR);
> +      do_test (&json_ctx, 2, 6, i, SMALL_CHAR);
> +      do_test (&json_ctx, 6, 2, i, BIG_CHAR);
> +      do_test (&json_ctx, 1, 7, i, SMALL_CHAR);
> +      do_test (&json_ctx, 7, 1, i, BIG_CHAR);
> +      do_test (&json_ctx, 3, 4, i, SMALL_CHAR);
> +      do_test (&json_ctx, 4, 3, i, BIG_CHAR);
> +      do_test (&json_ctx, 5, 7, i, SMALL_CHAR);
> +      do_test (&json_ctx, 7, 5, i, SMALL_CHAR);
>      }
>
> +  for (i = 1; i < 2048; i += i)
> +    {
> +      do_test (&json_ctx, 1, 0, i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, i, i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, i, SMALL_CHAR);
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
>    return ret;
>  }
>
> diff --git a/benchtests/bench-strncat.c b/benchtests/bench-strncat.c
> index b148c55279..5ccc09a4f8 100644
> --- a/benchtests/bench-strncat.c
> +++ b/benchtests/bench-strncat.c
> @@ -33,6 +33,8 @@
>  # define SMALL_CHAR 1273
>  #endif /* WIDE */
>
> +#include "json-lib.h"
> +
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
>
>  CHAR *
> @@ -49,7 +51,8 @@ IMPL (STRNCAT, 2)
>  IMPL (generic_strncat, 0)
>
>  static void
> -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
> +            size_t n)
>  {
>    size_t k = STRLEN (dst), i, iters = INNER_LOOP_ITERS8;
>    timing_t start, stop, cur;
> @@ -65,8 +68,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
>    size_t len = STRLEN (src);
>    if (MEMCMP (dst + k, src, len + 1 > n ? n : len + 1) != 0)
>      {
> -      error (0, 0, "Incorrect concatenation in function %s",
> -            impl->name);
> +      error (0, 0, "Incorrect concatenation in function %s", impl->name);
>        ret = 1;
>        return;
>      }
> @@ -88,12 +90,12 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align1, size_t align2, size_t len1, size_t len2,
> -        size_t n, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len1,
> +        size_t len2, size_t n, int max_char)
>  {
>    size_t i;
>    CHAR *s1, *s2;
> @@ -118,53 +120,93 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
>    for (i = 0; i < len2; i++)
>      s2[i] = 32 + 23 * i % (max_char - 32);
>
> -  printf ("Length %4zd/%4zd, alignment %2zd/%2zd, N %4zd:",
> -         len1, len2, align1, align2, n);
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_attr_uint (json_ctx, "len1", len1);
> +  json_attr_uint (json_ctx, "len2", len2);
> +  json_attr_uint (json_ctx, "n", n);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
>      {
>        s2[len2] = '\0';
> -      do_one_test (impl, s2, s1, n);
> +      do_one_test (json_ctx, impl, s2, s1, n);
>      }
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  main (void)
>  {
> +  json_ctx_t json_ctx;
>    size_t i, n;
>
>    test_init ();
>
> -  printf ("%28s", "");
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
> -  for (n = 2; n <= 2048; n*=4)
> +  for (n = 2; n <= 2048; n *= 4)
>      {
> -      do_test (0, 2, 2, 2, n, SMALL_CHAR);
> -      do_test (0, 0, 4, 4, n, SMALL_CHAR);
> -      do_test (4, 0, 4, 4, n, BIG_CHAR);
> -      do_test (0, 0, 8, 8, n, SMALL_CHAR);
> -      do_test (0, 8, 8, 8, n, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 2, 2, 2, n, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 4, 4, n, SMALL_CHAR);
> +      do_test (&json_ctx, 4, 0, 4, 4, n, BIG_CHAR);
> +      do_test (&json_ctx, 0, 0, 8, 8, n, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 8, 8, 8, n, SMALL_CHAR);
>
>        for (i = 1; i < 8; ++i)
>         {
> -         do_test (0, 0, 8 << i, 8 << i, n, SMALL_CHAR);
> -         do_test (8 - i, 2 * i, 8 << i, 8 << i, n, SMALL_CHAR);
> -         do_test (0, 0, 8 << i, 2 << i, n, SMALL_CHAR);
> -         do_test (8 - i, 2 * i, 8 << i, 2 << i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, 8 << i, 8 << i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 8 << i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, 8 << i, 2 << i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 2 << i, n, SMALL_CHAR);
>         }
>
>        for (i = 1; i < 8; ++i)
>         {
> -         do_test (i, 2 * i, 8 << i, 1, n, SMALL_CHAR);
> -         do_test (2 * i, i, 8 << i, 1, n, BIG_CHAR);
> -         do_test (i, i, 8 << i, 10, n, SMALL_CHAR);
> +         do_test (&json_ctx, i, 2 * i, 8 << i, 1, n, SMALL_CHAR);
> +         do_test (&json_ctx, 2 * i, i, 8 << i, 1, n, BIG_CHAR);
> +         do_test (&json_ctx, i, i, 8 << i, 10, n, SMALL_CHAR);
>         }
>      }
>
> +  for (i = 128; i < 2048; i += i)
> +    {
> +      for (n = i - 64; n <= i + 64; n += 32)
> +       {
> +         do_test (&json_ctx, 1, 0, i, i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 0, i, i, i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, i, i, n, SMALL_CHAR);
> +         do_test (&json_ctx, i, i, i, i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 1, 0, i, n, i, SMALL_CHAR);
> +         do_test (&json_ctx, 0, i, i, n, i, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, i, n, i, SMALL_CHAR);
> +         do_test (&json_ctx, i, i, i, n, i, SMALL_CHAR);
> +       }
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
>    return ret;
>  }
> diff --git a/benchtests/bench-strncpy.c b/benchtests/bench-strncpy.c
> index 8207d99f4d..f621cbfe09 100644
> --- a/benchtests/bench-strncpy.c
> +++ b/benchtests/bench-strncpy.c
> @@ -24,6 +24,8 @@
>  # define SMALL_CHAR 127
>  #endif /* !WIDE */
>
> +#include "json-lib.h"
> +
>  #ifndef STRNCPY_RESULT
>  # define STRNCPY_RESULT(dst, len, n) dst
>  # define TEST_MAIN
> @@ -52,7 +54,8 @@ IMPL (generic_strncpy, 0)
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
>
>  static void
> -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
> +            size_t len, size_t n)
>  {
>    size_t i, iters = INNER_LOOP_ITERS_LARGE * (4 / CHARBYTES);
>    timing_t start, stop, cur;
> @@ -77,7 +80,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
>        size_t i;
>
>        for (i = len; i < n; ++i)
> -       if (dst [i] != '\0')
> +       if (dst[i] != '\0')
>           {
>             error (0, 0, "Wrong result in function %s", impl->name);
>             ret = 1;
> @@ -94,17 +97,18 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
> +        size_t n, int max_char)
>  {
>    size_t i;
>    CHAR *s1, *s2;
>
> -/* For wcsncpy: align1 and align2 here mean alignment not in bytes,
> -   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)).  */
> +  /* For wcsncpy: align1 and align2 here mean alignment not in bytes,
> +     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)).  */
>    align1 &= 7;
>    if ((align1 + len) * sizeof (CHAR) >= page_size)
>      return;
> @@ -123,46 +127,86 @@ do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
>         ++i)
>      s1[i] = 32 + 32 * i % (max_char - 32);
>
> -  printf ("Length %4zd, n %4zd, alignment %2zd/%2zd:", len, n, align1, align2);
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_attr_uint (json_ctx, "len", len);
> +  json_attr_uint (json_ctx, "n", n);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, s2, s1, len, n);
> +    do_one_test (json_ctx, impl, s2, s1, len, n);
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  static int
>  test_main (void)
>  {
> -  size_t i;
> +  json_ctx_t json_ctx;
> +  size_t i, j;
>
>    test_init ();
>
> -  printf ("%28s", "");
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (i, i, 16, 16, SMALL_CHAR);
> -      do_test (i, i, 16, 16, BIG_CHAR);
> -      do_test (i, 2 * i, 16, 16, SMALL_CHAR);
> -      do_test (2 * i, i, 16, 16, BIG_CHAR);
> -      do_test (8 - i, 2 * i, 1 << i, 2 << i, SMALL_CHAR);
> -      do_test (2 * i, 8 - i, 2 << i, 1 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 1 << i, 2 << i, BIG_CHAR);
> -      do_test (2 * i, 8 - i, 2 << i, 1 << i, BIG_CHAR);
> +      do_test (&json_ctx, i, i, 16, 16, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, 16, 16, BIG_CHAR);
> +      do_test (&json_ctx, i, 2 * i, 16, 16, SMALL_CHAR);
> +      do_test (&json_ctx, 2 * i, i, 16, 16, BIG_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 1 << i, 2 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 2 * i, 8 - i, 2 << i, 1 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 1 << i, 2 << i, BIG_CHAR);
> +      do_test (&json_ctx, 2 * i, 8 - i, 2 << i, 1 << i, BIG_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (0, 0, 4 << i, 8 << i, SMALL_CHAR);
> -      do_test (0, 0, 16 << i, 8 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 4 << i, 8 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 16 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 4 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 16 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 4 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 16 << i, 8 << i, SMALL_CHAR);
>      }
>
> +  for (i = 128; i < 2048; i += i)
> +    {
> +      for (j = i - 64; j <= i + 64; j += 32)
> +       {
> +         do_test (&json_ctx, 1, 0, i, j, SMALL_CHAR);
> +         do_test (&json_ctx, 0, i, i, j, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, i, j, SMALL_CHAR);
> +         do_test (&json_ctx, i, i, i, j, SMALL_CHAR);
> +         do_test (&json_ctx, 1, 0, j, i, SMALL_CHAR);
> +         do_test (&json_ctx, 0, i, j, i, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, j, i, SMALL_CHAR);
> +         do_test (&json_ctx, i, i, j, i, SMALL_CHAR);
> +       }
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
>    return ret;
>  }
>
> --
> 2.34.1
>

Sorry, ignore this patchset. Committed wrong impl.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json
  2022-11-03  8:53 [PATCH v1 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json Noah Goldstein
                   ` (3 preceding siblings ...)
  2022-11-03  9:06 ` [PATCH v1 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json Noah Goldstein
@ 2022-11-04  8:20 ` Noah Goldstein
  2022-11-04  8:20   ` [PATCH v2 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
                     ` (3 more replies)
  2022-11-04 20:13 ` [PATCH v3 1/5] " Noah Goldstein
  5 siblings, 4 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04  8:20 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Json output is easier to parse and most other benchmarks already do
the same.
---
 benchtests/bench-strcat.c  | 106 ++++++++++++++++++++++++++-------
 benchtests/bench-strcpy.c  | 118 ++++++++++++++++++++++++-------------
 benchtests/bench-strncat.c |  94 +++++++++++++++++++++--------
 benchtests/bench-strncpy.c |  94 +++++++++++++++++++++--------
 4 files changed, 297 insertions(+), 115 deletions(-)

diff --git a/benchtests/bench-strcat.c b/benchtests/bench-strcat.c
index 749318e37e..890b34b4c1 100644
--- a/benchtests/bench-strcat.c
+++ b/benchtests/bench-strcat.c
@@ -35,6 +35,7 @@
 # define SMALL_CHAR 1273
 #endif /* WIDE */
 
+#include "json-lib.h"
 
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
 
@@ -49,7 +50,7 @@ IMPL (STRCAT, 1)
 IMPL (generic_strcat, 0)
 
 static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src)
 {
   size_t k = STRLEN (dst), i, iters = INNER_LOOP_ITERS_LARGE;
   timing_t start, stop, cur;
@@ -64,7 +65,8 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
 
   if (STRCMP (dst + k, src) != 0)
     {
-      error (0, 0, "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
+      error (0, 0,
+	     "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
 	     impl->name, dst, src);
       ret = 1;
       return;
@@ -80,11 +82,12 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align1, size_t align2, size_t len1, size_t len2, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len1,
+	 size_t len2, int max_char)
 {
   size_t i;
   CHAR *s1, *s2;
@@ -107,53 +110,112 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2, int max_char)
   for (i = 0; i < len2; i++)
     s2[i] = 32 + 23 * i % (max_char - 32);
 
-  printf ("Length %4zd/%4zd, alignment %2zd/%2zd:", len1, len2, align1, align2);
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_attr_uint (json_ctx, "len1", len1);
+  json_attr_uint (json_ctx, "len2", len2);
+  json_attr_uint (json_ctx, "max_char", max_char);
+
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
     {
       s2[len2] = '\0';
-      do_one_test (impl, s2, s1);
+      do_one_test (json_ctx, impl, s2, s1);
     }
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 test_main (void)
 {
+  json_ctx_t json_ctx;
   size_t i;
 
   test_init ();
 
-  printf ("%28s", "");
+  test_init ();
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
   for (i = 0; i < 16; ++i)
     {
-      do_test (0, 0, i, i, SMALL_CHAR);
-      do_test (0, 0, i, i, BIG_CHAR);
-      do_test (0, i, i, i, SMALL_CHAR);
-      do_test (i, 0, i, i, BIG_CHAR);
+      do_test (&json_ctx, 0, 0, i, i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, i, BIG_CHAR);
+      do_test (&json_ctx, 0, i, i, i, SMALL_CHAR);
+      do_test (&json_ctx, i, 0, i, i, BIG_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (0, 0, 8 << i, 8 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 8 << i, 8 << i, SMALL_CHAR);
-      do_test (0, 0, 8 << i, 2 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 8 << i, 2 << i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 8 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 8 << i, 2 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 2 << i, SMALL_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (i, 2 * i, 8 << i, 1, SMALL_CHAR);
-      do_test (2 * i, i, 8 << i, 1, BIG_CHAR);
-      do_test (i, i, 8 << i, 10, SMALL_CHAR);
-      do_test (i, i, 8 << i, 10, BIG_CHAR);
+      do_test (&json_ctx, i, 2 * i, 8 << i, 1, SMALL_CHAR);
+      do_test (&json_ctx, 2 * i, i, 8 << i, 1, BIG_CHAR);
+      do_test (&json_ctx, i, i, 8 << i, 10, SMALL_CHAR);
+      do_test (&json_ctx, i, i, 8 << i, 10, BIG_CHAR);
+    }
+
+  for (i = 32; i < 256; i += 32)
+    {
+      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
     }
 
+  for (; i < 512; i += 64)
+    {
+      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
+    }
+
+  for (; i < 1024; i += 128)
+    {
+      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
+    }
+
+  for (; i < 2048; i += 256)
+    {
+      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
   return ret;
 }
 
diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
index 29deb8a46a..af8673e137 100644
--- a/benchtests/bench-strcpy.c
+++ b/benchtests/bench-strcpy.c
@@ -26,16 +26,18 @@
 # define SMALL_CHAR 127
 #endif
 
+#include "json-lib.h"
+
 #ifndef STRCPY_RESULT
 # define STRCPY_RESULT(dst, len) dst
 # define TEST_MAIN
 # ifndef WIDE
-#  define TEST_NAME "strcpy"
+#   define TEST_NAME "strcpy"
 # else
-#  define TEST_NAME "wcscpy"
-#  define generic_strcpy generic_wcscpy
+#   define TEST_NAME "wcscpy"
+#   define generic_strcpy generic_wcscpy
 # endif
-#include "bench-string.h"
+# include "bench-string.h"
 
 CHAR *
 generic_strcpy (CHAR *dst, const CHAR *src)
@@ -51,8 +53,8 @@ IMPL (generic_strcpy, 0)
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
 
 static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
-	     size_t len __attribute__((unused)))
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
+	     size_t len __attribute__ ((unused)))
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
@@ -77,23 +79,24 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
   TIMING_NOW (start);
   for (i = 0; i < iters; ++i)
     {
-	  CALL (impl, dst, src);
+      CALL (impl, dst, src);
     }
   TIMING_NOW (stop);
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align1, size_t align2, size_t len, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
+	 int max_char)
 {
   size_t i;
   CHAR *s1, *s2;
-/* For wcscpy: align1 and align2 here mean alignment not in bytes,
-   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
-   len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
+  /* For wcscpy: align1 and align2 here mean alignment not in bytes,
+     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
+     len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
   align1 &= 7;
   if ((align1 + len) * sizeof (CHAR) >= page_size)
     return;
@@ -109,65 +112,96 @@ do_test (size_t align1, size_t align2, size_t len, int max_char)
     s1[i] = 32 + 23 * i % (max_char - 32);
   s1[len] = 0;
 
-  printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len,
-	  align1 * sizeof (CHAR), align2 * sizeof (CHAR));
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_attr_uint (json_ctx, "len", len);
+  json_attr_uint (json_ctx, "max_char", max_char);
+
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s2, s1, len);
+    do_one_test (json_ctx, impl, s2, s1, len);
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 test_main (void)
 {
+  json_ctx_t json_ctx;
   size_t i;
 
   test_init ();
 
-  printf ("%23s", "");
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
   for (i = 0; i < 16; ++i)
     {
-      do_test (0, 0, i, SMALL_CHAR);
-      do_test (0, 0, i, BIG_CHAR);
-      do_test (0, i, i, SMALL_CHAR);
-      do_test (i, 0, i, BIG_CHAR);
+      do_test (&json_ctx, 0, 0, i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, BIG_CHAR);
+      do_test (&json_ctx, 0, i, i, SMALL_CHAR);
+      do_test (&json_ctx, i, 0, i, BIG_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (0, 0, 8 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, SMALL_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (i, 2 * i, 8 << i, SMALL_CHAR);
-      do_test (2 * i, i, 8 << i, BIG_CHAR);
-      do_test (i, i, 8 << i, SMALL_CHAR);
-      do_test (i, i, 8 << i, BIG_CHAR);
+      do_test (&json_ctx, i, 2 * i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 2 * i, i, 8 << i, BIG_CHAR);
+      do_test (&json_ctx, i, i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, i, i, 8 << i, BIG_CHAR);
     }
 
-  for (i = 16; i <= 512; i+=4)
+  for (i = 16; i <= 512; i += 4)
     {
-      do_test (0, 4, i, SMALL_CHAR);
-      do_test (4, 0, i, BIG_CHAR);
-      do_test (4, 4, i, SMALL_CHAR);
-      do_test (2, 2, i, BIG_CHAR);
-      do_test (2, 6, i, SMALL_CHAR);
-      do_test (6, 2, i, BIG_CHAR);
-      do_test (1, 7, i, SMALL_CHAR);
-      do_test (7, 1, i, BIG_CHAR);
-      do_test (3, 4, i, SMALL_CHAR);
-      do_test (4, 3, i, BIG_CHAR);
-      do_test (5, 7, i, SMALL_CHAR);
-      do_test (7, 5, i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 4, i, SMALL_CHAR);
+      do_test (&json_ctx, 4, 0, i, BIG_CHAR);
+      do_test (&json_ctx, 4, 4, i, SMALL_CHAR);
+      do_test (&json_ctx, 2, 2, i, BIG_CHAR);
+      do_test (&json_ctx, 2, 6, i, SMALL_CHAR);
+      do_test (&json_ctx, 6, 2, i, BIG_CHAR);
+      do_test (&json_ctx, 1, 7, i, SMALL_CHAR);
+      do_test (&json_ctx, 7, 1, i, BIG_CHAR);
+      do_test (&json_ctx, 3, 4, i, SMALL_CHAR);
+      do_test (&json_ctx, 4, 3, i, BIG_CHAR);
+      do_test (&json_ctx, 5, 7, i, SMALL_CHAR);
+      do_test (&json_ctx, 7, 5, i, SMALL_CHAR);
     }
 
+  for (i = 1; i < 2048; i += i)
+    {
+      do_test (&json_ctx, 1, 0, i, SMALL_CHAR);
+      do_test (&json_ctx, 0, i, i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, SMALL_CHAR);
+      do_test (&json_ctx, i, i, i, SMALL_CHAR);
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
   return ret;
 }
 
diff --git a/benchtests/bench-strncat.c b/benchtests/bench-strncat.c
index b148c55279..5ccc09a4f8 100644
--- a/benchtests/bench-strncat.c
+++ b/benchtests/bench-strncat.c
@@ -33,6 +33,8 @@
 # define SMALL_CHAR 1273
 #endif /* WIDE */
 
+#include "json-lib.h"
+
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
 
 CHAR *
@@ -49,7 +51,8 @@ IMPL (STRNCAT, 2)
 IMPL (generic_strncat, 0)
 
 static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
+	     size_t n)
 {
   size_t k = STRLEN (dst), i, iters = INNER_LOOP_ITERS8;
   timing_t start, stop, cur;
@@ -65,8 +68,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
   size_t len = STRLEN (src);
   if (MEMCMP (dst + k, src, len + 1 > n ? n : len + 1) != 0)
     {
-      error (0, 0, "Incorrect concatenation in function %s",
-	     impl->name);
+      error (0, 0, "Incorrect concatenation in function %s", impl->name);
       ret = 1;
       return;
     }
@@ -88,12 +90,12 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align1, size_t align2, size_t len1, size_t len2,
-	 size_t n, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len1,
+	 size_t len2, size_t n, int max_char)
 {
   size_t i;
   CHAR *s1, *s2;
@@ -118,53 +120,93 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
   for (i = 0; i < len2; i++)
     s2[i] = 32 + 23 * i % (max_char - 32);
 
-  printf ("Length %4zd/%4zd, alignment %2zd/%2zd, N %4zd:",
-	  len1, len2, align1, align2, n);
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_attr_uint (json_ctx, "len1", len1);
+  json_attr_uint (json_ctx, "len2", len2);
+  json_attr_uint (json_ctx, "n", n);
+  json_attr_uint (json_ctx, "max_char", max_char);
+
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
     {
       s2[len2] = '\0';
-      do_one_test (impl, s2, s1, n);
+      do_one_test (json_ctx, impl, s2, s1, n);
     }
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 main (void)
 {
+  json_ctx_t json_ctx;
   size_t i, n;
 
   test_init ();
 
-  printf ("%28s", "");
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
-  for (n = 2; n <= 2048; n*=4)
+  for (n = 2; n <= 2048; n *= 4)
     {
-      do_test (0, 2, 2, 2, n, SMALL_CHAR);
-      do_test (0, 0, 4, 4, n, SMALL_CHAR);
-      do_test (4, 0, 4, 4, n, BIG_CHAR);
-      do_test (0, 0, 8, 8, n, SMALL_CHAR);
-      do_test (0, 8, 8, 8, n, SMALL_CHAR);
+      do_test (&json_ctx, 0, 2, 2, 2, n, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 4, 4, n, SMALL_CHAR);
+      do_test (&json_ctx, 4, 0, 4, 4, n, BIG_CHAR);
+      do_test (&json_ctx, 0, 0, 8, 8, n, SMALL_CHAR);
+      do_test (&json_ctx, 0, 8, 8, 8, n, SMALL_CHAR);
 
       for (i = 1; i < 8; ++i)
 	{
-	  do_test (0, 0, 8 << i, 8 << i, n, SMALL_CHAR);
-	  do_test (8 - i, 2 * i, 8 << i, 8 << i, n, SMALL_CHAR);
-	  do_test (0, 0, 8 << i, 2 << i, n, SMALL_CHAR);
-	  do_test (8 - i, 2 * i, 8 << i, 2 << i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, 8 << i, 8 << i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 8 << i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, 8 << i, 2 << i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 2 << i, n, SMALL_CHAR);
 	}
 
       for (i = 1; i < 8; ++i)
 	{
-	  do_test (i, 2 * i, 8 << i, 1, n, SMALL_CHAR);
-	  do_test (2 * i, i, 8 << i, 1, n, BIG_CHAR);
-	  do_test (i, i, 8 << i, 10, n, SMALL_CHAR);
+	  do_test (&json_ctx, i, 2 * i, 8 << i, 1, n, SMALL_CHAR);
+	  do_test (&json_ctx, 2 * i, i, 8 << i, 1, n, BIG_CHAR);
+	  do_test (&json_ctx, i, i, 8 << i, 10, n, SMALL_CHAR);
 	}
     }
 
+  for (i = 128; i < 2048; i += i)
+    {
+      for (n = i - 64; n <= i + 64; n += 32)
+	{
+	  do_test (&json_ctx, 1, 0, i, i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 0, i, i, i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, i, i, n, SMALL_CHAR);
+	  do_test (&json_ctx, i, i, i, i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 1, 0, i, n, i, SMALL_CHAR);
+	  do_test (&json_ctx, 0, i, i, n, i, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, i, n, i, SMALL_CHAR);
+	  do_test (&json_ctx, i, i, i, n, i, SMALL_CHAR);
+	}
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
   return ret;
 }
diff --git a/benchtests/bench-strncpy.c b/benchtests/bench-strncpy.c
index 8207d99f4d..f621cbfe09 100644
--- a/benchtests/bench-strncpy.c
+++ b/benchtests/bench-strncpy.c
@@ -24,6 +24,8 @@
 # define SMALL_CHAR 127
 #endif /* !WIDE */
 
+#include "json-lib.h"
+
 #ifndef STRNCPY_RESULT
 # define STRNCPY_RESULT(dst, len, n) dst
 # define TEST_MAIN
@@ -52,7 +54,8 @@ IMPL (generic_strncpy, 0)
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
 
 static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
+	     size_t len, size_t n)
 {
   size_t i, iters = INNER_LOOP_ITERS_LARGE * (4 / CHARBYTES);
   timing_t start, stop, cur;
@@ -77,7 +80,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
       size_t i;
 
       for (i = len; i < n; ++i)
-	if (dst [i] != '\0')
+	if (dst[i] != '\0')
 	  {
 	    error (0, 0, "Wrong result in function %s", impl->name);
 	    ret = 1;
@@ -94,17 +97,18 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
+	 size_t n, int max_char)
 {
   size_t i;
   CHAR *s1, *s2;
 
-/* For wcsncpy: align1 and align2 here mean alignment not in bytes,
-   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)).  */
+  /* For wcsncpy: align1 and align2 here mean alignment not in bytes,
+     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)).  */
   align1 &= 7;
   if ((align1 + len) * sizeof (CHAR) >= page_size)
     return;
@@ -123,46 +127,86 @@ do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
        ++i)
     s1[i] = 32 + 32 * i % (max_char - 32);
 
-  printf ("Length %4zd, n %4zd, alignment %2zd/%2zd:", len, n, align1, align2);
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_attr_uint (json_ctx, "len", len);
+  json_attr_uint (json_ctx, "n", n);
+  json_attr_uint (json_ctx, "max_char", max_char);
+
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s2, s1, len, n);
+    do_one_test (json_ctx, impl, s2, s1, len, n);
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 static int
 test_main (void)
 {
-  size_t i;
+  json_ctx_t json_ctx;
+  size_t i, j;
 
   test_init ();
 
-  printf ("%28s", "");
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (i, i, 16, 16, SMALL_CHAR);
-      do_test (i, i, 16, 16, BIG_CHAR);
-      do_test (i, 2 * i, 16, 16, SMALL_CHAR);
-      do_test (2 * i, i, 16, 16, BIG_CHAR);
-      do_test (8 - i, 2 * i, 1 << i, 2 << i, SMALL_CHAR);
-      do_test (2 * i, 8 - i, 2 << i, 1 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 1 << i, 2 << i, BIG_CHAR);
-      do_test (2 * i, 8 - i, 2 << i, 1 << i, BIG_CHAR);
+      do_test (&json_ctx, i, i, 16, 16, SMALL_CHAR);
+      do_test (&json_ctx, i, i, 16, 16, BIG_CHAR);
+      do_test (&json_ctx, i, 2 * i, 16, 16, SMALL_CHAR);
+      do_test (&json_ctx, 2 * i, i, 16, 16, BIG_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 1 << i, 2 << i, SMALL_CHAR);
+      do_test (&json_ctx, 2 * i, 8 - i, 2 << i, 1 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 1 << i, 2 << i, BIG_CHAR);
+      do_test (&json_ctx, 2 * i, 8 - i, 2 << i, 1 << i, BIG_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (0, 0, 4 << i, 8 << i, SMALL_CHAR);
-      do_test (0, 0, 16 << i, 8 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 4 << i, 8 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 16 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 4 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 16 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 4 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 16 << i, 8 << i, SMALL_CHAR);
     }
 
+  for (i = 128; i < 2048; i += i)
+    {
+      for (j = i - 64; j <= i + 64; j += 32)
+	{
+	  do_test (&json_ctx, 1, 0, i, j, SMALL_CHAR);
+	  do_test (&json_ctx, 0, i, i, j, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, i, j, SMALL_CHAR);
+	  do_test (&json_ctx, i, i, i, j, SMALL_CHAR);
+	  do_test (&json_ctx, 1, 0, j, i, SMALL_CHAR);
+	  do_test (&json_ctx, 0, i, j, i, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, j, i, SMALL_CHAR);
+	  do_test (&json_ctx, i, i, j, i, SMALL_CHAR);
+	}
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
   return ret;
 }
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
  2022-11-04  8:20 ` [PATCH v2 " Noah Goldstein
@ 2022-11-04  8:20   ` Noah Goldstein
  2022-11-04 16:33     ` H.J. Lu
  2022-11-04  8:20   ` [PATCH v2 3/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04  8:20 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
    1. Use more overlapping stores to avoid branches.
    2. Reduce how unrolled the aligning copies are (this is more of a
       code-size save, its a negative for some sizes in terms of
       perf).
    3. Improve the loop a bit (similiar to what we do in strlen with
       2x vpminu + kortest instead of 3x vpminu + kmov + test).
    4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
       number that are taken.

Performance Changes:

    Times are from N = 10 runs of the benchmark suite and are
    reported as geometric mean of all ratios of
    New Implementation / Old Implementation.

    stpcpy-evex      -> 0.922
    strcat-evex      -> 0.985
    strcpy-evex      -> 0.880

    strncpy-evex     -> 0.831
    stpncpy-evex     -> 0.780

    strncat-evex     -> 0.958

Code Size Changes:
    function         -> Bytes New / Bytes Old -> Ratio

    strcat-evex      -> 819  / 1874 -> 0.437
    strcpy-evex      -> 700  / 1074 -> 0.652
    stpcpy-evex      -> 735  / 1094 -> 0.672

    strncpy-evex     -> 1397 / 2611 -> 0.535
    stpncpy-evex     -> 1489 / 2691 -> 0.553

    strncat-evex     -> 1184 / 2832 -> 0.418

Notes:
    1. Because of the significant difference between the
       implementations they are split into three files.

           strcpy-evex.S    -> strcpy, stpcpy, strcat
           strncpy-evex.S   -> strncpy
           strncat-evex.S    > strncat

       I couldn't find a way to merge them without making the
       ifdefs incredibly difficult to follow.

    2. All implementations can be made evex512 by including
       "x86-evex512-vecs.h" at the top.

    3. All implementations have an optional define:
        `USE_EVEX_MASKED_STORE`
       Setting to one uses evex-masked stores for handling short
       strings.  This saves code size and branches.  It's disabled
       for all implementations are the moment as there are some
       serious drawbacks to masked stores in certain cases, but
       that may be fixed on future architectures.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
 sysdeps/x86_64/multiarch/stpncpy-evex.S       |    5 +-
 sysdeps/x86_64/multiarch/strcat-evex.S        |  291 +---
 sysdeps/x86_64/multiarch/strcat-strlen-evex.S |   85 ++
 sysdeps/x86_64/multiarch/strcpy-evex.S        | 1282 ++++++-----------
 sysdeps/x86_64/multiarch/strncat-evex.S       |  525 ++++++-
 sysdeps/x86_64/multiarch/strncpy-evex.S       |  995 ++++++++++++-
 .../multiarch/strncpy-or-cat-overflow-def.h   |   65 +
 7 files changed, 2075 insertions(+), 1173 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h

diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
index 99ea76a372..3693491baa 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
@@ -3,6 +3,5 @@
 #endif
 
 #define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY	STPNCPY
-#include "strcpy-evex.S"
+#define STRNCPY	STPNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
index 0e2df947e9..b4207b7889 100644
--- a/sysdeps/x86_64/multiarch/strcat-evex.S
+++ b/sysdeps/x86_64/multiarch/strcat-evex.S
@@ -1,286 +1,7 @@
-/* strcat with 256-bit EVEX instructions.
-   Copyright (C) 2021-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_evex
-# endif
-
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
-/* zero register */
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMM0		ymm17
-# define YMM1		ymm18
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE	32
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRCAT)
-	mov	%rdi, %r9
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-	xor	%eax, %eax
-	mov	%edi, %ecx
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-	cmp	$(VEC_SIZE * 3), %ecx
-	ja	L(fourth_vector_boundary)
-	vpcmpb	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_first_vector)
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	jmp	L(align_vec_size_start)
-L(fourth_vector_boundary):
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	vpcmpb	$0, (%rax), %YMMZERO, %k0
-	mov	$-1, %r10d
-	sub	%rax, %rcx
-	shl	%cl, %r10d
-	kmovd	%k0, %edx
-	and	%r10d, %edx
-	jnz	L(exit)
-
-L(align_vec_size_start):
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 4), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	kmovd	%k4, %edx
-	add	$(VEC_SIZE * 4), %rax
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 4), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 5), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	add	$VEC_SIZE, %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	add	$VEC_SIZE, %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
-	add	$VEC_SIZE, %rax
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	add	$VEC_SIZE, %rax
-
-	.p2align 4
-L(align_four_vec_loop):
-	VMOVA	(%rax), %YMM0
-	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
-	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
-	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
-	vpminub	%YMM0, %YMM1, %YMM0
-	/* If K0 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM0, %YMMZERO, %k0
-	add	$(VEC_SIZE * 4), %rax
-	ktestd	%k0, %k0
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
-	sub	$(VEC_SIZE * 5), %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit):
-	sub	%rdi, %rax
-L(exit_null_on_first_vector):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_second_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$VEC_SIZE, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_third_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 2), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fourth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 3), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fifth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-
-	.p2align 4
-L(StartStrcpyPart):
-	lea	(%r9, %rax), %rdi
-	mov	%rsi, %rcx
-	mov	%r9, %rax      /* save result */
-
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(ExitZero)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-evex.S"
+#ifndef STRCAT
+# define STRCAT	__strcat_evex
 #endif
+
+#define USE_AS_STRCAT
+#define STRCPY	STRCAT
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
new file mode 100644
index 0000000000..9bc777c339
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
@@ -0,0 +1,85 @@
+    /* Simple strlen implementation that ends at L(strcat_strlen_done).  */
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+	VPCMPEQ	(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+#ifdef USE_AS_WCSCPY
+	subl	%r8d, %edi
+	shrl	$2, %edi
+#endif
+	shrx	%VRDI, %VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+	movq	%rax, %rdi
+#endif
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v0)
+
+
+	VPCMPEQ	VEC_SIZE(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	leaq	(VEC_SIZE)(%r8), %rdi
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v1)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v2)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v3)
+
+	andq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4,, 8
+L(loop_2x_vec):
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(0)
+	VPMIN	(VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(2)
+	VPMIN	(VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	VPTESTN	%VMM(3), %VMM(3), %k3
+	subq	$(VEC_SIZE * -4), %rdi
+	KORTEST	%k1, %k3
+	jz	L(loop_2x_vec)
+
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v0)
+
+	KMOV	%k1, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v1)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v2)
+
+	KMOV	%k3, %VRCX
+L(bsf_and_done_v3):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v2):
+	bsf	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
+	jmp	L(strcat_strlen_done)
+
+	.p2align 4,, 4
+L(bsf_and_done_v1):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v0):
+	bsf	%VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#else
+	addq	%rcx, %rdi
+#endif
+L(strcat_strlen_done):
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
index 82e45ac675..1ba0195ed2 100644
--- a/sysdeps/x86_64/multiarch/strcpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
@@ -1,4 +1,4 @@
-/* strcpy with 256-bit EVEX instructions.
+/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
    Copyright (C) 2021-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,990 +17,526 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <isa-level.h>
-
 #if ISA_SHOULD_BUILD (4)
 
 
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+	/* Use movsb in page cross case to save code size.  */
+# define USE_MOVSB_IN_PAGE_CROSS	1
 
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_evex
-#  endif
+# include <sysdep.h>
 
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
-/* Number of bytes in a vector register */
-# ifndef VEC_SIZE
-#  define VEC_SIZE	32
+# ifndef STRCPY
+#  define STRCPY	__strcpy_evex
 # endif
 
-# define XMM2		xmm18
-# define XMM3		xmm19
 
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
-# define YMM7		ymm23
+# ifdef USE_AS_WCSCPY
+#  define VMOVU_MASK	vmovdqu32
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
 
-# ifndef USE_AS_STRCAT
+#  define REP_MOVS	rep movsd
 
-/* zero register */
-#  define XMMZERO	xmm16
-#  define YMMZERO	ymm16
-#  define YMM1		ymm17
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRCPY)
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-	test	%R8_LP, %R8_LP
-	jz	L(ExitZero)
-#  endif
-	mov	%rsi, %rcx
-#  ifndef USE_AS_STPCPY
-	mov	%rdi, %rax      /* save result */
-#  endif
+#  define USE_WIDE_CHAR
+# else
+#  define VMOVU_MASK	vmovdqu8
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
 
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+#  define REP_MOVS	rep movsb
 # endif
 
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	cmp	$(VEC_SIZE * 2), %ecx
-	jbe	L(SourceStringAlignmentLessTwoVecSize)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-
-	vpcmpb	$0, (%rsi), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	shr	%cl, %rdx
+# include "reg-macros.h"
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	mov	$VEC_SIZE, %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  else
-	mov	$(VEC_SIZE + 1), %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  endif
-	jbe	L(CopyVecSizeTailCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail)
-
-	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
-	kmovd	%k1, %edx
 
-# ifdef USE_AS_STRNCPY
-	add	$VEC_SIZE, %r10
-	cmp	%r10, %r8
-	jbe	L(CopyTwoVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize)
-
-	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
-	VMOVU	%YMM2, (%rdi)
-
-/* If source address alignment != destination address alignment */
-	.p2align 4
-L(UnalignVecSizeBoth):
-	sub	%rcx, %rdi
-# ifdef USE_AS_STRNCPY
-	add	%rcx, %r8
-	sbb	%rcx, %rcx
-	or	%rcx, %r8
-# endif
-	mov	$VEC_SIZE, %rcx
-	VMOVA	(%rsi, %rcx), %YMM2
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 3), %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STPCPY
+#  define END_REG	rax
 # else
-	jnz	L(CopyVecSize)
+#  define END_REG	rdi, %rdx, CHAR_SIZE
 # endif
 
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
+# ifdef USE_AS_STRCAT
+#  define PAGE_ALIGN_REG	edx
+#  define PAGE_ALIGN_REG_64	rdx
 # else
-	jnz	L(CopyVecSize)
+#  define PAGE_ALIGN_REG	eax
+#  define PAGE_ALIGN_REG_64	rax
 # endif
 
-	VMOVU	%YMM3, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
-	vpcmpb	$0, %YMM4, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
 
-	VMOVU	%YMM4, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
 
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
-	VMOVU	%YMM2, (%rdi, %rcx)
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
 
-	VMOVU	%YMM3, (%rdi, %rcx)
-	mov	%rsi, %rdx
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	and	$-(VEC_SIZE * 4), %rsi
-	sub	%rsi, %rdx
-	sub	%rdx, %rdi
-# ifdef USE_AS_STRNCPY
-	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
-# endif
-L(UnalignedFourVecSizeLoop):
-	VMOVA	(%rsi), %YMM4
-	VMOVA	VEC_SIZE(%rsi), %YMM5
-	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
-	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
-	vpminub	%YMM5, %YMM4, %YMM2
-	vpminub	%YMM7, %YMM6, %YMM3
-	vpminub	%YMM2, %YMM3, %YMM2
-	/* If K7 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM2, %YMMZERO, %k7
-	kmovd	%k7, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRCPY)
+# ifdef USE_AS_STRCAT
+	movq	%rdi, %rax
+#  include "strcat-strlen-evex.S"
 # endif
-	test	%edx, %edx
-	jnz	L(UnalignedFourVecSizeLeave)
-
-L(UnalignedFourVecSizeLoop_start):
-	add	$(VEC_SIZE * 4), %rdi
-	add	$(VEC_SIZE * 4), %rsi
-	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
-	VMOVA	(%rsi), %YMM4
-	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
-	VMOVA	VEC_SIZE(%rsi), %YMM5
-	vpminub	%YMM5, %YMM4, %YMM2
-	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
-	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
-	VMOVU	%YMM7, -VEC_SIZE(%rdi)
-	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
-	vpminub	%YMM7, %YMM6, %YMM3
-	vpminub	%YMM2, %YMM3, %YMM2
-	/* If K7 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM2, %YMMZERO, %k7
-	kmovd	%k7, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
+
+	movl	%esi, %PAGE_ALIGN_REG
+	andl	$(PAGE_SIZE - 1), %PAGE_ALIGN_REG
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 # endif
-	test	%edx, %edx
-	jz	L(UnalignedFourVecSizeLoop_start)
 
-L(UnalignedFourVecSizeLeave):
-	vpcmpb	$0, %YMM4, %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_0)
 
-	vpcmpb	$0, %YMM5, %YMMZERO, %k2
-	kmovd	%k2, %ecx
-	test	%ecx, %ecx
-	jnz	L(CopyVecSizeUnaligned_16)
+	/* Two short string implementations. One with traditional
+	   branching approach and one with masked instructions (which
+	   have potential for dramatically bad perf if dst splits a
+	   page and is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	VPTEST	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+#  ifdef USE_AS_WCSCPY
+	subl	$((1 << CHAR_PER_VEC)- 1), %VRCX
+#  else
+	inc	%VRCX
+#  endif
+	jz	L(more_1x_vec)
+	KMOV	%VRCX, %k1
+	KXOR	%k0, %k1, %k1
 
-	vpcmpb	$0, %YMM6, %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_32)
-
-	vpcmpb	$0, %YMM7, %YMMZERO, %k4
-	kmovd	%k4, %ecx
-	bsf	%ecx, %edx
-	VMOVU	%YMM4, (%rdi)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$(VEC_SIZE * 3), %rsi
-	add	$(VEC_SIZE * 3), %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
 
-/* If source address alignment == destination address alignment */
+#  ifdef USE_AS_STPCPY
+	bsf	%VRCX, %VRCX
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
+#  endif
+	ret
 
-L(SourceStringAlignmentLessTwoVecSize):
-	VMOVU	(%rsi), %YMM3
-	VMOVU	VEC_SIZE(%rsi), %YMM2
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
+# else
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jz	L(more_1x_vec)
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$VEC_SIZE, %r8
+	xorl	%edx, %edx
+	bsf	%VRCX, %VRDX
+#  ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  endif
+
+	/* Use mask bits in rcx to detect which copy we need. If the low
+	   mask is zero then there must be a bit set in the upper half.
+	   I.e if rcx != 0 and ecx == 0, then match must be upper 32
+	   bits so we use L(copy_32_63).  */
+#  if VEC_SIZE == 64
+#   ifdef USE_AS_WCSCPY
+	testb	%cl, %cl
+#   else
+	testl	%ecx, %ecx
+#   endif
+	jz	L(copy_32_63)
+#  endif
+
+#  ifdef USE_AS_WCSCPY
+	testb	$0xf, %cl
 #  else
-	cmp	$(VEC_SIZE + 1), %r8
+	testw	%cx, %cx
 #  endif
-	jbe	L(CopyVecSizeTail1Case2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail1)
+	jz	L(copy_16_31)
 
-	VMOVU	%YMM3, (%rdi)
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$(VEC_SIZE * 2), %r8
+#  ifdef USE_AS_WCSCPY
+	testb	$0x3, %cl
 #  else
-	cmp	$((VEC_SIZE * 2) + 1), %r8
+	testb	%cl, %cl
 #  endif
-	jbe	L(CopyTwoVecSize1Case2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize1)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-	jmp	L(UnalignVecSizeBoth)
+	jz	L(copy_8_15)
 
-/*------End of main part with loops---------------------*/
 
-/* Case1 */
+#  ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+	/* No need to copy, we know its zero.  */
+	movl	$0, (%END_REG)
 
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
-	.p2align 4
-L(CopyVecSize):
-	add	%rcx, %rdi
-# endif
-L(CopyVecSizeTail):
-	add	%rcx, %rsi
-L(CopyVecSizeTail1):
-	bsf	%edx, %edx
-L(CopyVecSizeExit):
-	cmp	$32, %edx
-	jae	L(Exit32_63)
-	cmp	$16, %edx
-	jae	L(Exit16_31)
-	cmp	$8, %edx
-	jae	L(Exit8_15)
-	cmp	$4, %edx
-	jae	L(Exit4_7)
-	cmp	$3, %edx
-	je	L(Exit3)
-	cmp	$1, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	movb	$0, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$1, %r8
-	lea	1(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
 	ret
+#  else
 
-	.p2align 4
-L(CopyTwoVecSize1):
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$VEC_SIZE, %r8
-# endif
-	jmp	L(CopyVecSizeTail1)
-
-	.p2align 4
-L(CopyTwoVecSize):
-	bsf	%edx, %edx
-	add	%rcx, %rsi
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	jmp	L(CopyVecSizeExit)
-
-	.p2align 4
-L(CopyVecSizeUnaligned_0):
-	bsf	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM4, (%rdi)
-	add	$((VEC_SIZE * 4) - 1), %r8
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	jmp	L(CopyVecSizeExit)
-# endif
+	testb	$0x7, %cl
+	jz	L(copy_4_7)
 
-	.p2align 4
-L(CopyVecSizeUnaligned_16):
-	bsf	%ecx, %edx
-	VMOVU	%YMM4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	VEC_SIZE(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	add	$((VEC_SIZE * 3) - 1), %r8
-	sub	%rdx, %r8
-	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
 
-	.p2align 4
-L(CopyVecSizeUnaligned_32):
-	bsf	%edx, %edx
-	VMOVU	%YMM4, (%rdi)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-	add	$((VEC_SIZE * 2) - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$(VEC_SIZE * 2), %rsi
-	add	$(VEC_SIZE * 2), %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
+	test	%edx, %edx
+	jz	L(set_null_term)
 
-# ifdef USE_AS_STRNCPY
-#  ifndef USE_AS_STRCAT
-	.p2align 4
-L(CopyVecSizeUnalignedVec6):
-	VMOVU	%YMM6, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec5):
-	VMOVU	%YMM5, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec4):
-	VMOVU	%YMM4, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec3):
-	VMOVU	%YMM3, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
+	/* NB: make this `vmovw` if support for AVX512-FP16 is added.
+	 */
+	vmovd	%VMM_128(0), %esi
+	movw	%si, (%rdi)
+
+	.p2align 4,, 1
+L(set_null_term):
+	/* No need to copy, we know its zero.  */
+	movb	$0, (%END_REG)
+	ret
 #  endif
 
-/* Case2 */
-
-	.p2align 4
-L(CopyVecSizeCase2):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTailCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTail1Case2):
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-/* Case2 or Case3,  Case3 */
-
-	.p2align 4
-L(CopyVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeCase2)
-L(CopyVecSizeCase3):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyTwoVecSizeCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyVecSizeTailCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTailCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSize1Case2OrCase3):
-	add	$VEC_SIZE, %rdi
-	add	$VEC_SIZE, %rsi
-	sub	$VEC_SIZE, %r8
-L(CopyVecSizeTail1Case2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTail1Case2)
-	jmp	L(StrncpyExit)
+#  if VEC_SIZE == 64
+	.p2align 4,, 6
+L(copy_32_63):
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
+	ret
+#  endif
+
+
+	.p2align 4,, 6
+L(copy_16_31):
+	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
+	   and will save code size.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
+	ret
+
+	.p2align 4,, 8
+L(copy_8_15):
+#  ifdef USE_AS_WCSCPY
+	movl	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+#  else
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
+#  endif
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%END_REG)
+	ret
 # endif
 
-/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
 
-	.p2align 4
-L(Exit1):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
+# ifndef USE_AS_WCSCPY
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%ecx, -(4 - CHAR_SIZE)(%END_REG)
+	ret
 # endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$2, %r8
-	lea	2(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rdi)
 # endif
-	ret
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+	addq	%rsi, %rdi
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
 
-	.p2align 4
-L(Exit2):
-	movzwl	(%rsi), %ecx
-	mov	%cx, (%rdi)
-	movb	$0, 2(%rdi)
+	/* Ideally we store after moves to minimize impact of potential
+	   false-dependencies.  */
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rax)
+# endif
+
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+
+
+	/* Align for 4x loop.  */
+	subq	%rsi, %rdi
+
+	/* + VEC_SIZE * 5 because we never added the original VEC_SIZE
+	   we covered before aligning.  */
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$-(VEC_SIZE * 4), %rsi
+
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	/* Restore rdi (%rdi).  */
+	addq	%rsi, %rdi
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x0_end)
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	/* Place L(ret_vec_x4) here to save code size.  We get a
+	   meaningfuly benefit doing this for stpcpy.  */
+	KMOV	%k4, %VRDX
+L(ret_vec_x3):
+	bsf	%VRDX, %VRDX
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$3, %r8
-	lea	3(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
 # endif
+L(return_end):
 	ret
 
-	.p2align 4
-L(Exit3):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
+	.p2align 4,, 6
+L(ret_vec_x0_end):
+	bsf	%VRCX, %VRCX
 # ifdef USE_AS_STPCPY
-	lea	3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$4, %r8
-	lea	4(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
+	inc	%VRCX
+	VMOVU	(-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
 	ret
 
-	.p2align 4
-L(Exit4_7):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	-3(%rsi, %rdx), %ecx
-	mov	%ecx, -3(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x1):
+	bsf	%VRCX, %VRCX
+	VMOVU	(VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
 	ret
 
-	.p2align 4
-L(Exit8_15):
-	mov	(%rsi), %rcx
-	mov	-7(%rsi, %rdx), %r9
-	mov	%rcx, (%rdi)
-	mov	%r9, -7(%rdi, %rdx)
+	.p2align 4,, 4
+L(ret_vec_x2):
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
 	ret
 
-	.p2align 4
-L(Exit16_31):
-	VMOVU	(%rsi), %XMM2
-	VMOVU	-15(%rsi, %rdx), %XMM3
-	VMOVU	%XMM2, (%rdi)
-	VMOVU	%XMM3, -15(%rdi, %rdx)
+	/* ret_vec_x3 reuses return code after the loop.  */
+	.p2align 4,, 6
+L(ret_vec_x4):
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub %rdx, %r8
-	sub $1, %r8
-	lea 1(%rdi, %rdx), %rdi
-	jnz L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
 	ret
 
-	.p2align 4
-L(Exit32_63):
-	VMOVU	(%rsi), %YMM2
-	VMOVU	-31(%rsi, %rdx), %YMM3
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, -31(%rdi, %rdx)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
+
+	.p2align 4,, 4
+L(page_cross):
+# ifndef USE_AS_STRCAT
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
 # endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	movq	%rsi, %rcx
+	andq	$(VEC_SIZE * -1), %rcx
+
+	VPCMPEQ	(%rcx), %VZERO, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSCPY
+	andl	$(VEC_SIZE - 1), %PAGE_ALIGN_REG
+	shrl	$2, %PAGE_ALIGN_REG
 # endif
-	ret
+	shrx	%VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
 
-# ifdef USE_AS_STRNCPY
+# if USE_MOVSB_IN_PAGE_CROSS
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
 
-	.p2align 4
-L(StrncpyExit1):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 1(%rdi)
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shl	%VRCX
+	jz	L(page_cross_continue)
+#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 #  endif
-	ret
+	bsf	%VRCX, %VRCX
+	REP_MOVS
 
-	.p2align 4
-L(StrncpyExit2):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
 #  ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 2(%rdi)
+	leaq	-CHAR_SIZE(%rdi), %rax
 #  endif
 	ret
 
-	.p2align 4
-L(StrncpyExit3_4):
-	movzwl	(%rsi), %ecx
-	movzwl	-2(%rsi, %r8), %edx
-	mov	%cx, (%rdi)
-	mov	%dx, -2(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
 
-	.p2align 4
-L(StrncpyExit5_8):
-	mov	(%rsi), %ecx
-	mov	-4(%rsi, %r8), %edx
-	mov	%ecx, (%rdi)
-	mov	%edx, -4(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
+# else
+	/* Check if we found zero-char before end of page.  */
+	test	%VRCX, %VRCX
+	jz	L(page_cross_continue)
 
-	.p2align 4
-L(StrncpyExit9_16):
-	mov	(%rsi), %rcx
-	mov	-8(%rsi, %r8), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
+	/* Traditional copy case, essentially same as used in non-page-
+	   cross case but since we can't reuse VMM(0) we need twice as
+	   many loads from rsi.  */
 
-	.p2align 4
-L(StrncpyExit17_32):
-	VMOVU	(%rsi), %XMM2
-	VMOVU	-16(%rsi, %r8), %XMM3
-	VMOVU	%XMM2, (%rdi)
-	VMOVU	%XMM3, -16(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
+#  ifndef USE_AS_STRCAT
+	xorl	%edx, %edx
 #  endif
-	ret
-
-	.p2align 4
-L(StrncpyExit33_64):
-	/*  0/32, 31/16 */
-	VMOVU	(%rsi), %YMM2
-	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
+	/* Dependency on rdi must already have been satisfied.  */
+	bsf	%VRCX, %VRDX
 #  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  elif !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 #  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
 
-	.p2align 4
-L(StrncpyExit65):
-	/* 0/32, 32/32, 64/1 */
-	VMOVU	(%rsi), %YMM2
-	VMOVU	32(%rsi), %YMM3
-	mov	64(%rsi), %cl
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, 32(%rdi)
-	mov	%cl, 64(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	65(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 65(%rdi)
+#  if VEC_SIZE == 64
+#   ifdef USE_AS_WCSCPY
+	testb	%cl, %cl
+#   else
+	test	%ecx, %ecx
+#   endif
+	jz	L(page_cross_copy_32_63)
 #  endif
-	ret
-
-#  ifndef USE_AS_STRCAT
 
-	.p2align 4
-L(Fill1):
-	mov	%dl, (%rdi)
-	ret
+#  ifdef USE_AS_WCSCPY
+	testb	$0xf, %cl
+#  else
+	testw	%cx, %cx
+#  endif
+	jz	L(page_cross_copy_16_31)
 
-	.p2align 4
-L(Fill2):
-	mov	%dx, (%rdi)
-	ret
+#  ifdef USE_AS_WCSCPY
+	testb	$0x3, %cl
+#  else
+	testb	%cl, %cl
+#  endif
+	jz	L(page_cross_copy_8_15)
 
-	.p2align 4
-L(Fill3_4):
-	mov	%dx, (%rdi)
-	mov     %dx, -2(%rdi, %r8)
+#  ifdef USE_AS_WCSCPY
+	movl	(%rsi), %esi
+	movl	%esi, (%rdi)
+	movl	$0, (%END_REG)
 	ret
+#  else
 
-	.p2align 4
-L(Fill5_8):
-	mov	%edx, (%rdi)
-	mov     %edx, -4(%rdi, %r8)
-	ret
+	testb	$0x7, %cl
+	jz	L(page_cross_copy_4_7)
 
-	.p2align 4
-L(Fill9_16):
-	mov	%rdx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
+	test	%edx, %edx
+	jz	L(page_cross_set_null_term)
+	movzwl	(%rsi), %ecx
+	movw	%cx, (%rdi)
+L(page_cross_set_null_term):
+	movb	$0, (%END_REG)
 	ret
 
-	.p2align 4
-L(Fill17_32):
-	VMOVU	%XMMZERO, (%rdi)
-	VMOVU	%XMMZERO, -16(%rdi, %r8)
-	ret
 
-	.p2align 4
-L(CopyVecSizeUnalignedVec2):
-	VMOVU	%YMM2, (%rdi, %rcx)
-
-	.p2align 4
-L(CopyVecSizeVecExit):
-	bsf	%edx, %edx
-	add	$(VEC_SIZE - 1), %r8
-	add	%rcx, %rdi
-#   ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-#   endif
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-
-	.p2align 4
-L(StrncpyFillTailWithZero):
-	xor	%edx, %edx
-	sub	$VEC_SIZE, %r8
-	jbe	L(StrncpyFillExit)
-
-	VMOVU	%YMMZERO, (%rdi)
-	add	$VEC_SIZE, %rdi
-
-	mov	%rdi, %rsi
-	and	$(VEC_SIZE - 1), %esi
-	sub	%rsi, %rdi
-	add	%rsi, %r8
-	sub	$(VEC_SIZE * 4), %r8
-	jb	L(StrncpyFillLessFourVecSize)
-
-L(StrncpyFillLoopVmovdqa):
-	VMOVA	%YMMZERO, (%rdi)
-	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
-	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
-	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE * 4), %rdi
-	sub	$(VEC_SIZE * 4), %r8
-	jae	L(StrncpyFillLoopVmovdqa)
-
-L(StrncpyFillLessFourVecSize):
-	add	$(VEC_SIZE * 2), %r8
-	jl	L(StrncpyFillLessTwoVecSize)
-	VMOVA	%YMMZERO, (%rdi)
-	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
-	add	$(VEC_SIZE * 2), %rdi
-	sub	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	VMOVA	%YMMZERO, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillLessTwoVecSize):
-	add	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	VMOVA	%YMMZERO, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillExit):
-	add	$VEC_SIZE, %r8
-L(Fill):
-	cmp	$17, %r8d
-	jae	L(Fill17_32)
-	cmp	$9, %r8d
-	jae	L(Fill9_16)
-	cmp	$5, %r8d
-	jae	L(Fill5_8)
-	cmp	$3, %r8d
-	jae	L(Fill3_4)
-	cmp	$1, %r8d
-	ja	L(Fill2)
-	je	L(Fill1)
+	.p2align 4,, 4
+L(page_cross_copy_4_7):
+	movl	(%rsi), %ecx
+	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, -(4 - CHAR_SIZE)(%END_REG)
 	ret
-
-/* end of ifndef USE_AS_STRCAT */
 #  endif
 
-	.p2align 4
-L(UnalignedLeaveCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(UnalignedFourVecSizeLeaveCase2)
-L(UnalignedFourVecSizeLeaveCase3):
-	lea	(VEC_SIZE * 4)(%r8), %rcx
-	and	$-VEC_SIZE, %rcx
-	add	$(VEC_SIZE * 3), %r8
-	jl	L(CopyVecSizeCase3)
-	VMOVU	%YMM4, (%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 4)(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (VEC_SIZE * 4)(%rdi)
-#  endif
+#  if VEC_SIZE == 64
+	.p2align 4,, 4
+L(page_cross_copy_32_63):
+	VMOVU	(%rsi), %VMM_256(0)
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
 	ret
-
-	.p2align 4
-L(UnalignedFourVecSizeLeaveCase2):
-	xor	%ecx, %ecx
-	vpcmpb	$0, %YMM4, %YMMZERO, %k1
-	kmovd	%k1, %edx
-	add	$(VEC_SIZE * 3), %r8
-	jle	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
-	vpcmpb	$0, %YMM5, %YMMZERO, %k2
-	kmovd	%k2, %edx
-	VMOVU	%YMM4, (%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec5)
-#  else
-	jnz	L(CopyVecSize)
 #  endif
 
-	vpcmpb	$0, %YMM6, %YMMZERO, %k3
-	kmovd	%k3, %edx
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec6)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
-
-	vpcmpb	$0, %YMM7, %YMMZERO, %k4
-	kmovd	%k4, %edx
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-	lea	VEC_SIZE(%rdi, %rcx), %rdi
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-L(StrncpyExit):
-	cmp	$65, %r8d
-	je	L(StrncpyExit65)
-	cmp	$33, %r8d
-	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8d
-	jae	L(StrncpyExit17_32)
-	cmp	$9, %r8d
-	jae	L(StrncpyExit9_16)
-	cmp	$5, %r8d
-	jae	L(StrncpyExit5_8)
-	cmp	$3, %r8d
-	jae	L(StrncpyExit3_4)
-	cmp	$1, %r8d
-	ja	L(StrncpyExit2)
-	je	L(StrncpyExit1)
-#  ifdef USE_AS_STPCPY
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi)
-#  endif
+	.p2align 4,, 4
+L(page_cross_copy_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
 	ret
 
-	.p2align 4
-L(ExitZero):
-#  ifndef USE_AS_STRCAT
-	mov	%rdi, %rax
-#  endif
+	.p2align 4,, 4
+L(page_cross_copy_8_15):
+	movq	(%rsi), %rcx
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+	movq	%rcx, (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%END_REG)
 	ret
-
-# endif
-
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
 # endif
+END(STRCPY)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
index 203a19bf21..d648ba5cfe 100644
--- a/sysdeps/x86_64/multiarch/strncat-evex.S
+++ b/sysdeps/x86_64/multiarch/strncat-evex.S
@@ -1,7 +1,520 @@
-#ifndef STRNCAT
-# define STRNCAT	__strncat_evex
-#endif
+/* {wcs|str}ncat  with 256/512-bit EVEX.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+# ifndef STRNCAT
+#  define STRNCAT	__strncat_evex
+# endif
+
+
+# ifdef USE_AS_WCSCPY
+#  define movNULL	movl
+#  define VMOVU_MASK	vmovdqu32
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
+
+#  define REP_MOVS	rep movsd
+
+#  define VMASK_REG	VR10
+#  define FIND_FIRST_ONE(src, dst)	movl $CHAR_PER_VEC, %dst; bsf %src, %dst
+
+#  define USE_WIDE_CHAR
+# else
+#  define movNULL	movb
+#  define VMOVU_MASK	vmovdqu8
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
+
+#  define REP_MOVS	rep movsb
+
+#  define VMASK_REG	VRCX
+#  define FIND_FIRST_ONE(src, dst)	tzcnt %src, %dst
+
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# include "reg-macros.h"
+
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCAT)
+	movq	%rdi, %rax
+
+	/* NB: It's safe to filter out zero-length strings WITHOUT
+	   setting null-term. Destination MUST be a null-terminated
+	   string so essentially the work is already done.  */
+# ifdef USE_AS_WCSCPY
+	leaq	-1(%rdx), %rcx
+	shrq	$56, %rcx
+	jnz	L(zero_len)
+# else
+	test	%rdx, %rdx
+	jle	L(zero_len)
+# endif
+
+# include "strcat-strlen-evex.S"
+
+	movl	%esi, %ecx
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPTESTN	%VMM(0), %VMM(0), %k0
+
+	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
+	   <= CHAR_PER_VEC with masked instructions (which have
+	   potential for dramatically bad perf if dst splits a page and
+	   is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	KMOV	%k0, %VRCX
+	FIND_FIRST_ONE (VRCX, VR8)
+	cmpq	%r8, %rdx
+	jbe	L(less_1x_vec)
+
+	test	%VRCX, %VRCX
+	jz	L(more_1x_vec)
+
+	blsmsk	%VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
+	ret
+
+L(less_1x_vec):
+	mov	$-1, %VRCX
+	bzhi	%VRDX, %VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
+
+	ret
+# else
+	KMOV	%k0, %VMASK_REG
+	/* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
+	   %VMASK_REG, %VRCX` for wcsncat.  */
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpq	%rcx, %rdx
+	jbe	L(less_1x_vec)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	je	L(more_1x_vec)
+
+	movl	%ecx, %edx
+
+L(less_1x_vec):
+#  if VEC_SIZE == 64
+	cmpl	$(32 / CHAR_SIZE), %edx
+	jae	L(copy_32_63)
+#  endif
+
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jae	L(copy_16_31)
+
+
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jae	L(copy_8_15)
+
+#  ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  else
+
+	cmpl	$4, %edx
+	jae	L(copy_4_7)
+
+	movzbl	(%rsi), %ecx
+	cmpl	$1, %edx
+	jbe	L(set_null_term)
+
+	movzwl	1(%rsi), %esi
+	movw	%si, 1(%rdi)
+
+	.p2align 4,, 1
+L(set_null_term):
+	movb	%cl, (%rdi)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+#  endif
+
+#  if VEC_SIZE == 64
+	.p2align 4,, 6
+L(copy_32_63):
+	VMOVU	-(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  endif
+	.p2align 4,, 6
+L(copy_16_31):
+	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
+	   and will save code size.  */
+	vmovdqu	-(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 2
+L(copy_8_15):
+	movq	-(8)(%rsi, %rdx, CHAR_SIZE), %rcx
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+
+#  ifndef USE_AS_WCSCPY
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-(4)(%rsi, %rdx, CHAR_SIZE), %ecx
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  endif
+
+# endif
+	.p2align 4,, 4
+L(zero_len):
+# ifdef USE_AS_WCSCPY
+	test	%rdx, %rdx
+# endif
+	jne	OVERFLOW_STRCAT
+	ret
 
-#define USE_AS_STRNCAT
-#define STRCAT	STRNCAT
-#include "strcat-evex.S"
+	.p2align 4,, 8
+L(more_1x_vec):
+	VMOVU	%VMM(0), (%rdi)
+
+	/* We are going to align rsi here so will need to be able to re-
+	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
+
+	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+L(loop_last_4x_vec):
+	addq	%rsi, %rdi
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+
+	/* Will need this regardless.  */
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VMASK_REG
+
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	jne	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	addl	$-CHAR_PER_VEC, %edx
+	bzhi	%VRDX, %VRCX, %VR8
+	jz	L(ret_vec_x2_len)
+L(ret_vec_x2):
+	bsf	%VRCX, %VRDX
+L(ret_vec_x2_len):
+	VMOVU	(VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	movNULL	$0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 4
+L(ret_vec_x1_len):
+	movl	%edx, %ecx
+L(ret_vec_x1):
+	VMOVU	(VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	movNULL	$0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	addl	$-(CHAR_PER_VEC * 4), %edx
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VMASK_REG
+	subq	$-(VEC_SIZE * 4), %rsi
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+# ifdef USE_AS_WCSCPY
+	xorl	%ecx, %ecx
+# endif
+	bsf	%VMASK_REG, %VRCX
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VMASK_REG
+
+	cmpq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(more_4x_vec)
+
+	/* Adjust length before going to L(ret_vec_x3_len) or
+	   L(ret_vec_x3).  */
+	addl	$(CHAR_PER_VEC * -2), %edx
+
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	jne	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	addl	$-CHAR_PER_VEC, %edx
+	bzhi	%VRDX, %VRCX, %VR8
+	jz	L(ret_vec_x4_len)
+L(ret_vec_x4):
+	bsf	%VRCX, %VRDX
+L(ret_vec_x4_len):
+	VMOVU	(VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	movNULL	$0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 4
+L(ret_vec_x3_len):
+	movl	%edx, %ecx
+L(ret_vec_x3):
+	VMOVU	(VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	movNULL	$0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 8
+L(more_4x_vec):
+# ifdef USE_AS_WCSCPY
+	xorl	%ecx, %ecx
+# endif
+	bsf	%VMASK_REG, %VRCX
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+
+	/* Check if we are near the end before aligning.  */
+	cmpq	$(CHAR_PER_VEC * 8), %rdx
+	jbe	L(last_4x_vec)
+
+
+	/* Add rsi to rdx (length) before aligning rsi. NB: Since we
+	   filtered out huge lengths this cannot overflow.  */
+# ifdef USE_AS_WCSCPY
+	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rsi, %rdx
+# endif
+
+	/* Subtract rsi from rdi before aligning (add back will have
+	   correct rdi for aligned rsi).  */
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+
+	/* Offset rsi by VEC_SIZE so that we can jump to
+	   L(loop_last_4x_vec).  */
+	addq	$-(VEC_SIZE), %rsi
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	/* Store loop end in r9.  */
+	leaq	-(VEC_SIZE * 5)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	/* Restore rdi (dst).  */
+	addq	%rsi, %rdi
+
+	/* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
+	   test with bsf.  */
+	bsf	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
+
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	bsf	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+
+	KMOV	%k4, %VRCX
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	ret
+
+
+	.p2align 4,, 4
+L(page_cross):
+	movq	%rsi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+	VPCMPEQ	(%r8), %VZERO, %k0
+
+# ifdef USE_AS_WCSCPY
+	KMOV	%k0, %VR9
+	shrl	$2, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+	shrx	%VRCX, %VR9, %VRCX
+# else
+	KMOV	%k0, %VRCX
+	shrx	%VRSI, %VRCX, %VRCX
+# endif
+
+	subl	%esi, %r8d
+	andl	$(VEC_SIZE - 1), %r8d
+# ifdef USE_AS_WCSCPY
+	shrl	$2, %r8d
+# endif
+	cmpq	%r8, %rdx
+	jbe	L(page_cross_small)
+	/* Optimizing more for space as this is very cold code. This
+	   saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shl	%VRCX
+	jz	L(page_cross_continue)
+	bsf	%VRCX, %VRCX
+	REP_MOVS
+	ret
+
+L(page_cross_small):
+	tzcnt	%VRCX, %VRCX
+	jz	L(page_cross_setz)
+	cmpl	%edx, %ecx
+	cmova	%edx, %ecx
+
+# ifdef USE_AS_WCSCPY
+	rep	movsd
+# else
+	rep	movsb
+# endif
+L(page_cross_setz):
+	movNULL	$0, (%rdi)
+	ret
+END(STRNCAT)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
index 1b3426d511..49eaf4cbd9 100644
--- a/sysdeps/x86_64/multiarch/strncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
@@ -1,7 +1,990 @@
-#ifndef STRNCPY
-# define STRNCPY	__strncpy_evex
-#endif
+/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+
+
+# include <sysdep.h>
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+
+# ifndef STRNCPY
+#  define STRNCPY	__strncpy_evex
+# endif
+
+# ifdef USE_AS_WCSCPY
+#  define VMOVU_MASK	vmovdqu32
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define CHAR_SIZE	4
+
+#  define REP_MOVS	rep movsd
+#  define REP_STOS	rep stosl
+
+#  define USE_WIDE_CHAR
+
+# else
+#  define VMOVU_MASK	vmovdqu8
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define CHAR_SIZE	1
+
+#  define REP_MOVS	rep movsb
+#  define REP_STOS	rep stosb
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+
+# define VZERO	VMM(7)
+# define VZERO_256	VMM_256(7)
+# define VZERO_128	VMM_128(7)
+
+# if VEC_SIZE == 64
+#  define VZERO_HALF	VZERO_256
+# else
+#  define VZERO_HALF	VZERO_128
+# endif
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCPY)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   just running rep stos{b|l} to zero set (which will almost
+	   certainly segfault), if that succeeds then just calling
+	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
+# ifdef USE_AS_WCSCPY
+	decq	%rdx
+	movq	%rdx, %rax
+	/* 56 is end of max supported address space.  */
+	shr	$56, %rax
+	jnz	L(zero_len)
+# else
+	decq	%rdx
+	/* If the flag needs to become `jb` replace `dec` with `sub`.
+	 */
+	jl	L(zero_len)
+# endif
+
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
+	movl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+
+	/* If no STPCPY just save end ahead of time.  */
+# ifndef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+
+
+	cmpq	$(CHAR_PER_VEC), %rdx
+
+	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
+	   <= CHAR_PER_VEC with masked instructions (which have
+	   potential for dramatically bad perf if dst splits a page and
+	   is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	/* `jae` because length rdx is now length - 1.  */
+	jae	L(more_1x_vec)
+
+	/* If there where multiple zero-CHAR matches in the first VEC,
+	   VRCX will be overset but thats fine since any oversets where
+	   at zero-positions anyways.  */
+
+#  ifdef USE_AS_STPCPY
+	tzcnt	%VRCX, %VRAX
+	cmpl	%eax, %edx
+	cmovb	%edx, %eax
+#   ifdef USE_AS_WCSCPY
+	adcl	$0, %eax
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#   else
+	adcq	%rdi, %rax
+#   endif
+#  endif
+	dec	%VRCX
+
+	/* Zero out all non-zero CHAR's after the first zero match.  */
+	KMOV	%VRCX, %k1
+
+	/* Use VZERO as destination so this can be reused for
+	   L(zfill_less_vec) (which if jumped to by subsequent logic
+	   will have zerod out VZERO.  */
+	VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
+L(zfill_less_vec):
+	/* Get mask for what we need to set.  */
+	incl	%edx
+	mov	$-1, %VRCX
+	bzhi	%VRDX, %VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	VMOVU_MASK %VZERO, (%rdi){%k1}
+	ret
+
+	.p2align 4,, 4
+L(zero_len):
+	cmpq	$-1, %rdx
+	jne	L(best_effort_strncpy)
+	movq	%rdi, %rax
+	ret
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# else
+	/* `jb` because length rdx is now length - 1.  */
+	jb	L(less_1x_vec)
+# endif
+
+
+	/* This may overset but thats fine because we still need to zero
+	   fill.  */
+	VMOVU	%VMM(0), (%rdi)
+
+
+	/* Length must be >= CHAR_PER_VEC so match here means we must
+	   zero-fill.  */
+	test	%VRCX, %VRCX
+	jnz	L(zfill)
+
+
+	/* We are going to align rsi here so will need to be able to re-
+	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
+	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+
+L(loop_last_4x_vec):
+	addq	%rsi, %rdi
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+
+	/* -1 because of the `dec %rdx` earlier.  */
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	/* This will be need to be computed no matter what. We do it
+	   ahead of time for CHAR_PER_VEC == 64 because we can't adjust
+	   the value of `tzcnt` with a shift.  */
+# if CHAR_PER_VEC == 64
+	tzcntq	%rcx, %rcx
+# endif
+
+	cmpl	$(CHAR_PER_VEC), %edx
+	jb	L(ret_vec_x1_len)
+
+	/* Seperate logic for CHAR_PER_VEC == 64 because we already did
+	   `tzcnt` on VRCX.  */
+# if CHAR_PER_VEC == 64
+	/* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
+	cmpb	$CHAR_PER_VEC, %cl
+	jnz	L(ret_vec_x1_no_bsf)
+# else
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+# endif
+
+
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %VZERO, %k0
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	KMOV	%k0, %VRCX
+
+# if CHAR_PER_VEC < 64
+	/* This essentiallys adds CHAR_PER_VEC to computed result.  */
+	shlq	$CHAR_PER_VEC, %rcx
+# else
+	tzcntq	%rcx, %rcx
+	addl	$CHAR_PER_VEC, %ecx
+# endif
+
+	.p2align 4,, 4
+L(ret_vec_x1_len):
+	/* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
+	   already been done.  */
+# if CHAR_PER_VEC < 64
+	tzcntq	%rcx, %rcx
+# endif
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x1_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+L(ret_vec_x1_len_no_zfill):
+	VMOVU	((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+	.p2align 4,, 10
+L(ret_vec_x1):
+	bsf	%VRCX, %VRCX
+L(ret_vec_x1_no_bsf):
+	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	subl	%ecx, %edx
+	cmpl	$CHAR_PER_VEC, %edx
+	jb	L(ret_vec_x1_len_no_zfill_mov)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+	ret
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	/* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
+	   $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
+	   using `movzbl`.  */
+# if CHAR_PER_VEC == 64
+	movzbl	%dl, %edx
+# else
+	andl	$(CHAR_PER_VEC * 4 - 1), %edx
+# endif
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+	subq	$-(VEC_SIZE * 4), %rsi
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2 - 1), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	test	%VRCX, %VRCX
+	/* Must fill at least 2x VEC.  */
+	jnz	L(zfill_vec1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	/* Must fill at least 1x VEC.  */
+	jnz	L(zfill_vec2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRCX
+
+	/* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rdx
+	ja	L(more_4x_vec)
+
+	subl	$(CHAR_PER_VEC * 3), %edx
+	jb	L(ret_vec_x3_len)
+
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rsi), %VZERO, %k0
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	KMOV	%k0, %VRCX
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x4_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	movl	%ecx, %edx
+L(ret_vec_x4_len_no_zfill):
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 4 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+L(ret_vec_x3_len):
+	addl	$(CHAR_PER_VEC * 1), %edx
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x3_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+	.p2align 4,, 4
+L(ret_vec_x3_len_no_zfill):
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 3 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsf	%VRCX, %VRCX
+	VMOVU	%VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
+	subl	%ecx, %edx
+	jl	L(ret_vec_x3_len_no_zfill_mov)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+	ret
+
+	.p2align 4,, 8
+L(more_4x_vec):
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec4)
 
-#define USE_AS_STRNCPY
-#define STRCPY	STRNCPY
-#include "strcpy-evex.S"
+	/* Recheck length before aligning.  */
+	cmpq	$(CHAR_PER_VEC * 8 - 1), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
+# ifdef USE_AS_WCSCPY
+	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rsi, %rdx
+# endif
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+
+
+	/* Offset rsi by VEC_SIZE so that we can jump to
+	   L(loop_last_4x_vec).  */
+	addq	$-(VEC_SIZE), %rsi
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	/* Store loop end in r9.  */
+	leaq	-(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	/* Restore rdx (length).  */
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	/* Restore rdi (dst).  */
+	addq	%rsi, %rdi
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec1)
+
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec2)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec3)
+
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
+	KMOV	%k4, %VRCX
+	// Zfill more....
+
+	.p2align 4,, 4
+L(zfill_vec4):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -2), %rdx
+L(zfill_vec2):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -1), %rdx
+L(zfill):
+	/* VRCX must be non-zero.  */
+	bsf	%VRCX, %VRCX
+
+	/* Adjust length / dst for zfill.  */
+	subq	%rcx, %rdx
+# ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+# else
+	addq	%rcx, %rdi
+# endif
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_from_page_cross):
+
+	/* From here on out its just memset(rdi, 0, rdx).  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jb	L(zfill_less_vec)
+
+L(zfill_more_1x_vec):
+	VMOVU	%VZERO, (%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
+	ja	L(zfill_more_2x_vec)
+L(zfill_done0):
+	ret
+
+	/* Coming from vec1/vec2 we must be able to zfill at least 2x
+	   VEC.  */
+	.p2align 4,, 8
+L(zfill_vec3):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -2), %rdx
+	.p2align 4,, 2
+L(zfill_vec1):
+	bsfq	%rcx, %rcx
+	/* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
+	 */
+	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+
+
+	VMOVU	%VZERO, (%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	jb	L(zfill_done0)
+L(zfill_more_2x_vec):
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VZERO, (VEC_SIZE)(%rdi)
+	subq	$(CHAR_PER_VEC * 4 - 1), %rdx
+	jbe	L(zfill_done)
+
+# ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rdi, %rdx
+# endif
+
+	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi)
+
+
+	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
+	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	jbe	L(zfill_done)
+
+	/* Align rdi and zfill loop.  */
+	andq	$-(VEC_SIZE), %rdi
+	.p2align 4,, 12
+L(zfill_loop_4x_vec):
+	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	ja	L(zfill_loop_4x_vec)
+L(zfill_done):
+	ret
+
+
+	/* Less 1x VEC case if we are not using evex masked store.  */
+# if !USE_EVEX_MASKED_STORE
+	.p2align 4,, 8
+L(copy_1x):
+	/* Special case for copy 1x. It can be handled quickly and many
+	   buffer sizes have convenient alignment.  */
+	VMOVU	%VMM(0), (%rdi)
+	/* If no zeros then we are done.  */
+	testl	%ecx, %ecx
+	jz	L(ret_1x_1x)
+
+	/* Need to zfill, not we know that length <= CHAR_PER_VEC so we
+	   only handle the small case here.  */
+	bsf	%VRCX, %VRCX
+L(zfill_less_vec_no_bsf):
+	/* Adjust length / dst then just zfill less_vec.  */
+	subq	%rcx, %rdx
+#  ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#  else
+	addq	%rcx, %rdi
+#  endif
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+
+L(zfill_less_vec):
+	cmpl	$((VEC_SIZE / 2) / CHAR_SIZE), %edx
+	jb	L(zfill_less_half)
+
+	VMOVU	%VZERO_HALF, (%rdi)
+	VMOVU	%VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	ret
+#  ifdef USE_AS_STPCPY
+L(ret_1x_1x):
+	leaq	CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
+	ret
+#  endif
+
+
+#  if VEC_SIZE == 64
+	.p2align 4,, 4
+L(copy_32_63):
+	/* Overfill to avoid branches.  */
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+
+	/* We are taking advantage of the fact that to be here we must
+	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
+	   way for overwriting.  */
+	cmpl	%ecx, %edx
+	ja	L(zfill_less_vec_no_bsf)
+#   ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+#   else
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  endif
+
+	.p2align 4,, 4
+L(copy_16_31):
+	/* Overfill to avoid branches.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpl	%ecx, %edx
+
+	/* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
+	   we have a larger copy block for 32-63 so this is just falls
+	   through to zfill 16-31. If VEC_SIZE == 32 then we check for
+	   full zfill of less 1x VEC.  */
+#  if VEC_SIZE == 64
+	jbe	L(ret_16_31)
+	subl	%ecx, %edx
+#   ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#   else
+	addq	%rcx, %rdi
+#   endif
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+L(zfill_less_half):
+L(zfill_less_32):
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jb	L(zfill_less_16)
+	VMOVU	%VZERO_128, (%rdi)
+	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+#   ifdef USE_AS_STPCPY
+	ret
+#   endif
+L(ret_16_31):
+#   ifdef USE_AS_STPCPY
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  else
+	/* VEC_SIZE == 32 begins.  */
+	ja	L(zfill_less_vec_no_bsf)
+#   ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+#   else
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  endif
+
+
+	.p2align 4,, 4
+L(copy_8_15):
+	/* Overfill to avoid branches.  */
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpl	%ecx, %edx
+	jbe	L(ret_8_15)
+	subl	%ecx, %edx
+#  ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#  else
+	addq	%rcx, %rdi
+#  endif
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+	.p2align 4,, 8
+#  if VEC_SIZE == 32
+L(zfill_less_half):
+#  endif
+L(zfill_less_16):
+	xorl	%ecx, %ecx
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jb	L(zfill_less_8)
+	movq	%rcx, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+#  ifndef USE_AS_STPCPY
+L(ret_8_15):
+#  endif
+	ret
+
+	.p2align 4,, 8
+L(less_1x_vec):
+	je	L(copy_1x)
+
+	/* We will need `tzcnt` result for all other copy sizes.  */
+	tzcnt	%VRCX, %VRCX
+#  if VEC_SIZE == 64
+	cmpl	$(32 / CHAR_SIZE), %edx
+	jae	L(copy_32_63)
+#  endif
+
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jae	L(copy_16_31)
+
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jae	L(copy_8_15)
+#  ifdef USE_AS_WCSCPY
+	testl	%ecx, %ecx
+	jz	L(zfill_less_8_set_ret)
+
+	movl	(%rsi, %rdx, CHAR_SIZE), %esi
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%esi, (%rdi, %rdx, CHAR_SIZE)
+#   ifdef USE_AS_STPCPY
+	cmpl	%ecx, %edx
+L(ret_8_15):
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#   endif
+	ret
+L(zfill_less_8_set_ret):
+	xorl	%ecx, %ecx
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+L(zfill_less_8):
+	movl	%ecx, (%rdi)
+	movl	%ecx, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  else
+	cmpl	$3, %edx
+	jb	L(copy_0_3)
+	/* Overfill to avoid branches.  */
+	movl	-3(%rsi, %rdx), %esi
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%esi, -3(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_4_7)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+	xorl	%ecx, %ecx
+	.p2align 4,, 8
+L(zfill_less_8):
+	cmpl	$3, %edx
+	jb	L(zfill_less_3)
+	movl	%ecx, (%rdi)
+	movl	%ecx, -3(%rdi, %rdx)
+#   ifdef USE_AS_STPCPY
+	ret
+#   endif
+
+L(ret_4_7):
+#   ifdef USE_AS_STPCPY
+L(ret_8_15):
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#   endif
+	ret
+
+	.p2align 4,, 4
+L(zfill_less_3):
+	testl	%edx, %edx
+	jz	L(zfill_1)
+	movw	%cx, (%rdi)
+L(zfill_1):
+	movb	%cl, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_0_3):
+	vmovd	%VMM_128(0), %r8d
+	testl	%edx, %edx
+	jz	L(copy_1)
+	movw	%r8w, (%rdi)
+	cmpl	%ecx, %edx
+	ja	L(zfill_from_1)
+	movzbl	(%rsi, %rdx), %r8d
+#   ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+	movb	%r8b, (%rdi, %rdx)
+	ret
+#   endif
+
+L(copy_1):
+#   ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	cmpl	%ecx, %edx
+	adcq	%rdi, %rax
+#   endif
+#   ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+#   else
+	movb	%r8b, (%rdi, %rdx)
+#   endif
+	ret
+#  endif
+
+
+#  ifndef USE_AS_WCSCPY
+	.p2align 4,, 8
+L(zfill_from_1):
+#   ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+#   endif
+	movw	$0, -1(%rdi, %rdx)
+	ret
+#  endif
+
+	.p2align 4,, 4
+L(zero_len):
+	incq	%rdx
+	jne	L(best_effort_strncpy)
+	movq	%rdi, %rax
+	ret
+# endif
+
+
+	.p2align 4,, 4
+	.p2align 6,, 8
+L(page_cross):
+	movq	%rsi, %rax
+	andq	$(VEC_SIZE * -1), %rax
+	VPCMPEQ	(%rax), %VZERO, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSCPY
+	movl	%esi, %r8d
+	shrl	$2, %r8d
+	andl	$(CHAR_PER_VEC - 1), %r8d
+	shrx	%VR8, %VRCX, %VRCX
+# else
+	shrx	%VRSI, %VRCX, %VRCX
+# endif
+
+	/* Compute amount of bytes we checked.  */
+	subl	%esi, %eax
+	andl	$(VEC_SIZE - 1), %eax
+# ifdef USE_AS_WCSCPY
+	shrl	$2, %eax
+# endif
+
+	/* If rax > rdx then we are finishing the copy at the end of the
+	   page.  */
+	cmpq	%rax, %rdx
+	jb	L(page_cross_small)
+
+
+	/* If rcx is non-zero then continue.  */
+	test	%VRCX, %VRCX
+	jz	L(page_cross_continue)
+
+	/* We found zero-CHAR so need to copy then zfill (we know we
+	   didn't cover all of length here).  */
+	bsf	%VRCX, %VRCX
+L(movsb_and_zfill):
+	incl	%ecx
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	leaq	-CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
+# else
+	movq	%rdi, %rax
+# endif
+
+	REP_MOVS
+# ifdef USE_AS_WCSCPY
+	movl	$0, (%rdi)
+# else
+	movb	$0, (%rdi)
+# endif
+	jmp	L(zfill_from_page_cross)
+
+L(page_cross_small):
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(page_cross_copy_only)
+
+	/* Do a zfill of the tail before copying.  */
+	movq	%rdi, %r9
+	xorl	%eax, %eax
+
+	movl	%ecx, %r8d
+
+	subl	%ecx, %edx
+	leaq	CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+	movl	%edx, %ecx
+	REP_STOS
+	movq	%r9, %rdi
+	movl	%r8d, %edx
+L(page_cross_copy_only):
+	leal	1(%rdx), %ecx
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcl	$0, %edx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# else
+	movq	%rdi, %rax
+# endif
+	REP_MOVS
+	ret
+
+
+L(best_effort_strncpy):
+	movq	%rdx, %rcx
+	xorl	%eax, %eax
+	movq	%rdi, %r8
+	/* The length is >= 2^63. We very much so expect to segfault at
+	   rep stos. If that doesn't happen then just strcpy to finish.
+	 */
+	REP_STOS
+	movq	%r8, %rdi
+	jmp	OVERFLOW_STRCPY
+END(STRNCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
new file mode 100644
index 0000000000..d5ff4cbe50
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
@@ -0,0 +1,65 @@
+#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
+#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
+
+#if defined USE_MULTIARCH && IS_IN(libc)
+#  define UNDERSCORES __
+#  ifdef USE_WITH_SSE2
+#    define ISA_EXT _sse2
+#  elif defined USE_WITH_AVX
+#    ifdef USE_WITH_RTM
+#      define ISA_EXT _avx_rtm
+#    else
+#      define ISA_EXT _avx
+#    endif
+#  elif defined USE_WITH_AVX2
+#    ifdef USE_WITH_RTM
+#      define ISA_EXT _avx2_rtm
+#    else
+#      define ISA_EXT _avx2
+#    endif
+
+#  elif defined USE_WITH_EVEX256
+#    define ISA_EXT _evex
+#  elif defined USE_WITH_EVEX512
+#    define ISA_EXT _evex512
+#  endif
+#else
+#  define UNDERSCORES
+#  define ISA_EXT
+#endif
+
+#ifdef USE_AS_WCSCPY
+#  define STRCPY_PREFIX wc
+#  define STRCAT_PREFIX wcs
+#  ifdef USE_AS_STPCPY
+#    define STRCPY_POSTFIX pcpy
+#  else
+#    define STRCPY_POSTFIX scpy
+#  endif
+#else
+#  define STRCPY_PREFIX st
+#  define STRCAT_PREFIX str
+#  ifdef USE_AS_STPCPY
+#    define STRCPY_POSTFIX pcpy
+#  else
+#    define STRCPY_POSTFIX rcpy
+#  endif
+#endif
+#define STRCAT_POSTFIX cat
+
+#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext)                 \
+  underscores##prefix##postfix##ext
+
+#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
+
+#ifndef OVERFLOW_STRCPY
+#  define OVERFLOW_STRCPY                                                     \
+    OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
+#endif
+
+#ifndef OVERFLOW_STRCAT
+#  define OVERFLOW_STRCAT                                                     \
+    OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
+#endif
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2 3/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions
  2022-11-04  8:20 ` [PATCH v2 " Noah Goldstein
  2022-11-04  8:20   ` [PATCH v2 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
@ 2022-11-04  8:20   ` Noah Goldstein
  2022-11-04 16:45     ` H.J. Lu
  2022-11-04  8:20   ` [PATCH v2 4/4] x86: Add optimized functions for the wide-character strcpy family Noah Goldstein
  2022-11-04 16:26   ` [PATCH v2 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json H.J. Lu
  3 siblings, 1 reply; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04  8:20 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
    1. Use more overlapping stores to avoid branches.
    2. Reduce how unrolled the aligning copies are (this is more of a
       code-size save, its a negative for some sizes in terms of
       perf).
    3. For st{r|p}n{cat|cpy} re-order the branches to minimize the
       number that are taken.

Performance Changes:

    Times are from N = 10 runs of the benchmark suite and are
    reported as geometric mean of all ratios of
    New Implementation / Old Implementation.

    strcat-avx2      -> 0.998
    strcpy-avx2      -> 0.937
    stpcpy-avx2      -> 0.971

    strncpy-avx2     -> 0.793
    stpncpy-avx2     -> 0.775

    strncat-avx2     -> 0.962

Code Size Changes:
    function         -> Bytes New / Bytes Old -> Ratio

    strcat-avx2      -> 685  / 1639 -> 0.418
    strcpy-avx2      -> 560  / 903  -> 0.620
    stpcpy-avx2      -> 592  / 939  -> 0.630

    strncpy-avx2     -> 1176 / 2390 -> 0.492
    stpncpy-avx2     -> 1268 / 2438 -> 0.520

    strncat-avx2     -> 1042 / 2563 -> 0.407

Notes:
    1. Because of the significant difference between the
       implementations they are split into three files.

           strcpy-evex.S    -> strcpy, stpcpy, strcat
           strncpy-evex.S   -> strncpy
           strncat-evex.S    > strncat

       I couldn't find a way to merge them without making the
       ifdefs incredibly difficult to follow.

    2. All implementations can be made evex512 by including
       "x86-evex512-vecs.h" at the top.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.

Fix avx2
---
 sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S    |    6 +-
 sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S   |    7 +-
 sysdeps/x86_64/multiarch/stpncpy-avx2.S       |    5 +-
 sysdeps/x86_64/multiarch/strcat-avx2-rtm.S    |   13 +-
 sysdeps/x86_64/multiarch/strcat-avx2.S        |  268 +---
 sysdeps/x86_64/multiarch/strcat-strlen-avx2.S |   76 +
 sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S    |   13 +-
 sysdeps/x86_64/multiarch/strcpy-avx2.S        | 1236 +++++------------
 sysdeps/x86_64/multiarch/strncat-avx2-rtm.S   |    6 +-
 sysdeps/x86_64/multiarch/strncat-avx2.S       |  424 +++++-
 sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S   |    6 +-
 sysdeps/x86_64/multiarch/strncpy-avx2.S       |  740 +++++++++-
 sysdeps/x86_64/multiarch/x86-avx-vecs.h       |    5 +-
 sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h  |   26 +
 sysdeps/x86_64/multiarch/x86-avx2-vecs.h      |   27 +
 15 files changed, 1624 insertions(+), 1234 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-vecs.h

diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
index 2b9c07a59f..189a288053 100644
--- a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STPCPY	__stpcpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "stpcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
index 60a2ccfe53..1b252985e7 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
@@ -1,4 +1,3 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STPNCPY	__stpncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "stpncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
index b2f8c19143..a46a8edbe2 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
@@ -3,6 +3,5 @@
 #endif
 
 #define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY	STPNCPY
-#include "strcpy-avx2.S"
+#define STRNCPY	STPNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
index 637fb557c4..94d51d10bd 100644
--- a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
@@ -1,12 +1,3 @@
-#ifndef STRCAT
-# define STRCAT __strcat_avx2_rtm
-#endif
-
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
-
-#define SECTION(p) p##.avx.rtm
-
+#define STRCAT	__strcat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
 #include "strcat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
index d9b7fb2a43..3f914fa342 100644
--- a/sysdeps/x86_64/multiarch/strcat-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
@@ -16,266 +16,10 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (3)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_avx2
-# endif
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE	32
-
-# ifndef SECTION
-#  define SECTION(p)	p##.avx
-# endif
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRCAT)
-	mov	%rdi, %r9
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-	xor	%eax, %eax
-	mov	%edi, %ecx
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	vpxor	%xmm6, %xmm6, %xmm6
-	cmp	$(VEC_SIZE * 3), %ecx
-	ja	L(fourth_vector_boundary)
-	vpcmpeqb (%rdi), %ymm6, %ymm0
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_first_vector)
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	jmp	L(align_vec_size_start)
-L(fourth_vector_boundary):
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	vpcmpeqb	(%rax), %ymm6, %ymm0
-	mov	$-1, %r10d
-	sub	%rax, %rcx
-	shl	%cl, %r10d
-	vpmovmskb %ymm0, %edx
-	and	%r10d, %edx
-	jnz	L(exit)
-
-L(align_vec_size_start):
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 4), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 4), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 4), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 5), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
-	add	$VEC_SIZE, %rax
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
-	add	$VEC_SIZE, %rax
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
-	add	$VEC_SIZE, %rax
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	add	$VEC_SIZE, %rax
-
-	.p2align 4
-L(align_four_vec_loop):
-	vmovaps	(%rax),	%ymm4
-	vpminub	VEC_SIZE(%rax),	%ymm4, %ymm4
-	vmovaps	(VEC_SIZE * 2)(%rax),	%ymm5
-	vpminub	(VEC_SIZE * 3)(%rax),	%ymm5, %ymm5
-	add	$(VEC_SIZE * 4),	%rax
-	vpminub	%ymm4,	%ymm5, %ymm5
-	vpcmpeqb %ymm5,	%ymm6, %ymm5
-	vpmovmskb %ymm5,	%edx
-	test	%edx,	%edx
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
-	sub	$(VEC_SIZE * 5),	%rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit):
-	sub	%rdi, %rax
-L(exit_null_on_first_vector):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_second_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$VEC_SIZE, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_third_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 2), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fourth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 3), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fifth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-
-	.p2align 4
-L(StartStrcpyPart):
-	lea	(%r9, %rax), %rdi
-	mov	%rsi, %rcx
-	mov	%r9, %rax      /* save result */
-
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(ExitZero)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-avx2.S"
+#ifndef STRCAT
+# define STRCAT	__strcat_avx2
 #endif
+
+#define USE_AS_STRCAT
+#define STRCPY	STRCAT
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
new file mode 100644
index 0000000000..128a45b6ff
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
@@ -0,0 +1,76 @@
+    /* Simple strlen implementation that ends at L(strcat_strlen_done).  */
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+	VPCMPEQ	(%r8), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	shrxl	%edi, %ecx, %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	VEC_SIZE(%r8), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	leaq	(VEC_SIZE)(%r8), %rdi
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v1)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v2)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v3)
+
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	.p2align 4,, 8
+L(loop_2x_vec):
+	VMOVA	(VEC_SIZE * 0 + 1)(%rdi), %VMM(0)
+	VPMIN	(VEC_SIZE * 1 + 1)(%rdi), %VMM(0), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 1)(%rdi), %VMM(2)
+	VPMIN	(VEC_SIZE * 3 + 1)(%rdi), %VMM(2), %VMM(3)
+	VPMIN	%VMM(1), %VMM(3), %VMM(3)
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(3)
+	vpmovmskb %VMM(3), %r8d
+	subq	$(VEC_SIZE * -4), %rdi
+	testl	%r8d, %r8d
+	jz	L(loop_2x_vec)
+
+	addq	$(VEC_SIZE * -4 + 1), %rdi
+
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(1)
+	vpmovmskb %VMM(1), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v1)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(2)
+	vpmovmskb %VMM(2), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v2)
+
+	movl	%r8d, %ecx
+L(bsf_and_done_v3):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v2):
+	bsfl	%ecx, %ecx
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx), %rdi
+	jmp	L(strcat_strlen_done)
+
+	.p2align 4,, 4
+L(bsf_and_done_v1):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v0):
+	bsfl	%ecx, %ecx
+	addq	%rcx, %rdi
+L(strcat_strlen_done):
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
index c2c581ecf7..fe80ffd265 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
@@ -1,12 +1,3 @@
-#ifndef STRCPY
-# define STRCPY __strcpy_avx2_rtm
-#endif
-
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
-
-#define SECTION(p) p##.avx.rtm
-
+#define STRCPY	__strcpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
 #include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index c725834929..b87a1722d5 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -20,984 +20,378 @@
 
 #if ISA_SHOULD_BUILD (3)
 
+# include <sysdep.h>
 
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
-
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_avx2
-#  endif
-
-# endif
-
-/* Number of bytes in a vector register */
 # ifndef VEC_SIZE
-#  define VEC_SIZE	32
-# endif
-
-# ifndef VZEROUPPER
-#  define VZEROUPPER	vzeroupper
-# endif
-
-# ifndef SECTION
-#  define SECTION(p)	p##.avx
-# endif
-
-/* zero register */
-#define xmmZ	xmm0
-#define ymmZ	ymm0
-
-/* mask register */
-#define ymmM	ymm1
-
-# ifndef USE_AS_STRCAT
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRCPY)
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-	test	%R8_LP, %R8_LP
-	jz	L(ExitZero)
-#  endif
-	mov	%rsi, %rcx
-#  ifndef USE_AS_STPCPY
-	mov	%rdi, %rax      /* save result */
-#  endif
-
+#  include "x86-avx2-vecs.h"
 # endif
 
-	vpxor	%xmmZ, %xmmZ, %xmmZ
-
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	cmp	$(VEC_SIZE * 2), %ecx
-	jbe	L(SourceStringAlignmentLessTwoVecSize)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-
-	vpcmpeqb (%rsi), %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	shr	%cl, %rdx
-
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	mov	$VEC_SIZE, %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  else
-	mov	$(VEC_SIZE + 1), %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  endif
-	jbe	L(CopyVecSizeTailCase2OrCase3)
+# ifndef STRCPY
+#  define STRCPY	__strcpy_avx2
 # endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail)
 
-	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
-	vpmovmskb %ymm2, %edx
+	/* Use movsb in page cross case to save code size.  */
+# define USE_MOVSB_IN_PAGE_CROSS	1
 
-# ifdef USE_AS_STRNCPY
-	add	$VEC_SIZE, %r10
-	cmp	%r10, %r8
-	jbe	L(CopyTwoVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize)
-
-	vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
-	vmovdqu %ymm2, (%rdi)
-
-/* If source address alignment != destination address alignment */
-	.p2align 4
-L(UnalignVecSizeBoth):
-	sub	%rcx, %rdi
-# ifdef USE_AS_STRNCPY
-	add	%rcx, %r8
-	sbb	%rcx, %rcx
-	or	%rcx, %r8
-# endif
-	mov	$VEC_SIZE, %rcx
-	vmovdqa (%rsi, %rcx), %ymm2
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 3), %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_WCSCPY
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
 # else
-	jnz	L(CopyVecSize)
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
 # endif
 
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
-	vpcmpeqb %ymm3, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define PAGE_SIZE	4096
 
-	vmovdqu %ymm3, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
-	vpcmpeqb %ymm4, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
+# ifdef USE_AS_STPCPY
+#  define END_REG	rax
 # else
-	jnz	L(CopyVecSize)
+#  define END_REG	rdi, %rdx
 # endif
 
-	vmovdqu %ymm4, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STRCAT
+#  define PAGE_ALIGN_REG	ecx
 # else
-	jnz	L(CopyVecSize)
+#  define PAGE_ALIGN_REG	eax
 # endif
 
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
 
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vpcmpeqb %ymm3, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRCPY)
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
 
-	vmovdqu %ymm3, (%rdi, %rcx)
-	mov	%rsi, %rdx
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	and	$-(VEC_SIZE * 4), %rsi
-	sub	%rsi, %rdx
-	sub	%rdx, %rdi
-# ifdef USE_AS_STRNCPY
-	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
-# endif
-L(UnalignedFourVecSizeLoop):
-	vmovdqa (%rsi), %ymm4
-	vmovdqa VEC_SIZE(%rsi), %ymm5
-	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
-	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
-	vpminub %ymm5, %ymm4, %ymm2
-	vpminub %ymm7, %ymm6, %ymm3
-	vpminub %ymm2, %ymm3, %ymm3
-	vpcmpeqb %ymmM, %ymm3, %ymm3
-	vpmovmskb %ymm3, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(UnalignedFourVecSizeLeave)
-
-L(UnalignedFourVecSizeLoop_start):
-	add	$(VEC_SIZE * 4), %rdi
-	add	$(VEC_SIZE * 4), %rsi
-	vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
-	vmovdqa (%rsi), %ymm4
-	vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
-	vmovdqa VEC_SIZE(%rsi), %ymm5
-	vpminub %ymm5, %ymm4, %ymm2
-	vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
-	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
-	vmovdqu %ymm7, -VEC_SIZE(%rdi)
-	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
-	vpminub %ymm7, %ymm6, %ymm3
-	vpminub %ymm2, %ymm3, %ymm3
-	vpcmpeqb %ymmM, %ymm3, %ymm3
-	vpmovmskb %ymm3, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jz	L(UnalignedFourVecSizeLoop_start)
-
-L(UnalignedFourVecSizeLeave):
-	vpcmpeqb %ymm4, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_0)
-
-	vpcmpeqb %ymm5, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %ecx
-	test	%ecx, %ecx
-	jnz	L(CopyVecSizeUnaligned_16)
-
-	vpcmpeqb %ymm6, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_32)
-
-	vpcmpeqb %ymm7, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %ecx
-	bsf	%ecx, %edx
-	vmovdqu %ymm4, (%rdi)
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$(VEC_SIZE * 3), %rsi
-	add	$(VEC_SIZE * 3), %rdi
-	jmp	L(CopyVecSizeExit)
+# ifdef USE_AS_STRCAT
+	movq	%rdi, %rax
+#  include "strcat-strlen-avx2.S"
 # endif
 
-/* If source address alignment == destination address alignment */
-
-L(SourceStringAlignmentLessTwoVecSize):
-	vmovdqu (%rsi), %ymm3
-	vmovdqu VEC_SIZE(%rsi), %ymm2
-	vpcmpeqb %ymm3, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$VEC_SIZE, %r8
-#  else
-	cmp	$(VEC_SIZE + 1), %r8
-#  endif
-	jbe	L(CopyVecSizeTail1Case2OrCase3)
+	movl	%esi, %PAGE_ALIGN_REG
+	andl	$(PAGE_SIZE - 1), %PAGE_ALIGN_REG
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
+	ja	L(page_cross)
+L(page_cross_continue):
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 # endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail1)
-
-	vmovdqu %ymm3, (%rdi)
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$(VEC_SIZE * 2), %r8
-#  else
-	cmp	$((VEC_SIZE * 2) + 1), %r8
-#  endif
-	jbe	L(CopyTwoVecSize1Case2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize1)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-	jmp	L(UnalignVecSizeBoth)
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
 
-/*------End of main part with loops---------------------*/
+	testl	%ecx, %ecx
+	jz	L(more_1x_vec)
 
-/* Case1 */
+	/* No longer need ymm registers so just vzeroupper so it doesn't
+	   need to be duplicated at each return statement.  */
+	COND_VZEROUPPER
 
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
-	.p2align 4
-L(CopyVecSize):
-	add	%rcx, %rdi
-# endif
-L(CopyVecSizeTail):
-	add	%rcx, %rsi
-L(CopyVecSizeTail1):
-	bsf	%edx, %edx
-L(CopyVecSizeExit):
-	cmp	$32, %edx
-	jae	L(Exit32_63)
-	cmp	$16, %edx
-	jae	L(Exit16_31)
-	cmp	$8, %edx
-	jae	L(Exit8_15)
-	cmp	$4, %edx
-	jae	L(Exit4_7)
-	cmp	$3, %edx
-	je	L(Exit3)
-	cmp	$1, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	movb	$0, (%rdi)
+	xorl	%edx, %edx
+	bsfl	%ecx, %edx
 # ifdef USE_AS_STPCPY
-	lea	(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$1, %r8
-	lea	1(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
-
-	.p2align 4
-L(CopyTwoVecSize1):
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$VEC_SIZE, %r8
-# endif
-	jmp	L(CopyVecSizeTail1)
-
-	.p2align 4
-L(CopyTwoVecSize):
-	bsf	%edx, %edx
-	add	%rcx, %rsi
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	jmp	L(CopyVecSizeExit)
-
-	.p2align 4
-L(CopyVecSizeUnaligned_0):
-	bsf	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm4, (%rdi)
-	add	$((VEC_SIZE * 4) - 1), %r8
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	jmp	L(CopyVecSizeExit)
-# endif
-
-	.p2align 4
-L(CopyVecSizeUnaligned_16):
-	bsf	%ecx, %edx
-	vmovdqu %ymm4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	VEC_SIZE(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	add	$((VEC_SIZE * 3) - 1), %r8
-	sub	%rdx, %r8
-	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
+	leaq	(%rdi, %rdx), %rax
+# endif
+
+	/* Use mask bits in rcx to detect which copy we need. If the low
+	   mask is zero then there must be a bit set in the upper half.
+	   I.e if ecx != 0 and cx == 0, then match must be upper 16
+	   bits so we use L(copy_16_31).  */
+	testw	%cx, %cx
+	jz	L(copy_16_31)
+
+	testb	%cl, %cl
+	jz	L(copy_8_15)
+# ifdef USE_AS_WCSCPY
+	vmovd	%xmm0, (%rdi)
+	movl	$0, (%END_REG)
+	ret
 # else
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
-
-	.p2align 4
-L(CopyVecSizeUnaligned_32):
-	bsf	%edx, %edx
-	vmovdqu %ymm4, (%rdi)
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-	add	$((VEC_SIZE * 2) - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
+	testb	$0x7, %cl
+	jz	L(copy_4_7)
+
+	testl	%edx, %edx
+	jz	L(set_null_term)
+	vmovd	%xmm0, %ecx
+	movw	%cx, (%rdi)
+
+	.p2align 4,, 2
+L(set_null_term):
+	movb	$0, (%END_REG)
+	ret
+
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-3(%rsi, %rdx), %ecx
+	vmovd	%xmm0, (%rdi)
+	movl	%ecx, -3(%END_REG)
+	ret
+# endif
+
+	.p2align 4,, 10
+L(copy_16_31):
+	VMOVU	-(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
+	ret
+
+	.p2align 4,, 10
+L(copy_8_15):
+# ifdef USE_AS_WCSCPY
+	movl	-(8 - CHAR_SIZE)(%rsi, %rdx), %ecx
 # else
-	add	$(VEC_SIZE * 2), %rsi
-	add	$(VEC_SIZE * 2), %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
-
-# ifdef USE_AS_STRNCPY
-#  ifndef USE_AS_STRCAT
-	.p2align 4
-L(CopyVecSizeUnalignedVec6):
-	vmovdqu %ymm6, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec5):
-	vmovdqu %ymm5, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec4):
-	vmovdqu %ymm4, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec3):
-	vmovdqu %ymm3, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-#  endif
-
-/* Case2 */
-
-	.p2align 4
-L(CopyVecSizeCase2):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTailCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTail1Case2):
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-/* Case2 or Case3,  Case3 */
-
-	.p2align 4
-L(CopyVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeCase2)
-L(CopyVecSizeCase3):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyTwoVecSizeCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyVecSizeTailCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTailCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSize1Case2OrCase3):
-	add	$VEC_SIZE, %rdi
-	add	$VEC_SIZE, %rsi
-	sub	$VEC_SIZE, %r8
-L(CopyVecSizeTail1Case2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTail1Case2)
-	jmp	L(StrncpyExit)
-# endif
-
-/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
-
-	.p2align 4
-L(Exit1):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$2, %r8
-	lea	2(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Exit2):
-	movzwl	(%rsi), %ecx
-	mov	%cx, (%rdi)
-	movb	$0, 2(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$3, %r8
-	lea	3(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Exit3):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
+# endif
+	vmovq	%xmm0, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%END_REG)
+	ret
+
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rdi)
+# endif
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	addq	%rsi, %rdi
+	VMOVA	1(%rsi), %VMM(1)
+
+	/* Try and order stores after as many loads as is reasonable to
+	   avoid potential false dependencies.  */
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rax)
+# endif
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE + 1)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), 1(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE + 1)(%rdi)
+
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2 + 1)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %edx
+	testl	%edx, %edx
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 3 + 1)(%rdi)
+
+	/* Subtract rsi from rdi before aligning. Adding back rsi will
+	   get proper rdi (dst) for new src.  */
+	subq	%rsi, %rdi
+	incq	%rsi
+	orq	$(VEC_SIZE * 4 - 1), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %edx
+	addq	%rsi, %rdi
+
+	testl	%edx, %edx
+	jnz	L(loop_4x_done)
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 1)(%rdi)
+
+
+	VMOVA	(VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %edx
+	subq	$(VEC_SIZE * -4), %rdi
+	testl	%edx, %edx
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
+
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
+L(ret_vec_x4):
+	bsfl	%edx, %edx
+	VMOVU	((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
 # ifdef USE_AS_STPCPY
-	lea	3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$4, %r8
-	lea	4(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax
 # endif
+L(return_end):
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(Exit4_7):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	-3(%rsi, %rdx), %ecx
-	mov	%ecx, -3(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x1):
+	bsfl	%ecx, %ecx
+	VMOVU	(1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	1(%rcx, %rdi), %rax
 # endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Exit8_15):
-	mov	(%rsi), %rcx
-	mov	-7(%rsi, %rdx), %r9
-	mov	%rcx, (%rdi)
-	mov	%r9, -7(%rdi, %rdx)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
-L(Exit16_31):
-	vmovdqu (%rsi), %xmm2
-	vmovdqu -15(%rsi, %rdx), %xmm3
-	vmovdqu %xmm2, (%rdi)
-	vmovdqu %xmm3, -15(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x2):
+	bsfl	%ecx, %ecx
+	VMOVU	((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub %rdx, %r8
-	sub $1, %r8
-	lea 1(%rdi, %rdx), %rdi
-	jnz L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax
 # endif
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(Exit32_63):
-	vmovdqu (%rsi), %ymm2
-	vmovdqu -31(%rsi, %rdx), %ymm3
-	vmovdqu %ymm2, (%rdi)
-	vmovdqu %ymm3, -31(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsfl	%ecx, %ecx
+	VMOVU	((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax
 # endif
 	VZEROUPPER_RETURN
 
-# ifdef USE_AS_STRNCPY
 
-	.p2align 4
-L(StrncpyExit1):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
+	.p2align 4,, 4
+L(page_cross):
+	movq	%rsi, %rcx
+	andq	$(VEC_SIZE * -1), %rcx
+
+	VPCMPEQ	(%rcx), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+# if USE_MOVSB_IN_PAGE_CROSS
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shll	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsfl	%ecx, %ecx
+#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
+#  endif
+	rep	movsb
 #  ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 1(%rdi)
+	leaq	-CHAR_SIZE(%rdi), %rax
 #  endif
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(StrncpyExit2):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 2(%rdi)
-#  endif
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(StrncpyExit3_4):
-	movzwl	(%rsi), %ecx
-	movzwl	-2(%rsi, %r8), %edx
-	mov	%cx, (%rdi)
-	mov	%dx, -2(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit5_8):
-	mov	(%rsi), %ecx
-	mov	-4(%rsi, %r8), %edx
-	mov	%ecx, (%rdi)
-	mov	%edx, -4(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit9_16):
-	mov	(%rsi), %rcx
-	mov	-8(%rsi, %r8), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit17_32):
-	vmovdqu (%rsi), %xmm2
-	vmovdqu -16(%rsi, %r8), %xmm3
-	vmovdqu %xmm2, (%rdi)
-	vmovdqu %xmm3, -16(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit33_64):
-	/*  0/32, 31/16 */
-	vmovdqu (%rsi), %ymm2
-	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
-	vmovdqu %ymm2, (%rdi)
-	vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit65):
-	/* 0/32, 32/32, 64/1 */
-	vmovdqu (%rsi), %ymm2
-	vmovdqu 32(%rsi), %ymm3
-	mov	64(%rsi), %cl
-	vmovdqu %ymm2, (%rdi)
-	vmovdqu %ymm3, 32(%rdi)
-	mov	%cl, 64(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	65(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 65(%rdi)
-#  endif
-	VZEROUPPER_RETURN
+# else
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
 
+	/* Traditional copy case, essentially same as used in non-page-
+	   cross case but since we can't reuse VMM(0) we need twice as
+	   many loads from rsi.  */
 #  ifndef USE_AS_STRCAT
-
-	.p2align 4
-L(Fill1):
-	mov	%dl, (%rdi)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill2):
-	mov	%dx, (%rdi)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill3_4):
-	mov	%dx, (%rdi)
-	mov     %dx, -2(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill5_8):
-	mov	%edx, (%rdi)
-	mov     %edx, -4(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill9_16):
-	mov	%rdx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill17_32):
-	vmovdqu %xmmZ, (%rdi)
-	vmovdqu %xmmZ, -16(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec2):
-	vmovdqu %ymm2, (%rdi, %rcx)
-
-	.p2align 4
-L(CopyVecSizeVecExit):
-	bsf	%edx, %edx
-	add	$(VEC_SIZE - 1), %r8
-	add	%rcx, %rdi
-#   ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-#   endif
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-
-	.p2align 4
-L(StrncpyFillTailWithZero):
-	xor	%edx, %edx
-	sub	$VEC_SIZE, %r8
-	jbe	L(StrncpyFillExit)
-
-	vmovdqu %ymmZ, (%rdi)
-	add	$VEC_SIZE, %rdi
-
-	mov	%rdi, %rsi
-	and	$(VEC_SIZE - 1), %esi
-	sub	%rsi, %rdi
-	add	%rsi, %r8
-	sub	$(VEC_SIZE * 4), %r8
-	jb	L(StrncpyFillLessFourVecSize)
-
-L(StrncpyFillLoopVmovdqa):
-	vmovdqa %ymmZ, (%rdi)
-	vmovdqa %ymmZ, VEC_SIZE(%rdi)
-	vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
-	vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE * 4), %rdi
-	sub	$(VEC_SIZE * 4), %r8
-	jae	L(StrncpyFillLoopVmovdqa)
-
-L(StrncpyFillLessFourVecSize):
-	add	$(VEC_SIZE * 2), %r8
-	jl	L(StrncpyFillLessTwoVecSize)
-	vmovdqa %ymmZ, (%rdi)
-	vmovdqa %ymmZ, VEC_SIZE(%rdi)
-	add	$(VEC_SIZE * 2), %rdi
-	sub	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	vmovdqa %ymmZ, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillLessTwoVecSize):
-	add	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	vmovdqa %ymmZ, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillExit):
-	add	$VEC_SIZE, %r8
-L(Fill):
-	cmp	$17, %r8d
-	jae	L(Fill17_32)
-	cmp	$9, %r8d
-	jae	L(Fill9_16)
-	cmp	$5, %r8d
-	jae	L(Fill5_8)
-	cmp	$3, %r8d
-	jae	L(Fill3_4)
-	cmp	$1, %r8d
-	ja	L(Fill2)
-	je	L(Fill1)
-	VZEROUPPER_RETURN
-
-/* end of ifndef USE_AS_STRCAT */
+	xorl	%edx, %edx
 #  endif
-
-	.p2align 4
-L(UnalignedLeaveCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(UnalignedFourVecSizeLeaveCase2)
-L(UnalignedFourVecSizeLeaveCase3):
-	lea	(VEC_SIZE * 4)(%r8), %rcx
-	and	$-VEC_SIZE, %rcx
-	add	$(VEC_SIZE * 3), %r8
-	jl	L(CopyVecSizeCase3)
-	vmovdqu %ymm4, (%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
+	bsfl	%ecx, %edx
 #  ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 4)(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (VEC_SIZE * 4)(%rdi)
+	leaq	(%rdi, %rdx), %rax
+#  elif !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 #  endif
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(UnalignedFourVecSizeLeaveCase2):
-	xor	%ecx, %ecx
-	vpcmpeqb %ymm4, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$(VEC_SIZE * 3), %r8
-	jle	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
-	vpcmpeqb %ymm5, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	vmovdqu %ymm4, (%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec5)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
+	/* vzeroupper early to avoid duplicating at each return.  */
+	COND_VZEROUPPER
 
-	vpcmpeqb %ymm6, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec6)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
+	testw	%cx, %cx
+	jz	L(page_cross_copy_16_31)
 
-	vpcmpeqb %ymm7, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-	lea	VEC_SIZE(%rdi, %rcx), %rdi
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-L(StrncpyExit):
-	cmp	$65, %r8d
-	je	L(StrncpyExit65)
-	cmp	$33, %r8d
-	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8d
-	jae	L(StrncpyExit17_32)
-	cmp	$9, %r8d
-	jae	L(StrncpyExit9_16)
-	cmp	$5, %r8d
-	jae	L(StrncpyExit5_8)
-	cmp	$3, %r8d
-	jae	L(StrncpyExit3_4)
-	cmp	$1, %r8d
-	ja	L(StrncpyExit2)
-	je	L(StrncpyExit1)
-#  ifdef USE_AS_STPCPY
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(ExitZero):
-#  ifndef USE_AS_STRCAT
-	mov	%rdi, %rax
-#  endif
-	VZEROUPPER_RETURN
+	testb	%cl, %cl
+	jz	L(page_cross_copy_8_15)
 
-# endif
+	testl	$0x7, %cl
+	jz	L(page_cross_copy_4_7)
 
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
-# endif
+	testl	%edx, %edx
+	jz	L(page_cross_set_null_term)
+	movzwl	(%rsi), %ecx
+	movw	%cx, (%rdi)
+L(page_cross_set_null_term):
+	movb	$0, (%END_REG)
+	ret
+
+	.p2align 4,, 4
+L(page_cross_copy_4_7):
+	movl	(%rsi), %ecx
+	movl	-3(%rsi, %rdx), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, -3(%END_REG)
+	ret
+
+	.p2align 4,, 4
+L(page_cross_copy_8_15):
+	movq	(%rsi), %rcx
+	movq	-7(%rsi, %rdx), %rsi
+	movq	%rcx, (%rdi)
+	movq	%rsi, -7(%END_REG)
+	ret
+
+
+	.p2align 4,, 3
+L(page_cross_copy_16_31):
+	VMOVU	(%rsi), %xmm0
+	VMOVU	-15(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -15(%END_REG)
+	ret
+# endif
+
+END(STRCPY)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
index 0dcea18dbb..2bbdbb91ab 100644
--- a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_avx2_rtm
-#include "strcat-avx2-rtm.S"
+#define STRNCAT	__strncat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "strncat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
index 52ecbca943..547cef9486 100644
--- a/sysdeps/x86_64/multiarch/strncat-avx2.S
+++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
@@ -1,7 +1,419 @@
-#ifndef STRNCAT
-# define STRNCAT	__strncat_avx2
-#endif
+/* strncat with AVX2
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+#  include "x86-avx2-vecs.h"
+# endif
+
+# ifndef STRNCAT
+#  define STRNCAT	__strncat_avx2
+# endif
+
+# ifdef USE_AS_WCSCPY
+#  define movNULL	movl
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define movNULL	movb
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCAT)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   using the non-length variant {wcs|str}cat.  */
+	movq	%rdi, %rax
+# ifdef USE_AS_WCSCPY
+	leaq	-1(%rdx), %rcx
+	shr	$56, %rcx
+	jnz	L(zero_len)
+	salq	$2, %rdx
+# else
+	test	%rdx, %rdx
+	jl	L(zero_len)
+# endif
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
+
+# include "strcat-strlen-avx2.S"
+
+	movl	%esi, %ecx
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	tzcnt	%ecx, %r8d
+	cmpq	%r8, %rdx
+	jbe	L(less_1x_vec)
+
+	testl	%ecx, %ecx
+	jz	L(more_1x_vec)
+
+	/* Hoist this to save code size.  */
+
+	movl	%r8d, %edx
+
+L(less_1x_vec):
+	COND_VZEROUPPER
+
+	cmpl	$16, %edx
+	jae	L(copy_16_31)
+	cmpl	$8, %edx
+	jae	L(copy_8_15)
+
+
+# ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+# else
+	cmpl	$4, %edx
+	jae	L(copy_4_7)
+
+	movzbl	(%rsi), %ecx
+	cmpl	$1, %edx
+	jbe	L(set_null_term)
+
+	/* NB: make this `vmovw` if support for AVX512-FP16 is added.
+	 */
+	movzwl	1(%rsi), %esi
+	movw	%si, 1(%rdi)
+
+	.p2align 4,, 1
+L(set_null_term):
+	movb	%cl, (%rdi)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 11
+L(copy_4_7):
+	movl	-(4)(%rsi, %rdx), %ecx
+	vmovd	%xmm0, (%rdi)
+	movl	%ecx, -(4)(%rdi, %rdx)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+# endif
+
+
+	.p2align 4,, 10
+L(copy_16_31):
+	VMOVU	-(16)(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -(16)(%rdi, %rdx)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 10
+L(copy_8_15):
+	movq	-(8)(%rsi, %rdx), %rcx
+	vmovq	%xmm0, (%rdi)
+	movq	%rcx, -(8)(%rdi, %rdx)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+	.p2align 6,, 14
+L(more_1x_vec):
+	VMOVU	%VMM(0), (%rdi)
+
+	/* Align rsi (src) and just rdx/rdi (length/dst).  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	incq	%rsi
+	addq	%rsi, %rdi
+L(loop_last_4x_vec):
+	subq	%rsi, %rdx
+	VMOVA	0(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	tzcnt	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len)
+
+	cmpl	$VEC_SIZE, %ecx
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	addl	$-VEC_SIZE, %edx
+	bzhil	%edx, %ecx, %r8d
+	jz	L(ret_vec_x2_len)
+L(ret_vec_x2):
+	bsfl	%ecx, %edx
+L(ret_vec_x2_len):
+	VMOVU	(%rsi, %rdx), %VMM(0)
+	movNULL	$0, (VEC_SIZE)(%rdi, %rdx)
+	VMOVU	%VMM(0), (%rdi, %rdx)
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+
+	.p2align 4,, 12
+L(ret_vec_x1_len):
+	movl	%edx, %ecx
+L(ret_vec_x1):
+	VMOVU	-(VEC_SIZE)(%rsi, %rcx), %VMM(1)
+	movNULL	$0, (%rdi, %rcx)
+	VMOVU	%VMM(1), -VEC_SIZE(%rdi, %rcx)
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	subq	$-(VEC_SIZE * 4), %rsi
+	VMOVA	0(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	subq	$-(VEC_SIZE * 4), %rdi
+	addl	$-(VEC_SIZE * 4), %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+	/* L(ret_vec_x1) expects ecx to have position of first match so
+	   test with bsf.  */
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
 
-#define USE_AS_STRNCAT
-#define STRCAT	STRNCAT
-#include "strcat-avx2.S"
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE * 1)(%rdi)
+
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* Check if length is greater than 4x VEC.  */
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	addl	$(VEC_SIZE * -2), %edx
+
+	tzcnt	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len)
+
+	cmpl	$VEC_SIZE, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	addl	$-VEC_SIZE, %edx
+	bzhil	%edx, %ecx, %r8d
+	jz	L(ret_vec_x4_len)
+L(ret_vec_x4):
+	bsfl	%ecx, %edx
+L(ret_vec_x4_len):
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %VMM(0)
+	movNULL	$0, (VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * 2)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 4
+L(ret_vec_x3_len):
+	movl	%edx, %ecx
+L(ret_vec_x3):
+	VMOVU	(VEC_SIZE)(%rsi, %rcx), %VMM(0)
+	movNULL	$0, (VEC_SIZE * 2)(%rdi, %rcx)
+	VMOVU	%VMM(0), (VEC_SIZE)(%rdi, %rcx)
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(more_4x_vec):
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi)
+
+
+	/* Recheck length before aligning.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Align rsi (src) and just rdx/rdi (length/dst).  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %r8d
+	addq	%rsi, %rdi
+	testl	%r8d, %r8d
+	jnz	L(loop_4x_done)
+
+	/* Use r9 for end of region before handling last 4x VEC
+	   specially.  */
+	leaq	-(VEC_SIZE * 4)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %r8d
+
+	testl	%r8d, %r8d
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	/* L(ret_vec_x1) expects ecx to have position of first match so
+	   test with bsf.  */
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	bsfl	%r8d, %r8d
+	VMOVU	(VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8)
+	VZEROUPPER_RETURN
+
+
+
+	.p2align 4,, 4
+L(page_cross):
+	movq	%rsi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+
+	VPCMPEQ	(%r8), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+
+	subl	%esi, %r8d
+	andl	$(VEC_SIZE - 1), %r8d
+	cmpq	%r8, %rdx
+	jb	L(page_cross_small)
+
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shll	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsfl	%ecx, %ecx
+	rep	movsb
+	VZEROUPPER_RETURN
+
+L(page_cross_small):
+	tzcntl	%ecx, %ecx
+	jz	L(page_cross_setz)
+	cmpl	%edx, %ecx
+	cmova	%edx, %ecx
+	rep	movsb
+L(page_cross_setz):
+	movNULL	$0, (%rdi)
+	VZEROUPPER_RETURN
+L(zero_len):
+# ifdef USE_AS_WCSCPY
+	test	%rdx, %rdx
+# endif
+	jnz	OVERFLOW_STRCAT
+	ret
+
+
+END(STRNCAT)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
index 79e7083299..b582a4a7a1 100644
--- a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STRNCPY	__strncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
index ce634e94fa..d1b25b7a42 100644
--- a/sysdeps/x86_64/multiarch/strncpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
@@ -1,7 +1,735 @@
-#ifndef STRNCPY
-# define STRNCPY	__strncpy_avx2
-#endif
+/* strncpy with AVX2
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+
+# ifndef VEC_SIZE
+#  include "x86-avx2-vecs.h"
+# endif
+
+# ifndef STRNCPY
+#  define STRNCPY	__strncpy_avx2
+# endif
+
+
+# ifdef USE_AS_WCSCPY
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCPY)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   just running rep stos{b|l} to zero set (which will almost
+	   certainly segfault), if that succeeds then just calling
+	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
+# ifdef USE_AS_WCSCPY
+	decq	%rdx
+	movq	%rdx, %rax
+	/* 56 is end of max supported address space.  */
+	shr	$56, %rax
+	jnz	L(zero_len)
+	salq	$2, %rdx
+# else
+	decq	%rdx
+	/* `dec` can macrofuse with `jl`. If the flag needs to become
+	   `jb` replace `dec` with `sub`.  */
+	jl	L(zero_len)
+# endif
+
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
+	movl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* If no STPCPY just save end ahead of time.  */
+# ifndef USE_AS_STPCPY
+	movq	%rdi, %rax
+# elif defined USE_AS_WCSCPY
+	/* Clear dependency as nearly all return code for wcpncpy uses
+	   `setc %al`.  */
+	xorl	%eax, %eax
+# endif
+
+	cmpq	$(VEC_SIZE - CHAR_SIZE), %rdx
+	/* `jb` because length rdx is now length - CHAR_SIZE.  */
+	jbe	L(less_1x_vec)
+
+	/* This may overset but thats fine because we still need to zero
+	   fill.  */
+	VMOVU	%VMM(0), (%rdi)
+
+	testl	%ecx, %ecx
+	jnz	L(zfill)
+
+	/* Align.  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	incq	%rsi
+L(last_4x_vec):
+	addq	%rsi, %rdi
+L(loop_last_4x_vec):
+	subq	%rsi, %rdx
+
+
+	VMOVA	0(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jae	L(more_2x_vec)
+
+	cmpl	$(VEC_SIZE), %edx
+	jb	L(ret_vec_x1_len)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VPCMPEQ	VEC_SIZE(%rsi), %VZERO, %VMM(6)
+	VMOVU	%VMM(1), (%rdi)
+	vpmovmskb %VMM(6), %ecx
+	shlq	$VEC_SIZE, %rcx
+L(ret_vec_x1_len):
+	tzcntq	%rcx, %rcx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+L(ret_vec_x1_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+L(ret_vec_x1_len_no_zfill):
+	VMOVU	((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	.p2align 4,, 6
+L(ret_vec_x1):
+	bsfl	%ecx, %ecx
+	VMOVU	%VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+	subl	%ecx, %edx
+	/* Check if we need to reload/store.  */
+	cmpl	$VEC_SIZE, %edx
+	jb	L(ret_vec_x1_len_no_zfill_mov)
+	/* Otherwise safe to just store directly.  */
+	VMOVU	%VMM(1), (%rdi)
+	VMOVU	%VZERO, (%rdi, %rcx)
+# ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 12
+L(more_2x_vec):
+	VMOVU	%VMM(1), (%rdi)
+	testl	%ecx, %ecx
+	/* Must fill at least 2x VEC.  */
+	jnz	L(zfill_vec1)
+
+	VMOVA	VEC_SIZE(%rsi), %VMM(2)
+	VMOVU	%VMM(2), VEC_SIZE(%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	/* Must fill at least 1x VEC.  */
+	jnz	L(zfill_vec2)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
+	   CHAR_SIZE.  */
+	cmpq	$(VEC_SIZE * 4 - CHAR_SIZE), %rdx
+	ja	L(more_4x_vec)
+
+	subl	$(VEC_SIZE * 3), %edx
+	jb	L(ret_vec_x3_len)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	vpmovmskb %VMM(6), %ecx
+	tzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x4_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+	movl	%ecx, %edx
+L(ret_vec_x4_len_no_zfill):
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 3 + 0)(%edx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+
+L(ret_vec_x3_len):
+	addl	$(VEC_SIZE * 1), %edx
+	tzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+L(ret_vec_x3_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+	.p2align 4,, 4
+L(ret_vec_x3_len_no_zfill):
+	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 2 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsfl	%ecx, %ecx
+	VMOVU	%VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
+	subl	%ecx, %edx
+	jl	L(ret_vec_x3_len_no_zfill_mov)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx), %rax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(more_4x_vec):
+
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec3)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(4)
+	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec4)
+
+	movq	%rdx, %rcx
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+	/* Recheck length before aligning.  */
+	cmpq	$(VEC_SIZE * 8 - CHAR_SIZE), %rcx
+	jbe	L(last_4x_vec)
+
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %r8d
+	addq	%rsi, %rdi
+	testl	%r8d, %r8d
+	jnz	L(loop_4x_done)
+
+	/* Use r9 as end register.  */
+	leaq	-(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
 
-#define USE_AS_STRNCPY
-#define STRCPY	STRNCPY
-#include "strcpy-avx2.S"
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %r8d
+
+	testl	%r8d, %r8d
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	subq	%rsi, %rdx
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec1)
+
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec2)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec3)
+
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+	movl	%r8d, %ecx
+
+	// Zfill more....
+
+	.p2align 4,, 4
+L(zfill_vec4):
+	addq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2), %rdx
+L(zfill_vec2):
+	shlq	$VEC_SIZE, %rcx
+L(zfill):
+	bsfq	%rcx, %rcx
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_from_page_cross):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(zfill_less_vec_vzeroupper)
+
+L(zfill_more_1x_vec):
+	VMOVU	%VZERO, CHAR_SIZE(%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jae	L(zfill_more_2x_vec)
+L(zfill_done0):
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(zfill_vec3):
+	addq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2), %rdx
+	.p2align 4,, 2
+L(zfill_vec1):
+	bsfl	%ecx, %ecx
+	addq	%rcx, %rdi
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+	/* zfill from vec1/vec3 must have to set at least 2x VECS.  */
+
+	VMOVU	%VZERO, CHAR_SIZE(%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jb	L(zfill_done0)
+L(zfill_more_2x_vec):
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
+	subq	$(VEC_SIZE * 4 - CHAR_SIZE), %rdx
+	jbe	L(zfill_done)
+
+	addq	%rdi, %rdx
+	VMOVU	%VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
+
+
+	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
+	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
+
+	subq	$-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
+	cmpq	%rdi, %rdx
+	jbe	L(zfill_done)
+
+	andq	$-(VEC_SIZE), %rdi
+	.p2align 4,, 12
+L(zfill_loop_4x_vec):
+	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	ja	L(zfill_loop_4x_vec)
+L(zfill_done):
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(copy_1x):
+	VMOVU	%VMM(0), (%rdi)
+	testl	%ecx, %ecx
+	jz	L(ret_32_32)
+L(zfill_less_vec):
+	bsfl	%ecx, %ecx
+L(zfill_less_vec_no_bsf):
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_less_vec_vzeroupper):
+	COND_VZEROUPPER
+	/* We are taking advantage of the fact that to be here we must
+	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
+	   way for overwriting.  */
+	cmpl	$16, %edx
+	jb	L(zfill_less_16)
+	VMOVU	%VZERO_128, (%rdi)
+	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
+	ret
+# ifdef USE_AS_STPCPY
+L(ret_32_32):
+	leaq	CHAR_SIZE(%rdi, %rdx), %rax
+	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4,, 4
+L(copy_16_31):
+	/* Overfill to avoid branches.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	ja	L(zfill_less_vec_no_bsf)
+# ifndef USE_AS_STPCPY
+L(ret_32_32):
+# else
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 4
+L(copy_8_15):
+	/* Overfill to avoid branches.  */
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
+	vmovq	%xmm0, (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_8_15)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+	.p2align 4,, 8
+L(zfill_less_16):
+	xorl	%ecx, %ecx
+	cmpl	$8, %edx
+	jb	L(zfill_less_8)
+	movq	%rcx, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
+# ifndef USE_AS_STPCPY
+L(ret_8_15):
+# endif
+	ret
+
+
+	.p2align 4,, 8
+L(less_1x_vec):
+	/* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
+	   buffer sizes are aligned conventially.  */
+	je	L(copy_1x)
+
+	tzcntl	%ecx, %ecx
+	cmpl	$16, %edx
+	jae	L(copy_16_31)
+
+	COND_VZEROUPPER
+	cmpl	$8, %edx
+	jae	L(copy_8_15)
+# ifdef USE_AS_WCSCPY
+	testl	%ecx, %ecx
+	jz	L(zfill_less_8_set_ret)
+
+	movl	(%rsi, %rdx), %esi
+	vmovd	%xmm0, (%rdi)
+	movl	%esi, (%rdi, %rdx)
+
+#  ifdef USE_AS_STPCPY
+	cmpl	%ecx, %edx
+L(ret_8_15):
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  endif
+	ret
+L(zfill_less_8_set_ret):
+	xorl	%ecx, %ecx
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+L(zfill_less_8):
+	movl	%ecx, (%rdi)
+	movl	%ecx, (%rdi, %rdx)
+	ret
+
+# else
+	cmpl	$3, %edx
+	jb	L(copy_0_3)
+	/* Overfill to avoid branches.  */
+	movl	-3(%rsi, %rdx), %esi
+	vmovd	%xmm0, (%rdi)
+	movl	%esi, -3(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_4_7)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+	xorl	%ecx, %ecx
+	.p2align 4,, 8
+L(zfill_less_8):
+	cmpl	$3, %edx
+	jb	L(zfill_less_3)
+	movl	%ecx, (%rdi)
+	movl	%ecx, -3(%rdi, %rdx)
+#  ifdef USE_AS_STPCPY
+	ret
+#  endif
+
+L(ret_4_7):
+#  ifdef USE_AS_STPCPY
+L(ret_8_15):
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+	ret
+
+	.p2align 4,, 4
+L(zfill_less_3):
+	testl	%edx, %edx
+	jz	L(zfill_1)
+	movw	%cx, (%rdi)
+L(zfill_1):
+	movb	%cl, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_0_3):
+	vmovd	%xmm0, %r8d
+	testl	%edx, %edx
+	jz	L(copy_1)
+	movw	%r8w, (%rdi)
+	cmpl	%ecx, %edx
+	ja	L(zfill_from_1)
+	movzbl	(%rsi, %rdx), %r8d
+#  ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+	movb	%r8b, (%rdi, %rdx)
+	ret
+#  endif
+
+L(copy_1):
+#  ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	cmpl	%ecx, %edx
+	adcq	%rdi, %rax
+#  endif
+#  ifdef USE_AS_WCSCPY
+	vmovd	%xmm0, (%rdi)
+#  else
+	movb	%r8b, (%rdi, %rdx)
+#  endif
+	ret
+# endif
+
+	.p2align 4,, 2
+L(zero_len):
+	movq	%rdi, %rax
+	ret
+# ifndef USE_AS_WCSCPY
+	.p2align 4,, 8
+L(zfill_from_1):
+#  ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+#  endif
+	movw	$0, -1(%rdi, %rdx)
+	ret
+# endif
+
+	.p2align 4,, 4
+	.p2align 6,, 8
+L(page_cross):
+	movq	%rsi, %rax
+	andq	$(VEC_SIZE * -1), %rax
+
+	VPCMPEQ	(%rax), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+
+	subl	%esi, %eax
+	andl	$(VEC_SIZE - 1), %eax
+	cmpq	%rax, %rdx
+	jb	L(page_cross_small)
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* If rcx is non-zero then continue.  */
+	shl	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsf	%ecx, %ecx
+
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	leaq	-CHAR_SIZE(%rdi, %rcx), %rax
+# else
+	movq	%rdi, %rax
+# endif
+
+	rep	movsb
+# ifdef USE_AS_WCSCPY
+	movl	$0, (%rdi)
+# else
+	movb	$0, (%rdi)
+# endif
+	jmp	L(zfill_from_page_cross)
+
+L(page_cross_small):
+	tzcntl	%ecx, %ecx
+	xorl	%eax, %eax
+	cmpl	%ecx, %edx
+	jbe	L(page_cross_copy_only)
+
+	/* Do a zfill of the tail before copying.  */
+	movq	%rdi, %r9
+	movl	%ecx, %r8d
+
+	subl	%ecx, %edx
+	leaq	CHAR_SIZE(%rdi, %rcx), %rdi
+	movl	%edx, %ecx
+	rep	stosb
+	movq	%r9, %rdi
+	movl	%r8d, %edx
+L(page_cross_copy_only):
+	leal	CHAR_SIZE(%rdx), %ecx
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdi, %rdx
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# else
+	movq	%rdi, %rax
+# endif
+	rep	movsb
+	ret
+
+
+L(best_effort_strncpy):
+	movq	%rdx, %rcx
+	xorl	%eax, %eax
+	movq	%rdi, %r8
+	/* The length is >= 2^63. We very much so expect to segfault at
+	   rep stos. If that doesn't happen then just strcpy to finish.
+	 */
+# ifdef USE_AS_WCSCPY
+	rep	stosl
+# else
+	rep	stosb
+# endif
+	movq	%r8, %rdi
+	jmp	OVERFLOW_STRCPY
+END(STRNCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
index dca1089060..01bead1435 100644
--- a/sysdeps/x86_64/multiarch/x86-avx-vecs.h
+++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
@@ -27,7 +27,10 @@
 #define VEC_SIZE			32
 #include "x86-vec-macros.h"
 
-#define USE_WITH_AVX		1
+#ifndef USE_WITH_AVX2
+# define USE_WITH_AVX		1
+#endif
+
 #define SECTION(p)			p##.avx
 
 /* 4-byte mov instructions with AVX2.  */
diff --git a/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
new file mode 100644
index 0000000000..a5966701ec
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
@@ -0,0 +1,26 @@
+/* Common config for AVX2-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX2_RTM_VECS_H
+#define _X86_AVX2_RTM_VECS_H			1
+
+#define USE_WITH_AVX2		1
+#include "x86-avx-rtm-vecs.h"
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-avx2-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
new file mode 100644
index 0000000000..16d7ae5147
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
@@ -0,0 +1,27 @@
+/* Common config for AVX2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX2_VECS_H
+#define _X86_AVX2_VECS_H			1
+
+#define USE_WITH_AVX2		1
+
+#include "x86-avx-vecs.h"
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2 4/4] x86: Add optimized functions for the wide-character strcpy family
  2022-11-04  8:20 ` [PATCH v2 " Noah Goldstein
  2022-11-04  8:20   ` [PATCH v2 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
  2022-11-04  8:20   ` [PATCH v2 3/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
@ 2022-11-04  8:20   ` Noah Goldstein
  2022-11-04 16:47     ` H.J. Lu
  2022-11-04 16:26   ` [PATCH v2 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json H.J. Lu
  3 siblings, 1 reply; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04  8:20 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Implemented:
    wcscat-avx2{+rtm}
    wcscpy-avx2{+rtm}
    wcpcpy-avx2{+rtm}
    wcsncpy-avx2{+rtm}
    wcpncpy-avx2{+rtm}
    wcsncat-avx2{+rtm}
    wcscat-evex
    wcscpy-evex
    wcpcpy-evex
    wcsncpy-evex
    wcpncpy-evex
    wcsncat-evex

Performance Changes:
    Times are from N = 10 runs of the benchmark suite and are reported
    as geometric mean of all ratios of New Implementation / Best Old
    Implementation. Best Old Implementation was determined with the
    highest ISA implementation.

    wcscat-avx2     -> 0.975
    wcscpy-avx2     -> 0.591
    wcpcpy-avx2     -> 0.698
    wcsncpy-avx2    -> 0.730
    wcpncpy-avx2    -> 0.711
    wcsncat-avx2    -> 0.954
    wcscat-evex     -> 0.991
    wcscpy-evex     -> 0.587
    wcpcpy-evex     -> 0.695
    wcsncpy-evex    -> 0.719
    wcpncpy-evex    -> 0.694
    wcsncat-evex    -> 0.979

Code Size Changes:
    This change (compared with the last two commits without it)
    increase the size of libc.so by ~19kb bytes. For reference this
    entire patchset increases libc.so by ~2.5kb (so without the
    wide-character functions libc.so would decrease by 16.5kb).

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
 sysdeps/x86_64/Makefile                     |   5 +
 sysdeps/x86_64/multiarch/Makefile           |  26 +++-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c  | 135 +++++++++++++++++++-
 sysdeps/x86_64/multiarch/ifunc-wcs.h        |  60 +++++++++
 sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S  |   3 +
 sysdeps/x86_64/multiarch/wcpcpy-avx2.S      |   8 ++
 sysdeps/x86_64/multiarch/wcpcpy-evex.S      |   8 ++
 sysdeps/x86_64/multiarch/wcpcpy-generic.c   |  27 ++++
 sysdeps/x86_64/multiarch/wcpcpy.c           |  37 ++++++
 sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S |   3 +
 sysdeps/x86_64/multiarch/wcpncpy-avx2.S     |   8 ++
 sysdeps/x86_64/multiarch/wcpncpy-evex.S     |   8 ++
 sysdeps/x86_64/multiarch/wcpncpy-generic.c  |  27 ++++
 sysdeps/x86_64/multiarch/wcpncpy.c          |  37 ++++++
 sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S  |   3 +
 sysdeps/x86_64/multiarch/wcscat-avx2.S      |  10 ++
 sysdeps/x86_64/multiarch/wcscat-evex.S      |   9 ++
 sysdeps/x86_64/multiarch/wcscat-generic.c   |  27 ++++
 sysdeps/x86_64/multiarch/wcscat.c           |  37 ++++++
 sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S  |   3 +
 sysdeps/x86_64/multiarch/wcscpy-avx2.S      |   7 +
 sysdeps/x86_64/multiarch/wcscpy-evex.S      |   7 +
 sysdeps/x86_64/multiarch/wcscpy-generic.c   |   3 +-
 sysdeps/x86_64/multiarch/wcscpy.c           |  21 +++
 sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S |   3 +
 sysdeps/x86_64/multiarch/wcsncat-avx2.S     |   9 ++
 sysdeps/x86_64/multiarch/wcsncat-evex.S     |   9 ++
 sysdeps/x86_64/multiarch/wcsncat-generic.c  |  27 ++++
 sysdeps/x86_64/multiarch/wcsncat.c          |  34 +++++
 sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S |   3 +
 sysdeps/x86_64/multiarch/wcsncpy-avx2.S     |   7 +
 sysdeps/x86_64/multiarch/wcsncpy-evex.S     |   7 +
 sysdeps/x86_64/multiarch/wcsncpy-generic.c  |  27 ++++
 sysdeps/x86_64/multiarch/wcsncpy.c          |  37 ++++++
 sysdeps/x86_64/wcpcpy-generic.c             |  31 +++++
 sysdeps/x86_64/wcpcpy.S                     |  41 ++++++
 sysdeps/x86_64/wcpncpy-generic.c            |  31 +++++
 sysdeps/x86_64/wcpncpy.S                    |  41 ++++++
 sysdeps/x86_64/wcscat-generic.c             |  31 +++++
 sysdeps/x86_64/wcscat.S                     |  41 ++++++
 sysdeps/x86_64/wcscpy.S                     |   2 +
 sysdeps/x86_64/wcsncat-generic.c            |  31 +++++
 sysdeps/x86_64/wcsncat.S                    |  39 ++++++
 sysdeps/x86_64/wcsncpy-generic.c            |  31 +++++
 sysdeps/x86_64/wcsncpy.S                    |  41 ++++++
 45 files changed, 1036 insertions(+), 6 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcs.h
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy.c
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy.c
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcscat.c
 create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat.c
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy.c
 create mode 100644 sysdeps/x86_64/wcpcpy-generic.c
 create mode 100644 sysdeps/x86_64/wcpcpy.S
 create mode 100644 sysdeps/x86_64/wcpncpy-generic.c
 create mode 100644 sysdeps/x86_64/wcpncpy.S
 create mode 100644 sysdeps/x86_64/wcscat-generic.c
 create mode 100644 sysdeps/x86_64/wcscat.S
 create mode 100644 sysdeps/x86_64/wcsncat-generic.c
 create mode 100644 sysdeps/x86_64/wcsncat.S
 create mode 100644 sysdeps/x86_64/wcsncpy-generic.c
 create mode 100644 sysdeps/x86_64/wcsncpy.S

diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 3627c5659f..688eb2d7c4 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -188,8 +188,13 @@ endif
 ifeq ($(subdir),wcsmbs)
 
 sysdep_routines += \
+  wcpcpy-generic \
+  wcpncpy-generic \
+  wcscat-generic \
   wcscpy-generic \
+  wcsncat-generic \
   wcsncmp-generic \
+  wcsncpy-generic \
   wcsnlen-generic \
 # sysdep_routines
 
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 066bfa48d9..f848fc0e28 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -131,6 +131,18 @@ endif
 
 ifeq ($(subdir),wcsmbs)
 sysdep_routines += \
+  wcpcpy-avx2 \
+  wcpcpy-avx2-rtm \
+  wcpcpy-evex \
+  wcpcpy-generic \
+  wcpncpy-avx2 \
+  wcpncpy-avx2-rtm \
+  wcpncpy-evex \
+  wcpncpy-generic \
+  wcscat-avx2 \
+  wcscat-avx2-rtm \
+  wcscat-evex \
+  wcscat-generic \
   wcschr-avx2 \
   wcschr-avx2-rtm \
   wcschr-evex \
@@ -140,6 +152,10 @@ sysdep_routines += \
   wcscmp-avx2-rtm \
   wcscmp-evex \
   wcscmp-sse2 \
+  wcscpy-avx2 \
+  wcscpy-avx2-rtm \
+  wcscpy-evex \
+  wcscpy-generic \
   wcscpy-ssse3 \
   wcslen-avx2 \
   wcslen-avx2-rtm \
@@ -147,9 +163,17 @@ sysdep_routines += \
   wcslen-evex512 \
   wcslen-sse2 \
   wcslen-sse4_1 \
+  wcsncat-avx2 \
+  wcsncat-avx2-rtm \
+  wcsncat-evex \
+  wcsncat-generic \
   wcsncmp-avx2 \
   wcsncmp-avx2-rtm \
   wcsncmp-evex \
+  wcsncpy-avx2 \
+  wcsncpy-avx2-rtm \
+  wcsncpy-evex \
+  wcsncpy-generic \
   wcsnlen-avx2 \
   wcsnlen-avx2-rtm \
   wcsnlen-evex \
@@ -163,8 +187,8 @@ sysdep_routines += \
   wmemchr-avx2 \
   wmemchr-avx2-rtm \
   wmemchr-evex \
-  wmemchr-evex512 \
   wmemchr-evex-rtm \
+  wmemchr-evex512 \
   wmemchr-sse2 \
   wmemcmp-avx2-movbe \
   wmemcmp-avx2-movbe-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7cebee7ec7..71e8953e91 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -901,16 +901,145 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/wcscpy.c.  */
   IFUNC_IMPL (i, name, wcscpy,
-	      /* ISA V4 wrapper for SSSE3 implementation because
-	         the SSSE3 implementation is also used at ISA
-	         level 3/4.  */
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcscpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcscpy_avx2_rtm)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcscpy,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __wcscpy_ssse3)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
 				     1,
 				     __wcscpy_generic))
 
+  /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
+  IFUNC_IMPL (i, name, wcsncpy,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcsncpy_avx2_rtm)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcpncpy,
+				     1,
+				     __wcsncpy_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcpcpy.c.  */
+  IFUNC_IMPL (i, name, wcpcpy,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcpcpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpcpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpcpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcpcpy_avx2_rtm)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcpcpy,
+				     1,
+				     __wcpcpy_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcpncpy.c.  */
+  IFUNC_IMPL (i, name, wcpncpy,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcpncpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpncpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpncpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcpncpy_avx2_rtm)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncpy,
+				     1,
+				     __wcpncpy_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcscat.c.  */
+  IFUNC_IMPL (i, name, wcscat,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcscat,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscat_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscat_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcscat_avx2_rtm)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscat,
+				     1,
+				     __wcscat_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcsncat.c.  */
+  IFUNC_IMPL (i, name, wcsncat,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncat,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncat_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncat_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcsncat_avx2_rtm)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncat,
+				     1,
+				     __wcsncat_generic))
+
   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
   IFUNC_IMPL (i, name, wcslen,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcslen,
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcs.h b/sysdeps/x86_64/multiarch/ifunc-wcs.h
new file mode 100644
index 0000000000..cda633d8fb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-wcs.h
@@ -0,0 +1,60 @@
+/* Common definition for ifunc selections optimized wide-character
+   string copy functions.
+
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+#ifndef GENERIC
+# define GENERIC generic
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+				      AVX_Fast_Unaligned_Load, ))
+    {
+      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+				       Prefer_No_VZEROUPPER, !))
+	return OPTIMIZE (avx2);
+
+    }
+
+  return OPTIMIZE (GENERIC);
+}
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
new file mode 100644
index 0000000000..756280a3ab
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCPCPY	__wcpcpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcpcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
new file mode 100644
index 0000000000..0fffd912d3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
@@ -0,0 +1,8 @@
+#ifndef WCPCPY
+# define WCPCPY	__wcpcpy_avx2
+#endif
+
+#define USE_AS_STPCPY
+#define USE_AS_WCSCPY
+#define STRCPY	WCPCPY
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-evex.S b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
new file mode 100644
index 0000000000..ac6429cc07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
@@ -0,0 +1,8 @@
+#ifndef WCPCPY
+# define WCPCPY	__wcpcpy_evex
+#endif
+
+#define USE_AS_STPCPY
+#define USE_AS_WCSCPY
+#define STRCPY	WCPCPY
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-generic.c b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
new file mode 100644
index 0000000000..0ba29b081f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
@@ -0,0 +1,27 @@
+/* wcpcpy.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (2)
+
+# define WCPCPY __wcpcpy_generic
+# include <wcsmbs/wcpcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcpcpy.c b/sysdeps/x86_64/multiarch/wcpcpy.c
new file mode 100644
index 0000000000..8f96ddbc99
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcpcpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcpcpy __redirect_wcpcpy
+# include <wchar.h>
+# undef __wcpcpy
+
+# define SYMBOL_NAME wcpcpy
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcpcpy, __wcpcpy, IFUNC_SELECTOR ());
+weak_alias (__wcpcpy, wcpcpy)
+# ifdef SHARED
+__hidden_ver1 (__wcpcpy, __GI___wcpcpy, __redirect_wcpcpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpcpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
new file mode 100644
index 0000000000..80600d6b01
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCPNCPY	__wcpncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcpncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
new file mode 100644
index 0000000000..b7e594f7b7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
@@ -0,0 +1,8 @@
+#ifndef WCPNCPY
+# define WCPNCPY	__wcpncpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STPCPY
+#define STRNCPY	WCPNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-evex.S b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
new file mode 100644
index 0000000000..62ddb694fe
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
@@ -0,0 +1,8 @@
+#ifndef WCPNCPY
+# define WCPNCPY	__wcpncpy_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STPCPY
+#define STRNCPY	WCPNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-generic.c b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
new file mode 100644
index 0000000000..4aab4ecdd2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
@@ -0,0 +1,27 @@
+/* wcpncpy.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (2)
+
+# define WCPNCPY __wcpncpy_generic
+# include <wcsmbs/wcpncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcpncpy.c b/sysdeps/x86_64/multiarch/wcpncpy.c
new file mode 100644
index 0000000000..ed8f307e07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcpncpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcpncpy __redirect_wcpncpy
+# include <wchar.h>
+# undef __wcpncpy
+
+# define SYMBOL_NAME wcpncpy
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcpncpy, __wcpncpy, IFUNC_SELECTOR ());
+weak_alias (__wcpncpy, wcpncpy)
+# ifdef SHARED
+__hidden_ver1 (__wcpncpy, __GI___wcpncpy, __redirect_wcpncpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpncpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
new file mode 100644
index 0000000000..e99449a2dc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSCAT	__wcscat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcscat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2.S b/sysdeps/x86_64/multiarch/wcscat-avx2.S
new file mode 100644
index 0000000000..a20f23c09d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-avx2.S
@@ -0,0 +1,10 @@
+#ifndef WCSCAT
+# define WCSCAT	__wcscat_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRCPY	WCSCAT
+
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscat-evex.S b/sysdeps/x86_64/multiarch/wcscat-evex.S
new file mode 100644
index 0000000000..1d017e4899
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-evex.S
@@ -0,0 +1,9 @@
+#ifndef WCSCAT
+# define WCSCAT	__wcscat_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRCPY	WCSCAT
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcscat-generic.c b/sysdeps/x86_64/multiarch/wcscat-generic.c
new file mode 100644
index 0000000000..6476f85bbb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-generic.c
@@ -0,0 +1,27 @@
+/* wcscat.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (2)
+
+# define WCSCAT __wcscat_generic
+# include <wcsmbs/wcscat.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcscat.c b/sysdeps/x86_64/multiarch/wcscat.c
new file mode 100644
index 0000000000..3277c44561
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcscat.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcscat __redirect_wcscat
+# include <wchar.h>
+# undef __wcscat
+
+# define SYMBOL_NAME wcscat
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcscat, __wcscat, IFUNC_SELECTOR ());
+weak_alias (__wcscat, wcscat)
+# ifdef SHARED
+__hidden_ver1 (__wcscat, __GI___wcscat, __redirect_wcscat)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcscat);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
new file mode 100644
index 0000000000..2f800c8d3e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSCPY	__wcscpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcscpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2.S b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
new file mode 100644
index 0000000000..6bc509da07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
@@ -0,0 +1,7 @@
+#ifndef WCSCPY
+# define WCSCPY	__wcscpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define STRCPY	WCSCPY
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-evex.S b/sysdeps/x86_64/multiarch/wcscpy-evex.S
new file mode 100644
index 0000000000..1069a8e224
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-evex.S
@@ -0,0 +1,7 @@
+#ifndef WCSCPY
+# define WCSCPY	__wcscpy_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define STRCPY	WCSCPY
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-generic.c b/sysdeps/x86_64/multiarch/wcscpy-generic.c
index 93d314aaad..600d606c45 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcscpy-generic.c
@@ -18,8 +18,7 @@
 
 
 #include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (1)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSCPY  __wcscpy_generic
 # include <wcsmbs/wcscpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
index 92c917b6b4..7f6387817b 100644
--- a/sysdeps/x86_64/multiarch/wcscpy.c
+++ b/sysdeps/x86_64/multiarch/wcscpy.c
@@ -26,6 +26,11 @@
 # define SYMBOL_NAME wcscpy
 # include <init-arch.h>
 
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
@@ -35,6 +40,22 @@ IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
+  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load, ))
+    {
+      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER, !))
+	return OPTIMIZE (avx2);
+    }
+
   if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
     return OPTIMIZE (ssse3);
 
diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
new file mode 100644
index 0000000000..609d6e69c0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSNCAT	__wcsncat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcsncat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2.S b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
new file mode 100644
index 0000000000..a72105b7e9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
@@ -0,0 +1,9 @@
+#ifndef WCSNCAT
+# define WCSNCAT	__wcsncat_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRNCAT	WCSNCAT
+#include "strncat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncat-evex.S b/sysdeps/x86_64/multiarch/wcsncat-evex.S
new file mode 100644
index 0000000000..392215950a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-evex.S
@@ -0,0 +1,9 @@
+#ifndef WCSCAT
+# define WCSCAT	__wcsncat_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRNCAT	WCSCAT
+#include "strncat-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncat-generic.c b/sysdeps/x86_64/multiarch/wcsncat-generic.c
new file mode 100644
index 0000000000..9ced02b35e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-generic.c
@@ -0,0 +1,27 @@
+/* wcsncat.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (2)
+
+# define WCSNCAT __wcsncat_generic
+# include <wcsmbs/wcsncat.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncat.c b/sysdeps/x86_64/multiarch/wcsncat.c
new file mode 100644
index 0000000000..49c46aef08
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat.c
@@ -0,0 +1,34 @@
+/* Multiple versions of wcsncat.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define wcsncat __redirect_wcsncat
+# include <wchar.h>
+# undef wcsncat
+
+# define SYMBOL_NAME wcsncat
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcsncat, wcsncat, IFUNC_SELECTOR ());
+# ifdef SHARED
+__hidden_ver1 (wcsncat, __GI_wcsncat, __redirect_wcsncat)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncat);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
new file mode 100644
index 0000000000..cab5a6b820
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSNCPY	__wcsncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcsncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
new file mode 100644
index 0000000000..3a1a8a372c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
@@ -0,0 +1,7 @@
+#ifndef WCSNCPY
+# define WCSNCPY	__wcsncpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define STRNCPY	WCSNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-evex.S b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
new file mode 100644
index 0000000000..2debb8fd6b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
@@ -0,0 +1,7 @@
+#ifndef WCSNCPY
+# define WCSNCPY	__wcsncpy_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define STRNCPY	WCSNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-generic.c b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
new file mode 100644
index 0000000000..693521713b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
@@ -0,0 +1,27 @@
+/* wcsncpy.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (2)
+
+# define WCSNCPY __wcsncpy_generic
+# include <wcsmbs/wcsncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncpy.c b/sysdeps/x86_64/multiarch/wcsncpy.c
new file mode 100644
index 0000000000..5b89dd4d27
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcsncpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcsncpy __redirect_wcsncpy
+# include <wchar.h>
+# undef __wcsncpy
+
+# define SYMBOL_NAME wcsncpy
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcsncpy, __wcsncpy, IFUNC_SELECTOR ());
+weak_alias (__wcsncpy, wcsncpy)
+# ifdef SHARED
+__hidden_ver1 (__wcsncpy, __GI___wcsncpy, __redirect_wcsncpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/wcpcpy-generic.c b/sysdeps/x86_64/wcpcpy-generic.c
new file mode 100644
index 0000000000..d52525f288
--- /dev/null
+++ b/sysdeps/x86_64/wcpcpy-generic.c
@@ -0,0 +1,31 @@
+/* wcpcpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpcpy non-multiarch build is split into two files,
+   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcpcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcpcpy.S b/sysdeps/x86_64/wcpcpy.S
new file mode 100644
index 0000000000..ec32dc070a
--- /dev/null
+++ b/sysdeps/x86_64/wcpcpy.S
@@ -0,0 +1,41 @@
+/* wcpcpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpcpy non-multiarch build is split into two files,
+   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCPCPY	__wcpcpy
+
+# define DEFAULT_IMPL_V4	"multiarch/wcpcpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcpcpy-avx2.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcpcpy, wcpcpy)
+libc_hidden_def (__wcpcpy)
+#endif
diff --git a/sysdeps/x86_64/wcpncpy-generic.c b/sysdeps/x86_64/wcpncpy-generic.c
new file mode 100644
index 0000000000..871219a445
--- /dev/null
+++ b/sysdeps/x86_64/wcpncpy-generic.c
@@ -0,0 +1,31 @@
+/* wcpncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpncpy non-multiarch build is split into two files,
+   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcpncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcpncpy.S b/sysdeps/x86_64/wcpncpy.S
new file mode 100644
index 0000000000..68e6ff1836
--- /dev/null
+++ b/sysdeps/x86_64/wcpncpy.S
@@ -0,0 +1,41 @@
+/* wcpncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpncpy non-multiarch build is split into two files,
+   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCPNCPY	__wcpncpy
+
+# define DEFAULT_IMPL_V4	"multiarch/wcpncpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcpncpy-avx2.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcpncpy, wcpncpy)
+libc_hidden_def (__wcpncpy)
+#endif
diff --git a/sysdeps/x86_64/wcscat-generic.c b/sysdeps/x86_64/wcscat-generic.c
new file mode 100644
index 0000000000..85f981a81f
--- /dev/null
+++ b/sysdeps/x86_64/wcscat-generic.c
@@ -0,0 +1,31 @@
+/* wcscat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcscat non-multiarch build is split into two files,
+   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcscat.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcscat.S b/sysdeps/x86_64/wcscat.S
new file mode 100644
index 0000000000..007de3c40c
--- /dev/null
+++ b/sysdeps/x86_64/wcscat.S
@@ -0,0 +1,41 @@
+/* wcscat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcscat non-multiarch build is split into two files,
+   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCSCAT	__wcscat
+
+# define DEFAULT_IMPL_V4	"multiarch/wcscat-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcscat-avx2.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcscat, wcscat)
+libc_hidden_def (__wcscat)
+#endif
diff --git a/sysdeps/x86_64/wcscpy.S b/sysdeps/x86_64/wcscpy.S
index 11d0bb4bab..ab9288ed74 100644
--- a/sysdeps/x86_64/wcscpy.S
+++ b/sysdeps/x86_64/wcscpy.S
@@ -28,6 +28,8 @@
 
 # define WCSCPY	__wcscpy
 
+# define DEFAULT_IMPL_V4	"multiarch/wcscpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcscpy-avx2.S"
 # define DEFAULT_IMPL_V2	"multiarch/wcscpy-ssse3.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
diff --git a/sysdeps/x86_64/wcsncat-generic.c b/sysdeps/x86_64/wcsncat-generic.c
new file mode 100644
index 0000000000..2cc0f7b11a
--- /dev/null
+++ b/sysdeps/x86_64/wcsncat-generic.c
@@ -0,0 +1,31 @@
+/* wcsncat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncat non-multiarch build is split into two files,
+   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcsncat.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcsncat.S b/sysdeps/x86_64/wcsncat.S
new file mode 100644
index 0000000000..3f4c7948db
--- /dev/null
+++ b/sysdeps/x86_64/wcsncat.S
@@ -0,0 +1,39 @@
+/* wcsncat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncat non-multiarch build is split into two files,
+   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCSNCAT	wcsncat
+
+# define DEFAULT_IMPL_V4	"multiarch/wcsncat-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcsncat-avx2.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+#endif
diff --git a/sysdeps/x86_64/wcsncpy-generic.c b/sysdeps/x86_64/wcsncpy-generic.c
new file mode 100644
index 0000000000..49d06b8ae8
--- /dev/null
+++ b/sysdeps/x86_64/wcsncpy-generic.c
@@ -0,0 +1,31 @@
+/* wcsncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncpy non-multiarch build is split into two files,
+   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcsncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcsncpy.S b/sysdeps/x86_64/wcsncpy.S
new file mode 100644
index 0000000000..e1428fd4c1
--- /dev/null
+++ b/sysdeps/x86_64/wcsncpy.S
@@ -0,0 +1,41 @@
+/* wcsncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncpy non-multiarch build is split into two files,
+   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCSNCPY	__wcsncpy
+
+# define DEFAULT_IMPL_V4	"multiarch/wcsncpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcsncpy-avx2.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcsncpy, wcsncpy)
+libc_hidden_def (__wcsncpy)
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json
  2022-11-04  8:20 ` [PATCH v2 " Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-11-04  8:20   ` [PATCH v2 4/4] x86: Add optimized functions for the wide-character strcpy family Noah Goldstein
@ 2022-11-04 16:26   ` H.J. Lu
  3 siblings, 0 replies; 42+ messages in thread
From: H.J. Lu @ 2022-11-04 16:26 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Nov 4, 2022 at 1:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Json output is easier to parse and most other benchmarks already do
> the same.
> ---
>  benchtests/bench-strcat.c  | 106 ++++++++++++++++++++++++++-------
>  benchtests/bench-strcpy.c  | 118 ++++++++++++++++++++++++-------------
>  benchtests/bench-strncat.c |  94 +++++++++++++++++++++--------
>  benchtests/bench-strncpy.c |  94 +++++++++++++++++++++--------
>  4 files changed, 297 insertions(+), 115 deletions(-)
>
> diff --git a/benchtests/bench-strcat.c b/benchtests/bench-strcat.c
> index 749318e37e..890b34b4c1 100644
> --- a/benchtests/bench-strcat.c
> +++ b/benchtests/bench-strcat.c
> @@ -35,6 +35,7 @@
>  # define SMALL_CHAR 1273
>  #endif /* WIDE */
>
> +#include "json-lib.h"
>
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
>
> @@ -49,7 +50,7 @@ IMPL (STRCAT, 1)
>  IMPL (generic_strcat, 0)
>
>  static void
> -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src)
>  {
>    size_t k = STRLEN (dst), i, iters = INNER_LOOP_ITERS_LARGE;
>    timing_t start, stop, cur;
> @@ -64,7 +65,8 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
>
>    if (STRCMP (dst + k, src) != 0)
>      {
> -      error (0, 0, "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
> +      error (0, 0,
> +            "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
>              impl->name, dst, src);
>        ret = 1;
>        return;
> @@ -80,11 +82,12 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align1, size_t align2, size_t len1, size_t len2, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len1,
> +        size_t len2, int max_char)
>  {
>    size_t i;
>    CHAR *s1, *s2;
> @@ -107,53 +110,112 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2, int max_char)
>    for (i = 0; i < len2; i++)
>      s2[i] = 32 + 23 * i % (max_char - 32);
>
> -  printf ("Length %4zd/%4zd, alignment %2zd/%2zd:", len1, len2, align1, align2);
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_attr_uint (json_ctx, "len1", len1);
> +  json_attr_uint (json_ctx, "len2", len2);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
>      {
>        s2[len2] = '\0';
> -      do_one_test (impl, s2, s1);
> +      do_one_test (json_ctx, impl, s2, s1);
>      }
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  test_main (void)
>  {
> +  json_ctx_t json_ctx;
>    size_t i;
>
>    test_init ();
>
> -  printf ("%28s", "");
> +  test_init ();
> +
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
>    for (i = 0; i < 16; ++i)
>      {
> -      do_test (0, 0, i, i, SMALL_CHAR);
> -      do_test (0, 0, i, i, BIG_CHAR);
> -      do_test (0, i, i, i, SMALL_CHAR);
> -      do_test (i, 0, i, i, BIG_CHAR);
> +      do_test (&json_ctx, 0, 0, i, i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, i, BIG_CHAR);
> +      do_test (&json_ctx, 0, i, i, i, SMALL_CHAR);
> +      do_test (&json_ctx, i, 0, i, i, BIG_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (0, 0, 8 << i, 8 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 8 << i, 8 << i, SMALL_CHAR);
> -      do_test (0, 0, 8 << i, 2 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 8 << i, 2 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 8 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 8 << i, 2 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 2 << i, SMALL_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (i, 2 * i, 8 << i, 1, SMALL_CHAR);
> -      do_test (2 * i, i, 8 << i, 1, BIG_CHAR);
> -      do_test (i, i, 8 << i, 10, SMALL_CHAR);
> -      do_test (i, i, 8 << i, 10, BIG_CHAR);
> +      do_test (&json_ctx, i, 2 * i, 8 << i, 1, SMALL_CHAR);
> +      do_test (&json_ctx, 2 * i, i, 8 << i, 1, BIG_CHAR);
> +      do_test (&json_ctx, i, i, 8 << i, 10, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, 8 << i, 10, BIG_CHAR);
> +    }
> +
> +  for (i = 32; i < 256; i += 32)
> +    {
> +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
>      }
>
> +  for (; i < 512; i += 64)
> +    {
> +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
> +    }
> +
> +  for (; i < 1024; i += 128)
> +    {
> +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
> +    }
> +
> +  for (; i < 2048; i += 256)
> +    {
> +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
>    return ret;
>  }
>
> diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
> index 29deb8a46a..af8673e137 100644
> --- a/benchtests/bench-strcpy.c
> +++ b/benchtests/bench-strcpy.c
> @@ -26,16 +26,18 @@
>  # define SMALL_CHAR 127
>  #endif
>
> +#include "json-lib.h"
> +
>  #ifndef STRCPY_RESULT
>  # define STRCPY_RESULT(dst, len) dst
>  # define TEST_MAIN
>  # ifndef WIDE
> -#  define TEST_NAME "strcpy"
> +#   define TEST_NAME "strcpy"
>  # else
> -#  define TEST_NAME "wcscpy"
> -#  define generic_strcpy generic_wcscpy
> +#   define TEST_NAME "wcscpy"
> +#   define generic_strcpy generic_wcscpy
>  # endif
> -#include "bench-string.h"
> +# include "bench-string.h"
>
>  CHAR *
>  generic_strcpy (CHAR *dst, const CHAR *src)
> @@ -51,8 +53,8 @@ IMPL (generic_strcpy, 0)
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
>
>  static void
> -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
> -            size_t len __attribute__((unused)))
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
> +            size_t len __attribute__ ((unused)))
>  {
>    size_t i, iters = INNER_LOOP_ITERS;
>    timing_t start, stop, cur;
> @@ -77,23 +79,24 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
>    TIMING_NOW (start);
>    for (i = 0; i < iters; ++i)
>      {
> -         CALL (impl, dst, src);
> +      CALL (impl, dst, src);
>      }
>    TIMING_NOW (stop);
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align1, size_t align2, size_t len, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
> +        int max_char)
>  {
>    size_t i;
>    CHAR *s1, *s2;
> -/* For wcscpy: align1 and align2 here mean alignment not in bytes,
> -   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> -   len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
> +  /* For wcscpy: align1 and align2 here mean alignment not in bytes,
> +     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> +     len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
>    align1 &= 7;
>    if ((align1 + len) * sizeof (CHAR) >= page_size)
>      return;
> @@ -109,65 +112,96 @@ do_test (size_t align1, size_t align2, size_t len, int max_char)
>      s1[i] = 32 + 23 * i % (max_char - 32);
>    s1[len] = 0;
>
> -  printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len,
> -         align1 * sizeof (CHAR), align2 * sizeof (CHAR));
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_attr_uint (json_ctx, "len", len);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, s2, s1, len);
> +    do_one_test (json_ctx, impl, s2, s1, len);
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  test_main (void)
>  {
> +  json_ctx_t json_ctx;
>    size_t i;
>
>    test_init ();
>
> -  printf ("%23s", "");
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
>    for (i = 0; i < 16; ++i)
>      {
> -      do_test (0, 0, i, SMALL_CHAR);
> -      do_test (0, 0, i, BIG_CHAR);
> -      do_test (0, i, i, SMALL_CHAR);
> -      do_test (i, 0, i, BIG_CHAR);
> +      do_test (&json_ctx, 0, 0, i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, BIG_CHAR);
> +      do_test (&json_ctx, 0, i, i, SMALL_CHAR);
> +      do_test (&json_ctx, i, 0, i, BIG_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (0, 0, 8 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, SMALL_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (i, 2 * i, 8 << i, SMALL_CHAR);
> -      do_test (2 * i, i, 8 << i, BIG_CHAR);
> -      do_test (i, i, 8 << i, SMALL_CHAR);
> -      do_test (i, i, 8 << i, BIG_CHAR);
> +      do_test (&json_ctx, i, 2 * i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 2 * i, i, 8 << i, BIG_CHAR);
> +      do_test (&json_ctx, i, i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, 8 << i, BIG_CHAR);
>      }
>
> -  for (i = 16; i <= 512; i+=4)
> +  for (i = 16; i <= 512; i += 4)
>      {
> -      do_test (0, 4, i, SMALL_CHAR);
> -      do_test (4, 0, i, BIG_CHAR);
> -      do_test (4, 4, i, SMALL_CHAR);
> -      do_test (2, 2, i, BIG_CHAR);
> -      do_test (2, 6, i, SMALL_CHAR);
> -      do_test (6, 2, i, BIG_CHAR);
> -      do_test (1, 7, i, SMALL_CHAR);
> -      do_test (7, 1, i, BIG_CHAR);
> -      do_test (3, 4, i, SMALL_CHAR);
> -      do_test (4, 3, i, BIG_CHAR);
> -      do_test (5, 7, i, SMALL_CHAR);
> -      do_test (7, 5, i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 4, i, SMALL_CHAR);
> +      do_test (&json_ctx, 4, 0, i, BIG_CHAR);
> +      do_test (&json_ctx, 4, 4, i, SMALL_CHAR);
> +      do_test (&json_ctx, 2, 2, i, BIG_CHAR);
> +      do_test (&json_ctx, 2, 6, i, SMALL_CHAR);
> +      do_test (&json_ctx, 6, 2, i, BIG_CHAR);
> +      do_test (&json_ctx, 1, 7, i, SMALL_CHAR);
> +      do_test (&json_ctx, 7, 1, i, BIG_CHAR);
> +      do_test (&json_ctx, 3, 4, i, SMALL_CHAR);
> +      do_test (&json_ctx, 4, 3, i, BIG_CHAR);
> +      do_test (&json_ctx, 5, 7, i, SMALL_CHAR);
> +      do_test (&json_ctx, 7, 5, i, SMALL_CHAR);
>      }
>
> +  for (i = 1; i < 2048; i += i)
> +    {
> +      do_test (&json_ctx, 1, 0, i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, i, i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, i, SMALL_CHAR);
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
>    return ret;
>  }
>
> diff --git a/benchtests/bench-strncat.c b/benchtests/bench-strncat.c
> index b148c55279..5ccc09a4f8 100644
> --- a/benchtests/bench-strncat.c
> +++ b/benchtests/bench-strncat.c
> @@ -33,6 +33,8 @@
>  # define SMALL_CHAR 1273
>  #endif /* WIDE */
>
> +#include "json-lib.h"
> +
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
>
>  CHAR *
> @@ -49,7 +51,8 @@ IMPL (STRNCAT, 2)
>  IMPL (generic_strncat, 0)
>
>  static void
> -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
> +            size_t n)
>  {
>    size_t k = STRLEN (dst), i, iters = INNER_LOOP_ITERS8;
>    timing_t start, stop, cur;
> @@ -65,8 +68,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
>    size_t len = STRLEN (src);
>    if (MEMCMP (dst + k, src, len + 1 > n ? n : len + 1) != 0)
>      {
> -      error (0, 0, "Incorrect concatenation in function %s",
> -            impl->name);
> +      error (0, 0, "Incorrect concatenation in function %s", impl->name);
>        ret = 1;
>        return;
>      }
> @@ -88,12 +90,12 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align1, size_t align2, size_t len1, size_t len2,
> -        size_t n, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len1,
> +        size_t len2, size_t n, int max_char)
>  {
>    size_t i;
>    CHAR *s1, *s2;
> @@ -118,53 +120,93 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
>    for (i = 0; i < len2; i++)
>      s2[i] = 32 + 23 * i % (max_char - 32);
>
> -  printf ("Length %4zd/%4zd, alignment %2zd/%2zd, N %4zd:",
> -         len1, len2, align1, align2, n);
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_attr_uint (json_ctx, "len1", len1);
> +  json_attr_uint (json_ctx, "len2", len2);
> +  json_attr_uint (json_ctx, "n", n);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
>      {
>        s2[len2] = '\0';
> -      do_one_test (impl, s2, s1, n);
> +      do_one_test (json_ctx, impl, s2, s1, n);
>      }
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  main (void)
>  {
> +  json_ctx_t json_ctx;
>    size_t i, n;
>
>    test_init ();
>
> -  printf ("%28s", "");
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
> -  for (n = 2; n <= 2048; n*=4)
> +  for (n = 2; n <= 2048; n *= 4)
>      {
> -      do_test (0, 2, 2, 2, n, SMALL_CHAR);
> -      do_test (0, 0, 4, 4, n, SMALL_CHAR);
> -      do_test (4, 0, 4, 4, n, BIG_CHAR);
> -      do_test (0, 0, 8, 8, n, SMALL_CHAR);
> -      do_test (0, 8, 8, 8, n, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 2, 2, 2, n, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 4, 4, n, SMALL_CHAR);
> +      do_test (&json_ctx, 4, 0, 4, 4, n, BIG_CHAR);
> +      do_test (&json_ctx, 0, 0, 8, 8, n, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 8, 8, 8, n, SMALL_CHAR);
>
>        for (i = 1; i < 8; ++i)
>         {
> -         do_test (0, 0, 8 << i, 8 << i, n, SMALL_CHAR);
> -         do_test (8 - i, 2 * i, 8 << i, 8 << i, n, SMALL_CHAR);
> -         do_test (0, 0, 8 << i, 2 << i, n, SMALL_CHAR);
> -         do_test (8 - i, 2 * i, 8 << i, 2 << i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, 8 << i, 8 << i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 8 << i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, 8 << i, 2 << i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 2 << i, n, SMALL_CHAR);
>         }
>
>        for (i = 1; i < 8; ++i)
>         {
> -         do_test (i, 2 * i, 8 << i, 1, n, SMALL_CHAR);
> -         do_test (2 * i, i, 8 << i, 1, n, BIG_CHAR);
> -         do_test (i, i, 8 << i, 10, n, SMALL_CHAR);
> +         do_test (&json_ctx, i, 2 * i, 8 << i, 1, n, SMALL_CHAR);
> +         do_test (&json_ctx, 2 * i, i, 8 << i, 1, n, BIG_CHAR);
> +         do_test (&json_ctx, i, i, 8 << i, 10, n, SMALL_CHAR);
>         }
>      }
>
> +  for (i = 128; i < 2048; i += i)
> +    {
> +      for (n = i - 64; n <= i + 64; n += 32)
> +       {
> +         do_test (&json_ctx, 1, 0, i, i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 0, i, i, i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, i, i, n, SMALL_CHAR);
> +         do_test (&json_ctx, i, i, i, i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 1, 0, i, n, i, SMALL_CHAR);
> +         do_test (&json_ctx, 0, i, i, n, i, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, i, n, i, SMALL_CHAR);
> +         do_test (&json_ctx, i, i, i, n, i, SMALL_CHAR);
> +       }
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
>    return ret;
>  }
> diff --git a/benchtests/bench-strncpy.c b/benchtests/bench-strncpy.c
> index 8207d99f4d..f621cbfe09 100644
> --- a/benchtests/bench-strncpy.c
> +++ b/benchtests/bench-strncpy.c
> @@ -24,6 +24,8 @@
>  # define SMALL_CHAR 127
>  #endif /* !WIDE */
>
> +#include "json-lib.h"
> +
>  #ifndef STRNCPY_RESULT
>  # define STRNCPY_RESULT(dst, len, n) dst
>  # define TEST_MAIN
> @@ -52,7 +54,8 @@ IMPL (generic_strncpy, 0)
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
>
>  static void
> -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
> +            size_t len, size_t n)
>  {
>    size_t i, iters = INNER_LOOP_ITERS_LARGE * (4 / CHARBYTES);
>    timing_t start, stop, cur;
> @@ -77,7 +80,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
>        size_t i;
>
>        for (i = len; i < n; ++i)
> -       if (dst [i] != '\0')
> +       if (dst[i] != '\0')
>           {
>             error (0, 0, "Wrong result in function %s", impl->name);
>             ret = 1;
> @@ -94,17 +97,18 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
> +        size_t n, int max_char)
>  {
>    size_t i;
>    CHAR *s1, *s2;
>
> -/* For wcsncpy: align1 and align2 here mean alignment not in bytes,
> -   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)).  */
> +  /* For wcsncpy: align1 and align2 here mean alignment not in bytes,
> +     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)).  */
>    align1 &= 7;
>    if ((align1 + len) * sizeof (CHAR) >= page_size)
>      return;
> @@ -123,46 +127,86 @@ do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
>         ++i)
>      s1[i] = 32 + 32 * i % (max_char - 32);
>
> -  printf ("Length %4zd, n %4zd, alignment %2zd/%2zd:", len, n, align1, align2);
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_attr_uint (json_ctx, "len", len);
> +  json_attr_uint (json_ctx, "n", n);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, s2, s1, len, n);
> +    do_one_test (json_ctx, impl, s2, s1, len, n);
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  static int
>  test_main (void)
>  {
> -  size_t i;
> +  json_ctx_t json_ctx;
> +  size_t i, j;
>
>    test_init ();
>
> -  printf ("%28s", "");
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (i, i, 16, 16, SMALL_CHAR);
> -      do_test (i, i, 16, 16, BIG_CHAR);
> -      do_test (i, 2 * i, 16, 16, SMALL_CHAR);
> -      do_test (2 * i, i, 16, 16, BIG_CHAR);
> -      do_test (8 - i, 2 * i, 1 << i, 2 << i, SMALL_CHAR);
> -      do_test (2 * i, 8 - i, 2 << i, 1 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 1 << i, 2 << i, BIG_CHAR);
> -      do_test (2 * i, 8 - i, 2 << i, 1 << i, BIG_CHAR);
> +      do_test (&json_ctx, i, i, 16, 16, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, 16, 16, BIG_CHAR);
> +      do_test (&json_ctx, i, 2 * i, 16, 16, SMALL_CHAR);
> +      do_test (&json_ctx, 2 * i, i, 16, 16, BIG_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 1 << i, 2 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 2 * i, 8 - i, 2 << i, 1 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 1 << i, 2 << i, BIG_CHAR);
> +      do_test (&json_ctx, 2 * i, 8 - i, 2 << i, 1 << i, BIG_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (0, 0, 4 << i, 8 << i, SMALL_CHAR);
> -      do_test (0, 0, 16 << i, 8 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 4 << i, 8 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 16 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 4 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 16 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 4 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 16 << i, 8 << i, SMALL_CHAR);
>      }
>
> +  for (i = 128; i < 2048; i += i)
> +    {
> +      for (j = i - 64; j <= i + 64; j += 32)
> +       {
> +         do_test (&json_ctx, 1, 0, i, j, SMALL_CHAR);
> +         do_test (&json_ctx, 0, i, i, j, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, i, j, SMALL_CHAR);
> +         do_test (&json_ctx, i, i, i, j, SMALL_CHAR);
> +         do_test (&json_ctx, 1, 0, j, i, SMALL_CHAR);
> +         do_test (&json_ctx, 0, i, j, i, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, j, i, SMALL_CHAR);
> +         do_test (&json_ctx, i, i, j, i, SMALL_CHAR);
> +       }
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
>    return ret;
>  }
>
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
  2022-11-04  8:20   ` [PATCH v2 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
@ 2022-11-04 16:33     ` H.J. Lu
  2022-11-04 20:20       ` Noah Goldstein
  0 siblings, 1 reply; 42+ messages in thread
From: H.J. Lu @ 2022-11-04 16:33 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Nov 4, 2022 at 1:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
>     1. Use more overlapping stores to avoid branches.
>     2. Reduce how unrolled the aligning copies are (this is more of a
>        code-size save, its a negative for some sizes in terms of
>        perf).
>     3. Improve the loop a bit (similiar to what we do in strlen with
>        2x vpminu + kortest instead of 3x vpminu + kmov + test).
>     4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
>        number that are taken.
>
> Performance Changes:
>
>     Times are from N = 10 runs of the benchmark suite and are
>     reported as geometric mean of all ratios of
>     New Implementation / Old Implementation.
>
>     stpcpy-evex      -> 0.922
>     strcat-evex      -> 0.985
>     strcpy-evex      -> 0.880
>
>     strncpy-evex     -> 0.831
>     stpncpy-evex     -> 0.780
>
>     strncat-evex     -> 0.958
>
> Code Size Changes:
>     function         -> Bytes New / Bytes Old -> Ratio
>
>     strcat-evex      -> 819  / 1874 -> 0.437
>     strcpy-evex      -> 700  / 1074 -> 0.652
>     stpcpy-evex      -> 735  / 1094 -> 0.672
>
>     strncpy-evex     -> 1397 / 2611 -> 0.535
>     stpncpy-evex     -> 1489 / 2691 -> 0.553
>
>     strncat-evex     -> 1184 / 2832 -> 0.418
>
> Notes:
>     1. Because of the significant difference between the
>        implementations they are split into three files.
>
>            strcpy-evex.S    -> strcpy, stpcpy, strcat
>            strncpy-evex.S   -> strncpy
>            strncat-evex.S    > strncat
>
>        I couldn't find a way to merge them without making the
>        ifdefs incredibly difficult to follow.
>
>     2. All implementations can be made evex512 by including
>        "x86-evex512-vecs.h" at the top.
>
>     3. All implementations have an optional define:
>         `USE_EVEX_MASKED_STORE`
>        Setting to one uses evex-masked stores for handling short
>        strings.  This saves code size and branches.  It's disabled
>        for all implementations are the moment as there are some
>        serious drawbacks to masked stores in certain cases, but
>        that may be fixed on future architectures.
>
> Full check passes on x86-64 and build succeeds for all ISA levels w/
> and w/o multiarch.
> ---
>  sysdeps/x86_64/multiarch/stpncpy-evex.S       |    5 +-
>  sysdeps/x86_64/multiarch/strcat-evex.S        |  291 +---
>  sysdeps/x86_64/multiarch/strcat-strlen-evex.S |   85 ++
>  sysdeps/x86_64/multiarch/strcpy-evex.S        | 1282 ++++++-----------
>  sysdeps/x86_64/multiarch/strncat-evex.S       |  525 ++++++-
>  sysdeps/x86_64/multiarch/strncpy-evex.S       |  995 ++++++++++++-
>  .../multiarch/strncpy-or-cat-overflow-def.h   |   65 +
>  7 files changed, 2075 insertions(+), 1173 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
>
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> index 99ea76a372..3693491baa 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
> +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> @@ -3,6 +3,5 @@
>  #endif
>
>  #define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY STPNCPY
> -#include "strcpy-evex.S"
> +#define STRNCPY        STPNCPY
> +#include "strncpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
> index 0e2df947e9..b4207b7889 100644
> --- a/sysdeps/x86_64/multiarch/strcat-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcat-evex.S
> @@ -1,286 +1,7 @@
> -/* strcat with 256-bit EVEX instructions.
> -   Copyright (C) 2021-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <isa-level.h>
> -
> -#if ISA_SHOULD_BUILD (4)
> -
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_evex
> -# endif
> -
> -# define VMOVU         vmovdqu64
> -# define VMOVA         vmovdqa64
> -
> -/* zero register */
> -# define XMMZERO       xmm16
> -# define YMMZERO       ymm16
> -# define YMM0          ymm17
> -# define YMM1          ymm18
> -
> -# define USE_AS_STRCAT
> -
> -/* Number of bytes in a vector register */
> -# define VEC_SIZE      32
> -
> -       .section .text.evex,"ax",@progbits
> -ENTRY (STRCAT)
> -       mov     %rdi, %r9
> -# ifdef USE_AS_STRNCAT
> -       mov     %rdx, %r8
> -# endif
> -
> -       xor     %eax, %eax
> -       mov     %edi, %ecx
> -       and     $((VEC_SIZE * 4) - 1), %ecx
> -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> -       cmp     $(VEC_SIZE * 3), %ecx
> -       ja      L(fourth_vector_boundary)
> -       vpcmpb  $0, (%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_first_vector)
> -       mov     %rdi, %rax
> -       and     $-VEC_SIZE, %rax
> -       jmp     L(align_vec_size_start)
> -L(fourth_vector_boundary):
> -       mov     %rdi, %rax
> -       and     $-VEC_SIZE, %rax
> -       vpcmpb  $0, (%rax), %YMMZERO, %k0
> -       mov     $-1, %r10d
> -       sub     %rax, %rcx
> -       shl     %cl, %r10d
> -       kmovd   %k0, %edx
> -       and     %r10d, %edx
> -       jnz     L(exit)
> -
> -L(align_vec_size_start):
> -       vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -       kmovd   %k1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -       kmovd   %k2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -       kmovd   %k3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -       add     $(VEC_SIZE * 4), %rax
> -       kmovd   %k4, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -       kmovd   %k1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -       kmovd   %k2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -       kmovd   %k3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -       kmovd   %k4, %edx
> -       add     $(VEC_SIZE * 4), %rax
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -       kmovd   %k1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -       kmovd   %k2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -       kmovd   %k3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -       add     $(VEC_SIZE * 4), %rax
> -       kmovd   %k4, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -       kmovd   %k1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -       kmovd   %k2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -       kmovd   %k3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -       add     $(VEC_SIZE * 5), %rax
> -       kmovd   %k4, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> -       add     $VEC_SIZE, %rax
> -       kmovd   %k0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> -       add     $VEC_SIZE, %rax
> -       kmovd   %k0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k1
> -       add     $VEC_SIZE, %rax
> -       kmovd   %k1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       add     $VEC_SIZE, %rax
> -
> -       .p2align 4
> -L(align_four_vec_loop):
> -       VMOVA   (%rax), %YMM0
> -       VMOVA   (VEC_SIZE * 2)(%rax), %YMM1
> -       vpminub VEC_SIZE(%rax), %YMM0, %YMM0
> -       vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1
> -       vpminub %YMM0, %YMM1, %YMM0
> -       /* If K0 != 0, there is a null byte.  */
> -       vpcmpb  $0, %YMM0, %YMMZERO, %k0
> -       add     $(VEC_SIZE * 4), %rax
> -       ktestd  %k0, %k0
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
> -       sub     $(VEC_SIZE * 5), %rax
> -       kmovd   %k0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -       kmovd   %k1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -       kmovd   %k2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -       kmovd   %k3, %edx
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 4), %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit):
> -       sub     %rdi, %rax
> -L(exit_null_on_first_vector):
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_second_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $VEC_SIZE, %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_third_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 2), %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_fourth_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 3), %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_fifth_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 4), %rax
> -
> -       .p2align 4
> -L(StartStrcpyPart):
> -       lea     (%r9, %rax), %rdi
> -       mov     %rsi, %rcx
> -       mov     %r9, %rax      /* save result */
> -
> -# ifdef USE_AS_STRNCAT
> -       test    %r8, %r8
> -       jz      L(ExitZero)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-evex.S"
> +#ifndef STRCAT
> +# define STRCAT        __strcat_evex
>  #endif
> +
> +#define USE_AS_STRCAT
> +#define STRCPY STRCAT
> +#include "strcpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> new file mode 100644
> index 0000000000..9bc777c339
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S

Copyright notice is missing.

> @@ -0,0 +1,85 @@
> +    /* Simple strlen implementation that ends at L(strcat_strlen_done).  */
> +       vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> +       movq    %rdi, %r8
> +       andq    $(VEC_SIZE * -1), %r8
> +       VPCMPEQ (%r8), %VZERO, %k0
> +       KMOV    %k0, %VRCX
> +#ifdef USE_AS_WCSCPY
> +       subl    %r8d, %edi
> +       shrl    $2, %edi
> +#endif
> +       shrx    %VRDI, %VRCX, %VRCX
> +#ifdef USE_AS_WCSCPY
> +       movq    %rax, %rdi
> +#endif
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v0)
> +
> +
> +       VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0
> +       KMOV    %k0, %VRCX
> +       leaq    (VEC_SIZE)(%r8), %rdi
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v0)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v1)
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v2)
> +
> +       VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v3)
> +
> +       andq    $-(VEC_SIZE * 4), %rdi
> +       .p2align 4,, 8
> +L(loop_2x_vec):
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(0)
> +       VPMIN   (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
> +       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(2)
> +       VPMIN   (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
> +       VPTESTN %VMM(1), %VMM(1), %k1
> +       VPTESTN %VMM(3), %VMM(3), %k3
> +       subq    $(VEC_SIZE * -4), %rdi
> +       KORTEST %k1, %k3
> +       jz      L(loop_2x_vec)
> +
> +       VPTESTN %VMM(0), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v0)
> +
> +       KMOV    %k1, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v1)
> +
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(bsf_and_done_v2)
> +
> +       KMOV    %k3, %VRCX
> +L(bsf_and_done_v3):
> +       addq    $VEC_SIZE, %rdi
> +L(bsf_and_done_v2):
> +       bsf     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
> +       jmp     L(strcat_strlen_done)
> +
> +       .p2align 4,, 4
> +L(bsf_and_done_v1):
> +       addq    $VEC_SIZE, %rdi
> +L(bsf_and_done_v0):
> +       bsf     %VRCX, %VRCX
> +#ifdef USE_AS_WCSCPY
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> +#else
> +       addq    %rcx, %rdi
> +#endif
> +L(strcat_strlen_done):
> diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
> index 82e45ac675..1ba0195ed2 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
> @@ -1,4 +1,4 @@
> -/* strcpy with 256-bit EVEX instructions.
> +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
>     Copyright (C) 2021-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
> @@ -17,990 +17,526 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #include <isa-level.h>
> -
>  #if ISA_SHOULD_BUILD (4)
>
>
> -# ifndef USE_AS_STRCAT
> -#  include <sysdep.h>
> +       /* Use evex-masked stores for small sizes. Turned off at the
> +          moment.  */
> +# define USE_EVEX_MASKED_STORE 0
> +       /* Use movsb in page cross case to save code size.  */
> +# define USE_MOVSB_IN_PAGE_CROSS       1
>
> -#  ifndef STRCPY
> -#   define STRCPY  __strcpy_evex
> -#  endif
> +# include <sysdep.h>
>
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
>  # endif
>
> -# define VMOVU         vmovdqu64
> -# define VMOVA         vmovdqa64
> -
> -/* Number of bytes in a vector register */
> -# ifndef VEC_SIZE
> -#  define VEC_SIZE     32
> +# ifndef STRCPY
> +#  define STRCPY       __strcpy_evex
>  # endif
>
> -# define XMM2          xmm18
> -# define XMM3          xmm19
>
> -# define YMM2          ymm18
> -# define YMM3          ymm19
> -# define YMM4          ymm20
> -# define YMM5          ymm21
> -# define YMM6          ymm22
> -# define YMM7          ymm23
> +# ifdef USE_AS_WCSCPY
> +#  define VMOVU_MASK   vmovdqu32
> +#  define VPMIN        vpminud
> +#  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
> +#  define VPCMPEQ      vpcmpeqd
> +#  define CHAR_SIZE    4
>
> -# ifndef USE_AS_STRCAT
> +#  define REP_MOVS     rep movsd
>
> -/* zero register */
> -#  define XMMZERO      xmm16
> -#  define YMMZERO      ymm16
> -#  define YMM1         ymm17
> -
> -       .section .text.evex,"ax",@progbits
> -ENTRY (STRCPY)
> -#  ifdef USE_AS_STRNCPY
> -       mov     %RDX_LP, %R8_LP
> -       test    %R8_LP, %R8_LP
> -       jz      L(ExitZero)
> -#  endif
> -       mov     %rsi, %rcx
> -#  ifndef USE_AS_STPCPY
> -       mov     %rdi, %rax      /* save result */
> -#  endif
> +#  define USE_WIDE_CHAR
> +# else
> +#  define VMOVU_MASK   vmovdqu8
> +#  define VPMIN        vpminub
> +#  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
> +#  define VPCMPEQ      vpcmpeqb
> +#  define CHAR_SIZE    1
>
> -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> +#  define REP_MOVS     rep movsb
>  # endif
>
> -       and     $((VEC_SIZE * 4) - 1), %ecx
> -       cmp     $(VEC_SIZE * 2), %ecx
> -       jbe     L(SourceStringAlignmentLessTwoVecSize)
> -
> -       and     $-VEC_SIZE, %rsi
> -       and     $(VEC_SIZE - 1), %ecx
> -
> -       vpcmpb  $0, (%rsi), %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       shr     %cl, %rdx
> +# include "reg-macros.h"
>
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -       mov     $VEC_SIZE, %r10
> -       sub     %rcx, %r10
> -       cmp     %r10, %r8
> -#  else
> -       mov     $(VEC_SIZE + 1), %r10
> -       sub     %rcx, %r10
> -       cmp     %r10, %r8
> -#  endif
> -       jbe     L(CopyVecSizeTailCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeTail)
> -
> -       vpcmpb  $0, VEC_SIZE(%rsi), %YMMZERO, %k1
> -       kmovd   %k1, %edx
>
> -# ifdef USE_AS_STRNCPY
> -       add     $VEC_SIZE, %r10
> -       cmp     %r10, %r8
> -       jbe     L(CopyTwoVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyTwoVecSize)
> -
> -       VMOVU   (%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
> -       VMOVU   %YMM2, (%rdi)
> -
> -/* If source address alignment != destination address alignment */
> -       .p2align 4
> -L(UnalignVecSizeBoth):
> -       sub     %rcx, %rdi
> -# ifdef USE_AS_STRNCPY
> -       add     %rcx, %r8
> -       sbb     %rcx, %rcx
> -       or      %rcx, %r8
> -# endif
> -       mov     $VEC_SIZE, %rcx
> -       VMOVA   (%rsi, %rcx), %YMM2
> -       VMOVU   %YMM2, (%rdi, %rcx)
> -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> -       vpcmpb  $0, %YMM2, %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $(VEC_SIZE * 3), %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec2)
> +# ifdef USE_AS_STPCPY
> +#  define END_REG      rax
>  # else
> -       jnz     L(CopyVecSize)
> +#  define END_REG      rdi, %rdx, CHAR_SIZE
>  # endif
>
> -       VMOVU   %YMM2, (%rdi, %rcx)
> -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM3
> -       vpcmpb  $0, %YMM3, %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec3)
> +# ifdef USE_AS_STRCAT
> +#  define PAGE_ALIGN_REG       edx
> +#  define PAGE_ALIGN_REG_64    rdx
>  # else
> -       jnz     L(CopyVecSize)
> +#  define PAGE_ALIGN_REG       eax
> +#  define PAGE_ALIGN_REG_64    rax
>  # endif
>
> -       VMOVU   %YMM3, (%rdi, %rcx)
> -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM4
> -       vpcmpb  $0, %YMM4, %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec4)
> -# else
> -       jnz     L(CopyVecSize)
> -# endif
> +# define VZERO VMM(7)
> +# define VZERO_128     VMM_128(7)
>
> -       VMOVU   %YMM4, (%rdi, %rcx)
> -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> -       vpcmpb  $0, %YMM2, %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec2)
> -# else
> -       jnz     L(CopyVecSize)
> -# endif
>
> -       VMOVU   %YMM2, (%rdi, %rcx)
> -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> -       vpcmpb  $0, %YMM2, %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec2)
> -# else
> -       jnz     L(CopyVecSize)
> -# endif
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
> -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM3
> -       VMOVU   %YMM2, (%rdi, %rcx)
> -       vpcmpb  $0, %YMM3, %YMMZERO, %k0
> -       kmovd   %k0, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec3)
> -# else
> -       jnz     L(CopyVecSize)
> -# endif
>
> -       VMOVU   %YMM3, (%rdi, %rcx)
> -       mov     %rsi, %rdx
> -       lea     VEC_SIZE(%rsi, %rcx), %rsi
> -       and     $-(VEC_SIZE * 4), %rsi
> -       sub     %rsi, %rdx
> -       sub     %rdx, %rdi
> -# ifdef USE_AS_STRNCPY
> -       lea     (VEC_SIZE * 8)(%r8, %rdx), %r8
> -# endif
> -L(UnalignedFourVecSizeLoop):
> -       VMOVA   (%rsi), %YMM4
> -       VMOVA   VEC_SIZE(%rsi), %YMM5
> -       VMOVA   (VEC_SIZE * 2)(%rsi), %YMM6
> -       VMOVA   (VEC_SIZE * 3)(%rsi), %YMM7
> -       vpminub %YMM5, %YMM4, %YMM2
> -       vpminub %YMM7, %YMM6, %YMM3
> -       vpminub %YMM2, %YMM3, %YMM2
> -       /* If K7 != 0, there is a null byte.  */
> -       vpcmpb  $0, %YMM2, %YMMZERO, %k7
> -       kmovd   %k7, %edx
> -# ifdef USE_AS_STRNCPY
> -       sub     $(VEC_SIZE * 4), %r8
> -       jbe     L(UnalignedLeaveCase2OrCase3)
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(STRCPY)
> +# ifdef USE_AS_STRCAT
> +       movq    %rdi, %rax
> +#  include "strcat-strlen-evex.S"
>  # endif
> -       test    %edx, %edx
> -       jnz     L(UnalignedFourVecSizeLeave)
> -
> -L(UnalignedFourVecSizeLoop_start):
> -       add     $(VEC_SIZE * 4), %rdi
> -       add     $(VEC_SIZE * 4), %rsi
> -       VMOVU   %YMM4, -(VEC_SIZE * 4)(%rdi)
> -       VMOVA   (%rsi), %YMM4
> -       VMOVU   %YMM5, -(VEC_SIZE * 3)(%rdi)
> -       VMOVA   VEC_SIZE(%rsi), %YMM5
> -       vpminub %YMM5, %YMM4, %YMM2
> -       VMOVU   %YMM6, -(VEC_SIZE * 2)(%rdi)
> -       VMOVA   (VEC_SIZE * 2)(%rsi), %YMM6
> -       VMOVU   %YMM7, -VEC_SIZE(%rdi)
> -       VMOVA   (VEC_SIZE * 3)(%rsi), %YMM7
> -       vpminub %YMM7, %YMM6, %YMM3
> -       vpminub %YMM2, %YMM3, %YMM2
> -       /* If K7 != 0, there is a null byte.  */
> -       vpcmpb  $0, %YMM2, %YMMZERO, %k7
> -       kmovd   %k7, %edx
> -# ifdef USE_AS_STRNCPY
> -       sub     $(VEC_SIZE * 4), %r8
> -       jbe     L(UnalignedLeaveCase2OrCase3)
> +
> +       movl    %esi, %PAGE_ALIGN_REG
> +       andl    $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
> +       ja      L(page_cross)
> +L(page_cross_continue):
> +       VMOVU   (%rsi), %VMM(0)
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +       movq    %rdi, %rax
>  # endif
> -       test    %edx, %edx
> -       jz      L(UnalignedFourVecSizeLoop_start)
>
> -L(UnalignedFourVecSizeLeave):
> -       vpcmpb  $0, %YMM4, %YMMZERO, %k1
> -       kmovd   %k1, %edx
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeUnaligned_0)
>
> -       vpcmpb  $0, %YMM5, %YMMZERO, %k2
> -       kmovd   %k2, %ecx
> -       test    %ecx, %ecx
> -       jnz     L(CopyVecSizeUnaligned_16)
> +       /* Two short string implementations. One with traditional
> +          branching approach and one with masked instructions (which
> +          have potential for dramatically bad perf if dst splits a
> +          page and is not in the TLB).  */
> +# if USE_EVEX_MASKED_STORE
> +       VPTEST  %VMM(0), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +#  ifdef USE_AS_WCSCPY
> +       subl    $((1 << CHAR_PER_VEC)- 1), %VRCX
> +#  else
> +       inc     %VRCX
> +#  endif
> +       jz      L(more_1x_vec)
> +       KMOV    %VRCX, %k1
> +       KXOR    %k0, %k1, %k1
>
> -       vpcmpb  $0, %YMM6, %YMMZERO, %k3
> -       kmovd   %k3, %edx
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeUnaligned_32)
> -
> -       vpcmpb  $0, %YMM7, %YMMZERO, %k4
> -       kmovd   %k4, %ecx
> -       bsf     %ecx, %edx
> -       VMOVU   %YMM4, (%rdi)
> -       VMOVU   %YMM5, VEC_SIZE(%rdi)
> -       VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     (VEC_SIZE * 3)(%rdi, %rdx), %rax
> -# endif
> -       VMOVU   %YMM7, (VEC_SIZE * 3)(%rdi)
> -       add     $(VEC_SIZE - 1), %r8
> -       sub     %rdx, %r8
> -       lea     ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> -# else
> -       add     $(VEC_SIZE * 3), %rsi
> -       add     $(VEC_SIZE * 3), %rdi
> -       jmp     L(CopyVecSizeExit)
> -# endif
> +       VMOVU_MASK %VMM(0), (%rdi){%k1}
>
> -/* If source address alignment == destination address alignment */
> +#  ifdef USE_AS_STPCPY
> +       bsf     %VRCX, %VRCX
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rax
> +#  endif
> +       ret
>
> -L(SourceStringAlignmentLessTwoVecSize):
> -       VMOVU   (%rsi), %YMM3
> -       VMOVU   VEC_SIZE(%rsi), %YMM2
> -       vpcmpb  $0, %YMM3, %YMMZERO, %k0
> -       kmovd   %k0, %edx
> +# else
> +       VPTESTN %VMM(0), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jz      L(more_1x_vec)
>
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -       cmp     $VEC_SIZE, %r8
> +       xorl    %edx, %edx
> +       bsf     %VRCX, %VRDX
> +#  ifdef USE_AS_STPCPY
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> +#  endif
> +
> +       /* Use mask bits in rcx to detect which copy we need. If the low
> +          mask is zero then there must be a bit set in the upper half.
> +          I.e if rcx != 0 and ecx == 0, then match must be upper 32
> +          bits so we use L(copy_32_63).  */
> +#  if VEC_SIZE == 64
> +#   ifdef USE_AS_WCSCPY
> +       testb   %cl, %cl
> +#   else
> +       testl   %ecx, %ecx
> +#   endif
> +       jz      L(copy_32_63)
> +#  endif
> +
> +#  ifdef USE_AS_WCSCPY
> +       testb   $0xf, %cl
>  #  else
> -       cmp     $(VEC_SIZE + 1), %r8
> +       testw   %cx, %cx
>  #  endif
> -       jbe     L(CopyVecSizeTail1Case2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeTail1)
> +       jz      L(copy_16_31)
>
> -       VMOVU   %YMM3, (%rdi)
> -       vpcmpb  $0, %YMM2, %YMMZERO, %k0
> -       kmovd   %k0, %edx
>
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -       cmp     $(VEC_SIZE * 2), %r8
> +#  ifdef USE_AS_WCSCPY
> +       testb   $0x3, %cl
>  #  else
> -       cmp     $((VEC_SIZE * 2) + 1), %r8
> +       testb   %cl, %cl
>  #  endif
> -       jbe     L(CopyTwoVecSize1Case2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyTwoVecSize1)
> -
> -       and     $-VEC_SIZE, %rsi
> -       and     $(VEC_SIZE - 1), %ecx
> -       jmp     L(UnalignVecSizeBoth)
> +       jz      L(copy_8_15)
>
> -/*------End of main part with loops---------------------*/
>
> -/* Case1 */
> +#  ifdef USE_AS_WCSCPY
> +       vmovd   %VMM_128(0), (%rdi)
> +       /* No need to copy, we know its zero.  */
> +       movl    $0, (%END_REG)
>
> -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> -       .p2align 4
> -L(CopyVecSize):
> -       add     %rcx, %rdi
> -# endif
> -L(CopyVecSizeTail):
> -       add     %rcx, %rsi
> -L(CopyVecSizeTail1):
> -       bsf     %edx, %edx
> -L(CopyVecSizeExit):
> -       cmp     $32, %edx
> -       jae     L(Exit32_63)
> -       cmp     $16, %edx
> -       jae     L(Exit16_31)
> -       cmp     $8, %edx
> -       jae     L(Exit8_15)
> -       cmp     $4, %edx
> -       jae     L(Exit4_7)
> -       cmp     $3, %edx
> -       je      L(Exit3)
> -       cmp     $1, %edx
> -       ja      L(Exit2)
> -       je      L(Exit1)
> -       movb    $0, (%rdi)
> -# ifdef USE_AS_STPCPY
> -       lea     (%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $1, %r8
> -       lea     1(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> -# endif
>         ret
> +#  else
>
> -       .p2align 4
> -L(CopyTwoVecSize1):
> -       add     $VEC_SIZE, %rsi
> -       add     $VEC_SIZE, %rdi
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $VEC_SIZE, %r8
> -# endif
> -       jmp     L(CopyVecSizeTail1)
> -
> -       .p2align 4
> -L(CopyTwoVecSize):
> -       bsf     %edx, %edx
> -       add     %rcx, %rsi
> -       add     $VEC_SIZE, %edx
> -       sub     %ecx, %edx
> -       jmp     L(CopyVecSizeExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnaligned_0):
> -       bsf     %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -       VMOVU   %YMM4, (%rdi)
> -       add     $((VEC_SIZE * 4) - 1), %r8
> -       sub     %rdx, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> -# else
> -       jmp     L(CopyVecSizeExit)
> -# endif
> +       testb   $0x7, %cl
> +       jz      L(copy_4_7)
>
> -       .p2align 4
> -L(CopyVecSizeUnaligned_16):
> -       bsf     %ecx, %edx
> -       VMOVU   %YMM4, (%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     VEC_SIZE(%rdi, %rdx), %rax
> -# endif
> -       VMOVU   %YMM5, VEC_SIZE(%rdi)
> -       add     $((VEC_SIZE * 3) - 1), %r8
> -       sub     %rdx, %r8
> -       lea     (VEC_SIZE + 1)(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> -# else
> -       add     $VEC_SIZE, %rsi
> -       add     $VEC_SIZE, %rdi
> -       jmp     L(CopyVecSizeExit)
> -# endif
>
> -       .p2align 4
> -L(CopyVecSizeUnaligned_32):
> -       bsf     %edx, %edx
> -       VMOVU   %YMM4, (%rdi)
> -       VMOVU   %YMM5, VEC_SIZE(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     (VEC_SIZE * 2)(%rdi, %rdx), %rax
> -# endif
> -       VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> -       add     $((VEC_SIZE * 2) - 1), %r8
> -       sub     %rdx, %r8
> -       lea     ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> -# else
> -       add     $(VEC_SIZE * 2), %rsi
> -       add     $(VEC_SIZE * 2), %rdi
> -       jmp     L(CopyVecSizeExit)
> -# endif
> +       test    %edx, %edx
> +       jz      L(set_null_term)
>
> -# ifdef USE_AS_STRNCPY
> -#  ifndef USE_AS_STRCAT
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec6):
> -       VMOVU   %YMM6, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec5):
> -       VMOVU   %YMM5, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec4):
> -       VMOVU   %YMM4, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec3):
> -       VMOVU   %YMM3, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> +       /* NB: make this `vmovw` if support for AVX512-FP16 is added.
> +        */
> +       vmovd   %VMM_128(0), %esi
> +       movw    %si, (%rdi)
> +
> +       .p2align 4,, 1
> +L(set_null_term):
> +       /* No need to copy, we know its zero.  */
> +       movb    $0, (%END_REG)
> +       ret
>  #  endif
>
> -/* Case2 */
> -
> -       .p2align 4
> -L(CopyVecSizeCase2):
> -       add     $VEC_SIZE, %r8
> -       add     %rcx, %rdi
> -       add     %rcx, %rsi
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyTwoVecSizeCase2):
> -       add     %rcx, %rsi
> -       bsf     %edx, %edx
> -       add     $VEC_SIZE, %edx
> -       sub     %ecx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -L(CopyVecSizeTailCase2):
> -       add     %rcx, %rsi
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -L(CopyVecSizeTail1Case2):
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -/* Case2 or Case3,  Case3 */
> -
> -       .p2align 4
> -L(CopyVecSizeCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyVecSizeCase2)
> -L(CopyVecSizeCase3):
> -       add     $VEC_SIZE, %r8
> -       add     %rcx, %rdi
> -       add     %rcx, %rsi
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyTwoVecSizeCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyTwoVecSizeCase2)
> -       add     %rcx, %rsi
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyVecSizeTailCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyVecSizeTailCase2)
> -       add     %rcx, %rsi
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyTwoVecSize1Case2OrCase3):
> -       add     $VEC_SIZE, %rdi
> -       add     $VEC_SIZE, %rsi
> -       sub     $VEC_SIZE, %r8
> -L(CopyVecSizeTail1Case2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyVecSizeTail1Case2)
> -       jmp     L(StrncpyExit)
> +#  if VEC_SIZE == 64
> +       .p2align 4,, 6
> +L(copy_32_63):
> +       VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +       VMOVU   %VMM_256(0), (%rdi)
> +       VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
> +       ret
> +#  endif
> +
> +
> +       .p2align 4,, 6
> +L(copy_16_31):
> +       /* Use xmm1 explicitly here as it won't require a `vzeroupper`
> +          and will save code size.  */
> +       vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +       VMOVU   %VMM_128(0), (%rdi)
> +       vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
> +       ret
> +
> +       .p2align 4,, 8
> +L(copy_8_15):
> +#  ifdef USE_AS_WCSCPY
> +       movl    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> +#  else
> +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
> +#  endif
> +       vmovq   %VMM_128(0), (%rdi)
> +       movq    %rcx, -(8 - CHAR_SIZE)(%END_REG)
> +       ret
>  # endif
>
> -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
>
> -       .p2align 4
> -L(Exit1):
> -       movzwl  (%rsi), %edx
> -       mov     %dx, (%rdi)
> -# ifdef USE_AS_STPCPY
> -       lea     1(%rdi), %rax
> +# ifndef USE_AS_WCSCPY
> +       .p2align 4,, 12
> +L(copy_4_7):
> +       movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> +       vmovd   %VMM_128(0), (%rdi)
> +       movl    %ecx, -(4 - CHAR_SIZE)(%END_REG)
> +       ret
>  # endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $2, %r8
> -       lea     2(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +
> +
> +       .p2align 4,, 8
> +L(more_1x_vec):
> +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> +       VMOVU   %VMM(0), (%rdi)
>  # endif
> -       ret
> +       subq    %rsi, %rdi
> +       andq    $-(VEC_SIZE), %rsi
> +       addq    %rsi, %rdi
> +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
>
> -       .p2align 4
> -L(Exit2):
> -       movzwl  (%rsi), %ecx
> -       mov     %cx, (%rdi)
> -       movb    $0, 2(%rdi)
> +       /* Ideally we store after moves to minimize impact of potential
> +          false-dependencies.  */
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +       VMOVU   %VMM(0), (%rax)
> +# endif
> +
> +       VPTESTN %VMM(1), %VMM(1), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x1)
> +
> +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), VEC_SIZE(%rdi)
> +
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x2)
> +
> +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> +
> +       VPTESTN %VMM(3), %VMM(3), %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(ret_vec_x3)
> +
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> +       VPTESTN %VMM(4), %VMM(4), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x4)
> +
> +       VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> +
> +
> +       /* Align for 4x loop.  */
> +       subq    %rsi, %rdi
> +
> +       /* + VEC_SIZE * 5 because we never added the original VEC_SIZE
> +          we covered before aligning.  */
> +       subq    $-(VEC_SIZE * 5), %rsi
> +       andq    $-(VEC_SIZE * 4), %rsi
> +
> +
> +       /* Load first half of the loop before entry.  */
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPTESTN %VMM(4), %VMM(4), %k2
> +       VPTESTN %VMM(6), %VMM(6), %k4
> +       KORTEST %k2, %k4
> +       jnz     L(loop_4x_done)
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> +
> +       subq    $(VEC_SIZE * -4), %rsi
> +
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPTESTN %VMM(4), %VMM(4), %k2
> +       VPTESTN %VMM(6), %VMM(6), %k4
> +       KORTEST %k2, %k4
> +       jz      L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +       VPTESTN %VMM(0), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       /* Restore rdi (%rdi).  */
> +       addq    %rsi, %rdi
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x0_end)
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +
> +       KMOV    %k2, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x1)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x2)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +       /* Place L(ret_vec_x4) here to save code size.  We get a
> +          meaningfuly benefit doing this for stpcpy.  */
> +       KMOV    %k4, %VRDX
> +L(ret_vec_x3):
> +       bsf     %VRDX, %VRDX
> +       VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -       lea     2(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $3, %r8
> -       lea     3(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       leaq    (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
>  # endif
> +L(return_end):
>         ret
>
> -       .p2align 4
> -L(Exit3):
> -       mov     (%rsi), %edx
> -       mov     %edx, (%rdi)
> +       .p2align 4,, 6
> +L(ret_vec_x0_end):
> +       bsf     %VRCX, %VRCX
>  # ifdef USE_AS_STPCPY
> -       lea     3(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $4, %r8
> -       lea     4(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
> +       inc     %VRCX
> +       VMOVU   (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
>         ret
>
> -       .p2align 4
> -L(Exit4_7):
> -       mov     (%rsi), %ecx
> -       mov     %ecx, (%rdi)
> -       mov     -3(%rsi, %rdx), %ecx
> -       mov     %ecx, -3(%rdi, %rdx)
> +       .p2align 4,, 8
> +L(ret_vec_x1):
> +       bsf     %VRCX, %VRCX
> +       VMOVU   (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     %rdx, %r8
> -       sub     $1, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       leaq    VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
>         ret
>
> -       .p2align 4
> -L(Exit8_15):
> -       mov     (%rsi), %rcx
> -       mov     -7(%rsi, %rdx), %r9
> -       mov     %rcx, (%rdi)
> -       mov     %r9, -7(%rdi, %rdx)
> +       .p2align 4,, 4
> +L(ret_vec_x2):
> +       bsf     %VRCX, %VRCX
> +       VMOVU   ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     %rdx, %r8
> -       sub     $1, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
>         ret
>
> -       .p2align 4
> -L(Exit16_31):
> -       VMOVU   (%rsi), %XMM2
> -       VMOVU   -15(%rsi, %rdx), %XMM3
> -       VMOVU   %XMM2, (%rdi)
> -       VMOVU   %XMM3, -15(%rdi, %rdx)
> +       /* ret_vec_x3 reuses return code after the loop.  */
> +       .p2align 4,, 6
> +L(ret_vec_x4):
> +       bsf     %VRCX, %VRCX
> +       VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub %rdx, %r8
> -       sub $1, %r8
> -       lea 1(%rdi, %rdx), %rdi
> -       jnz L(StrncpyFillTailWithZero)
> +       leaq    (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
>         ret
>
> -       .p2align 4
> -L(Exit32_63):
> -       VMOVU   (%rsi), %YMM2
> -       VMOVU   -31(%rsi, %rdx), %YMM3
> -       VMOVU   %YMM2, (%rdi)
> -       VMOVU   %YMM3, -31(%rdi, %rdx)
> -# ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> +
> +       .p2align 4,, 4
> +L(page_cross):
> +# ifndef USE_AS_STRCAT
> +       vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
>  # endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     %rdx, %r8
> -       sub     $1, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       movq    %rsi, %rcx
> +       andq    $(VEC_SIZE * -1), %rcx
> +
> +       VPCMPEQ (%rcx), %VZERO, %k0
> +       KMOV    %k0, %VRCX
> +# ifdef USE_AS_WCSCPY
> +       andl    $(VEC_SIZE - 1), %PAGE_ALIGN_REG
> +       shrl    $2, %PAGE_ALIGN_REG
>  # endif
> -       ret
> +       shrx    %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
>
> -# ifdef USE_AS_STRNCPY
> +# if USE_MOVSB_IN_PAGE_CROSS
> +       /* Optimizing more aggressively for space as this is very cold
> +          code. This saves 2x cache lines.  */
>
> -       .p2align 4
> -L(StrncpyExit1):
> -       movzbl  (%rsi), %edx
> -       mov     %dl, (%rdi)
> -#  ifdef USE_AS_STPCPY
> -       lea     1(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, 1(%rdi)
> +       /* This adds once to the later result which will get correct
> +          copy bounds. NB: this can never zero-out a non-zero RCX as
> +          to be in the page cross case rsi cannot be aligned and we
> +          already right-shift rcx by the misalignment.  */
> +       shl     %VRCX
> +       jz      L(page_cross_continue)
> +#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +       movq    %rdi, %rax
>  #  endif
> -       ret
> +       bsf     %VRCX, %VRCX
> +       REP_MOVS
>
> -       .p2align 4
> -L(StrncpyExit2):
> -       movzwl  (%rsi), %edx
> -       mov     %dx, (%rdi)
>  #  ifdef USE_AS_STPCPY
> -       lea     2(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, 2(%rdi)
> +       leaq    -CHAR_SIZE(%rdi), %rax
>  #  endif
>         ret
>
> -       .p2align 4
> -L(StrncpyExit3_4):
> -       movzwl  (%rsi), %ecx
> -       movzwl  -2(%rsi, %r8), %edx
> -       mov     %cx, (%rdi)
> -       mov     %dx, -2(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       ret
>
> -       .p2align 4
> -L(StrncpyExit5_8):
> -       mov     (%rsi), %ecx
> -       mov     -4(%rsi, %r8), %edx
> -       mov     %ecx, (%rdi)
> -       mov     %edx, -4(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       ret
> +# else
> +       /* Check if we found zero-char before end of page.  */
> +       test    %VRCX, %VRCX
> +       jz      L(page_cross_continue)
>
> -       .p2align 4
> -L(StrncpyExit9_16):
> -       mov     (%rsi), %rcx
> -       mov     -8(%rsi, %r8), %rdx
> -       mov     %rcx, (%rdi)
> -       mov     %rdx, -8(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       ret
> +       /* Traditional copy case, essentially same as used in non-page-
> +          cross case but since we can't reuse VMM(0) we need twice as
> +          many loads from rsi.  */
>
> -       .p2align 4
> -L(StrncpyExit17_32):
> -       VMOVU   (%rsi), %XMM2
> -       VMOVU   -16(%rsi, %r8), %XMM3
> -       VMOVU   %XMM2, (%rdi)
> -       VMOVU   %XMM3, -16(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> +#  ifndef USE_AS_STRCAT
> +       xorl    %edx, %edx
>  #  endif
> -       ret
> -
> -       .p2align 4
> -L(StrncpyExit33_64):
> -       /*  0/32, 31/16 */
> -       VMOVU   (%rsi), %YMM2
> -       VMOVU   -VEC_SIZE(%rsi, %r8), %YMM3
> -       VMOVU   %YMM2, (%rdi)
> -       VMOVU   %YMM3, -VEC_SIZE(%rdi, %r8)
> +       /* Dependency on rdi must already have been satisfied.  */
> +       bsf     %VRCX, %VRDX
>  #  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> +#  elif !defined USE_AS_STRCAT
> +       movq    %rdi, %rax
>  #  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       ret
>
> -       .p2align 4
> -L(StrncpyExit65):
> -       /* 0/32, 32/32, 64/1 */
> -       VMOVU   (%rsi), %YMM2
> -       VMOVU   32(%rsi), %YMM3
> -       mov     64(%rsi), %cl
> -       VMOVU   %YMM2, (%rdi)
> -       VMOVU   %YMM3, 32(%rdi)
> -       mov     %cl, 64(%rdi)
> -#  ifdef USE_AS_STPCPY
> -       lea     65(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, 65(%rdi)
> +#  if VEC_SIZE == 64
> +#   ifdef USE_AS_WCSCPY
> +       testb   %cl, %cl
> +#   else
> +       test    %ecx, %ecx
> +#   endif
> +       jz      L(page_cross_copy_32_63)
>  #  endif
> -       ret
> -
> -#  ifndef USE_AS_STRCAT
>
> -       .p2align 4
> -L(Fill1):
> -       mov     %dl, (%rdi)
> -       ret
> +#  ifdef USE_AS_WCSCPY
> +       testb   $0xf, %cl
> +#  else
> +       testw   %cx, %cx
> +#  endif
> +       jz      L(page_cross_copy_16_31)
>
> -       .p2align 4
> -L(Fill2):
> -       mov     %dx, (%rdi)
> -       ret
> +#  ifdef USE_AS_WCSCPY
> +       testb   $0x3, %cl
> +#  else
> +       testb   %cl, %cl
> +#  endif
> +       jz      L(page_cross_copy_8_15)
>
> -       .p2align 4
> -L(Fill3_4):
> -       mov     %dx, (%rdi)
> -       mov     %dx, -2(%rdi, %r8)
> +#  ifdef USE_AS_WCSCPY
> +       movl    (%rsi), %esi
> +       movl    %esi, (%rdi)
> +       movl    $0, (%END_REG)
>         ret
> +#  else
>
> -       .p2align 4
> -L(Fill5_8):
> -       mov     %edx, (%rdi)
> -       mov     %edx, -4(%rdi, %r8)
> -       ret
> +       testb   $0x7, %cl
> +       jz      L(page_cross_copy_4_7)
>
> -       .p2align 4
> -L(Fill9_16):
> -       mov     %rdx, (%rdi)
> -       mov     %rdx, -8(%rdi, %r8)
> +       test    %edx, %edx
> +       jz      L(page_cross_set_null_term)
> +       movzwl  (%rsi), %ecx
> +       movw    %cx, (%rdi)
> +L(page_cross_set_null_term):
> +       movb    $0, (%END_REG)
>         ret
>
> -       .p2align 4
> -L(Fill17_32):
> -       VMOVU   %XMMZERO, (%rdi)
> -       VMOVU   %XMMZERO, -16(%rdi, %r8)
> -       ret
>
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec2):
> -       VMOVU   %YMM2, (%rdi, %rcx)
> -
> -       .p2align 4
> -L(CopyVecSizeVecExit):
> -       bsf     %edx, %edx
> -       add     $(VEC_SIZE - 1), %r8
> -       add     %rcx, %rdi
> -#   ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -#   endif
> -       sub     %rdx, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -
> -       .p2align 4
> -L(StrncpyFillTailWithZero):
> -       xor     %edx, %edx
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(StrncpyFillExit)
> -
> -       VMOVU   %YMMZERO, (%rdi)
> -       add     $VEC_SIZE, %rdi
> -
> -       mov     %rdi, %rsi
> -       and     $(VEC_SIZE - 1), %esi
> -       sub     %rsi, %rdi
> -       add     %rsi, %r8
> -       sub     $(VEC_SIZE * 4), %r8
> -       jb      L(StrncpyFillLessFourVecSize)
> -
> -L(StrncpyFillLoopVmovdqa):
> -       VMOVA   %YMMZERO, (%rdi)
> -       VMOVA   %YMMZERO, VEC_SIZE(%rdi)
> -       VMOVA   %YMMZERO, (VEC_SIZE * 2)(%rdi)
> -       VMOVA   %YMMZERO, (VEC_SIZE * 3)(%rdi)
> -       add     $(VEC_SIZE * 4), %rdi
> -       sub     $(VEC_SIZE * 4), %r8
> -       jae     L(StrncpyFillLoopVmovdqa)
> -
> -L(StrncpyFillLessFourVecSize):
> -       add     $(VEC_SIZE * 2), %r8
> -       jl      L(StrncpyFillLessTwoVecSize)
> -       VMOVA   %YMMZERO, (%rdi)
> -       VMOVA   %YMMZERO, VEC_SIZE(%rdi)
> -       add     $(VEC_SIZE * 2), %rdi
> -       sub     $VEC_SIZE, %r8
> -       jl      L(StrncpyFillExit)
> -       VMOVA   %YMMZERO, (%rdi)
> -       add     $VEC_SIZE, %rdi
> -       jmp     L(Fill)
> -
> -       .p2align 4
> -L(StrncpyFillLessTwoVecSize):
> -       add     $VEC_SIZE, %r8
> -       jl      L(StrncpyFillExit)
> -       VMOVA   %YMMZERO, (%rdi)
> -       add     $VEC_SIZE, %rdi
> -       jmp     L(Fill)
> -
> -       .p2align 4
> -L(StrncpyFillExit):
> -       add     $VEC_SIZE, %r8
> -L(Fill):
> -       cmp     $17, %r8d
> -       jae     L(Fill17_32)
> -       cmp     $9, %r8d
> -       jae     L(Fill9_16)
> -       cmp     $5, %r8d
> -       jae     L(Fill5_8)
> -       cmp     $3, %r8d
> -       jae     L(Fill3_4)
> -       cmp     $1, %r8d
> -       ja      L(Fill2)
> -       je      L(Fill1)
> +       .p2align 4,, 4
> +L(page_cross_copy_4_7):
> +       movl    (%rsi), %ecx
> +       movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
> +       movl    %ecx, (%rdi)
> +       movl    %esi, -(4 - CHAR_SIZE)(%END_REG)
>         ret
> -
> -/* end of ifndef USE_AS_STRCAT */
>  #  endif
>
> -       .p2align 4
> -L(UnalignedLeaveCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(UnalignedFourVecSizeLeaveCase2)
> -L(UnalignedFourVecSizeLeaveCase3):
> -       lea     (VEC_SIZE * 4)(%r8), %rcx
> -       and     $-VEC_SIZE, %rcx
> -       add     $(VEC_SIZE * 3), %r8
> -       jl      L(CopyVecSizeCase3)
> -       VMOVU   %YMM4, (%rdi)
> -       sub     $VEC_SIZE, %r8
> -       jb      L(CopyVecSizeCase3)
> -       VMOVU   %YMM5, VEC_SIZE(%rdi)
> -       sub     $VEC_SIZE, %r8
> -       jb      L(CopyVecSizeCase3)
> -       VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> -       sub     $VEC_SIZE, %r8
> -       jb      L(CopyVecSizeCase3)
> -       VMOVU   %YMM7, (VEC_SIZE * 3)(%rdi)
> -#  ifdef USE_AS_STPCPY
> -       lea     (VEC_SIZE * 4)(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (VEC_SIZE * 4)(%rdi)
> -#  endif
> +#  if VEC_SIZE == 64
> +       .p2align 4,, 4
> +L(page_cross_copy_32_63):
> +       VMOVU   (%rsi), %VMM_256(0)
> +       VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +       VMOVU   %VMM_256(0), (%rdi)
> +       VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
>         ret
> -
> -       .p2align 4
> -L(UnalignedFourVecSizeLeaveCase2):
> -       xor     %ecx, %ecx
> -       vpcmpb  $0, %YMM4, %YMMZERO, %k1
> -       kmovd   %k1, %edx
> -       add     $(VEC_SIZE * 3), %r8
> -       jle     L(CopyVecSizeCase2OrCase3)
> -       test    %edx, %edx
> -#  ifndef USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec4)
> -#  else
> -       jnz     L(CopyVecSize)
> -#  endif
> -       vpcmpb  $0, %YMM5, %YMMZERO, %k2
> -       kmovd   %k2, %edx
> -       VMOVU   %YMM4, (%rdi)
> -       add     $VEC_SIZE, %rcx
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -       test    %edx, %edx
> -#  ifndef USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec5)
> -#  else
> -       jnz     L(CopyVecSize)
>  #  endif
>
> -       vpcmpb  $0, %YMM6, %YMMZERO, %k3
> -       kmovd   %k3, %edx
> -       VMOVU   %YMM5, VEC_SIZE(%rdi)
> -       add     $VEC_SIZE, %rcx
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -       test    %edx, %edx
> -#  ifndef USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec6)
> -#  else
> -       jnz     L(CopyVecSize)
> -#  endif
> -
> -       vpcmpb  $0, %YMM7, %YMMZERO, %k4
> -       kmovd   %k4, %edx
> -       VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> -       lea     VEC_SIZE(%rdi, %rcx), %rdi
> -       lea     VEC_SIZE(%rsi, %rcx), %rsi
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -L(StrncpyExit):
> -       cmp     $65, %r8d
> -       je      L(StrncpyExit65)
> -       cmp     $33, %r8d
> -       jae     L(StrncpyExit33_64)
> -       cmp     $17, %r8d
> -       jae     L(StrncpyExit17_32)
> -       cmp     $9, %r8d
> -       jae     L(StrncpyExit9_16)
> -       cmp     $5, %r8d
> -       jae     L(StrncpyExit5_8)
> -       cmp     $3, %r8d
> -       jae     L(StrncpyExit3_4)
> -       cmp     $1, %r8d
> -       ja      L(StrncpyExit2)
> -       je      L(StrncpyExit1)
> -#  ifdef USE_AS_STPCPY
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi)
> -#  endif
> +       .p2align 4,, 4
> +L(page_cross_copy_16_31):
> +       vmovdqu (%rsi), %xmm0
> +       vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +       vmovdqu %xmm0, (%rdi)
> +       vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
>         ret
>
> -       .p2align 4
> -L(ExitZero):
> -#  ifndef USE_AS_STRCAT
> -       mov     %rdi, %rax
> -#  endif
> +       .p2align 4,, 4
> +L(page_cross_copy_8_15):
> +       movq    (%rsi), %rcx
> +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> +       movq    %rcx, (%rdi)
> +       movq    %rsi, -(8 - CHAR_SIZE)(%END_REG)
>         ret
> -
> -# endif
> -
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# else
> -END (STRCAT)
>  # endif
> +END(STRCPY)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
> index 203a19bf21..d648ba5cfe 100644
> --- a/sysdeps/x86_64/multiarch/strncat-evex.S
> +++ b/sysdeps/x86_64/multiarch/strncat-evex.S
> @@ -1,7 +1,520 @@
> -#ifndef STRNCAT
> -# define STRNCAT       __strncat_evex
> -#endif
> +/* {wcs|str}ncat  with 256/512-bit EVEX.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +       /* Use evex-masked stores for small sizes. Turned off at the
> +          moment.  */
> +# define USE_EVEX_MASKED_STORE 0
> +
> +# include <sysdep.h>
> +
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +# ifndef STRNCAT
> +#  define STRNCAT      __strncat_evex
> +# endif
> +
> +
> +# ifdef USE_AS_WCSCPY
> +#  define movNULL      movl
> +#  define VMOVU_MASK   vmovdqu32
> +#  define VPMIN        vpminud
> +#  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
> +#  define VPCMPEQ      vpcmpeqd
> +#  define CHAR_SIZE    4
> +
> +#  define REP_MOVS     rep movsd
> +
> +#  define VMASK_REG    VR10
> +#  define FIND_FIRST_ONE(src, dst)     movl $CHAR_PER_VEC, %dst; bsf %src, %dst
> +
> +#  define USE_WIDE_CHAR
> +# else
> +#  define movNULL      movb
> +#  define VMOVU_MASK   vmovdqu8
> +#  define VPMIN        vpminub
> +#  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
> +#  define VPCMPEQ      vpcmpeqb
> +#  define CHAR_SIZE    1
> +
> +#  define REP_MOVS     rep movsb
> +
> +#  define VMASK_REG    VRCX
> +#  define FIND_FIRST_ONE(src, dst)     tzcnt %src, %dst
> +
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# include "reg-macros.h"
> +
> +
> +# define VZERO VMM(7)
> +# define VZERO_128     VMM_128(7)
> +
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCAT)
> +       movq    %rdi, %rax
> +
> +       /* NB: It's safe to filter out zero-length strings WITHOUT
> +          setting null-term. Destination MUST be a null-terminated
> +          string so essentially the work is already done.  */
> +# ifdef USE_AS_WCSCPY
> +       leaq    -1(%rdx), %rcx
> +       shrq    $56, %rcx
> +       jnz     L(zero_len)
> +# else
> +       test    %rdx, %rdx
> +       jle     L(zero_len)
> +# endif
> +
> +# include "strcat-strlen-evex.S"
> +
> +       movl    %esi, %ecx
> +       andl    $(PAGE_SIZE - 1), %ecx
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> +       ja      L(page_cross)
> +L(page_cross_continue):
> +       VMOVU   (%rsi), %VMM(0)
> +       VPTESTN %VMM(0), %VMM(0), %k0
> +
> +       /* If USE_EVEX_MASK_STORE is enabled then we just handle length
> +          <= CHAR_PER_VEC with masked instructions (which have
> +          potential for dramatically bad perf if dst splits a page and
> +          is not in the TLB).  */
> +# if USE_EVEX_MASKED_STORE
> +       KMOV    %k0, %VRCX
> +       FIND_FIRST_ONE (VRCX, VR8)
> +       cmpq    %r8, %rdx
> +       jbe     L(less_1x_vec)
> +
> +       test    %VRCX, %VRCX
> +       jz      L(more_1x_vec)
> +
> +       blsmsk  %VRCX, %VRCX
> +       KMOV    %VRCX, %k1
> +       VMOVU_MASK %VMM(0), (%rdi){%k1}
> +       ret
> +
> +L(less_1x_vec):
> +       mov     $-1, %VRCX
> +       bzhi    %VRDX, %VRCX, %VRCX
> +       KMOV    %VRCX, %k1
> +       movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> +       VMOVU_MASK %VMM(0), (%rdi){%k1}
> +
> +       ret
> +# else
> +       KMOV    %k0, %VMASK_REG
> +       /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
> +          %VMASK_REG, %VRCX` for wcsncat.  */
> +       FIND_FIRST_ONE (VMASK_REG, VRCX)
> +       cmpq    %rcx, %rdx
> +       jbe     L(less_1x_vec)
> +
> +       /* If there were no zero-CHARs (rcx was zero before
> +          FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> +       cmpl    $CHAR_PER_VEC, %ecx
> +       je      L(more_1x_vec)
> +
> +       movl    %ecx, %edx
> +
> +L(less_1x_vec):
> +#  if VEC_SIZE == 64
> +       cmpl    $(32 / CHAR_SIZE), %edx
> +       jae     L(copy_32_63)
> +#  endif
> +
> +       cmpl    $(16 / CHAR_SIZE), %edx
> +       jae     L(copy_16_31)
> +
> +
> +       cmpl    $(8 / CHAR_SIZE), %edx
> +       jae     L(copy_8_15)
> +
> +#  ifdef USE_AS_WCSCPY
> +       vmovd   %VMM_128(0), (%rdi)
> +       movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> +       ret
> +#  else
> +
> +       cmpl    $4, %edx
> +       jae     L(copy_4_7)
> +
> +       movzbl  (%rsi), %ecx
> +       cmpl    $1, %edx
> +       jbe     L(set_null_term)
> +
> +       movzwl  1(%rsi), %esi
> +       movw    %si, 1(%rdi)
> +
> +       .p2align 4,, 1
> +L(set_null_term):
> +       movb    %cl, (%rdi)
> +       movNULL $0, (%rdi, %rdx)
> +       ret
> +#  endif
> +
> +#  if VEC_SIZE == 64
> +       .p2align 4,, 6
> +L(copy_32_63):
> +       VMOVU   -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +       VMOVU   %VMM_256(0), (%rdi)
> +       VMOVU   %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
> +       movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> +       ret
> +#  endif
> +       .p2align 4,, 6
> +L(copy_16_31):
> +       /* Use xmm1 explicitly here as it won't require a `vzeroupper`
> +          and will save code size.  */
> +       vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +       VMOVU   %VMM_128(0), (%rdi)
> +       vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
> +       movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> +       ret
> +
> +       .p2align 4,, 2
> +L(copy_8_15):
> +       movq    -(8)(%rsi, %rdx, CHAR_SIZE), %rcx
> +       vmovq   %VMM_128(0), (%rdi)
> +       movq    %rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
> +       movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> +       ret
> +
> +#  ifndef USE_AS_WCSCPY
> +       .p2align 4,, 12
> +L(copy_4_7):
> +       movl    -(4)(%rsi, %rdx, CHAR_SIZE), %ecx
> +       vmovd   %VMM_128(0), (%rdi)
> +       movl    %ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
> +       movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> +       ret
> +#  endif
> +
> +# endif
> +       .p2align 4,, 4
> +L(zero_len):
> +# ifdef USE_AS_WCSCPY
> +       test    %rdx, %rdx
> +# endif
> +       jne     OVERFLOW_STRCAT
> +       ret
>
> -#define USE_AS_STRNCAT
> -#define STRCAT STRNCAT
> -#include "strcat-evex.S"
> +       .p2align 4,, 8
> +L(more_1x_vec):
> +       VMOVU   %VMM(0), (%rdi)
> +
> +       /* We are going to align rsi here so will need to be able to re-
> +          adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> +          so rsi + rdx * CHAR_SIZE cannot overflow.  */
> +
> +       leaq    (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> +       subq    %rsi, %rdi
> +       andq    $-(VEC_SIZE), %rsi
> +L(loop_last_4x_vec):
> +       addq    %rsi, %rdi
> +       subq    %rsi, %rdx
> +# ifdef USE_AS_WCSCPY
> +       shrq    $2, %rdx
> +# endif
> +
> +       /* Will need this regardless.  */
> +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> +       VPTESTN %VMM(1), %VMM(1), %k0
> +       KMOV    %k0, %VMASK_REG
> +
> +       cmpq    $(CHAR_PER_VEC * 2), %rdx
> +       ja      L(more_2x_vec)
> +
> +L(last_2x_vec):
> +       FIND_FIRST_ONE (VMASK_REG, VRCX)
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x1_len)
> +
> +       /* If there were no zero-CHARs (rcx was zero before
> +          FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> +       cmpl    $CHAR_PER_VEC, %ecx
> +       jne     L(ret_vec_x1)
> +
> +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       addl    $-CHAR_PER_VEC, %edx
> +       bzhi    %VRDX, %VRCX, %VR8
> +       jz      L(ret_vec_x2_len)
> +L(ret_vec_x2):
> +       bsf     %VRCX, %VRDX
> +L(ret_vec_x2_len):
> +       VMOVU   (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +       movNULL $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +       VMOVU   %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +       ret
> +
> +       .p2align 4,, 4
> +L(ret_vec_x1_len):
> +       movl    %edx, %ecx
> +L(ret_vec_x1):
> +       VMOVU   (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +       movNULL $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
> +       VMOVU   %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +       VZEROUPPER_RETURN
> +
> +
> +       .p2align 4,, 8
> +L(last_4x_vec):
> +       addl    $-(CHAR_PER_VEC * 4), %edx
> +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(1)
> +       VPTESTN %VMM(1), %VMM(1), %k0
> +       KMOV    %k0, %VMASK_REG
> +       subq    $-(VEC_SIZE * 4), %rsi
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       cmpl    $(CHAR_PER_VEC * 2), %edx
> +       jbe     L(last_2x_vec)
> +       .p2align 4,, 8
> +L(more_2x_vec):
> +# ifdef USE_AS_WCSCPY
> +       xorl    %ecx, %ecx
> +# endif
> +       bsf     %VMASK_REG, %VRCX
> +       jnz     L(ret_vec_x1)
> +
> +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x2)
> +
> +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> +       VPTESTN %VMM(3), %VMM(3), %k0
> +       KMOV    %k0, %VMASK_REG
> +
> +       cmpq    $(CHAR_PER_VEC * 4), %rdx
> +       ja      L(more_4x_vec)
> +
> +       /* Adjust length before going to L(ret_vec_x3_len) or
> +          L(ret_vec_x3).  */
> +       addl    $(CHAR_PER_VEC * -2), %edx
> +
> +       FIND_FIRST_ONE (VMASK_REG, VRCX)
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x3_len)
> +
> +       /* If there were no zero-CHARs (rcx was zero before
> +          FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> +       cmpl    $CHAR_PER_VEC, %ecx
> +       jne     L(ret_vec_x3)
> +
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> +       VPTESTN %VMM(4), %VMM(4), %k0
> +       KMOV    %k0, %VRCX
> +       addl    $-CHAR_PER_VEC, %edx
> +       bzhi    %VRDX, %VRCX, %VR8
> +       jz      L(ret_vec_x4_len)
> +L(ret_vec_x4):
> +       bsf     %VRCX, %VRDX
> +L(ret_vec_x4_len):
> +       VMOVU   (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +       movNULL $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
> +       VMOVU   %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +       ret
> +
> +       .p2align 4,, 4
> +L(ret_vec_x3_len):
> +       movl    %edx, %ecx
> +L(ret_vec_x3):
> +       VMOVU   (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +       movNULL $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> +       VMOVU   %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +       ret
> +
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +# ifdef USE_AS_WCSCPY
> +       xorl    %ecx, %ecx
> +# endif
> +       bsf     %VMASK_REG, %VRCX
> +       jnz     L(ret_vec_x3)
> +
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> +       VPTESTN %VMM(4), %VMM(4), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x4)
> +
> +       VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> +
> +       /* Check if we are near the end before aligning.  */
> +       cmpq    $(CHAR_PER_VEC * 8), %rdx
> +       jbe     L(last_4x_vec)
> +
> +
> +       /* Add rsi to rdx (length) before aligning rsi. NB: Since we
> +          filtered out huge lengths this cannot overflow.  */
> +# ifdef USE_AS_WCSCPY
> +       leaq    (%rsi, %rdx, CHAR_SIZE), %rdx
> +# else
> +       addq    %rsi, %rdx
> +# endif
> +
> +       /* Subtract rsi from rdi before aligning (add back will have
> +          correct rdi for aligned rsi).  */
> +       subq    %rsi, %rdi
> +       subq    $-(VEC_SIZE * 5), %rsi
> +       andq    $(VEC_SIZE * -4), %rsi
> +
> +       /* Load first half of the loop before entry.  */
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPTESTN %VMM(4), %VMM(4), %k2
> +       VPTESTN %VMM(6), %VMM(6), %k4
> +
> +       /* Offset rsi by VEC_SIZE so that we can jump to
> +          L(loop_last_4x_vec).  */
> +       addq    $-(VEC_SIZE), %rsi
> +       KORTEST %k2, %k4
> +       jnz     L(loop_4x_done)
> +
> +       /* Store loop end in r9.  */
> +       leaq    -(VEC_SIZE * 5)(%rdx), %r9
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +       VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> +
> +       subq    $(VEC_SIZE * -4), %rsi
> +       cmpq    %rsi, %r9
> +       jbe     L(loop_last_4x_vec)
> +
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPTESTN %VMM(4), %VMM(4), %k2
> +       VPTESTN %VMM(6), %VMM(6), %k4
> +       KORTEST %k2, %k4
> +       jz      L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +       VPTESTN %VMM(0), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       /* Restore rdi (dst).  */
> +       addq    %rsi, %rdi
> +
> +       /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
> +          test with bsf.  */
> +       bsf     %VRCX, %VRCX
> +       jnz     L(ret_vec_x1)
> +       VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
> +
> +       KMOV    %k2, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x2)
> +       VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> +
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       bsf     %VRCX, %VRCX
> +       jnz     L(ret_vec_x3)
> +       VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> +
> +       KMOV    %k4, %VRCX
> +       bsf     %VRCX, %VRCX
> +       VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +       ret
> +
> +
> +       .p2align 4,, 4
> +L(page_cross):
> +       movq    %rsi, %r8
> +       andq    $(VEC_SIZE * -1), %r8
> +       VPCMPEQ (%r8), %VZERO, %k0
> +
> +# ifdef USE_AS_WCSCPY
> +       KMOV    %k0, %VR9
> +       shrl    $2, %ecx
> +       andl    $(CHAR_PER_VEC - 1), %ecx
> +       shrx    %VRCX, %VR9, %VRCX
> +# else
> +       KMOV    %k0, %VRCX
> +       shrx    %VRSI, %VRCX, %VRCX
> +# endif
> +
> +       subl    %esi, %r8d
> +       andl    $(VEC_SIZE - 1), %r8d
> +# ifdef USE_AS_WCSCPY
> +       shrl    $2, %r8d
> +# endif
> +       cmpq    %r8, %rdx
> +       jbe     L(page_cross_small)
> +       /* Optimizing more for space as this is very cold code. This
> +          saves 2x cache lines.  */
> +
> +       /* This adds once to the later result which will get correct
> +          copy bounds. NB: this can never zero-out a non-zero RCX as
> +          to be in the page cross case rsi cannot be aligned and we
> +          already right-shift rcx by the misalignment.  */
> +       shl     %VRCX
> +       jz      L(page_cross_continue)
> +       bsf     %VRCX, %VRCX
> +       REP_MOVS
> +       ret
> +
> +L(page_cross_small):
> +       tzcnt   %VRCX, %VRCX
> +       jz      L(page_cross_setz)
> +       cmpl    %edx, %ecx
> +       cmova   %edx, %ecx
> +
> +# ifdef USE_AS_WCSCPY
> +       rep     movsd
> +# else
> +       rep     movsb
> +# endif
> +L(page_cross_setz):
> +       movNULL $0, (%rdi)
> +       ret
> +END(STRNCAT)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
> index 1b3426d511..49eaf4cbd9 100644
> --- a/sysdeps/x86_64/multiarch/strncpy-evex.S
> +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
> @@ -1,7 +1,990 @@
> -#ifndef STRNCPY
> -# define STRNCPY       __strncpy_evex
> -#endif
> +/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +       /* Use evex-masked stores for small sizes. Turned off at the
> +          moment.  */
> +# define USE_EVEX_MASKED_STORE 0
> +
> +
> +# include <sysdep.h>
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +
> +# ifndef STRNCPY
> +#  define STRNCPY      __strncpy_evex
> +# endif
> +
> +# ifdef USE_AS_WCSCPY
> +#  define VMOVU_MASK   vmovdqu32
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPMIN        vpminud
> +#  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
> +#  define CHAR_SIZE    4
> +
> +#  define REP_MOVS     rep movsd
> +#  define REP_STOS     rep stosl
> +
> +#  define USE_WIDE_CHAR
> +
> +# else
> +#  define VMOVU_MASK   vmovdqu8
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPMIN        vpminub
> +#  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
> +#  define CHAR_SIZE    1
> +
> +#  define REP_MOVS     rep movsb
> +#  define REP_STOS     rep stosb
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +# include "reg-macros.h"
> +
> +
> +# define VZERO VMM(7)
> +# define VZERO_256     VMM_256(7)
> +# define VZERO_128     VMM_128(7)
> +
> +# if VEC_SIZE == 64
> +#  define VZERO_HALF   VZERO_256
> +# else
> +#  define VZERO_HALF   VZERO_128
> +# endif
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCPY)
> +       /* Filter zero length strings and very long strings.  Zero
> +          length strings just return, very long strings are handled by
> +          just running rep stos{b|l} to zero set (which will almost
> +          certainly segfault), if that succeeds then just calling
> +          OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
> +# ifdef USE_AS_WCSCPY
> +       decq    %rdx
> +       movq    %rdx, %rax
> +       /* 56 is end of max supported address space.  */
> +       shr     $56, %rax
> +       jnz     L(zero_len)
> +# else
> +       decq    %rdx
> +       /* If the flag needs to become `jb` replace `dec` with `sub`.
> +        */
> +       jl      L(zero_len)
> +# endif
> +
> +       vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> +       movl    %esi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(page_cross)
> +
> +L(page_cross_continue):
> +       VMOVU   (%rsi), %VMM(0)
> +       VPTESTN %VMM(0), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +
> +       /* If no STPCPY just save end ahead of time.  */
> +# ifndef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# endif
> +
> +
> +       cmpq    $(CHAR_PER_VEC), %rdx
> +
> +       /* If USE_EVEX_MASK_STORE is enabled then we just handle length
> +          <= CHAR_PER_VEC with masked instructions (which have
> +          potential for dramatically bad perf if dst splits a page and
> +          is not in the TLB).  */
> +# if USE_EVEX_MASKED_STORE
> +       /* `jae` because length rdx is now length - 1.  */
> +       jae     L(more_1x_vec)
> +
> +       /* If there where multiple zero-CHAR matches in the first VEC,
> +          VRCX will be overset but thats fine since any oversets where
> +          at zero-positions anyways.  */
> +
> +#  ifdef USE_AS_STPCPY
> +       tzcnt   %VRCX, %VRAX
> +       cmpl    %eax, %edx
> +       cmovb   %edx, %eax
> +#   ifdef USE_AS_WCSCPY
> +       adcl    $0, %eax
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#   else
> +       adcq    %rdi, %rax
> +#   endif
> +#  endif
> +       dec     %VRCX
> +
> +       /* Zero out all non-zero CHAR's after the first zero match.  */
> +       KMOV    %VRCX, %k1
> +
> +       /* Use VZERO as destination so this can be reused for
> +          L(zfill_less_vec) (which if jumped to by subsequent logic
> +          will have zerod out VZERO.  */
> +       VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
> +L(zfill_less_vec):
> +       /* Get mask for what we need to set.  */
> +       incl    %edx
> +       mov     $-1, %VRCX
> +       bzhi    %VRDX, %VRCX, %VRCX
> +       KMOV    %VRCX, %k1
> +       VMOVU_MASK %VZERO, (%rdi){%k1}
> +       ret
> +
> +       .p2align 4,, 4
> +L(zero_len):
> +       cmpq    $-1, %rdx
> +       jne     L(best_effort_strncpy)
> +       movq    %rdi, %rax
> +       ret
> +
> +       .p2align 4,, 8
> +L(more_1x_vec):
> +# else
> +       /* `jb` because length rdx is now length - 1.  */
> +       jb      L(less_1x_vec)
> +# endif
> +
> +
> +       /* This may overset but thats fine because we still need to zero
> +          fill.  */
> +       VMOVU   %VMM(0), (%rdi)
> +
> +
> +       /* Length must be >= CHAR_PER_VEC so match here means we must
> +          zero-fill.  */
> +       test    %VRCX, %VRCX
> +       jnz     L(zfill)
> +
> +
> +       /* We are going to align rsi here so will need to be able to re-
> +          adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> +          so rsi + rdx * CHAR_SIZE cannot overflow.  */
> +       leaq    (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> +       subq    %rsi, %rdi
> +       andq    $-(VEC_SIZE), %rsi
> +
> +L(loop_last_4x_vec):
> +       addq    %rsi, %rdi
> +       subq    %rsi, %rdx
> +# ifdef USE_AS_WCSCPY
> +       shrq    $2, %rdx
> +# endif
> +
> +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> +       VPTESTN %VMM(1), %VMM(1), %k0
> +       KMOV    %k0, %VRCX
> +
> +       /* -1 because of the `dec %rdx` earlier.  */
> +       cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
> +       ja      L(more_2x_vec)
> +
> +L(last_2x_vec):
> +       /* This will be need to be computed no matter what. We do it
> +          ahead of time for CHAR_PER_VEC == 64 because we can't adjust
> +          the value of `tzcnt` with a shift.  */
> +# if CHAR_PER_VEC == 64
> +       tzcntq  %rcx, %rcx
> +# endif
> +
> +       cmpl    $(CHAR_PER_VEC), %edx
> +       jb      L(ret_vec_x1_len)
> +
> +       /* Seperate logic for CHAR_PER_VEC == 64 because we already did
> +          `tzcnt` on VRCX.  */
> +# if CHAR_PER_VEC == 64
> +       /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
> +       cmpb    $CHAR_PER_VEC, %cl
> +       jnz     L(ret_vec_x1_no_bsf)
> +# else
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x1)
> +# endif
> +
> +
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0
> +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> +       KMOV    %k0, %VRCX
> +
> +# if CHAR_PER_VEC < 64
> +       /* This essentiallys adds CHAR_PER_VEC to computed result.  */
> +       shlq    $CHAR_PER_VEC, %rcx
> +# else
> +       tzcntq  %rcx, %rcx
> +       addl    $CHAR_PER_VEC, %ecx
> +# endif
> +
> +       .p2align 4,, 4
> +L(ret_vec_x1_len):
> +       /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
> +          already been done.  */
> +# if CHAR_PER_VEC < 64
> +       tzcntq  %rcx, %rcx
> +# endif
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x1_len_no_zfill)
> +       /* Fall through (expectation) is copy len < buffer len.  */
> +       VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +L(ret_vec_x1_len_no_zfill_mov):
> +       movl    %ecx, %edx
> +# ifdef USE_AS_STPCPY
> +       /* clear flags.  */
> +       xorl    %ecx, %ecx
> +# endif
> +L(ret_vec_x1_len_no_zfill):
> +       VMOVU   ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       adcq    $0, %rdx
> +       leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +       leal    (VEC_SIZE)(%rdx), %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# endif
> +       ret
> +
> +
> +       .p2align 4,, 10
> +L(ret_vec_x1):
> +       bsf     %VRCX, %VRCX
> +L(ret_vec_x1_no_bsf):
> +       VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +       subl    %ecx, %edx
> +       cmpl    $CHAR_PER_VEC, %edx
> +       jb      L(ret_vec_x1_len_no_zfill_mov)
> +       /* Fall through (expectation) is copy len < buffer len.  */
> +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> +       VMOVU   %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +       leaq    (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
> +# endif
> +       ret
> +
> +       .p2align 4,, 8
> +L(last_4x_vec):
> +       /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
> +          $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
> +          using `movzbl`.  */
> +# if CHAR_PER_VEC == 64
> +       movzbl  %dl, %edx
> +# else
> +       andl    $(CHAR_PER_VEC * 4 - 1), %edx
> +# endif
> +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(1)
> +       VPTESTN %VMM(1), %VMM(1), %k0
> +       KMOV    %k0, %VRCX
> +       subq    $-(VEC_SIZE * 4), %rsi
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       cmpl    $(CHAR_PER_VEC * 2 - 1), %edx
> +       jbe     L(last_2x_vec)
> +       .p2align 4,, 8
> +L(more_2x_vec):
> +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> +       test    %VRCX, %VRCX
> +       /* Must fill at least 2x VEC.  */
> +       jnz     L(zfill_vec1)
> +
> +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       /* Must fill at least 1x VEC.  */
> +       jnz     L(zfill_vec2)
> +
> +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> +       VPTESTN %VMM(3), %VMM(3), %k0
> +       KMOV    %k0, %VRCX
> +
> +       /* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
> +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rdx
> +       ja      L(more_4x_vec)
> +
> +       subl    $(CHAR_PER_VEC * 3), %edx
> +       jb      L(ret_vec_x3_len)
> +
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x3)
> +
> +       VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> +       KMOV    %k0, %VRCX
> +       tzcnt   %VRCX, %VRCX
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x4_len_no_zfill)
> +       /* Fall through (expectation) is copy len < buffer len.  */
> +       VMOVU   %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +       movl    %ecx, %edx
> +L(ret_vec_x4_len_no_zfill):
> +       VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       adcq    $0, %rdx
> +       leaq    (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +       leal    (VEC_SIZE * 4 + 0)(%rdx), %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# endif
> +       ret
> +
> +
> +L(ret_vec_x3_len):
> +       addl    $(CHAR_PER_VEC * 1), %edx
> +       tzcnt   %VRCX, %VRCX
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x3_len_no_zfill)
> +       /* Fall through (expectation) is copy len < buffer len.  */
> +       VMOVU   %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +L(ret_vec_x3_len_no_zfill_mov):
> +       movl    %ecx, %edx
> +# ifdef USE_AS_STPCPY
> +       /* clear flags.  */
> +       xorl    %ecx, %ecx
> +# endif
> +       .p2align 4,, 4
> +L(ret_vec_x3_len_no_zfill):
> +       VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +       VMOVU   %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       adcq    $0, %rdx
> +       leaq    (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +       leal    (VEC_SIZE * 3 + 0)(%rdx), %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# endif
> +       ret
> +
> +
> +       .p2align 4,, 8
> +L(ret_vec_x3):
> +       bsf     %VRCX, %VRCX
> +       VMOVU   %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
> +       subl    %ecx, %edx
> +       jl      L(ret_vec_x3_len_no_zfill_mov)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> +       VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +       leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
> +# endif
> +       ret
> +
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> +       test    %VRCX, %VRCX
> +       jnz     L(zfill_vec3)
> +
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> +       VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> +       VPTESTN %VMM(4), %VMM(4), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(zfill_vec4)
>
> -#define USE_AS_STRNCPY
> -#define STRCPY STRNCPY
> -#include "strcpy-evex.S"
> +       /* Recheck length before aligning.  */
> +       cmpq    $(CHAR_PER_VEC * 8 - 1), %rdx
> +       jbe     L(last_4x_vec)
> +
> +       /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
> +# ifdef USE_AS_WCSCPY
> +       leaq    (%rsi, %rdx, CHAR_SIZE), %rdx
> +# else
> +       addq    %rsi, %rdx
> +# endif
> +       subq    %rsi, %rdi
> +       subq    $-(VEC_SIZE * 5), %rsi
> +       andq    $(VEC_SIZE * -4), %rsi
> +
> +
> +       /* Load first half of the loop before entry.  */
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPTESTN %VMM(4), %VMM(4), %k2
> +       VPTESTN %VMM(6), %VMM(6), %k4
> +
> +
> +       /* Offset rsi by VEC_SIZE so that we can jump to
> +          L(loop_last_4x_vec).  */
> +       addq    $-(VEC_SIZE), %rsi
> +       KORTEST %k2, %k4
> +       jnz     L(loop_4x_done)
> +
> +       /* Store loop end in r9.  */
> +       leaq    -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +       VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> +
> +       subq    $(VEC_SIZE * -4), %rsi
> +       cmpq    %rsi, %r9
> +       jbe     L(loop_last_4x_vec)
> +
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPTESTN %VMM(4), %VMM(4), %k2
> +       VPTESTN %VMM(6), %VMM(6), %k4
> +       KORTEST %k2, %k4
> +       jz      L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +       /* Restore rdx (length).  */
> +       subq    %rsi, %rdx
> +# ifdef USE_AS_WCSCPY
> +       shrq    $2, %rdx
> +# endif
> +       VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +       /* Restore rdi (dst).  */
> +       addq    %rsi, %rdi
> +       VPTESTN %VMM(0), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(zfill_vec1)
> +
> +       VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> +       KMOV    %k2, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(zfill_vec2)
> +
> +       VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(zfill_vec3)
> +
> +       VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
> +       KMOV    %k4, %VRCX
> +       // Zfill more....
> +
> +       .p2align 4,, 4
> +L(zfill_vec4):
> +       subq    $(VEC_SIZE * -2), %rdi
> +       addq    $(CHAR_PER_VEC * -2), %rdx
> +L(zfill_vec2):
> +       subq    $(VEC_SIZE * -2), %rdi
> +       addq    $(CHAR_PER_VEC * -1), %rdx
> +L(zfill):
> +       /* VRCX must be non-zero.  */
> +       bsf     %VRCX, %VRCX
> +
> +       /* Adjust length / dst for zfill.  */
> +       subq    %rcx, %rdx
> +# ifdef USE_AS_WCSCPY
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> +# else
> +       addq    %rcx, %rdi
> +# endif
> +# ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# endif
> +L(zfill_from_page_cross):
> +
> +       /* From here on out its just memset(rdi, 0, rdx).  */
> +       cmpq    $CHAR_PER_VEC, %rdx
> +       jb      L(zfill_less_vec)
> +
> +L(zfill_more_1x_vec):
> +       VMOVU   %VZERO, (%rdi)
> +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
> +       ja      L(zfill_more_2x_vec)
> +L(zfill_done0):
> +       ret
> +
> +       /* Coming from vec1/vec2 we must be able to zfill at least 2x
> +          VEC.  */
> +       .p2align 4,, 8
> +L(zfill_vec3):
> +       subq    $(VEC_SIZE * -2), %rdi
> +       addq    $(CHAR_PER_VEC * -2), %rdx
> +       .p2align 4,, 2
> +L(zfill_vec1):
> +       bsfq    %rcx, %rcx
> +       /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
> +        */
> +       leaq    VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> +       subq    %rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# endif
> +
> +
> +       VMOVU   %VZERO, (%rdi)
> +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       cmpq    $(CHAR_PER_VEC * 2), %rdx
> +       jb      L(zfill_done0)
> +L(zfill_more_2x_vec):
> +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +       VMOVU   %VZERO, (VEC_SIZE)(%rdi)
> +       subq    $(CHAR_PER_VEC * 4 - 1), %rdx
> +       jbe     L(zfill_done)
> +
> +# ifdef USE_AS_WCSCPY
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rdx
> +# else
> +       addq    %rdi, %rdx
> +# endif
> +
> +       VMOVU   %VZERO, (VEC_SIZE * 2)(%rdi)
> +       VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi)
> +
> +
> +       VMOVU   %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
> +       VMOVU   %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
> +
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       cmpq    %rdi, %rdx
> +       jbe     L(zfill_done)
> +
> +       /* Align rdi and zfill loop.  */
> +       andq    $-(VEC_SIZE), %rdi
> +       .p2align 4,, 12
> +L(zfill_loop_4x_vec):
> +       VMOVA   %VZERO, (VEC_SIZE * 0)(%rdi)
> +       VMOVA   %VZERO, (VEC_SIZE * 1)(%rdi)
> +       VMOVA   %VZERO, (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VZERO, (VEC_SIZE * 3)(%rdi)
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       cmpq    %rdi, %rdx
> +       ja      L(zfill_loop_4x_vec)
> +L(zfill_done):
> +       ret
> +
> +
> +       /* Less 1x VEC case if we are not using evex masked store.  */
> +# if !USE_EVEX_MASKED_STORE
> +       .p2align 4,, 8
> +L(copy_1x):
> +       /* Special case for copy 1x. It can be handled quickly and many
> +          buffer sizes have convenient alignment.  */
> +       VMOVU   %VMM(0), (%rdi)
> +       /* If no zeros then we are done.  */
> +       testl   %ecx, %ecx
> +       jz      L(ret_1x_1x)
> +
> +       /* Need to zfill, not we know that length <= CHAR_PER_VEC so we
> +          only handle the small case here.  */
> +       bsf     %VRCX, %VRCX
> +L(zfill_less_vec_no_bsf):
> +       /* Adjust length / dst then just zfill less_vec.  */
> +       subq    %rcx, %rdx
> +#  ifdef USE_AS_WCSCPY
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> +#  else
> +       addq    %rcx, %rdi
> +#  endif
> +#  ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +#  endif
> +
> +L(zfill_less_vec):
> +       cmpl    $((VEC_SIZE / 2) / CHAR_SIZE), %edx
> +       jb      L(zfill_less_half)
> +
> +       VMOVU   %VZERO_HALF, (%rdi)
> +       VMOVU   %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       ret
> +#  ifdef USE_AS_STPCPY
> +L(ret_1x_1x):
> +       leaq    CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
> +       ret
> +#  endif
> +
> +
> +#  if VEC_SIZE == 64
> +       .p2align 4,, 4
> +L(copy_32_63):
> +       /* Overfill to avoid branches.  */
> +       VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +       VMOVU   %VMM_256(0), (%rdi)
> +       VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +
> +       /* We are taking advantage of the fact that to be here we must
> +          be writing null-term as (%rdi, %rcx) we have a byte of lee-
> +          way for overwriting.  */
> +       cmpl    %ecx, %edx
> +       ja      L(zfill_less_vec_no_bsf)
> +#   ifndef USE_AS_STPCPY
> +L(ret_1x_1x):
> +#   else
> +#    ifdef USE_AS_WCSCPY
> +       adcq    $0, %rdx
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> +#    else
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#    endif
> +#   endif
> +       ret
> +#  endif
> +
> +       .p2align 4,, 4
> +L(copy_16_31):
> +       /* Overfill to avoid branches.  */
> +       vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +       VMOVU   %VMM_128(0), (%rdi)
> +       vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       cmpl    %ecx, %edx
> +
> +       /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
> +          we have a larger copy block for 32-63 so this is just falls
> +          through to zfill 16-31. If VEC_SIZE == 32 then we check for
> +          full zfill of less 1x VEC.  */
> +#  if VEC_SIZE == 64
> +       jbe     L(ret_16_31)
> +       subl    %ecx, %edx
> +#   ifdef USE_AS_WCSCPY
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> +#   else
> +       addq    %rcx, %rdi
> +#   endif
> +#   ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +#   endif
> +L(zfill_less_half):
> +L(zfill_less_32):
> +       cmpl    $(16 / CHAR_SIZE), %edx
> +       jb      L(zfill_less_16)
> +       VMOVU   %VZERO_128, (%rdi)
> +       VMOVU   %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +#   ifdef USE_AS_STPCPY
> +       ret
> +#   endif
> +L(ret_16_31):
> +#   ifdef USE_AS_STPCPY
> +#    ifdef USE_AS_WCSCPY
> +       adcq    $0, %rdx
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> +#    else
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#    endif
> +#   endif
> +       ret
> +#  else
> +       /* VEC_SIZE == 32 begins.  */
> +       ja      L(zfill_less_vec_no_bsf)
> +#   ifndef USE_AS_STPCPY
> +L(ret_1x_1x):
> +#   else
> +#    ifdef USE_AS_WCSCPY
> +       adcq    $0, %rdx
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> +#    else
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#    endif
> +#   endif
> +       ret
> +#  endif
> +
> +
> +       .p2align 4,, 4
> +L(copy_8_15):
> +       /* Overfill to avoid branches.  */
> +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> +       vmovq   %VMM_128(0), (%rdi)
> +       movq    %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_8_15)
> +       subl    %ecx, %edx
> +#  ifdef USE_AS_WCSCPY
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> +#  else
> +       addq    %rcx, %rdi
> +#  endif
> +#  ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +#  endif
> +       .p2align 4,, 8
> +#  if VEC_SIZE == 32
> +L(zfill_less_half):
> +#  endif
> +L(zfill_less_16):
> +       xorl    %ecx, %ecx
> +       cmpl    $(8 / CHAR_SIZE), %edx
> +       jb      L(zfill_less_8)
> +       movq    %rcx, (%rdi)
> +       movq    %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +#  ifndef USE_AS_STPCPY
> +L(ret_8_15):
> +#  endif
> +       ret
> +
> +       .p2align 4,, 8
> +L(less_1x_vec):
> +       je      L(copy_1x)
> +
> +       /* We will need `tzcnt` result for all other copy sizes.  */
> +       tzcnt   %VRCX, %VRCX
> +#  if VEC_SIZE == 64
> +       cmpl    $(32 / CHAR_SIZE), %edx
> +       jae     L(copy_32_63)
> +#  endif
> +
> +       cmpl    $(16 / CHAR_SIZE), %edx
> +       jae     L(copy_16_31)
> +
> +       cmpl    $(8 / CHAR_SIZE), %edx
> +       jae     L(copy_8_15)
> +#  ifdef USE_AS_WCSCPY
> +       testl   %ecx, %ecx
> +       jz      L(zfill_less_8_set_ret)
> +
> +       movl    (%rsi, %rdx, CHAR_SIZE), %esi
> +       vmovd   %VMM_128(0), (%rdi)
> +       movl    %esi, (%rdi, %rdx, CHAR_SIZE)
> +#   ifdef USE_AS_STPCPY
> +       cmpl    %ecx, %edx
> +L(ret_8_15):
> +       adcq    $0, %rdx
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> +#   endif
> +       ret
> +L(zfill_less_8_set_ret):
> +       xorl    %ecx, %ecx
> +#   ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +#   endif
> +L(zfill_less_8):
> +       movl    %ecx, (%rdi)
> +       movl    %ecx, (%rdi, %rdx, CHAR_SIZE)
> +       ret
> +#  else
> +       cmpl    $3, %edx
> +       jb      L(copy_0_3)
> +       /* Overfill to avoid branches.  */
> +       movl    -3(%rsi, %rdx), %esi
> +       vmovd   %VMM_128(0), (%rdi)
> +       movl    %esi, -3(%rdi, %rdx)
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_4_7)
> +       subq    %rcx, %rdx
> +       addq    %rcx, %rdi
> +#   ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +#   endif
> +       xorl    %ecx, %ecx
> +       .p2align 4,, 8
> +L(zfill_less_8):
> +       cmpl    $3, %edx
> +       jb      L(zfill_less_3)
> +       movl    %ecx, (%rdi)
> +       movl    %ecx, -3(%rdi, %rdx)
> +#   ifdef USE_AS_STPCPY
> +       ret
> +#   endif
> +
> +L(ret_4_7):
> +#   ifdef USE_AS_STPCPY
> +L(ret_8_15):
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#   endif
> +       ret
> +
> +       .p2align 4,, 4
> +L(zfill_less_3):
> +       testl   %edx, %edx
> +       jz      L(zfill_1)
> +       movw    %cx, (%rdi)
> +L(zfill_1):
> +       movb    %cl, (%rdi, %rdx)
> +       ret
> +
> +       .p2align 4,, 8
> +L(copy_0_3):
> +       vmovd   %VMM_128(0), %r8d
> +       testl   %edx, %edx
> +       jz      L(copy_1)
> +       movw    %r8w, (%rdi)
> +       cmpl    %ecx, %edx
> +       ja      L(zfill_from_1)
> +       movzbl  (%rsi, %rdx), %r8d
> +#   ifdef USE_AS_STPCPY
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +       movb    %r8b, (%rdi, %rdx)
> +       ret
> +#   endif
> +
> +L(copy_1):
> +#   ifdef USE_AS_STPCPY
> +       movl    %edx, %eax
> +       cmpl    %ecx, %edx
> +       adcq    %rdi, %rax
> +#   endif
> +#   ifdef USE_AS_WCSCPY
> +       vmovd   %VMM_128(0), (%rdi)
> +#   else
> +       movb    %r8b, (%rdi, %rdx)
> +#   endif
> +       ret
> +#  endif
> +
> +
> +#  ifndef USE_AS_WCSCPY
> +       .p2align 4,, 8
> +L(zfill_from_1):
> +#   ifdef USE_AS_STPCPY
> +       leaq    (%rdi, %rcx), %rax
> +#   endif
> +       movw    $0, -1(%rdi, %rdx)
> +       ret
> +#  endif
> +
> +       .p2align 4,, 4
> +L(zero_len):
> +       incq    %rdx
> +       jne     L(best_effort_strncpy)
> +       movq    %rdi, %rax
> +       ret
> +# endif
> +
> +
> +       .p2align 4,, 4
> +       .p2align 6,, 8
> +L(page_cross):
> +       movq    %rsi, %rax
> +       andq    $(VEC_SIZE * -1), %rax
> +       VPCMPEQ (%rax), %VZERO, %k0
> +       KMOV    %k0, %VRCX
> +# ifdef USE_AS_WCSCPY
> +       movl    %esi, %r8d
> +       shrl    $2, %r8d
> +       andl    $(CHAR_PER_VEC - 1), %r8d
> +       shrx    %VR8, %VRCX, %VRCX
> +# else
> +       shrx    %VRSI, %VRCX, %VRCX
> +# endif
> +
> +       /* Compute amount of bytes we checked.  */
> +       subl    %esi, %eax
> +       andl    $(VEC_SIZE - 1), %eax
> +# ifdef USE_AS_WCSCPY
> +       shrl    $2, %eax
> +# endif
> +
> +       /* If rax > rdx then we are finishing the copy at the end of the
> +          page.  */
> +       cmpq    %rax, %rdx
> +       jb      L(page_cross_small)
> +
> +
> +       /* If rcx is non-zero then continue.  */
> +       test    %VRCX, %VRCX
> +       jz      L(page_cross_continue)
> +
> +       /* We found zero-CHAR so need to copy then zfill (we know we
> +          didn't cover all of length here).  */
> +       bsf     %VRCX, %VRCX
> +L(movsb_and_zfill):
> +       incl    %ecx
> +       subq    %rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +       leaq    -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
> +# else
> +       movq    %rdi, %rax
> +# endif
> +
> +       REP_MOVS
> +# ifdef USE_AS_WCSCPY
> +       movl    $0, (%rdi)
> +# else
> +       movb    $0, (%rdi)
> +# endif
> +       jmp     L(zfill_from_page_cross)
> +
> +L(page_cross_small):
> +       tzcnt   %VRCX, %VRCX
> +       cmpl    %ecx, %edx
> +       jbe     L(page_cross_copy_only)
> +
> +       /* Do a zfill of the tail before copying.  */
> +       movq    %rdi, %r9
> +       xorl    %eax, %eax
> +
> +       movl    %ecx, %r8d
> +
> +       subl    %ecx, %edx
> +       leaq    CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> +       movl    %edx, %ecx
> +       REP_STOS
> +       movq    %r9, %rdi
> +       movl    %r8d, %edx
> +L(page_cross_copy_only):
> +       leal    1(%rdx), %ecx
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       adcl    $0, %edx
> +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# else
> +       movq    %rdi, %rax
> +# endif
> +       REP_MOVS
> +       ret
> +
> +
> +L(best_effort_strncpy):
> +       movq    %rdx, %rcx
> +       xorl    %eax, %eax
> +       movq    %rdi, %r8
> +       /* The length is >= 2^63. We very much so expect to segfault at
> +          rep stos. If that doesn't happen then just strcpy to finish.
> +        */
> +       REP_STOS
> +       movq    %r8, %rdi
> +       jmp     OVERFLOW_STRCPY
> +END(STRNCPY)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> new file mode 100644
> index 0000000000..d5ff4cbe50
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> @@ -0,0 +1,65 @@

Copyright notice is missing.

> +#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
> +#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
> +
> +#if defined USE_MULTIARCH && IS_IN(libc)
> +#  define UNDERSCORES __
> +#  ifdef USE_WITH_SSE2
> +#    define ISA_EXT _sse2
> +#  elif defined USE_WITH_AVX
> +#    ifdef USE_WITH_RTM
> +#      define ISA_EXT _avx_rtm
> +#    else
> +#      define ISA_EXT _avx
> +#    endif
> +#  elif defined USE_WITH_AVX2
> +#    ifdef USE_WITH_RTM
> +#      define ISA_EXT _avx2_rtm
> +#    else
> +#      define ISA_EXT _avx2
> +#    endif
> +
> +#  elif defined USE_WITH_EVEX256
> +#    define ISA_EXT _evex
> +#  elif defined USE_WITH_EVEX512
> +#    define ISA_EXT _evex512
> +#  endif
> +#else
> +#  define UNDERSCORES
> +#  define ISA_EXT
> +#endif
> +
> +#ifdef USE_AS_WCSCPY
> +#  define STRCPY_PREFIX wc
> +#  define STRCAT_PREFIX wcs
> +#  ifdef USE_AS_STPCPY
> +#    define STRCPY_POSTFIX pcpy
> +#  else
> +#    define STRCPY_POSTFIX scpy
> +#  endif
> +#else
> +#  define STRCPY_PREFIX st
> +#  define STRCAT_PREFIX str
> +#  ifdef USE_AS_STPCPY
> +#    define STRCPY_POSTFIX pcpy
> +#  else
> +#    define STRCPY_POSTFIX rcpy
> +#  endif
> +#endif
> +#define STRCAT_POSTFIX cat
> +
> +#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext)                 \
> +  underscores##prefix##postfix##ext
> +
> +#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
> +
> +#ifndef OVERFLOW_STRCPY
> +#  define OVERFLOW_STRCPY                                                     \
> +    OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
> +#endif
> +
> +#ifndef OVERFLOW_STRCAT
> +#  define OVERFLOW_STRCAT                                                     \
> +    OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
> +#endif
> +
> +#endif
> --
> 2.34.1
>

OK with copyright notices added.

-- 
H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 3/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions
  2022-11-04  8:20   ` [PATCH v2 3/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
@ 2022-11-04 16:45     ` H.J. Lu
  2022-11-04 20:21       ` Noah Goldstein
  0 siblings, 1 reply; 42+ messages in thread
From: H.J. Lu @ 2022-11-04 16:45 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Nov 4, 2022 at 1:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
>     1. Use more overlapping stores to avoid branches.
>     2. Reduce how unrolled the aligning copies are (this is more of a
>        code-size save, its a negative for some sizes in terms of
>        perf).
>     3. For st{r|p}n{cat|cpy} re-order the branches to minimize the
>        number that are taken.
>
> Performance Changes:
>
>     Times are from N = 10 runs of the benchmark suite and are
>     reported as geometric mean of all ratios of
>     New Implementation / Old Implementation.
>
>     strcat-avx2      -> 0.998
>     strcpy-avx2      -> 0.937
>     stpcpy-avx2      -> 0.971
>
>     strncpy-avx2     -> 0.793
>     stpncpy-avx2     -> 0.775
>
>     strncat-avx2     -> 0.962
>
> Code Size Changes:
>     function         -> Bytes New / Bytes Old -> Ratio
>
>     strcat-avx2      -> 685  / 1639 -> 0.418
>     strcpy-avx2      -> 560  / 903  -> 0.620
>     stpcpy-avx2      -> 592  / 939  -> 0.630
>
>     strncpy-avx2     -> 1176 / 2390 -> 0.492
>     stpncpy-avx2     -> 1268 / 2438 -> 0.520
>
>     strncat-avx2     -> 1042 / 2563 -> 0.407
>
> Notes:
>     1. Because of the significant difference between the
>        implementations they are split into three files.
>
>            strcpy-evex.S    -> strcpy, stpcpy, strcat
>            strncpy-evex.S   -> strncpy
>            strncat-evex.S    > strncat
>
>        I couldn't find a way to merge them without making the
>        ifdefs incredibly difficult to follow.
>
>     2. All implementations can be made evex512 by including
>        "x86-evex512-vecs.h" at the top.

These comments are wrong for AVX2 implementations.

> Full check passes on x86-64 and build succeeds for all ISA levels w/
> and w/o multiarch.
>
> Fix avx2

Strayed comments?

> ---
>  sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S    |    6 +-
>  sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S   |    7 +-
>  sysdeps/x86_64/multiarch/stpncpy-avx2.S       |    5 +-
>  sysdeps/x86_64/multiarch/strcat-avx2-rtm.S    |   13 +-
>  sysdeps/x86_64/multiarch/strcat-avx2.S        |  268 +---
>  sysdeps/x86_64/multiarch/strcat-strlen-avx2.S |   76 +
>  sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S    |   13 +-
>  sysdeps/x86_64/multiarch/strcpy-avx2.S        | 1236 +++++------------
>  sysdeps/x86_64/multiarch/strncat-avx2-rtm.S   |    6 +-
>  sysdeps/x86_64/multiarch/strncat-avx2.S       |  424 +++++-
>  sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S   |    6 +-
>  sysdeps/x86_64/multiarch/strncpy-avx2.S       |  740 +++++++++-
>  sysdeps/x86_64/multiarch/x86-avx-vecs.h       |    5 +-
>  sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h  |   26 +
>  sysdeps/x86_64/multiarch/x86-avx2-vecs.h      |   27 +
>  15 files changed, 1624 insertions(+), 1234 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-vecs.h
>
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> index 2b9c07a59f..189a288053 100644
> --- a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> @@ -1,3 +1,3 @@
> -#define USE_AS_STPCPY
> -#define STRCPY __stpcpy_avx2_rtm
> -#include "strcpy-avx2-rtm.S"
> +#define STPCPY __stpcpy_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
> +#include "stpcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> index 60a2ccfe53..1b252985e7 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> @@ -1,4 +1,3 @@
> -#define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY __stpncpy_avx2_rtm
> -#include "strcpy-avx2-rtm.S"
> +#define STPNCPY        __stpncpy_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
> +#include "stpncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> index b2f8c19143..a46a8edbe2 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> @@ -3,6 +3,5 @@
>  #endif
>
>  #define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY STPNCPY
> -#include "strcpy-avx2.S"
> +#define STRNCPY        STPNCPY
> +#include "strncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> index 637fb557c4..94d51d10bd 100644
> --- a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> @@ -1,12 +1,3 @@
> -#ifndef STRCAT
> -# define STRCAT __strcat_avx2_rtm
> -#endif
> -
> -#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> -  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> -
> -#define VZEROUPPER_RETURN jmp   L(return_vzeroupper)
> -
> -#define SECTION(p) p##.avx.rtm
> -
> +#define STRCAT __strcat_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
>  #include "strcat-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
> index d9b7fb2a43..3f914fa342 100644
> --- a/sysdeps/x86_64/multiarch/strcat-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
> @@ -16,266 +16,10 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <isa-level.h>
> -
> -#if ISA_SHOULD_BUILD (3)
> -
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_avx2
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -/* Number of bytes in a vector register */
> -# define VEC_SIZE      32
> -
> -# ifndef SECTION
> -#  define SECTION(p)   p##.avx
> -# endif
> -
> -       .section SECTION(.text),"ax",@progbits
> -ENTRY (STRCAT)
> -       mov     %rdi, %r9
> -# ifdef USE_AS_STRNCAT
> -       mov     %rdx, %r8
> -# endif
> -
> -       xor     %eax, %eax
> -       mov     %edi, %ecx
> -       and     $((VEC_SIZE * 4) - 1), %ecx
> -       vpxor   %xmm6, %xmm6, %xmm6
> -       cmp     $(VEC_SIZE * 3), %ecx
> -       ja      L(fourth_vector_boundary)
> -       vpcmpeqb (%rdi), %ymm6, %ymm0
> -       vpmovmskb %ymm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_first_vector)
> -       mov     %rdi, %rax
> -       and     $-VEC_SIZE, %rax
> -       jmp     L(align_vec_size_start)
> -L(fourth_vector_boundary):
> -       mov     %rdi, %rax
> -       and     $-VEC_SIZE, %rax
> -       vpcmpeqb        (%rax), %ymm6, %ymm0
> -       mov     $-1, %r10d
> -       sub     %rax, %rcx
> -       shl     %cl, %r10d
> -       vpmovmskb %ymm0, %edx
> -       and     %r10d, %edx
> -       jnz     L(exit)
> -
> -L(align_vec_size_start):
> -       vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
> -       vpmovmskb %ymm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> -       vpmovmskb %ymm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> -       vpmovmskb %ymm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> -       vpmovmskb %ymm3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> -       add     $(VEC_SIZE * 4), %rax
> -       vpmovmskb %ymm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> -       vpmovmskb %ymm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> -       vpmovmskb %ymm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> -       vpmovmskb %ymm3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> -       add     $(VEC_SIZE * 4), %rax
> -       vpmovmskb %ymm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> -       vpmovmskb %ymm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> -       vpmovmskb %ymm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> -       vpmovmskb %ymm3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> -       add     $(VEC_SIZE * 4), %rax
> -       vpmovmskb %ymm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> -       vpmovmskb %ymm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> -       vpmovmskb %ymm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> -       vpmovmskb %ymm3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fifth_vector)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> -       add     $(VEC_SIZE * 5), %rax
> -       vpmovmskb %ymm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
> -       add     $VEC_SIZE, %rax
> -       vpmovmskb %ymm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
> -       add     $VEC_SIZE, %rax
> -       vpmovmskb %ymm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       test    $((VEC_SIZE * 4) - 1), %rax
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
> -       add     $VEC_SIZE, %rax
> -       vpmovmskb %ymm3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       add     $VEC_SIZE, %rax
> -
> -       .p2align 4
> -L(align_four_vec_loop):
> -       vmovaps (%rax), %ymm4
> -       vpminub VEC_SIZE(%rax), %ymm4, %ymm4
> -       vmovaps (VEC_SIZE * 2)(%rax),   %ymm5
> -       vpminub (VEC_SIZE * 3)(%rax),   %ymm5, %ymm5
> -       add     $(VEC_SIZE * 4),        %rax
> -       vpminub %ymm4,  %ymm5, %ymm5
> -       vpcmpeqb %ymm5, %ymm6, %ymm5
> -       vpmovmskb %ymm5,        %edx
> -       test    %edx,   %edx
> -       jz      L(align_four_vec_loop)
> -
> -       vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
> -       sub     $(VEC_SIZE * 5),        %rax
> -       vpmovmskb %ymm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_second_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> -       vpmovmskb %ymm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_third_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> -       vpmovmskb %ymm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_null_on_fourth_vector)
> -
> -       vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> -       vpmovmskb %ymm3, %edx
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 4), %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit):
> -       sub     %rdi, %rax
> -L(exit_null_on_first_vector):
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_second_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $VEC_SIZE, %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_third_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 2), %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_fourth_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 3), %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_null_on_fifth_vector):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $(VEC_SIZE * 4), %rax
> -
> -       .p2align 4
> -L(StartStrcpyPart):
> -       lea     (%r9, %rax), %rdi
> -       mov     %rsi, %rcx
> -       mov     %r9, %rax      /* save result */
> -
> -# ifdef USE_AS_STRNCAT
> -       test    %r8, %r8
> -       jz      L(ExitZero)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-avx2.S"
> +#ifndef STRCAT
> +# define STRCAT        __strcat_avx2
>  #endif
> +
> +#define USE_AS_STRCAT
> +#define STRCPY STRCAT
> +#include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
> new file mode 100644
> index 0000000000..128a45b6ff
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S

Missing copyright notice.

> @@ -0,0 +1,76 @@
> +    /* Simple strlen implementation that ends at L(strcat_strlen_done).  */
> +       movq    %rdi, %r8
> +       andq    $(VEC_SIZE * -1), %r8
> +       VPCMPEQ (%r8), %VZERO, %VMM(0)


> +       vpmovmskb %VMM(0), %ecx
> +       shrxl   %edi, %ecx, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v0)
> +
> +       VPCMPEQ VEC_SIZE(%r8), %VZERO, %VMM(0)
> +       vpmovmskb %VMM(0), %ecx
> +       leaq    (VEC_SIZE)(%r8), %rdi
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v0)
> +
> +       VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %VMM(0)
> +       vpmovmskb %VMM(0), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v1)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %VMM(0)
> +       vpmovmskb %VMM(0), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v2)
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %VMM(0)
> +       vpmovmskb %VMM(0), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v3)
> +
> +       orq     $(VEC_SIZE * 4 - 1), %rdi
> +       .p2align 4,, 8
> +L(loop_2x_vec):
> +       VMOVA   (VEC_SIZE * 0 + 1)(%rdi), %VMM(0)
> +       VPMIN   (VEC_SIZE * 1 + 1)(%rdi), %VMM(0), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 1)(%rdi), %VMM(2)
> +       VPMIN   (VEC_SIZE * 3 + 1)(%rdi), %VMM(2), %VMM(3)
> +       VPMIN   %VMM(1), %VMM(3), %VMM(3)
> +       VPCMPEQ %VMM(3), %VZERO, %VMM(3)
> +       vpmovmskb %VMM(3), %r8d
> +       subq    $(VEC_SIZE * -4), %rdi
> +       testl   %r8d, %r8d
> +       jz      L(loop_2x_vec)
> +
> +       addq    $(VEC_SIZE * -4 + 1), %rdi
> +
> +       VPCMPEQ %VMM(0), %VZERO, %VMM(0)
> +       vpmovmskb %VMM(0), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v0)
> +
> +       VPCMPEQ %VMM(1), %VZERO, %VMM(1)
> +       vpmovmskb %VMM(1), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v1)
> +
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(2)
> +       vpmovmskb %VMM(2), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(bsf_and_done_v2)
> +
> +       movl    %r8d, %ecx
> +L(bsf_and_done_v3):
> +       addq    $VEC_SIZE, %rdi
> +L(bsf_and_done_v2):
> +       bsfl    %ecx, %ecx
> +       leaq    (VEC_SIZE * 2)(%rdi, %rcx), %rdi
> +       jmp     L(strcat_strlen_done)
> +
> +       .p2align 4,, 4
> +L(bsf_and_done_v1):
> +       addq    $VEC_SIZE, %rdi
> +L(bsf_and_done_v0):
> +       bsfl    %ecx, %ecx
> +       addq    %rcx, %rdi
> +L(strcat_strlen_done):
> diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> index c2c581ecf7..fe80ffd265 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> @@ -1,12 +1,3 @@
> -#ifndef STRCPY
> -# define STRCPY __strcpy_avx2_rtm
> -#endif
> -
> -#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> -  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> -
> -#define VZEROUPPER_RETURN jmp   L(return_vzeroupper)
> -
> -#define SECTION(p) p##.avx.rtm
> -
> +#define STRCPY __strcpy_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
>  #include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
> index c725834929..b87a1722d5 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
> @@ -20,984 +20,378 @@
>
>  #if ISA_SHOULD_BUILD (3)
>
> +# include <sysdep.h>
>
> -# ifndef USE_AS_STRCAT
> -#  include <sysdep.h>
> -
> -#  ifndef STRCPY
> -#   define STRCPY  __strcpy_avx2
> -#  endif
> -
> -# endif
> -
> -/* Number of bytes in a vector register */
>  # ifndef VEC_SIZE
> -#  define VEC_SIZE     32
> -# endif
> -
> -# ifndef VZEROUPPER
> -#  define VZEROUPPER   vzeroupper
> -# endif
> -
> -# ifndef SECTION
> -#  define SECTION(p)   p##.avx
> -# endif
> -
> -/* zero register */
> -#define xmmZ   xmm0
> -#define ymmZ   ymm0
> -
> -/* mask register */
> -#define ymmM   ymm1
> -
> -# ifndef USE_AS_STRCAT
> -
> -       .section SECTION(.text),"ax",@progbits
> -ENTRY (STRCPY)
> -#  ifdef USE_AS_STRNCPY
> -       mov     %RDX_LP, %R8_LP
> -       test    %R8_LP, %R8_LP
> -       jz      L(ExitZero)
> -#  endif
> -       mov     %rsi, %rcx
> -#  ifndef USE_AS_STPCPY
> -       mov     %rdi, %rax      /* save result */
> -#  endif
> -
> +#  include "x86-avx2-vecs.h"
>  # endif
>
> -       vpxor   %xmmZ, %xmmZ, %xmmZ
> -
> -       and     $((VEC_SIZE * 4) - 1), %ecx
> -       cmp     $(VEC_SIZE * 2), %ecx
> -       jbe     L(SourceStringAlignmentLessTwoVecSize)
> -
> -       and     $-VEC_SIZE, %rsi
> -       and     $(VEC_SIZE - 1), %ecx
> -
> -       vpcmpeqb (%rsi), %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       shr     %cl, %rdx
> -
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -       mov     $VEC_SIZE, %r10
> -       sub     %rcx, %r10
> -       cmp     %r10, %r8
> -#  else
> -       mov     $(VEC_SIZE + 1), %r10
> -       sub     %rcx, %r10
> -       cmp     %r10, %r8
> -#  endif
> -       jbe     L(CopyVecSizeTailCase2OrCase3)
> +# ifndef STRCPY
> +#  define STRCPY       __strcpy_avx2
>  # endif
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeTail)
>
> -       vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
> -       vpmovmskb %ymm2, %edx
> +       /* Use movsb in page cross case to save code size.  */
> +# define USE_MOVSB_IN_PAGE_CROSS       1
>
> -# ifdef USE_AS_STRNCPY
> -       add     $VEC_SIZE, %r10
> -       cmp     %r10, %r8
> -       jbe     L(CopyTwoVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyTwoVecSize)
> -
> -       vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
> -       vmovdqu %ymm2, (%rdi)
> -
> -/* If source address alignment != destination address alignment */
> -       .p2align 4
> -L(UnalignVecSizeBoth):
> -       sub     %rcx, %rdi
> -# ifdef USE_AS_STRNCPY
> -       add     %rcx, %r8
> -       sbb     %rcx, %rcx
> -       or      %rcx, %r8
> -# endif
> -       mov     $VEC_SIZE, %rcx
> -       vmovdqa (%rsi, %rcx), %ymm2
> -       vmovdqu %ymm2, (%rdi, %rcx)
> -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> -       vpcmpeqb %ymm2, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $(VEC_SIZE * 3), %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec2)
> +# ifdef USE_AS_WCSCPY
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPMIN        vpminud
> +#  define CHAR_SIZE    4
>  # else
> -       jnz     L(CopyVecSize)
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPMIN        vpminub
> +#  define CHAR_SIZE    1
>  # endif
>
> -       vmovdqu %ymm2, (%rdi, %rcx)
> -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
> -       vpcmpeqb %ymm3, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec3)
> -# else
> -       jnz     L(CopyVecSize)
> -# endif
> +# define PAGE_SIZE     4096
>
> -       vmovdqu %ymm3, (%rdi, %rcx)
> -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
> -       vpcmpeqb %ymm4, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec4)
> +# ifdef USE_AS_STPCPY
> +#  define END_REG      rax
>  # else
> -       jnz     L(CopyVecSize)
> +#  define END_REG      rdi, %rdx
>  # endif
>
> -       vmovdqu %ymm4, (%rdi, %rcx)
> -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> -       vpcmpeqb %ymm2, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec2)
> +# ifdef USE_AS_STRCAT
> +#  define PAGE_ALIGN_REG       ecx
>  # else
> -       jnz     L(CopyVecSize)
> +#  define PAGE_ALIGN_REG       eax
>  # endif
>
> -       vmovdqu %ymm2, (%rdi, %rcx)
> -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> -       vpcmpeqb %ymm2, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec2)
> -# else
> -       jnz     L(CopyVecSize)
> -# endif
> +# define VZERO VMM(7)
> +# define VZERO_128     VMM_128(7)
>
> -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
> -       vmovdqu %ymm2, (%rdi, %rcx)
> -       vpcmpeqb %ymm3, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       add     $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec3)
> -# else
> -       jnz     L(CopyVecSize)
> -# endif
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(STRCPY)
> +       vpxor   %VZERO_128, %VZERO_128, %VZERO_128
>
> -       vmovdqu %ymm3, (%rdi, %rcx)
> -       mov     %rsi, %rdx
> -       lea     VEC_SIZE(%rsi, %rcx), %rsi
> -       and     $-(VEC_SIZE * 4), %rsi
> -       sub     %rsi, %rdx
> -       sub     %rdx, %rdi
> -# ifdef USE_AS_STRNCPY
> -       lea     (VEC_SIZE * 8)(%r8, %rdx), %r8
> -# endif
> -L(UnalignedFourVecSizeLoop):
> -       vmovdqa (%rsi), %ymm4
> -       vmovdqa VEC_SIZE(%rsi), %ymm5
> -       vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
> -       vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
> -       vpminub %ymm5, %ymm4, %ymm2
> -       vpminub %ymm7, %ymm6, %ymm3
> -       vpminub %ymm2, %ymm3, %ymm3
> -       vpcmpeqb %ymmM, %ymm3, %ymm3
> -       vpmovmskb %ymm3, %edx
> -# ifdef USE_AS_STRNCPY
> -       sub     $(VEC_SIZE * 4), %r8
> -       jbe     L(UnalignedLeaveCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(UnalignedFourVecSizeLeave)
> -
> -L(UnalignedFourVecSizeLoop_start):
> -       add     $(VEC_SIZE * 4), %rdi
> -       add     $(VEC_SIZE * 4), %rsi
> -       vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
> -       vmovdqa (%rsi), %ymm4
> -       vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
> -       vmovdqa VEC_SIZE(%rsi), %ymm5
> -       vpminub %ymm5, %ymm4, %ymm2
> -       vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
> -       vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
> -       vmovdqu %ymm7, -VEC_SIZE(%rdi)
> -       vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
> -       vpminub %ymm7, %ymm6, %ymm3
> -       vpminub %ymm2, %ymm3, %ymm3
> -       vpcmpeqb %ymmM, %ymm3, %ymm3
> -       vpmovmskb %ymm3, %edx
> -# ifdef USE_AS_STRNCPY
> -       sub     $(VEC_SIZE * 4), %r8
> -       jbe     L(UnalignedLeaveCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jz      L(UnalignedFourVecSizeLoop_start)
> -
> -L(UnalignedFourVecSizeLeave):
> -       vpcmpeqb %ymm4, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeUnaligned_0)
> -
> -       vpcmpeqb %ymm5, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %ecx
> -       test    %ecx, %ecx
> -       jnz     L(CopyVecSizeUnaligned_16)
> -
> -       vpcmpeqb %ymm6, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeUnaligned_32)
> -
> -       vpcmpeqb %ymm7, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %ecx
> -       bsf     %ecx, %edx
> -       vmovdqu %ymm4, (%rdi)
> -       vmovdqu %ymm5, VEC_SIZE(%rdi)
> -       vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     (VEC_SIZE * 3)(%rdi, %rdx), %rax
> -# endif
> -       vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
> -       add     $(VEC_SIZE - 1), %r8
> -       sub     %rdx, %r8
> -       lea     ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> -# else
> -       add     $(VEC_SIZE * 3), %rsi
> -       add     $(VEC_SIZE * 3), %rdi
> -       jmp     L(CopyVecSizeExit)
> +# ifdef USE_AS_STRCAT
> +       movq    %rdi, %rax
> +#  include "strcat-strlen-avx2.S"
>  # endif
>
> -/* If source address alignment == destination address alignment */
> -
> -L(SourceStringAlignmentLessTwoVecSize):
> -       vmovdqu (%rsi), %ymm3
> -       vmovdqu VEC_SIZE(%rsi), %ymm2
> -       vpcmpeqb %ymm3, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -       cmp     $VEC_SIZE, %r8
> -#  else
> -       cmp     $(VEC_SIZE + 1), %r8
> -#  endif
> -       jbe     L(CopyVecSizeTail1Case2OrCase3)
> +       movl    %esi, %PAGE_ALIGN_REG
> +       andl    $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
> +       ja      L(page_cross)
> +L(page_cross_continue):
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +       movq    %rdi, %rax
>  # endif
> -       test    %edx, %edx
> -       jnz     L(CopyVecSizeTail1)
> -
> -       vmovdqu %ymm3, (%rdi)
> -       vpcmpeqb %ymm2, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -       cmp     $(VEC_SIZE * 2), %r8
> -#  else
> -       cmp     $((VEC_SIZE * 2) + 1), %r8
> -#  endif
> -       jbe     L(CopyTwoVecSize1Case2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyTwoVecSize1)
> -
> -       and     $-VEC_SIZE, %rsi
> -       and     $(VEC_SIZE - 1), %ecx
> -       jmp     L(UnalignVecSizeBoth)
> +       VMOVU   (%rsi), %VMM(0)
> +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
>
> -/*------End of main part with loops---------------------*/
> +       testl   %ecx, %ecx
> +       jz      L(more_1x_vec)
>
> -/* Case1 */
> +       /* No longer need ymm registers so just vzeroupper so it doesn't
> +          need to be duplicated at each return statement.  */
> +       COND_VZEROUPPER
>
> -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> -       .p2align 4
> -L(CopyVecSize):
> -       add     %rcx, %rdi
> -# endif
> -L(CopyVecSizeTail):
> -       add     %rcx, %rsi
> -L(CopyVecSizeTail1):
> -       bsf     %edx, %edx
> -L(CopyVecSizeExit):
> -       cmp     $32, %edx
> -       jae     L(Exit32_63)
> -       cmp     $16, %edx
> -       jae     L(Exit16_31)
> -       cmp     $8, %edx
> -       jae     L(Exit8_15)
> -       cmp     $4, %edx
> -       jae     L(Exit4_7)
> -       cmp     $3, %edx
> -       je      L(Exit3)
> -       cmp     $1, %edx
> -       ja      L(Exit2)
> -       je      L(Exit1)
> -       movb    $0, (%rdi)
> +       xorl    %edx, %edx
> +       bsfl    %ecx, %edx
>  # ifdef USE_AS_STPCPY
> -       lea     (%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $1, %r8
> -       lea     1(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> -# endif
> -L(return_vzeroupper):
> -       ZERO_UPPER_VEC_REGISTERS_RETURN
> -
> -       .p2align 4
> -L(CopyTwoVecSize1):
> -       add     $VEC_SIZE, %rsi
> -       add     $VEC_SIZE, %rdi
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $VEC_SIZE, %r8
> -# endif
> -       jmp     L(CopyVecSizeTail1)
> -
> -       .p2align 4
> -L(CopyTwoVecSize):
> -       bsf     %edx, %edx
> -       add     %rcx, %rsi
> -       add     $VEC_SIZE, %edx
> -       sub     %ecx, %edx
> -       jmp     L(CopyVecSizeExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnaligned_0):
> -       bsf     %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -       vmovdqu %ymm4, (%rdi)
> -       add     $((VEC_SIZE * 4) - 1), %r8
> -       sub     %rdx, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> -# else
> -       jmp     L(CopyVecSizeExit)
> -# endif
> -
> -       .p2align 4
> -L(CopyVecSizeUnaligned_16):
> -       bsf     %ecx, %edx
> -       vmovdqu %ymm4, (%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     VEC_SIZE(%rdi, %rdx), %rax
> -# endif
> -       vmovdqu %ymm5, VEC_SIZE(%rdi)
> -       add     $((VEC_SIZE * 3) - 1), %r8
> -       sub     %rdx, %r8
> -       lea     (VEC_SIZE + 1)(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> +       leaq    (%rdi, %rdx), %rax
> +# endif
> +
> +       /* Use mask bits in rcx to detect which copy we need. If the low
> +          mask is zero then there must be a bit set in the upper half.
> +          I.e if ecx != 0 and cx == 0, then match must be upper 16
> +          bits so we use L(copy_16_31).  */
> +       testw   %cx, %cx
> +       jz      L(copy_16_31)
> +
> +       testb   %cl, %cl
> +       jz      L(copy_8_15)
> +# ifdef USE_AS_WCSCPY
> +       vmovd   %xmm0, (%rdi)
> +       movl    $0, (%END_REG)
> +       ret
>  # else
> -       add     $VEC_SIZE, %rsi
> -       add     $VEC_SIZE, %rdi
> -       jmp     L(CopyVecSizeExit)
> -# endif
> -
> -       .p2align 4
> -L(CopyVecSizeUnaligned_32):
> -       bsf     %edx, %edx
> -       vmovdqu %ymm4, (%rdi)
> -       vmovdqu %ymm5, VEC_SIZE(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -       lea     (VEC_SIZE * 2)(%rdi, %rdx), %rax
> -# endif
> -       vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> -       add     $((VEC_SIZE * 2) - 1), %r8
> -       sub     %rdx, %r8
> -       lea     ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
> -       jmp     L(StrncpyFillTailWithZero)
> +       testb   $0x7, %cl
> +       jz      L(copy_4_7)
> +
> +       testl   %edx, %edx
> +       jz      L(set_null_term)
> +       vmovd   %xmm0, %ecx
> +       movw    %cx, (%rdi)
> +
> +       .p2align 4,, 2
> +L(set_null_term):
> +       movb    $0, (%END_REG)
> +       ret
> +
> +       .p2align 4,, 12
> +L(copy_4_7):
> +       movl    -3(%rsi, %rdx), %ecx
> +       vmovd   %xmm0, (%rdi)
> +       movl    %ecx, -3(%END_REG)
> +       ret
> +# endif
> +
> +       .p2align 4,, 10
> +L(copy_16_31):
> +       VMOVU   -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
> +       VMOVU   %xmm0, (%rdi)
> +       VMOVU   %xmm1, -(16 - CHAR_SIZE)(%END_REG)
> +       ret
> +
> +       .p2align 4,, 10
> +L(copy_8_15):
> +# ifdef USE_AS_WCSCPY
> +       movl    -(8 - CHAR_SIZE)(%rsi, %rdx), %ecx
>  # else
> -       add     $(VEC_SIZE * 2), %rsi
> -       add     $(VEC_SIZE * 2), %rdi
> -       jmp     L(CopyVecSizeExit)
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> -#  ifndef USE_AS_STRCAT
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec6):
> -       vmovdqu %ymm6, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec5):
> -       vmovdqu %ymm5, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec4):
> -       vmovdqu %ymm4, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> -
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec3):
> -       vmovdqu %ymm3, (%rdi, %rcx)
> -       jmp     L(CopyVecSizeVecExit)
> -#  endif
> -
> -/* Case2 */
> -
> -       .p2align 4
> -L(CopyVecSizeCase2):
> -       add     $VEC_SIZE, %r8
> -       add     %rcx, %rdi
> -       add     %rcx, %rsi
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyTwoVecSizeCase2):
> -       add     %rcx, %rsi
> -       bsf     %edx, %edx
> -       add     $VEC_SIZE, %edx
> -       sub     %ecx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -L(CopyVecSizeTailCase2):
> -       add     %rcx, %rsi
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -L(CopyVecSizeTail1Case2):
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -       jmp     L(StrncpyExit)
> -
> -/* Case2 or Case3,  Case3 */
> -
> -       .p2align 4
> -L(CopyVecSizeCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyVecSizeCase2)
> -L(CopyVecSizeCase3):
> -       add     $VEC_SIZE, %r8
> -       add     %rcx, %rdi
> -       add     %rcx, %rsi
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyTwoVecSizeCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyTwoVecSizeCase2)
> -       add     %rcx, %rsi
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyVecSizeTailCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyVecSizeTailCase2)
> -       add     %rcx, %rsi
> -       jmp     L(StrncpyExit)
> -
> -       .p2align 4
> -L(CopyTwoVecSize1Case2OrCase3):
> -       add     $VEC_SIZE, %rdi
> -       add     $VEC_SIZE, %rsi
> -       sub     $VEC_SIZE, %r8
> -L(CopyVecSizeTail1Case2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(CopyVecSizeTail1Case2)
> -       jmp     L(StrncpyExit)
> -# endif
> -
> -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
> -
> -       .p2align 4
> -L(Exit1):
> -       movzwl  (%rsi), %edx
> -       mov     %dx, (%rdi)
> -# ifdef USE_AS_STPCPY
> -       lea     1(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $2, %r8
> -       lea     2(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> -# endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Exit2):
> -       movzwl  (%rsi), %ecx
> -       mov     %cx, (%rdi)
> -       movb    $0, 2(%rdi)
> -# ifdef USE_AS_STPCPY
> -       lea     2(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $3, %r8
> -       lea     3(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> -# endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Exit3):
> -       mov     (%rsi), %edx
> -       mov     %edx, (%rdi)
> +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
> +# endif
> +       vmovq   %xmm0, (%rdi)
> +       movq    %rcx, -(8 - CHAR_SIZE)(%END_REG)
> +       ret
> +
> +
> +       .p2align 4,, 8
> +L(more_1x_vec):
> +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> +       VMOVU   %VMM(0), (%rdi)
> +# endif
> +       subq    %rsi, %rdi
> +       orq     $(VEC_SIZE - 1), %rsi
> +       addq    %rsi, %rdi
> +       VMOVA   1(%rsi), %VMM(1)
> +
> +       /* Try and order stores after as many loads as is reasonable to
> +          avoid potential false dependencies.  */
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +       VMOVU   %VMM(0), (%rax)
> +# endif
> +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
> +
> +       VMOVA   (VEC_SIZE + 1)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), 1(%rdi)
> +
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
> +
> +       VMOVA   (VEC_SIZE * 2 + 1)(%rsi), %VMM(3)
> +       VMOVU   %VMM(2), (VEC_SIZE + 1)(%rdi)
> +
> +       VPCMPEQ %VMM(3), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
> +
> +       VMOVA   (VEC_SIZE * 3 + 1)(%rsi), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 2 + 1)(%rdi)
> +       VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %edx
> +       testl   %edx, %edx
> +       jnz     L(ret_vec_x4)
> +
> +       VMOVU   %VMM(4), (VEC_SIZE * 3 + 1)(%rdi)
> +
> +       /* Subtract rsi from rdi before aligning. Adding back rsi will
> +          get proper rdi (dst) for new src.  */
> +       subq    %rsi, %rdi
> +       incq    %rsi
> +       orq     $(VEC_SIZE * 4 - 1), %rsi
> +
> +       /* Do first half of loop ahead of time so loop can just start by
> +          storing.  */
> +       VMOVA   (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %edx
> +       addq    %rsi, %rdi
> +
> +       testl   %edx, %edx
> +       jnz     L(loop_4x_done)
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
> +       subq    $(VEC_SIZE * -4), %rsi
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3 + 1)(%rdi)
> +
> +
> +       VMOVA   (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> +
> +       vpmovmskb %VMM(6), %edx
> +       subq    $(VEC_SIZE * -4), %rdi
> +       testl   %edx, %edx
> +       jz      L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
> +
> +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
> +
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
> +L(ret_vec_x4):
> +       bsfl    %edx, %edx
> +       VMOVU   ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> +       VMOVU   %VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
>  # ifdef USE_AS_STPCPY
> -       lea     3(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     $4, %r8
> -       lea     4(%rdi), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       leaq    (VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax
>  # endif
> +L(return_end):
>         VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(Exit4_7):
> -       mov     (%rsi), %ecx
> -       mov     %ecx, (%rdi)
> -       mov     -3(%rsi, %rdx), %ecx
> -       mov     %ecx, -3(%rdi, %rdx)
> +       .p2align 4,, 8
> +L(ret_vec_x1):
> +       bsfl    %ecx, %ecx
> +       VMOVU   (1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> +       VMOVU   %VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
>  # ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     %rdx, %r8
> -       sub     $1, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       leaq    1(%rcx, %rdi), %rax
>  # endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Exit8_15):
> -       mov     (%rsi), %rcx
> -       mov     -7(%rsi, %rdx), %r9
> -       mov     %rcx, (%rdi)
> -       mov     %r9, -7(%rdi, %rdx)
> -# ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     %rdx, %r8
> -       sub     $1, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> -# endif
> -       VZEROUPPER_RETURN
> +L(return_vzeroupper):
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
>
> -       .p2align 4
> -L(Exit16_31):
> -       vmovdqu (%rsi), %xmm2
> -       vmovdqu -15(%rsi, %rdx), %xmm3
> -       vmovdqu %xmm2, (%rdi)
> -       vmovdqu %xmm3, -15(%rdi, %rdx)
> +       .p2align 4,, 8
> +L(ret_vec_x2):
> +       bsfl    %ecx, %ecx
> +       VMOVU   ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> +       VMOVU   %VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
>  # ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub %rdx, %r8
> -       sub $1, %r8
> -       lea 1(%rdi, %rdx), %rdi
> -       jnz L(StrncpyFillTailWithZero)
> +       leaq    (VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax
>  # endif
>         VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(Exit32_63):
> -       vmovdqu (%rsi), %ymm2
> -       vmovdqu -31(%rsi, %rdx), %ymm3
> -       vmovdqu %ymm2, (%rdi)
> -       vmovdqu %ymm3, -31(%rdi, %rdx)
> +       .p2align 4,, 8
> +L(ret_vec_x3):
> +       bsfl    %ecx, %ecx
> +       VMOVU   ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> +       VMOVU   %VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
>  # ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -       sub     %rdx, %r8
> -       sub     $1, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -       jnz     L(StrncpyFillTailWithZero)
> +       leaq    (VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax
>  # endif
>         VZEROUPPER_RETURN
>
> -# ifdef USE_AS_STRNCPY
>
> -       .p2align 4
> -L(StrncpyExit1):
> -       movzbl  (%rsi), %edx
> -       mov     %dl, (%rdi)
> +       .p2align 4,, 4
> +L(page_cross):
> +       movq    %rsi, %rcx
> +       andq    $(VEC_SIZE * -1), %rcx
> +
> +       VPCMPEQ (%rcx), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       shrxl   %esi, %ecx, %ecx
> +# if USE_MOVSB_IN_PAGE_CROSS
> +       /* Optimizing more aggressively for space as this is very cold
> +          code. This saves 2x cache lines.  */
> +
> +       /* This adds once to the later result which will get correct
> +          copy bounds. NB: this can never zero-out a non-zero RCX as
> +          to be in the page cross case rsi cannot be aligned and we
> +          already right-shift rcx by the misalignment.  */
> +       shll    $CHAR_SIZE, %ecx
> +       jz      L(page_cross_continue)
> +       bsfl    %ecx, %ecx
> +#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +       movq    %rdi, %rax
> +#  endif
> +       rep     movsb
>  #  ifdef USE_AS_STPCPY
> -       lea     1(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, 1(%rdi)
> +       leaq    -CHAR_SIZE(%rdi), %rax
>  #  endif
> -       VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(StrncpyExit2):
> -       movzwl  (%rsi), %edx
> -       mov     %dx, (%rdi)
> -#  ifdef USE_AS_STPCPY
> -       lea     2(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, 2(%rdi)
> -#  endif
>         VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(StrncpyExit3_4):
> -       movzwl  (%rsi), %ecx
> -       movzwl  -2(%rsi, %r8), %edx
> -       mov     %cx, (%rdi)
> -       mov     %dx, -2(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(StrncpyExit5_8):
> -       mov     (%rsi), %ecx
> -       mov     -4(%rsi, %r8), %edx
> -       mov     %ecx, (%rdi)
> -       mov     %edx, -4(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(StrncpyExit9_16):
> -       mov     (%rsi), %rcx
> -       mov     -8(%rsi, %r8), %rdx
> -       mov     %rcx, (%rdi)
> -       mov     %rdx, -8(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(StrncpyExit17_32):
> -       vmovdqu (%rsi), %xmm2
> -       vmovdqu -16(%rsi, %r8), %xmm3
> -       vmovdqu %xmm2, (%rdi)
> -       vmovdqu %xmm3, -16(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(StrncpyExit33_64):
> -       /*  0/32, 31/16 */
> -       vmovdqu (%rsi), %ymm2
> -       vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
> -       vmovdqu %ymm2, (%rdi)
> -       vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi, %r8)
> -#  endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(StrncpyExit65):
> -       /* 0/32, 32/32, 64/1 */
> -       vmovdqu (%rsi), %ymm2
> -       vmovdqu 32(%rsi), %ymm3
> -       mov     64(%rsi), %cl
> -       vmovdqu %ymm2, (%rdi)
> -       vmovdqu %ymm3, 32(%rdi)
> -       mov     %cl, 64(%rdi)
> -#  ifdef USE_AS_STPCPY
> -       lea     65(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, 65(%rdi)
> -#  endif
> -       VZEROUPPER_RETURN
> +# else
> +       testl   %ecx, %ecx
> +       jz      L(page_cross_continue)
>
> +       /* Traditional copy case, essentially same as used in non-page-
> +          cross case but since we can't reuse VMM(0) we need twice as
> +          many loads from rsi.  */
>  #  ifndef USE_AS_STRCAT
> -
> -       .p2align 4
> -L(Fill1):
> -       mov     %dl, (%rdi)
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Fill2):
> -       mov     %dx, (%rdi)
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Fill3_4):
> -       mov     %dx, (%rdi)
> -       mov     %dx, -2(%rdi, %r8)
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Fill5_8):
> -       mov     %edx, (%rdi)
> -       mov     %edx, -4(%rdi, %r8)
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Fill9_16):
> -       mov     %rdx, (%rdi)
> -       mov     %rdx, -8(%rdi, %r8)
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(Fill17_32):
> -       vmovdqu %xmmZ, (%rdi)
> -       vmovdqu %xmmZ, -16(%rdi, %r8)
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(CopyVecSizeUnalignedVec2):
> -       vmovdqu %ymm2, (%rdi, %rcx)
> -
> -       .p2align 4
> -L(CopyVecSizeVecExit):
> -       bsf     %edx, %edx
> -       add     $(VEC_SIZE - 1), %r8
> -       add     %rcx, %rdi
> -#   ifdef USE_AS_STPCPY
> -       lea     (%rdi, %rdx), %rax
> -#   endif
> -       sub     %rdx, %r8
> -       lea     1(%rdi, %rdx), %rdi
> -
> -       .p2align 4
> -L(StrncpyFillTailWithZero):
> -       xor     %edx, %edx
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(StrncpyFillExit)
> -
> -       vmovdqu %ymmZ, (%rdi)
> -       add     $VEC_SIZE, %rdi
> -
> -       mov     %rdi, %rsi
> -       and     $(VEC_SIZE - 1), %esi
> -       sub     %rsi, %rdi
> -       add     %rsi, %r8
> -       sub     $(VEC_SIZE * 4), %r8
> -       jb      L(StrncpyFillLessFourVecSize)
> -
> -L(StrncpyFillLoopVmovdqa):
> -       vmovdqa %ymmZ, (%rdi)
> -       vmovdqa %ymmZ, VEC_SIZE(%rdi)
> -       vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
> -       vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
> -       add     $(VEC_SIZE * 4), %rdi
> -       sub     $(VEC_SIZE * 4), %r8
> -       jae     L(StrncpyFillLoopVmovdqa)
> -
> -L(StrncpyFillLessFourVecSize):
> -       add     $(VEC_SIZE * 2), %r8
> -       jl      L(StrncpyFillLessTwoVecSize)
> -       vmovdqa %ymmZ, (%rdi)
> -       vmovdqa %ymmZ, VEC_SIZE(%rdi)
> -       add     $(VEC_SIZE * 2), %rdi
> -       sub     $VEC_SIZE, %r8
> -       jl      L(StrncpyFillExit)
> -       vmovdqa %ymmZ, (%rdi)
> -       add     $VEC_SIZE, %rdi
> -       jmp     L(Fill)
> -
> -       .p2align 4
> -L(StrncpyFillLessTwoVecSize):
> -       add     $VEC_SIZE, %r8
> -       jl      L(StrncpyFillExit)
> -       vmovdqa %ymmZ, (%rdi)
> -       add     $VEC_SIZE, %rdi
> -       jmp     L(Fill)
> -
> -       .p2align 4
> -L(StrncpyFillExit):
> -       add     $VEC_SIZE, %r8
> -L(Fill):
> -       cmp     $17, %r8d
> -       jae     L(Fill17_32)
> -       cmp     $9, %r8d
> -       jae     L(Fill9_16)
> -       cmp     $5, %r8d
> -       jae     L(Fill5_8)
> -       cmp     $3, %r8d
> -       jae     L(Fill3_4)
> -       cmp     $1, %r8d
> -       ja      L(Fill2)
> -       je      L(Fill1)
> -       VZEROUPPER_RETURN
> -
> -/* end of ifndef USE_AS_STRCAT */
> +       xorl    %edx, %edx
>  #  endif
> -
> -       .p2align 4
> -L(UnalignedLeaveCase2OrCase3):
> -       test    %rdx, %rdx
> -       jnz     L(UnalignedFourVecSizeLeaveCase2)
> -L(UnalignedFourVecSizeLeaveCase3):
> -       lea     (VEC_SIZE * 4)(%r8), %rcx
> -       and     $-VEC_SIZE, %rcx
> -       add     $(VEC_SIZE * 3), %r8
> -       jl      L(CopyVecSizeCase3)
> -       vmovdqu %ymm4, (%rdi)
> -       sub     $VEC_SIZE, %r8
> -       jb      L(CopyVecSizeCase3)
> -       vmovdqu %ymm5, VEC_SIZE(%rdi)
> -       sub     $VEC_SIZE, %r8
> -       jb      L(CopyVecSizeCase3)
> -       vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> -       sub     $VEC_SIZE, %r8
> -       jb      L(CopyVecSizeCase3)
> -       vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
> +       bsfl    %ecx, %edx
>  #  ifdef USE_AS_STPCPY
> -       lea     (VEC_SIZE * 4)(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (VEC_SIZE * 4)(%rdi)
> +       leaq    (%rdi, %rdx), %rax
> +#  elif !defined USE_AS_STRCAT
> +       movq    %rdi, %rax
>  #  endif
> -       VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(UnalignedFourVecSizeLeaveCase2):
> -       xor     %ecx, %ecx
> -       vpcmpeqb %ymm4, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       add     $(VEC_SIZE * 3), %r8
> -       jle     L(CopyVecSizeCase2OrCase3)
> -       test    %edx, %edx
> -#  ifndef USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec4)
> -#  else
> -       jnz     L(CopyVecSize)
> -#  endif
> -       vpcmpeqb %ymm5, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       vmovdqu %ymm4, (%rdi)
> -       add     $VEC_SIZE, %rcx
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -       test    %edx, %edx
> -#  ifndef USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec5)
> -#  else
> -       jnz     L(CopyVecSize)
> -#  endif
> +       /* vzeroupper early to avoid duplicating at each return.  */
> +       COND_VZEROUPPER
>
> -       vpcmpeqb %ymm6, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       vmovdqu %ymm5, VEC_SIZE(%rdi)
> -       add     $VEC_SIZE, %rcx
> -       sub     $VEC_SIZE, %r8
> -       jbe     L(CopyVecSizeCase2OrCase3)
> -       test    %edx, %edx
> -#  ifndef USE_AS_STRCAT
> -       jnz     L(CopyVecSizeUnalignedVec6)
> -#  else
> -       jnz     L(CopyVecSize)
> -#  endif
> +       testw   %cx, %cx
> +       jz      L(page_cross_copy_16_31)
>
> -       vpcmpeqb %ymm7, %ymmZ, %ymmM
> -       vpmovmskb %ymmM, %edx
> -       vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> -       lea     VEC_SIZE(%rdi, %rcx), %rdi
> -       lea     VEC_SIZE(%rsi, %rcx), %rsi
> -       bsf     %edx, %edx
> -       cmp     %r8d, %edx
> -       jb      L(CopyVecSizeExit)
> -L(StrncpyExit):
> -       cmp     $65, %r8d
> -       je      L(StrncpyExit65)
> -       cmp     $33, %r8d
> -       jae     L(StrncpyExit33_64)
> -       cmp     $17, %r8d
> -       jae     L(StrncpyExit17_32)
> -       cmp     $9, %r8d
> -       jae     L(StrncpyExit9_16)
> -       cmp     $5, %r8d
> -       jae     L(StrncpyExit5_8)
> -       cmp     $3, %r8d
> -       jae     L(StrncpyExit3_4)
> -       cmp     $1, %r8d
> -       ja      L(StrncpyExit2)
> -       je      L(StrncpyExit1)
> -#  ifdef USE_AS_STPCPY
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -       movb    $0, (%rdi)
> -#  endif
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(ExitZero):
> -#  ifndef USE_AS_STRCAT
> -       mov     %rdi, %rax
> -#  endif
> -       VZEROUPPER_RETURN
> +       testb   %cl, %cl
> +       jz      L(page_cross_copy_8_15)
>
> -# endif
> +       testl   $0x7, %cl
> +       jz      L(page_cross_copy_4_7)
>
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# else
> -END (STRCAT)
> -# endif
> +       testl   %edx, %edx
> +       jz      L(page_cross_set_null_term)
> +       movzwl  (%rsi), %ecx
> +       movw    %cx, (%rdi)
> +L(page_cross_set_null_term):
> +       movb    $0, (%END_REG)
> +       ret
> +
> +       .p2align 4,, 4
> +L(page_cross_copy_4_7):
> +       movl    (%rsi), %ecx
> +       movl    -3(%rsi, %rdx), %esi
> +       movl    %ecx, (%rdi)
> +       movl    %esi, -3(%END_REG)
> +       ret
> +
> +       .p2align 4,, 4
> +L(page_cross_copy_8_15):
> +       movq    (%rsi), %rcx
> +       movq    -7(%rsi, %rdx), %rsi
> +       movq    %rcx, (%rdi)
> +       movq    %rsi, -7(%END_REG)
> +       ret
> +
> +
> +       .p2align 4,, 3
> +L(page_cross_copy_16_31):
> +       VMOVU   (%rsi), %xmm0
> +       VMOVU   -15(%rsi, %rdx), %xmm1
> +       VMOVU   %xmm0, (%rdi)
> +       VMOVU   %xmm1, -15(%END_REG)
> +       ret
> +# endif
> +
> +END(STRCPY)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> index 0dcea18dbb..2bbdbb91ab 100644
> --- a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> @@ -1,3 +1,3 @@
> -#define USE_AS_STRNCAT
> -#define STRCAT __strncat_avx2_rtm
> -#include "strcat-avx2-rtm.S"
> +#define STRNCAT        __strncat_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
> +#include "strncat-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
> index 52ecbca943..547cef9486 100644
> --- a/sysdeps/x86_64/multiarch/strncat-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
> @@ -1,7 +1,419 @@
> -#ifndef STRNCAT
> -# define STRNCAT       __strncat_avx2
> -#endif
> +/* strncat with AVX2
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (3)
> +
> +# include <sysdep.h>
> +
> +# ifndef VEC_SIZE
> +#  include "x86-avx2-vecs.h"
> +# endif
> +
> +# ifndef STRNCAT
> +#  define STRNCAT      __strncat_avx2
> +# endif
> +
> +# ifdef USE_AS_WCSCPY
> +#  define movNULL      movl
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPMIN        vpminud
> +#  define CHAR_SIZE    4
> +# else
> +#  define movNULL      movb
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPMIN        vpminub
> +#  define CHAR_SIZE    1
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# define PAGE_SIZE     4096
> +
> +# define VZERO VMM(7)
> +# define VZERO_128     VMM_128(7)
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCAT)
> +       /* Filter zero length strings and very long strings.  Zero
> +          length strings just return, very long strings are handled by
> +          using the non-length variant {wcs|str}cat.  */
> +       movq    %rdi, %rax
> +# ifdef USE_AS_WCSCPY
> +       leaq    -1(%rdx), %rcx
> +       shr     $56, %rcx
> +       jnz     L(zero_len)
> +       salq    $2, %rdx
> +# else
> +       test    %rdx, %rdx
> +       jl      L(zero_len)
> +# endif
> +       vpxor   %VZERO_128, %VZERO_128, %VZERO_128
> +
> +# include "strcat-strlen-avx2.S"
> +
> +       movl    %esi, %ecx
> +       andl    $(PAGE_SIZE - 1), %ecx
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> +       ja      L(page_cross)
> +L(page_cross_continue):
> +       VMOVU   (%rsi), %VMM(0)
> +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +
> +       tzcnt   %ecx, %r8d
> +       cmpq    %r8, %rdx
> +       jbe     L(less_1x_vec)
> +
> +       testl   %ecx, %ecx
> +       jz      L(more_1x_vec)
> +
> +       /* Hoist this to save code size.  */
> +
> +       movl    %r8d, %edx
> +
> +L(less_1x_vec):
> +       COND_VZEROUPPER
> +
> +       cmpl    $16, %edx
> +       jae     L(copy_16_31)
> +       cmpl    $8, %edx
> +       jae     L(copy_8_15)
> +
> +
> +# ifdef USE_AS_WCSCPY
> +       vmovd   %VMM_128(0), (%rdi)
> +       movNULL $0, (%rdi, %rdx)
> +       ret
> +# else
> +       cmpl    $4, %edx
> +       jae     L(copy_4_7)
> +
> +       movzbl  (%rsi), %ecx
> +       cmpl    $1, %edx
> +       jbe     L(set_null_term)
> +
> +       /* NB: make this `vmovw` if support for AVX512-FP16 is added.
> +        */
> +       movzwl  1(%rsi), %esi
> +       movw    %si, 1(%rdi)
> +
> +       .p2align 4,, 1
> +L(set_null_term):
> +       movb    %cl, (%rdi)
> +       movNULL $0, (%rdi, %rdx)
> +       ret
> +
> +       .p2align 4,, 11
> +L(copy_4_7):
> +       movl    -(4)(%rsi, %rdx), %ecx
> +       vmovd   %xmm0, (%rdi)
> +       movl    %ecx, -(4)(%rdi, %rdx)
> +       movNULL $0, (%rdi, %rdx)
> +       ret
> +# endif
> +
> +
> +       .p2align 4,, 10
> +L(copy_16_31):
> +       VMOVU   -(16)(%rsi, %rdx), %xmm1
> +       VMOVU   %xmm0, (%rdi)
> +       VMOVU   %xmm1, -(16)(%rdi, %rdx)
> +       movNULL $0, (%rdi, %rdx)
> +       ret
> +
> +       .p2align 4,, 10
> +L(copy_8_15):
> +       movq    -(8)(%rsi, %rdx), %rcx
> +       vmovq   %xmm0, (%rdi)
> +       movq    %rcx, -(8)(%rdi, %rdx)
> +       movNULL $0, (%rdi, %rdx)
> +       ret
> +
> +       .p2align 4,, 8
> +       .p2align 6,, 14
> +L(more_1x_vec):
> +       VMOVU   %VMM(0), (%rdi)
> +
> +       /* Align rsi (src) and just rdx/rdi (length/dst).  */
> +       addq    %rsi, %rdx
> +       subq    %rsi, %rdi
> +       orq     $(VEC_SIZE - 1), %rsi
> +       incq    %rsi
> +       addq    %rsi, %rdi
> +L(loop_last_4x_vec):
> +       subq    %rsi, %rdx
> +       VMOVA   0(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       ja      L(more_2x_vec)
> +L(last_2x_vec):
> +       tzcnt   %ecx, %ecx
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x1_len)
> +
> +       cmpl    $VEC_SIZE, %ecx
> +       jnz     L(ret_vec_x1)
> +
> +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), (%rdi)
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       addl    $-VEC_SIZE, %edx
> +       bzhil   %edx, %ecx, %r8d
> +       jz      L(ret_vec_x2_len)
> +L(ret_vec_x2):
> +       bsfl    %ecx, %edx
> +L(ret_vec_x2_len):
> +       VMOVU   (%rsi, %rdx), %VMM(0)
> +       movNULL $0, (VEC_SIZE)(%rdi, %rdx)
> +       VMOVU   %VMM(0), (%rdi, %rdx)
> +L(return_vzeroupper):
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
> +
> +
> +       .p2align 4,, 12
> +L(ret_vec_x1_len):
> +       movl    %edx, %ecx
> +L(ret_vec_x1):
> +       VMOVU   -(VEC_SIZE)(%rsi, %rcx), %VMM(1)
> +       movNULL $0, (%rdi, %rcx)
> +       VMOVU   %VMM(1), -VEC_SIZE(%rdi, %rcx)
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4,, 8
> +L(last_4x_vec):
> +       subq    $-(VEC_SIZE * 4), %rsi
> +       VMOVA   0(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       addl    $-(VEC_SIZE * 4), %edx
> +       cmpl    $(VEC_SIZE * 2), %edx
> +       jbe     L(last_2x_vec)
> +       .p2align 4,, 8
> +L(more_2x_vec):
> +       /* L(ret_vec_x1) expects ecx to have position of first match so
> +          test with bsf.  */
> +       bsfl    %ecx, %ecx
> +       jnz     L(ret_vec_x1)
> +
> +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), (%rdi)
> +
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
> +
>
> -#define USE_AS_STRNCAT
> -#define STRCAT STRNCAT
> -#include "strcat-avx2.S"
> +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(3)
> +       VMOVU   %VMM(2), (VEC_SIZE * 1)(%rdi)
> +
> +       VPCMPEQ %VMM(3), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +
> +       /* Check if length is greater than 4x VEC.  */
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       ja      L(more_4x_vec)
> +
> +       addl    $(VEC_SIZE * -2), %edx
> +
> +       tzcnt   %ecx, %ecx
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x3_len)
> +
> +       cmpl    $VEC_SIZE, %ecx
> +       jnz     L(ret_vec_x3)
> +
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
> +       VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       addl    $-VEC_SIZE, %edx
> +       bzhil   %edx, %ecx, %r8d
> +       jz      L(ret_vec_x4_len)
> +L(ret_vec_x4):
> +       bsfl    %ecx, %edx
> +L(ret_vec_x4_len):
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx), %VMM(0)
> +       movNULL $0, (VEC_SIZE * 3)(%rdi, %rdx)
> +       VMOVU   %VMM(0), (VEC_SIZE * 2)(%rdi, %rdx)
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4,, 4
> +L(ret_vec_x3_len):
> +       movl    %edx, %ecx
> +L(ret_vec_x3):
> +       VMOVU   (VEC_SIZE)(%rsi, %rcx), %VMM(0)
> +       movNULL $0, (VEC_SIZE * 2)(%rdi, %rcx)
> +       VMOVU   %VMM(0), (VEC_SIZE)(%rdi, %rcx)
> +       VZEROUPPER_RETURN
> +
> +
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +       bsfl    %ecx, %ecx
> +       jnz     L(ret_vec_x3)
> +
> +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi)
> +       VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x4)
> +
> +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi)
> +
> +
> +       /* Recheck length before aligning.  */
> +       cmpq    $(VEC_SIZE * 8), %rdx
> +       jbe     L(last_4x_vec)
> +
> +       /* Align rsi (src) and just rdx/rdi (length/dst).  */
> +       addq    %rsi, %rdx
> +       subq    %rsi, %rdi
> +       subq    $-(VEC_SIZE * 4), %rsi
> +       andq    $(VEC_SIZE * -4), %rsi
> +
> +       /* Do first half of loop ahead of time so loop can just start by
> +          storing.  */
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %r8d
> +       addq    %rsi, %rdi
> +       testl   %r8d, %r8d
> +       jnz     L(loop_4x_done)
> +
> +       /* Use r9 for end of region before handling last 4x VEC
> +          specially.  */
> +       leaq    -(VEC_SIZE * 4)(%rdx), %r9
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +       subq    $(VEC_SIZE * -4), %rsi
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> +
> +       subq    $(VEC_SIZE * -4), %rdi
> +       cmpq    %rsi, %r9
> +       jbe     L(loop_last_4x_vec)
> +
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> +
> +       vpmovmskb %VMM(6), %r8d
> +
> +       testl   %r8d, %r8d
> +       jz      L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       /* L(ret_vec_x1) expects ecx to have position of first match so
> +          test with bsf.  */
> +       bsfl    %ecx, %ecx
> +       jnz     L(ret_vec_x1)
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +
> +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       bsfl    %ecx, %ecx
> +       jnz     L(ret_vec_x3)
> +
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +       bsfl    %r8d, %r8d
> +       VMOVU   (VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1)
> +       VMOVU   %VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8)
> +       VZEROUPPER_RETURN
> +
> +
> +
> +       .p2align 4,, 4
> +L(page_cross):
> +       movq    %rsi, %r8
> +       andq    $(VEC_SIZE * -1), %r8
> +
> +       VPCMPEQ (%r8), %VZERO, %VMM(6)
> +
> +       vpmovmskb %VMM(6), %ecx
> +       shrxl   %esi, %ecx, %ecx
> +
> +       subl    %esi, %r8d
> +       andl    $(VEC_SIZE - 1), %r8d
> +       cmpq    %r8, %rdx
> +       jb      L(page_cross_small)
> +
> +       /* Optimizing more aggressively for space as this is very cold
> +          code. This saves 2x cache lines.  */
> +
> +       /* This adds once to the later result which will get correct
> +          copy bounds. NB: this can never zero-out a non-zero RCX as
> +          to be in the page cross case rsi cannot be aligned and we
> +          already right-shift rcx by the misalignment.  */
> +       shll    $CHAR_SIZE, %ecx
> +       jz      L(page_cross_continue)
> +       bsfl    %ecx, %ecx
> +       rep     movsb
> +       VZEROUPPER_RETURN
> +
> +L(page_cross_small):
> +       tzcntl  %ecx, %ecx
> +       jz      L(page_cross_setz)
> +       cmpl    %edx, %ecx
> +       cmova   %edx, %ecx
> +       rep     movsb
> +L(page_cross_setz):
> +       movNULL $0, (%rdi)
> +       VZEROUPPER_RETURN
> +L(zero_len):
> +# ifdef USE_AS_WCSCPY
> +       test    %rdx, %rdx
> +# endif
> +       jnz     OVERFLOW_STRCAT
> +       ret
> +
> +
> +END(STRNCAT)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> index 79e7083299..b582a4a7a1 100644
> --- a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> @@ -1,3 +1,3 @@
> -#define USE_AS_STRNCPY
> -#define STRCPY __strncpy_avx2_rtm
> -#include "strcpy-avx2-rtm.S"
> +#define STRNCPY        __strncpy_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
> +#include "strncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
> index ce634e94fa..d1b25b7a42 100644
> --- a/sysdeps/x86_64/multiarch/strncpy-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
> @@ -1,7 +1,735 @@
> -#ifndef STRNCPY
> -# define STRNCPY       __strncpy_avx2
> -#endif
> +/* strncpy with AVX2
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (3)
> +
> +# include <sysdep.h>
> +
> +
> +# ifndef VEC_SIZE
> +#  include "x86-avx2-vecs.h"
> +# endif
> +
> +# ifndef STRNCPY
> +#  define STRNCPY      __strncpy_avx2
> +# endif
> +
> +
> +# ifdef USE_AS_WCSCPY
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPMIN        vpminud
> +#  define CHAR_SIZE    4
> +# else
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPMIN        vpminub
> +#  define CHAR_SIZE    1
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# define PAGE_SIZE     4096
> +
> +# define VZERO VMM(7)
> +# define VZERO_128     VMM_128(7)
> +
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCPY)
> +       /* Filter zero length strings and very long strings.  Zero
> +          length strings just return, very long strings are handled by
> +          just running rep stos{b|l} to zero set (which will almost
> +          certainly segfault), if that succeeds then just calling
> +          OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
> +# ifdef USE_AS_WCSCPY
> +       decq    %rdx
> +       movq    %rdx, %rax
> +       /* 56 is end of max supported address space.  */
> +       shr     $56, %rax
> +       jnz     L(zero_len)
> +       salq    $2, %rdx
> +# else
> +       decq    %rdx
> +       /* `dec` can macrofuse with `jl`. If the flag needs to become
> +          `jb` replace `dec` with `sub`.  */
> +       jl      L(zero_len)
> +# endif
> +
> +       vpxor   %VZERO_128, %VZERO_128, %VZERO_128
> +       movl    %esi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(page_cross)
> +
> +L(page_cross_continue):
> +       VMOVU   (%rsi), %VMM(0)
> +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +
> +       /* If no STPCPY just save end ahead of time.  */
> +# ifndef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# elif defined USE_AS_WCSCPY
> +       /* Clear dependency as nearly all return code for wcpncpy uses
> +          `setc %al`.  */
> +       xorl    %eax, %eax
> +# endif
> +
> +       cmpq    $(VEC_SIZE - CHAR_SIZE), %rdx
> +       /* `jb` because length rdx is now length - CHAR_SIZE.  */
> +       jbe     L(less_1x_vec)
> +
> +       /* This may overset but thats fine because we still need to zero
> +          fill.  */
> +       VMOVU   %VMM(0), (%rdi)
> +
> +       testl   %ecx, %ecx
> +       jnz     L(zfill)
> +
> +       /* Align.  */
> +       addq    %rsi, %rdx
> +       subq    %rsi, %rdi
> +       orq     $(VEC_SIZE - 1), %rsi
> +       incq    %rsi
> +L(last_4x_vec):
> +       addq    %rsi, %rdi
> +L(loop_last_4x_vec):
> +       subq    %rsi, %rdx
> +
> +
> +       VMOVA   0(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       jae     L(more_2x_vec)
> +
> +       cmpl    $(VEC_SIZE), %edx
> +       jb      L(ret_vec_x1_len)
> +
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
> +
> +       VPCMPEQ VEC_SIZE(%rsi), %VZERO, %VMM(6)
> +       VMOVU   %VMM(1), (%rdi)
> +       vpmovmskb %VMM(6), %ecx
> +       shlq    $VEC_SIZE, %rcx
> +L(ret_vec_x1_len):
> +       tzcntq  %rcx, %rcx
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x1_len_no_zfill)
> +       /* Fall through (expectation) is copy len < buffer len.  */
> +       VMOVU   %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +L(ret_vec_x1_len_no_zfill_mov):
> +       movl    %ecx, %edx
> +# ifdef USE_AS_STPCPY
> +       /* clear flags.  */
> +       xorl    %ecx, %ecx
> +# endif
> +L(ret_vec_x1_len_no_zfill):
> +       VMOVU   ((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> +       VMOVU   %VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       setc    %al
> +       addq    %rdx, %rdi
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# endif
> +L(return_vzeroupper):
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
> +
> +       .p2align 4,, 6
> +L(ret_vec_x1):
> +       bsfl    %ecx, %ecx
> +       VMOVU   %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +       subl    %ecx, %edx
> +       /* Check if we need to reload/store.  */
> +       cmpl    $VEC_SIZE, %edx
> +       jb      L(ret_vec_x1_len_no_zfill_mov)
> +       /* Otherwise safe to just store directly.  */
> +       VMOVU   %VMM(1), (%rdi)
> +       VMOVU   %VZERO, (%rdi, %rcx)
> +# ifdef USE_AS_STPCPY
> +       leaq    (%rdi, %rcx), %rax
> +# endif
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4,, 12
> +L(more_2x_vec):
> +       VMOVU   %VMM(1), (%rdi)
> +       testl   %ecx, %ecx
> +       /* Must fill at least 2x VEC.  */
> +       jnz     L(zfill_vec1)
> +
> +       VMOVA   VEC_SIZE(%rsi), %VMM(2)
> +       VMOVU   %VMM(2), VEC_SIZE(%rdi)
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       /* Must fill at least 1x VEC.  */
> +       jnz     L(zfill_vec2)
> +
> +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(3)
> +       VPCMPEQ %VMM(3), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +
> +       /* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
> +          CHAR_SIZE.  */
> +       cmpq    $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
> +       ja      L(more_4x_vec)
> +
> +       subl    $(VEC_SIZE * 3), %edx
> +       jb      L(ret_vec_x3_len)
> +
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
> +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi)
> +       vpmovmskb %VMM(6), %ecx
> +       tzcntl  %ecx, %ecx
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x4_len_no_zfill)
> +       /* Fall through (expectation) is copy len < buffer len.  */
> +       VMOVU   %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +       movl    %ecx, %edx
> +L(ret_vec_x4_len_no_zfill):
> +       VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> +       VMOVU   %VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       setc    %al
> +       addq    %rdx, %rdi
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +       leal    (VEC_SIZE * 3 + 0)(%edx), %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# endif
> +       VZEROUPPER_RETURN
> +
> +
> +L(ret_vec_x3_len):
> +       addl    $(VEC_SIZE * 1), %edx
> +       tzcntl  %ecx, %ecx
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_vec_x3_len_no_zfill)
> +       /* Fall through (expectation) is copy len < buffer len.  */
> +       VMOVU   %VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +L(ret_vec_x3_len_no_zfill_mov):
> +       movl    %ecx, %edx
> +# ifdef USE_AS_STPCPY
> +       /* clear flags.  */
> +       xorl    %ecx, %ecx
> +# endif
> +       .p2align 4,, 4
> +L(ret_vec_x3_len_no_zfill):
> +       VMOVU   ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> +       VMOVU   %VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       setc    %al
> +       addq    %rdx, %rdi
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +       leal    (VEC_SIZE * 2 + 0)(%rdx), %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# endif
> +       VZEROUPPER_RETURN
> +
> +
> +       .p2align 4,, 8
> +L(ret_vec_x3):
> +       bsfl    %ecx, %ecx
> +       VMOVU   %VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
> +       subl    %ecx, %edx
> +       jl      L(ret_vec_x3_len_no_zfill_mov)
> +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi)
> +       VMOVU   %VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
> +# ifdef USE_AS_STPCPY
> +       leaq    (VEC_SIZE * 2)(%rdi, %rcx), %rax
> +# endif
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +
> +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi)
> +       testl   %ecx, %ecx
> +       jnz     L(zfill_vec3)
> +
> +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(4)
> +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi)
> +       VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(zfill_vec4)
> +
> +       movq    %rdx, %rcx
> +       addq    %rsi, %rdx
> +       subq    %rsi, %rdi
> +       subq    $-(VEC_SIZE * 4), %rsi
> +       /* Recheck length before aligning.  */
> +       cmpq    $(VEC_SIZE * 8 - CHAR_SIZE), %rcx
> +       jbe     L(last_4x_vec)
> +
> +       andq    $(VEC_SIZE * -4), %rsi
> +
> +       /* Do first half of loop ahead of time so loop can just start by
> +          storing.  */
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %r8d
> +       addq    %rsi, %rdi
> +       testl   %r8d, %r8d
> +       jnz     L(loop_4x_done)
> +
> +       /* Use r9 as end register.  */
> +       leaq    -(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
>
> -#define USE_AS_STRNCPY
> -#define STRCPY STRNCPY
> -#include "strcpy-avx2.S"
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +       subq    $(VEC_SIZE * -4), %rsi
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> +
> +       subq    $(VEC_SIZE * -4), %rdi
> +       cmpq    %rsi, %r9
> +       jbe     L(loop_last_4x_vec)
> +
> +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> +
> +       vpmovmskb %VMM(6), %r8d
> +
> +       testl   %r8d, %r8d
> +       jz      L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +       subq    %rsi, %rdx
> +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(zfill_vec1)
> +
> +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(zfill_vec2)
> +
> +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> +       vpmovmskb %VMM(6), %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(zfill_vec3)
> +
> +       VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> +       movl    %r8d, %ecx
> +
> +       // Zfill more....
> +
> +       .p2align 4,, 4
> +L(zfill_vec4):
> +       addq    $(VEC_SIZE * 2), %rdi
> +       subq    $(VEC_SIZE * 2), %rdx
> +L(zfill_vec2):
> +       shlq    $VEC_SIZE, %rcx
> +L(zfill):
> +       bsfq    %rcx, %rcx
> +       subq    %rcx, %rdx
> +       addq    %rcx, %rdi
> +# ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# endif
> +L(zfill_from_page_cross):
> +       cmpq    $VEC_SIZE, %rdx
> +       jb      L(zfill_less_vec_vzeroupper)
> +
> +L(zfill_more_1x_vec):
> +       VMOVU   %VZERO, CHAR_SIZE(%rdi)
> +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       jae     L(zfill_more_2x_vec)
> +L(zfill_done0):
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4,, 8
> +L(zfill_vec3):
> +       addq    $(VEC_SIZE * 2), %rdi
> +       subq    $(VEC_SIZE * 2), %rdx
> +       .p2align 4,, 2
> +L(zfill_vec1):
> +       bsfl    %ecx, %ecx
> +       addq    %rcx, %rdi
> +       subq    %rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# endif
> +       /* zfill from vec1/vec3 must have to set at least 2x VECS.  */
> +
> +       VMOVU   %VZERO, CHAR_SIZE(%rdi)
> +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       jb      L(zfill_done0)
> +L(zfill_more_2x_vec):
> +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   %VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
> +       subq    $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
> +       jbe     L(zfill_done)
> +
> +       addq    %rdi, %rdx
> +       VMOVU   %VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
> +       VMOVU   %VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
> +
> +
> +       VMOVU   %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
> +       VMOVU   %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
> +
> +       subq    $-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
> +       cmpq    %rdi, %rdx
> +       jbe     L(zfill_done)
> +
> +       andq    $-(VEC_SIZE), %rdi
> +       .p2align 4,, 12
> +L(zfill_loop_4x_vec):
> +       VMOVA   %VZERO, (VEC_SIZE * 0)(%rdi)
> +       VMOVA   %VZERO, (VEC_SIZE * 1)(%rdi)
> +       VMOVA   %VZERO, (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VZERO, (VEC_SIZE * 3)(%rdi)
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       cmpq    %rdi, %rdx
> +       ja      L(zfill_loop_4x_vec)
> +L(zfill_done):
> +       VZEROUPPER_RETURN
> +
> +
> +       .p2align 4,, 8
> +L(copy_1x):
> +       VMOVU   %VMM(0), (%rdi)
> +       testl   %ecx, %ecx
> +       jz      L(ret_32_32)
> +L(zfill_less_vec):
> +       bsfl    %ecx, %ecx
> +L(zfill_less_vec_no_bsf):
> +       subq    %rcx, %rdx
> +       addq    %rcx, %rdi
> +# ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# endif
> +L(zfill_less_vec_vzeroupper):
> +       COND_VZEROUPPER
> +       /* We are taking advantage of the fact that to be here we must
> +          be writing null-term as (%rdi, %rcx) we have a byte of lee-
> +          way for overwriting.  */
> +       cmpl    $16, %edx
> +       jb      L(zfill_less_16)
> +       VMOVU   %VZERO_128, (%rdi)
> +       VMOVU   %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
> +       ret
> +# ifdef USE_AS_STPCPY
> +L(ret_32_32):
> +       leaq    CHAR_SIZE(%rdi, %rdx), %rax
> +       VZEROUPPER_RETURN
> +# endif
> +
> +       .p2align 4,, 4
> +L(copy_16_31):
> +       /* Overfill to avoid branches.  */
> +       vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
> +       vmovdqu %xmm0, (%rdi)
> +       vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
> +       cmpl    %ecx, %edx
> +       ja      L(zfill_less_vec_no_bsf)
> +# ifndef USE_AS_STPCPY
> +L(ret_32_32):
> +# else
> +#  ifdef USE_AS_WCSCPY
> +       setc    %al
> +       addq    %rdx, %rdi
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# endif
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4,, 4
> +L(copy_8_15):
> +       /* Overfill to avoid branches.  */
> +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
> +       vmovq   %xmm0, (%rdi)
> +       movq    %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_8_15)
> +       subq    %rcx, %rdx
> +       addq    %rcx, %rdi
> +# ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +# endif
> +       .p2align 4,, 8
> +L(zfill_less_16):
> +       xorl    %ecx, %ecx
> +       cmpl    $8, %edx
> +       jb      L(zfill_less_8)
> +       movq    %rcx, (%rdi)
> +       movq    %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
> +# ifndef USE_AS_STPCPY
> +L(ret_8_15):
> +# endif
> +       ret
> +
> +
> +       .p2align 4,, 8
> +L(less_1x_vec):
> +       /* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
> +          buffer sizes are aligned conventially.  */
> +       je      L(copy_1x)
> +
> +       tzcntl  %ecx, %ecx
> +       cmpl    $16, %edx
> +       jae     L(copy_16_31)
> +
> +       COND_VZEROUPPER
> +       cmpl    $8, %edx
> +       jae     L(copy_8_15)
> +# ifdef USE_AS_WCSCPY
> +       testl   %ecx, %ecx
> +       jz      L(zfill_less_8_set_ret)
> +
> +       movl    (%rsi, %rdx), %esi
> +       vmovd   %xmm0, (%rdi)
> +       movl    %esi, (%rdi, %rdx)
> +
> +#  ifdef USE_AS_STPCPY
> +       cmpl    %ecx, %edx
> +L(ret_8_15):
> +       setc    %al
> +       addq    %rdx, %rdi
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#  endif
> +       ret
> +L(zfill_less_8_set_ret):
> +       xorl    %ecx, %ecx
> +#  ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +#  endif
> +L(zfill_less_8):
> +       movl    %ecx, (%rdi)
> +       movl    %ecx, (%rdi, %rdx)
> +       ret
> +
> +# else
> +       cmpl    $3, %edx
> +       jb      L(copy_0_3)
> +       /* Overfill to avoid branches.  */
> +       movl    -3(%rsi, %rdx), %esi
> +       vmovd   %xmm0, (%rdi)
> +       movl    %esi, -3(%rdi, %rdx)
> +       cmpl    %ecx, %edx
> +       jbe     L(ret_4_7)
> +       subq    %rcx, %rdx
> +       addq    %rcx, %rdi
> +#  ifdef USE_AS_STPCPY
> +       movq    %rdi, %rax
> +#  endif
> +       xorl    %ecx, %ecx
> +       .p2align 4,, 8
> +L(zfill_less_8):
> +       cmpl    $3, %edx
> +       jb      L(zfill_less_3)
> +       movl    %ecx, (%rdi)
> +       movl    %ecx, -3(%rdi, %rdx)
> +#  ifdef USE_AS_STPCPY
> +       ret
> +#  endif
> +
> +L(ret_4_7):
> +#  ifdef USE_AS_STPCPY
> +L(ret_8_15):
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#  endif
> +       ret
> +
> +       .p2align 4,, 4
> +L(zfill_less_3):
> +       testl   %edx, %edx
> +       jz      L(zfill_1)
> +       movw    %cx, (%rdi)
> +L(zfill_1):
> +       movb    %cl, (%rdi, %rdx)
> +       ret
> +
> +       .p2align 4,, 8
> +L(copy_0_3):
> +       vmovd   %xmm0, %r8d
> +       testl   %edx, %edx
> +       jz      L(copy_1)
> +       movw    %r8w, (%rdi)
> +       cmpl    %ecx, %edx
> +       ja      L(zfill_from_1)
> +       movzbl  (%rsi, %rdx), %r8d
> +#  ifdef USE_AS_STPCPY
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +       movb    %r8b, (%rdi, %rdx)
> +       ret
> +#  endif
> +
> +L(copy_1):
> +#  ifdef USE_AS_STPCPY
> +       movl    %edx, %eax
> +       cmpl    %ecx, %edx
> +       adcq    %rdi, %rax
> +#  endif
> +#  ifdef USE_AS_WCSCPY
> +       vmovd   %xmm0, (%rdi)
> +#  else
> +       movb    %r8b, (%rdi, %rdx)
> +#  endif
> +       ret
> +# endif
> +
> +       .p2align 4,, 2
> +L(zero_len):
> +       movq    %rdi, %rax
> +       ret
> +# ifndef USE_AS_WCSCPY
> +       .p2align 4,, 8
> +L(zfill_from_1):
> +#  ifdef USE_AS_STPCPY
> +       leaq    (%rdi, %rcx), %rax
> +#  endif
> +       movw    $0, -1(%rdi, %rdx)
> +       ret
> +# endif
> +
> +       .p2align 4,, 4
> +       .p2align 6,, 8
> +L(page_cross):
> +       movq    %rsi, %rax
> +       andq    $(VEC_SIZE * -1), %rax
> +
> +       VPCMPEQ (%rax), %VZERO, %VMM(6)
> +
> +       vpmovmskb %VMM(6), %ecx
> +       shrxl   %esi, %ecx, %ecx
> +
> +       subl    %esi, %eax
> +       andl    $(VEC_SIZE - 1), %eax
> +       cmpq    %rax, %rdx
> +       jb      L(page_cross_small)
> +       /* Optimizing more aggressively for space as this is very cold
> +          code. This saves 2x cache lines.  */
> +
> +       /* If rcx is non-zero then continue.  */
> +       shl     $CHAR_SIZE, %ecx
> +       jz      L(page_cross_continue)
> +       bsf     %ecx, %ecx
> +
> +       subq    %rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +       leaq    -CHAR_SIZE(%rdi, %rcx), %rax
> +# else
> +       movq    %rdi, %rax
> +# endif
> +
> +       rep     movsb
> +# ifdef USE_AS_WCSCPY
> +       movl    $0, (%rdi)
> +# else
> +       movb    $0, (%rdi)
> +# endif
> +       jmp     L(zfill_from_page_cross)
> +
> +L(page_cross_small):
> +       tzcntl  %ecx, %ecx
> +       xorl    %eax, %eax
> +       cmpl    %ecx, %edx
> +       jbe     L(page_cross_copy_only)
> +
> +       /* Do a zfill of the tail before copying.  */
> +       movq    %rdi, %r9
> +       movl    %ecx, %r8d
> +
> +       subl    %ecx, %edx
> +       leaq    CHAR_SIZE(%rdi, %rcx), %rdi
> +       movl    %edx, %ecx
> +       rep     stosb
> +       movq    %r9, %rdi
> +       movl    %r8d, %edx
> +L(page_cross_copy_only):
> +       leal    CHAR_SIZE(%rdx), %ecx
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +       setc    %al
> +       addq    %rdi, %rdx
> +       leaq    (%rdx, %rax, CHAR_SIZE), %rax
> +#  else
> +       movl    %edx, %eax
> +       adcq    %rdi, %rax
> +#  endif
> +# else
> +       movq    %rdi, %rax
> +# endif
> +       rep     movsb
> +       ret
> +
> +
> +L(best_effort_strncpy):
> +       movq    %rdx, %rcx
> +       xorl    %eax, %eax
> +       movq    %rdi, %r8
> +       /* The length is >= 2^63. We very much so expect to segfault at
> +          rep stos. If that doesn't happen then just strcpy to finish.
> +        */
> +# ifdef USE_AS_WCSCPY
> +       rep     stosl
> +# else
> +       rep     stosb
> +# endif
> +       movq    %r8, %rdi
> +       jmp     OVERFLOW_STRCPY
> +END(STRNCPY)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> index dca1089060..01bead1435 100644
> --- a/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> +++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> @@ -27,7 +27,10 @@
>  #define VEC_SIZE                       32
>  #include "x86-vec-macros.h"
>
> -#define USE_WITH_AVX           1
> +#ifndef USE_WITH_AVX2
> +# define USE_WITH_AVX          1

Is this checked somewhere?

> +#endif
> +
>  #define SECTION(p)                     p##.avx
>
>  /* 4-byte mov instructions with AVX2.  */
> diff --git a/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
> new file mode 100644
> index 0000000000..a5966701ec
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
> @@ -0,0 +1,26 @@
> +/* Common config for AVX2-RTM VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_AVX2_RTM_VECS_H
> +#define _X86_AVX2_RTM_VECS_H                   1
> +
> +#define USE_WITH_AVX2          1
> +#include "x86-avx-rtm-vecs.h"
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-avx2-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
> new file mode 100644
> index 0000000000..16d7ae5147
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
> @@ -0,0 +1,27 @@
> +/* Common config for AVX2 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_AVX2_VECS_H
> +#define _X86_AVX2_VECS_H                       1
> +
> +#define USE_WITH_AVX2          1

Is this checked somewhere?

> +
> +#include "x86-avx-vecs.h"
> +
> +#endif
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 4/4] x86: Add optimized functions for the wide-character strcpy family
  2022-11-04  8:20   ` [PATCH v2 4/4] x86: Add optimized functions for the wide-character strcpy family Noah Goldstein
@ 2022-11-04 16:47     ` H.J. Lu
  2022-11-04 20:22       ` Noah Goldstein
  0 siblings, 1 reply; 42+ messages in thread
From: H.J. Lu @ 2022-11-04 16:47 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Nov 4, 2022 at 1:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Implemented:
>     wcscat-avx2{+rtm}
>     wcscpy-avx2{+rtm}
>     wcpcpy-avx2{+rtm}
>     wcsncpy-avx2{+rtm}
>     wcpncpy-avx2{+rtm}
>     wcsncat-avx2{+rtm}
>     wcscat-evex
>     wcscpy-evex
>     wcpcpy-evex
>     wcsncpy-evex
>     wcpncpy-evex
>     wcsncat-evex
>
> Performance Changes:
>     Times are from N = 10 runs of the benchmark suite and are reported
>     as geometric mean of all ratios of New Implementation / Best Old
>     Implementation. Best Old Implementation was determined with the
>     highest ISA implementation.
>
>     wcscat-avx2     -> 0.975
>     wcscpy-avx2     -> 0.591
>     wcpcpy-avx2     -> 0.698
>     wcsncpy-avx2    -> 0.730
>     wcpncpy-avx2    -> 0.711
>     wcsncat-avx2    -> 0.954
>     wcscat-evex     -> 0.991
>     wcscpy-evex     -> 0.587
>     wcpcpy-evex     -> 0.695
>     wcsncpy-evex    -> 0.719
>     wcpncpy-evex    -> 0.694
>     wcsncat-evex    -> 0.979
>
> Code Size Changes:
>     This change (compared with the last two commits without it)
>     increase the size of libc.so by ~19kb bytes. For reference this
>     entire patchset increases libc.so by ~2.5kb (so without the
>     wide-character functions libc.so would decrease by 16.5kb).
>
> Full check passes on x86-64 and build succeeds for all ISA levels w/
> and w/o multiarch.

Please separate AVX2 and EVEX to reduce the patch size.

> ---
>  sysdeps/x86_64/Makefile                     |   5 +
>  sysdeps/x86_64/multiarch/Makefile           |  26 +++-
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c  | 135 +++++++++++++++++++-
>  sysdeps/x86_64/multiarch/ifunc-wcs.h        |  60 +++++++++
>  sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S  |   3 +
>  sysdeps/x86_64/multiarch/wcpcpy-avx2.S      |   8 ++
>  sysdeps/x86_64/multiarch/wcpcpy-evex.S      |   8 ++
>  sysdeps/x86_64/multiarch/wcpcpy-generic.c   |  27 ++++
>  sysdeps/x86_64/multiarch/wcpcpy.c           |  37 ++++++
>  sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S |   3 +
>  sysdeps/x86_64/multiarch/wcpncpy-avx2.S     |   8 ++
>  sysdeps/x86_64/multiarch/wcpncpy-evex.S     |   8 ++
>  sysdeps/x86_64/multiarch/wcpncpy-generic.c  |  27 ++++
>  sysdeps/x86_64/multiarch/wcpncpy.c          |  37 ++++++
>  sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S  |   3 +
>  sysdeps/x86_64/multiarch/wcscat-avx2.S      |  10 ++
>  sysdeps/x86_64/multiarch/wcscat-evex.S      |   9 ++
>  sysdeps/x86_64/multiarch/wcscat-generic.c   |  27 ++++
>  sysdeps/x86_64/multiarch/wcscat.c           |  37 ++++++
>  sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S  |   3 +
>  sysdeps/x86_64/multiarch/wcscpy-avx2.S      |   7 +
>  sysdeps/x86_64/multiarch/wcscpy-evex.S      |   7 +
>  sysdeps/x86_64/multiarch/wcscpy-generic.c   |   3 +-
>  sysdeps/x86_64/multiarch/wcscpy.c           |  21 +++
>  sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S |   3 +
>  sysdeps/x86_64/multiarch/wcsncat-avx2.S     |   9 ++
>  sysdeps/x86_64/multiarch/wcsncat-evex.S     |   9 ++
>  sysdeps/x86_64/multiarch/wcsncat-generic.c  |  27 ++++
>  sysdeps/x86_64/multiarch/wcsncat.c          |  34 +++++
>  sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S |   3 +
>  sysdeps/x86_64/multiarch/wcsncpy-avx2.S     |   7 +
>  sysdeps/x86_64/multiarch/wcsncpy-evex.S     |   7 +
>  sysdeps/x86_64/multiarch/wcsncpy-generic.c  |  27 ++++
>  sysdeps/x86_64/multiarch/wcsncpy.c          |  37 ++++++
>  sysdeps/x86_64/wcpcpy-generic.c             |  31 +++++
>  sysdeps/x86_64/wcpcpy.S                     |  41 ++++++
>  sysdeps/x86_64/wcpncpy-generic.c            |  31 +++++
>  sysdeps/x86_64/wcpncpy.S                    |  41 ++++++
>  sysdeps/x86_64/wcscat-generic.c             |  31 +++++
>  sysdeps/x86_64/wcscat.S                     |  41 ++++++
>  sysdeps/x86_64/wcscpy.S                     |   2 +
>  sysdeps/x86_64/wcsncat-generic.c            |  31 +++++
>  sysdeps/x86_64/wcsncat.S                    |  39 ++++++
>  sysdeps/x86_64/wcsncpy-generic.c            |  31 +++++
>  sysdeps/x86_64/wcsncpy.S                    |  41 ++++++
>  45 files changed, 1036 insertions(+), 6 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcs.h
>  create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcpcpy.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcpncpy.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcscat-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcscat-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcscat.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcscpy-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncat-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncat-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncat.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncpy.c
>  create mode 100644 sysdeps/x86_64/wcpcpy-generic.c
>  create mode 100644 sysdeps/x86_64/wcpcpy.S
>  create mode 100644 sysdeps/x86_64/wcpncpy-generic.c
>  create mode 100644 sysdeps/x86_64/wcpncpy.S
>  create mode 100644 sysdeps/x86_64/wcscat-generic.c
>  create mode 100644 sysdeps/x86_64/wcscat.S
>  create mode 100644 sysdeps/x86_64/wcsncat-generic.c
>  create mode 100644 sysdeps/x86_64/wcsncat.S
>  create mode 100644 sysdeps/x86_64/wcsncpy-generic.c
>  create mode 100644 sysdeps/x86_64/wcsncpy.S
>
> diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
> index 3627c5659f..688eb2d7c4 100644
> --- a/sysdeps/x86_64/Makefile
> +++ b/sysdeps/x86_64/Makefile
> @@ -188,8 +188,13 @@ endif
>  ifeq ($(subdir),wcsmbs)
>
>  sysdep_routines += \
> +  wcpcpy-generic \
> +  wcpncpy-generic \
> +  wcscat-generic \
>    wcscpy-generic \
> +  wcsncat-generic \
>    wcsncmp-generic \
> +  wcsncpy-generic \
>    wcsnlen-generic \
>  # sysdep_routines
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 066bfa48d9..f848fc0e28 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -131,6 +131,18 @@ endif
>
>  ifeq ($(subdir),wcsmbs)
>  sysdep_routines += \
> +  wcpcpy-avx2 \
> +  wcpcpy-avx2-rtm \
> +  wcpcpy-evex \
> +  wcpcpy-generic \
> +  wcpncpy-avx2 \
> +  wcpncpy-avx2-rtm \
> +  wcpncpy-evex \
> +  wcpncpy-generic \
> +  wcscat-avx2 \
> +  wcscat-avx2-rtm \
> +  wcscat-evex \
> +  wcscat-generic \
>    wcschr-avx2 \
>    wcschr-avx2-rtm \
>    wcschr-evex \
> @@ -140,6 +152,10 @@ sysdep_routines += \
>    wcscmp-avx2-rtm \
>    wcscmp-evex \
>    wcscmp-sse2 \
> +  wcscpy-avx2 \
> +  wcscpy-avx2-rtm \
> +  wcscpy-evex \
> +  wcscpy-generic \
>    wcscpy-ssse3 \
>    wcslen-avx2 \
>    wcslen-avx2-rtm \
> @@ -147,9 +163,17 @@ sysdep_routines += \
>    wcslen-evex512 \
>    wcslen-sse2 \
>    wcslen-sse4_1 \
> +  wcsncat-avx2 \
> +  wcsncat-avx2-rtm \
> +  wcsncat-evex \
> +  wcsncat-generic \
>    wcsncmp-avx2 \
>    wcsncmp-avx2-rtm \
>    wcsncmp-evex \
> +  wcsncpy-avx2 \
> +  wcsncpy-avx2-rtm \
> +  wcsncpy-evex \
> +  wcsncpy-generic \
>    wcsnlen-avx2 \
>    wcsnlen-avx2-rtm \
>    wcsnlen-evex \
> @@ -163,8 +187,8 @@ sysdep_routines += \
>    wmemchr-avx2 \
>    wmemchr-avx2-rtm \
>    wmemchr-evex \
> -  wmemchr-evex512 \
>    wmemchr-evex-rtm \
> +  wmemchr-evex512 \
>    wmemchr-sse2 \
>    wmemcmp-avx2-movbe \
>    wmemcmp-avx2-movbe-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 7cebee7ec7..71e8953e91 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -901,16 +901,145 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/wcscpy.c.  */
>    IFUNC_IMPL (i, name, wcscpy,
> -             /* ISA V4 wrapper for SSSE3 implementation because
> -                the SSSE3 implementation is also used at ISA
> -                level 3/4.  */
>               X86_IFUNC_IMPL_ADD_V4 (array, i, wcscpy,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wcscpy_evex)
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
> +                                    (CPU_FEATURE_USABLE (AVX2)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wcscpy_avx2)
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
> +                                    (CPU_FEATURE_USABLE (AVX2)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)
> +                                     && CPU_FEATURE_USABLE (RTM)),
> +                                    __wcscpy_avx2_rtm)
> +             X86_IFUNC_IMPL_ADD_V2 (array, i, wcscpy,
>                                      CPU_FEATURE_USABLE (SSSE3),
>                                      __wcscpy_ssse3)
>               X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
>                                      1,
>                                      __wcscpy_generic))
>
> +  /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
> +  IFUNC_IMPL (i, name, wcsncpy,
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wcsncpy_evex)
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
> +                                    (CPU_FEATURE_USABLE (AVX2)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wcsncpy_avx2)
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
> +                                    (CPU_FEATURE_USABLE (AVX2)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)
> +                                     && CPU_FEATURE_USABLE (RTM)),
> +                                    __wcsncpy_avx2_rtm)
> +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcpncpy,
> +                                    1,
> +                                    __wcsncpy_generic))
> +
> +  /* Support sysdeps/x86_64/multiarch/wcpcpy.c.  */
> +  IFUNC_IMPL (i, name, wcpcpy,
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcpcpy,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wcpcpy_evex)
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
> +                                    (CPU_FEATURE_USABLE (AVX2)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wcpcpy_avx2)
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
> +                                    (CPU_FEATURE_USABLE (AVX2)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)
> +                                     && CPU_FEATURE_USABLE (RTM)),
> +                                    __wcpcpy_avx2_rtm)
> +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcpcpy,
> +                                    1,
> +                                    __wcpcpy_generic))
> +
> +  /* Support sysdeps/x86_64/multiarch/wcpncpy.c.  */
> +  IFUNC_IMPL (i, name, wcpncpy,
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcpncpy,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wcpncpy_evex)
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
> +                                    (CPU_FEATURE_USABLE (AVX2)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wcpncpy_avx2)
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
> +                                    (CPU_FEATURE_USABLE (AVX2)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)
> +                                     && CPU_FEATURE_USABLE (RTM)),
> +                                    __wcpncpy_avx2_rtm)
> +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncpy,
> +                                    1,
> +                                    __wcpncpy_generic))
> +
> +  /* Support sysdeps/x86_64/multiarch/wcscat.c.  */
> +  IFUNC_IMPL (i, name, wcscat,
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcscat,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wcscat_evex)
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
> +                                    (CPU_FEATURE_USABLE (AVX2)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wcscat_avx2)
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
> +                                    (CPU_FEATURE_USABLE (AVX2)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)
> +                                     && CPU_FEATURE_USABLE (RTM)),
> +                                    __wcscat_avx2_rtm)
> +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcscat,
> +                                    1,
> +                                    __wcscat_generic))
> +
> +  /* Support sysdeps/x86_64/multiarch/wcsncat.c.  */
> +  IFUNC_IMPL (i, name, wcsncat,
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncat,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wcsncat_evex)
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
> +                                    (CPU_FEATURE_USABLE (AVX2)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wcsncat_avx2)
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
> +                                    (CPU_FEATURE_USABLE (AVX2)
> +                                     && CPU_FEATURE_USABLE (BMI1)
> +                                     && CPU_FEATURE_USABLE (BMI2)
> +                                     && CPU_FEATURE_USABLE (RTM)),
> +                                    __wcsncat_avx2_rtm)
> +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncat,
> +                                    1,
> +                                    __wcsncat_generic))
> +
>    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
>    IFUNC_IMPL (i, name, wcslen,
>               X86_IFUNC_IMPL_ADD_V4 (array, i, wcslen,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-wcs.h b/sysdeps/x86_64/multiarch/ifunc-wcs.h
> new file mode 100644
> index 0000000000..cda633d8fb
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/ifunc-wcs.h
> @@ -0,0 +1,60 @@
> +/* Common definition for ifunc selections optimized wide-character
> +   string copy functions.
> +
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <init-arch.h>
> +
> +#ifndef GENERIC
> +# define GENERIC generic
> +#endif
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features *cpu_features = __get_cpu_features ();
> +
> +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> +      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1)
> +      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> +      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> +                                     AVX_Fast_Unaligned_Load, ))
> +    {
> +      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +         && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> +       return OPTIMIZE (evex);
> +
> +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> +       return OPTIMIZE (avx2_rtm);
> +
> +      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> +                                      Prefer_No_VZEROUPPER, !))
> +       return OPTIMIZE (avx2);
> +
> +    }
> +
> +  return OPTIMIZE (GENERIC);
> +}
> diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
> new file mode 100644
> index 0000000000..756280a3ab
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
> @@ -0,0 +1,3 @@
> +#define WCPCPY __wcpcpy_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
> +#include "wcpcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
> new file mode 100644
> index 0000000000..0fffd912d3
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
> @@ -0,0 +1,8 @@
> +#ifndef WCPCPY
> +# define WCPCPY        __wcpcpy_avx2
> +#endif
> +
> +#define USE_AS_STPCPY
> +#define USE_AS_WCSCPY
> +#define STRCPY WCPCPY
> +#include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcpcpy-evex.S b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
> new file mode 100644
> index 0000000000..ac6429cc07
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
> @@ -0,0 +1,8 @@
> +#ifndef WCPCPY
> +# define WCPCPY        __wcpcpy_evex
> +#endif
> +
> +#define USE_AS_STPCPY
> +#define USE_AS_WCSCPY
> +#define STRCPY WCPCPY
> +#include "strcpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/wcpcpy-generic.c b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
> new file mode 100644
> index 0000000000..0ba29b081f
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
> @@ -0,0 +1,27 @@
> +/* wcpcpy.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* We always need to build this implementation as strspn-sse4 needs to
> +   be able to fallback to it.  */
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (2)
> +
> +# define WCPCPY __wcpcpy_generic
> +# include <wcsmbs/wcpcpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcpcpy.c b/sysdeps/x86_64/multiarch/wcpcpy.c
> new file mode 100644
> index 0000000000..8f96ddbc99
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpcpy.c
> @@ -0,0 +1,37 @@
> +/* Multiple versions of wcpcpy.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __wcpcpy __redirect_wcpcpy
> +# include <wchar.h>
> +# undef __wcpcpy
> +
> +# define SYMBOL_NAME wcpcpy
> +# include <init-arch.h>
> +
> +# include "ifunc-wcs.h"
> +
> +libc_ifunc_redirected (__redirect_wcpcpy, __wcpcpy, IFUNC_SELECTOR ());
> +weak_alias (__wcpcpy, wcpcpy)
> +# ifdef SHARED
> +__hidden_ver1 (__wcpcpy, __GI___wcpcpy, __redirect_wcpcpy)
> +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpcpy);
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
> new file mode 100644
> index 0000000000..80600d6b01
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
> @@ -0,0 +1,3 @@
> +#define WCPNCPY        __wcpncpy_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
> +#include "wcpncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
> new file mode 100644
> index 0000000000..b7e594f7b7
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
> @@ -0,0 +1,8 @@
> +#ifndef WCPNCPY
> +# define WCPNCPY       __wcpncpy_avx2
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define USE_AS_STPCPY
> +#define STRNCPY        WCPNCPY
> +#include "strncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcpncpy-evex.S b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
> new file mode 100644
> index 0000000000..62ddb694fe
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
> @@ -0,0 +1,8 @@
> +#ifndef WCPNCPY
> +# define WCPNCPY       __wcpncpy_evex
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define USE_AS_STPCPY
> +#define STRNCPY        WCPNCPY
> +#include "strncpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/wcpncpy-generic.c b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
> new file mode 100644
> index 0000000000..4aab4ecdd2
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
> @@ -0,0 +1,27 @@
> +/* wcpncpy.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* We always need to build this implementation as strspn-sse4 needs to
> +   be able to fallback to it.  */
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (2)
> +
> +# define WCPNCPY __wcpncpy_generic
> +# include <wcsmbs/wcpncpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcpncpy.c b/sysdeps/x86_64/multiarch/wcpncpy.c
> new file mode 100644
> index 0000000000..ed8f307e07
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpncpy.c
> @@ -0,0 +1,37 @@
> +/* Multiple versions of wcpncpy.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __wcpncpy __redirect_wcpncpy
> +# include <wchar.h>
> +# undef __wcpncpy
> +
> +# define SYMBOL_NAME wcpncpy
> +# include <init-arch.h>
> +
> +# include "ifunc-wcs.h"
> +
> +libc_ifunc_redirected (__redirect_wcpncpy, __wcpncpy, IFUNC_SELECTOR ());
> +weak_alias (__wcpncpy, wcpncpy)
> +# ifdef SHARED
> +__hidden_ver1 (__wcpncpy, __GI___wcpncpy, __redirect_wcpncpy)
> +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpncpy);
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
> new file mode 100644
> index 0000000000..e99449a2dc
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
> @@ -0,0 +1,3 @@
> +#define WCSCAT __wcscat_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
> +#include "wcscat-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2.S b/sysdeps/x86_64/multiarch/wcscat-avx2.S
> new file mode 100644
> index 0000000000..a20f23c09d
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscat-avx2.S
> @@ -0,0 +1,10 @@
> +#ifndef WCSCAT
> +# define WCSCAT        __wcscat_avx2
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define USE_AS_STRCAT
> +
> +#define STRCPY WCSCAT
> +
> +#include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcscat-evex.S b/sysdeps/x86_64/multiarch/wcscat-evex.S
> new file mode 100644
> index 0000000000..1d017e4899
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscat-evex.S
> @@ -0,0 +1,9 @@
> +#ifndef WCSCAT
> +# define WCSCAT        __wcscat_evex
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define USE_AS_STRCAT
> +
> +#define STRCPY WCSCAT
> +#include "strcpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/wcscat-generic.c b/sysdeps/x86_64/multiarch/wcscat-generic.c
> new file mode 100644
> index 0000000000..6476f85bbb
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscat-generic.c
> @@ -0,0 +1,27 @@
> +/* wcscat.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* We always need to build this implementation as strspn-sse4 needs to
> +   be able to fallback to it.  */
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (2)
> +
> +# define WCSCAT __wcscat_generic
> +# include <wcsmbs/wcscat.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcscat.c b/sysdeps/x86_64/multiarch/wcscat.c
> new file mode 100644
> index 0000000000..3277c44561
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscat.c
> @@ -0,0 +1,37 @@
> +/* Multiple versions of wcscat.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __wcscat __redirect_wcscat
> +# include <wchar.h>
> +# undef __wcscat
> +
> +# define SYMBOL_NAME wcscat
> +# include <init-arch.h>
> +
> +# include "ifunc-wcs.h"
> +
> +libc_ifunc_redirected (__redirect_wcscat, __wcscat, IFUNC_SELECTOR ());
> +weak_alias (__wcscat, wcscat)
> +# ifdef SHARED
> +__hidden_ver1 (__wcscat, __GI___wcscat, __redirect_wcscat)
> +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcscat);
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
> new file mode 100644
> index 0000000000..2f800c8d3e
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
> @@ -0,0 +1,3 @@
> +#define WCSCPY __wcscpy_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
> +#include "wcscpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2.S b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
> new file mode 100644
> index 0000000000..6bc509da07
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
> @@ -0,0 +1,7 @@
> +#ifndef WCSCPY
> +# define WCSCPY        __wcscpy_avx2
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define STRCPY WCSCPY
> +#include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcscpy-evex.S b/sysdeps/x86_64/multiarch/wcscpy-evex.S
> new file mode 100644
> index 0000000000..1069a8e224
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscpy-evex.S
> @@ -0,0 +1,7 @@
> +#ifndef WCSCPY
> +# define WCSCPY        __wcscpy_evex
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define STRCPY WCSCPY
> +#include "strcpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/wcscpy-generic.c b/sysdeps/x86_64/multiarch/wcscpy-generic.c
> index 93d314aaad..600d606c45 100644
> --- a/sysdeps/x86_64/multiarch/wcscpy-generic.c
> +++ b/sysdeps/x86_64/multiarch/wcscpy-generic.c
> @@ -18,8 +18,7 @@
>
>
>  #include <isa-level.h>
> -
> -#if ISA_SHOULD_BUILD (1)
> +#if ISA_SHOULD_BUILD (2)
>
>  # define WCSCPY  __wcscpy_generic
>  # include <wcsmbs/wcscpy.c>
> diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
> index 92c917b6b4..7f6387817b 100644
> --- a/sysdeps/x86_64/multiarch/wcscpy.c
> +++ b/sysdeps/x86_64/multiarch/wcscpy.c
> @@ -26,6 +26,11 @@
>  # define SYMBOL_NAME wcscpy
>  # include <init-arch.h>
>
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> +
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> @@ -35,6 +40,22 @@ IFUNC_SELECTOR (void)
>  {
>    const struct cpu_features* cpu_features = __get_cpu_features ();
>
> +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> +      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1)
> +      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> +      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load, ))
> +    {
> +      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +         && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> +       return OPTIMIZE (evex);
> +
> +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> +       return OPTIMIZE (avx2_rtm);
> +
> +      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER, !))
> +       return OPTIMIZE (avx2);
> +    }
> +
>    if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
>      return OPTIMIZE (ssse3);
>
> diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
> new file mode 100644
> index 0000000000..609d6e69c0
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
> @@ -0,0 +1,3 @@
> +#define WCSNCAT        __wcsncat_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
> +#include "wcsncat-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2.S b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
> new file mode 100644
> index 0000000000..a72105b7e9
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
> @@ -0,0 +1,9 @@
> +#ifndef WCSNCAT
> +# define WCSNCAT       __wcsncat_avx2
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define USE_AS_STRCAT
> +
> +#define STRNCAT        WCSNCAT
> +#include "strncat-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsncat-evex.S b/sysdeps/x86_64/multiarch/wcsncat-evex.S
> new file mode 100644
> index 0000000000..392215950a
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncat-evex.S
> @@ -0,0 +1,9 @@
> +#ifndef WCSCAT
> +# define WCSCAT        __wcsncat_evex
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define USE_AS_STRCAT
> +
> +#define STRNCAT        WCSCAT
> +#include "strncat-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsncat-generic.c b/sysdeps/x86_64/multiarch/wcsncat-generic.c
> new file mode 100644
> index 0000000000..9ced02b35e
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncat-generic.c
> @@ -0,0 +1,27 @@
> +/* wcsncat.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* We always need to build this implementation as strspn-sse4 needs to
> +   be able to fallback to it.  */
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (2)
> +
> +# define WCSNCAT __wcsncat_generic
> +# include <wcsmbs/wcsncat.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcsncat.c b/sysdeps/x86_64/multiarch/wcsncat.c
> new file mode 100644
> index 0000000000..49c46aef08
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncat.c
> @@ -0,0 +1,34 @@
> +/* Multiple versions of wcsncat.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define wcsncat __redirect_wcsncat
> +# include <wchar.h>
> +# undef wcsncat
> +
> +# define SYMBOL_NAME wcsncat
> +# include "ifunc-wcs.h"
> +
> +libc_ifunc_redirected (__redirect_wcsncat, wcsncat, IFUNC_SELECTOR ());
> +# ifdef SHARED
> +__hidden_ver1 (wcsncat, __GI_wcsncat, __redirect_wcsncat)
> +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncat);
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
> new file mode 100644
> index 0000000000..cab5a6b820
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
> @@ -0,0 +1,3 @@
> +#define WCSNCPY        __wcsncpy_avx2_rtm
> +#include "x86-avx2-rtm-vecs.h"
> +#include "wcsncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
> new file mode 100644
> index 0000000000..3a1a8a372c
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
> @@ -0,0 +1,7 @@
> +#ifndef WCSNCPY
> +# define WCSNCPY       __wcsncpy_avx2
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define STRNCPY        WCSNCPY
> +#include "strncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsncpy-evex.S b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
> new file mode 100644
> index 0000000000..2debb8fd6b
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
> @@ -0,0 +1,7 @@
> +#ifndef WCSNCPY
> +# define WCSNCPY       __wcsncpy_evex
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define STRNCPY        WCSNCPY
> +#include "strncpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsncpy-generic.c b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
> new file mode 100644
> index 0000000000..693521713b
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
> @@ -0,0 +1,27 @@
> +/* wcsncpy.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* We always need to build this implementation as strspn-sse4 needs to
> +   be able to fallback to it.  */
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (2)
> +
> +# define WCSNCPY __wcsncpy_generic
> +# include <wcsmbs/wcsncpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcsncpy.c b/sysdeps/x86_64/multiarch/wcsncpy.c
> new file mode 100644
> index 0000000000..5b89dd4d27
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncpy.c
> @@ -0,0 +1,37 @@
> +/* Multiple versions of wcsncpy.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __wcsncpy __redirect_wcsncpy
> +# include <wchar.h>
> +# undef __wcsncpy
> +
> +# define SYMBOL_NAME wcsncpy
> +# include <init-arch.h>
> +
> +# include "ifunc-wcs.h"
> +
> +libc_ifunc_redirected (__redirect_wcsncpy, __wcsncpy, IFUNC_SELECTOR ());
> +weak_alias (__wcsncpy, wcsncpy)
> +# ifdef SHARED
> +__hidden_ver1 (__wcsncpy, __GI___wcsncpy, __redirect_wcsncpy)
> +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncpy);
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/wcpcpy-generic.c b/sysdeps/x86_64/wcpcpy-generic.c
> new file mode 100644
> index 0000000000..d52525f288
> --- /dev/null
> +++ b/sysdeps/x86_64/wcpcpy-generic.c
> @@ -0,0 +1,31 @@
> +/* wcpcpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcpcpy non-multiarch build is split into two files,
> +   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL <= 3
> +
> +# include <wcsmbs/wcpcpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/wcpcpy.S b/sysdeps/x86_64/wcpcpy.S
> new file mode 100644
> index 0000000000..ec32dc070a
> --- /dev/null
> +++ b/sysdeps/x86_64/wcpcpy.S
> @@ -0,0 +1,41 @@
> +/* wcpcpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcpcpy non-multiarch build is split into two files,
> +   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL >= 4
> +
> +# define WCPCPY        __wcpcpy
> +
> +# define DEFAULT_IMPL_V4       "multiarch/wcpcpy-evex.S"
> +# define DEFAULT_IMPL_V3       "multiarch/wcpcpy-avx2.S"
> +/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> +   should never be used from here.  */
> +# define DEFAULT_IMPL_V1       "ERROR -- Invalid ISA IMPL"
> +
> +# include "isa-default-impl.h"
> +
> +weak_alias (__wcpcpy, wcpcpy)
> +libc_hidden_def (__wcpcpy)
> +#endif
> diff --git a/sysdeps/x86_64/wcpncpy-generic.c b/sysdeps/x86_64/wcpncpy-generic.c
> new file mode 100644
> index 0000000000..871219a445
> --- /dev/null
> +++ b/sysdeps/x86_64/wcpncpy-generic.c
> @@ -0,0 +1,31 @@
> +/* wcpncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcpncpy non-multiarch build is split into two files,
> +   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL <= 3
> +
> +# include <wcsmbs/wcpncpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/wcpncpy.S b/sysdeps/x86_64/wcpncpy.S
> new file mode 100644
> index 0000000000..68e6ff1836
> --- /dev/null
> +++ b/sysdeps/x86_64/wcpncpy.S
> @@ -0,0 +1,41 @@
> +/* wcpncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcpncpy non-multiarch build is split into two files,
> +   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL >= 4
> +
> +# define WCPNCPY       __wcpncpy
> +
> +# define DEFAULT_IMPL_V4       "multiarch/wcpncpy-evex.S"
> +# define DEFAULT_IMPL_V3       "multiarch/wcpncpy-avx2.S"
> +/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> +   should never be used from here.  */
> +# define DEFAULT_IMPL_V1       "ERROR -- Invalid ISA IMPL"
> +
> +# include "isa-default-impl.h"
> +
> +weak_alias (__wcpncpy, wcpncpy)
> +libc_hidden_def (__wcpncpy)
> +#endif
> diff --git a/sysdeps/x86_64/wcscat-generic.c b/sysdeps/x86_64/wcscat-generic.c
> new file mode 100644
> index 0000000000..85f981a81f
> --- /dev/null
> +++ b/sysdeps/x86_64/wcscat-generic.c
> @@ -0,0 +1,31 @@
> +/* wcscat dispatch for RTLD and non-multiarch .c ISA level 1 build.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcscat non-multiarch build is split into two files,
> +   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL <= 3
> +
> +# include <wcsmbs/wcscat.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/wcscat.S b/sysdeps/x86_64/wcscat.S
> new file mode 100644
> index 0000000000..007de3c40c
> --- /dev/null
> +++ b/sysdeps/x86_64/wcscat.S
> @@ -0,0 +1,41 @@
> +/* wcscat dispatch for RTLD and non-multiarch .c ISA level 1 build.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcscat non-multiarch build is split into two files,
> +   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL >= 4
> +
> +# define WCSCAT        __wcscat
> +
> +# define DEFAULT_IMPL_V4       "multiarch/wcscat-evex.S"
> +# define DEFAULT_IMPL_V3       "multiarch/wcscat-avx2.S"
> +/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> +   should never be used from here.  */
> +# define DEFAULT_IMPL_V1       "ERROR -- Invalid ISA IMPL"
> +
> +# include "isa-default-impl.h"
> +
> +weak_alias (__wcscat, wcscat)
> +libc_hidden_def (__wcscat)
> +#endif
> diff --git a/sysdeps/x86_64/wcscpy.S b/sysdeps/x86_64/wcscpy.S
> index 11d0bb4bab..ab9288ed74 100644
> --- a/sysdeps/x86_64/wcscpy.S
> +++ b/sysdeps/x86_64/wcscpy.S
> @@ -28,6 +28,8 @@
>
>  # define WCSCPY        __wcscpy
>
> +# define DEFAULT_IMPL_V4       "multiarch/wcscpy-evex.S"
> +# define DEFAULT_IMPL_V3       "multiarch/wcscpy-avx2.S"
>  # define DEFAULT_IMPL_V2       "multiarch/wcscpy-ssse3.S"
>  /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
>     should never be used from here.  */
> diff --git a/sysdeps/x86_64/wcsncat-generic.c b/sysdeps/x86_64/wcsncat-generic.c
> new file mode 100644
> index 0000000000..2cc0f7b11a
> --- /dev/null
> +++ b/sysdeps/x86_64/wcsncat-generic.c
> @@ -0,0 +1,31 @@
> +/* wcsncat dispatch for RTLD and non-multiarch .c ISA level 1 build.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcsncat non-multiarch build is split into two files,
> +   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL <= 3
> +
> +# include <wcsmbs/wcsncat.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/wcsncat.S b/sysdeps/x86_64/wcsncat.S
> new file mode 100644
> index 0000000000..3f4c7948db
> --- /dev/null
> +++ b/sysdeps/x86_64/wcsncat.S
> @@ -0,0 +1,39 @@
> +/* wcsncat dispatch for RTLD and non-multiarch .c ISA level 1 build.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcsncat non-multiarch build is split into two files,
> +   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL >= 4
> +
> +# define WCSNCAT       wcsncat
> +
> +# define DEFAULT_IMPL_V4       "multiarch/wcsncat-evex.S"
> +# define DEFAULT_IMPL_V3       "multiarch/wcsncat-avx2.S"
> +/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> +   should never be used from here.  */
> +# define DEFAULT_IMPL_V1       "ERROR -- Invalid ISA IMPL"
> +
> +# include "isa-default-impl.h"
> +
> +#endif
> diff --git a/sysdeps/x86_64/wcsncpy-generic.c b/sysdeps/x86_64/wcsncpy-generic.c
> new file mode 100644
> index 0000000000..49d06b8ae8
> --- /dev/null
> +++ b/sysdeps/x86_64/wcsncpy-generic.c
> @@ -0,0 +1,31 @@
> +/* wcsncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcsncpy non-multiarch build is split into two files,
> +   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL <= 3
> +
> +# include <wcsmbs/wcsncpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/wcsncpy.S b/sysdeps/x86_64/wcsncpy.S
> new file mode 100644
> index 0000000000..e1428fd4c1
> --- /dev/null
> +++ b/sysdeps/x86_64/wcsncpy.S
> @@ -0,0 +1,41 @@
> +/* wcsncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcsncpy non-multiarch build is split into two files,
> +   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL >= 4
> +
> +# define WCSNCPY       __wcsncpy
> +
> +# define DEFAULT_IMPL_V4       "multiarch/wcsncpy-evex.S"
> +# define DEFAULT_IMPL_V3       "multiarch/wcsncpy-avx2.S"
> +/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> +   should never be used from here.  */
> +# define DEFAULT_IMPL_V1       "ERROR -- Invalid ISA IMPL"
> +
> +# include "isa-default-impl.h"
> +
> +weak_alias (__wcsncpy, wcsncpy)
> +libc_hidden_def (__wcsncpy)
> +#endif
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v3 1/5] benchtests: Make str{n}{cat|cpy} benchmarks output json
  2022-11-03  8:53 [PATCH v1 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json Noah Goldstein
                   ` (4 preceding siblings ...)
  2022-11-04  8:20 ` [PATCH v2 " Noah Goldstein
@ 2022-11-04 20:13 ` Noah Goldstein
  2022-11-04 20:13   ` [PATCH v3 2/5] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
                     ` (4 more replies)
  5 siblings, 5 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04 20:13 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Json output is easier to parse and most other benchmarks already do
the same.
---
 benchtests/bench-strcat.c  | 106 ++++++++++++++++++++++++++-------
 benchtests/bench-strcpy.c  | 118 ++++++++++++++++++++++++-------------
 benchtests/bench-strncat.c |  94 +++++++++++++++++++++--------
 benchtests/bench-strncpy.c |  94 +++++++++++++++++++++--------
 4 files changed, 297 insertions(+), 115 deletions(-)

diff --git a/benchtests/bench-strcat.c b/benchtests/bench-strcat.c
index 749318e37e..890b34b4c1 100644
--- a/benchtests/bench-strcat.c
+++ b/benchtests/bench-strcat.c
@@ -35,6 +35,7 @@
 # define SMALL_CHAR 1273
 #endif /* WIDE */
 
+#include "json-lib.h"
 
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
 
@@ -49,7 +50,7 @@ IMPL (STRCAT, 1)
 IMPL (generic_strcat, 0)
 
 static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src)
 {
   size_t k = STRLEN (dst), i, iters = INNER_LOOP_ITERS_LARGE;
   timing_t start, stop, cur;
@@ -64,7 +65,8 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
 
   if (STRCMP (dst + k, src) != 0)
     {
-      error (0, 0, "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
+      error (0, 0,
+	     "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
 	     impl->name, dst, src);
       ret = 1;
       return;
@@ -80,11 +82,12 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align1, size_t align2, size_t len1, size_t len2, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len1,
+	 size_t len2, int max_char)
 {
   size_t i;
   CHAR *s1, *s2;
@@ -107,53 +110,112 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2, int max_char)
   for (i = 0; i < len2; i++)
     s2[i] = 32 + 23 * i % (max_char - 32);
 
-  printf ("Length %4zd/%4zd, alignment %2zd/%2zd:", len1, len2, align1, align2);
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_attr_uint (json_ctx, "len1", len1);
+  json_attr_uint (json_ctx, "len2", len2);
+  json_attr_uint (json_ctx, "max_char", max_char);
+
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
     {
       s2[len2] = '\0';
-      do_one_test (impl, s2, s1);
+      do_one_test (json_ctx, impl, s2, s1);
     }
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 test_main (void)
 {
+  json_ctx_t json_ctx;
   size_t i;
 
   test_init ();
 
-  printf ("%28s", "");
+  test_init ();
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
   for (i = 0; i < 16; ++i)
     {
-      do_test (0, 0, i, i, SMALL_CHAR);
-      do_test (0, 0, i, i, BIG_CHAR);
-      do_test (0, i, i, i, SMALL_CHAR);
-      do_test (i, 0, i, i, BIG_CHAR);
+      do_test (&json_ctx, 0, 0, i, i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, i, BIG_CHAR);
+      do_test (&json_ctx, 0, i, i, i, SMALL_CHAR);
+      do_test (&json_ctx, i, 0, i, i, BIG_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (0, 0, 8 << i, 8 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 8 << i, 8 << i, SMALL_CHAR);
-      do_test (0, 0, 8 << i, 2 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 8 << i, 2 << i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 8 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 8 << i, 2 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 2 << i, SMALL_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (i, 2 * i, 8 << i, 1, SMALL_CHAR);
-      do_test (2 * i, i, 8 << i, 1, BIG_CHAR);
-      do_test (i, i, 8 << i, 10, SMALL_CHAR);
-      do_test (i, i, 8 << i, 10, BIG_CHAR);
+      do_test (&json_ctx, i, 2 * i, 8 << i, 1, SMALL_CHAR);
+      do_test (&json_ctx, 2 * i, i, 8 << i, 1, BIG_CHAR);
+      do_test (&json_ctx, i, i, 8 << i, 10, SMALL_CHAR);
+      do_test (&json_ctx, i, i, 8 << i, 10, BIG_CHAR);
+    }
+
+  for (i = 32; i < 256; i += 32)
+    {
+      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
     }
 
+  for (; i < 512; i += 64)
+    {
+      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
+    }
+
+  for (; i < 1024; i += 128)
+    {
+      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
+    }
+
+  for (; i < 2048; i += 256)
+    {
+      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
+      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
   return ret;
 }
 
diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
index 29deb8a46a..af8673e137 100644
--- a/benchtests/bench-strcpy.c
+++ b/benchtests/bench-strcpy.c
@@ -26,16 +26,18 @@
 # define SMALL_CHAR 127
 #endif
 
+#include "json-lib.h"
+
 #ifndef STRCPY_RESULT
 # define STRCPY_RESULT(dst, len) dst
 # define TEST_MAIN
 # ifndef WIDE
-#  define TEST_NAME "strcpy"
+#   define TEST_NAME "strcpy"
 # else
-#  define TEST_NAME "wcscpy"
-#  define generic_strcpy generic_wcscpy
+#   define TEST_NAME "wcscpy"
+#   define generic_strcpy generic_wcscpy
 # endif
-#include "bench-string.h"
+# include "bench-string.h"
 
 CHAR *
 generic_strcpy (CHAR *dst, const CHAR *src)
@@ -51,8 +53,8 @@ IMPL (generic_strcpy, 0)
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
 
 static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
-	     size_t len __attribute__((unused)))
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
+	     size_t len __attribute__ ((unused)))
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
@@ -77,23 +79,24 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
   TIMING_NOW (start);
   for (i = 0; i < iters; ++i)
     {
-	  CALL (impl, dst, src);
+      CALL (impl, dst, src);
     }
   TIMING_NOW (stop);
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align1, size_t align2, size_t len, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
+	 int max_char)
 {
   size_t i;
   CHAR *s1, *s2;
-/* For wcscpy: align1 and align2 here mean alignment not in bytes,
-   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
-   len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
+  /* For wcscpy: align1 and align2 here mean alignment not in bytes,
+     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
+     len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
   align1 &= 7;
   if ((align1 + len) * sizeof (CHAR) >= page_size)
     return;
@@ -109,65 +112,96 @@ do_test (size_t align1, size_t align2, size_t len, int max_char)
     s1[i] = 32 + 23 * i % (max_char - 32);
   s1[len] = 0;
 
-  printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len,
-	  align1 * sizeof (CHAR), align2 * sizeof (CHAR));
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_attr_uint (json_ctx, "len", len);
+  json_attr_uint (json_ctx, "max_char", max_char);
+
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s2, s1, len);
+    do_one_test (json_ctx, impl, s2, s1, len);
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 test_main (void)
 {
+  json_ctx_t json_ctx;
   size_t i;
 
   test_init ();
 
-  printf ("%23s", "");
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
   for (i = 0; i < 16; ++i)
     {
-      do_test (0, 0, i, SMALL_CHAR);
-      do_test (0, 0, i, BIG_CHAR);
-      do_test (0, i, i, SMALL_CHAR);
-      do_test (i, 0, i, BIG_CHAR);
+      do_test (&json_ctx, 0, 0, i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, BIG_CHAR);
+      do_test (&json_ctx, 0, i, i, SMALL_CHAR);
+      do_test (&json_ctx, i, 0, i, BIG_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (0, 0, 8 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, SMALL_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (i, 2 * i, 8 << i, SMALL_CHAR);
-      do_test (2 * i, i, 8 << i, BIG_CHAR);
-      do_test (i, i, 8 << i, SMALL_CHAR);
-      do_test (i, i, 8 << i, BIG_CHAR);
+      do_test (&json_ctx, i, 2 * i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 2 * i, i, 8 << i, BIG_CHAR);
+      do_test (&json_ctx, i, i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, i, i, 8 << i, BIG_CHAR);
     }
 
-  for (i = 16; i <= 512; i+=4)
+  for (i = 16; i <= 512; i += 4)
     {
-      do_test (0, 4, i, SMALL_CHAR);
-      do_test (4, 0, i, BIG_CHAR);
-      do_test (4, 4, i, SMALL_CHAR);
-      do_test (2, 2, i, BIG_CHAR);
-      do_test (2, 6, i, SMALL_CHAR);
-      do_test (6, 2, i, BIG_CHAR);
-      do_test (1, 7, i, SMALL_CHAR);
-      do_test (7, 1, i, BIG_CHAR);
-      do_test (3, 4, i, SMALL_CHAR);
-      do_test (4, 3, i, BIG_CHAR);
-      do_test (5, 7, i, SMALL_CHAR);
-      do_test (7, 5, i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 4, i, SMALL_CHAR);
+      do_test (&json_ctx, 4, 0, i, BIG_CHAR);
+      do_test (&json_ctx, 4, 4, i, SMALL_CHAR);
+      do_test (&json_ctx, 2, 2, i, BIG_CHAR);
+      do_test (&json_ctx, 2, 6, i, SMALL_CHAR);
+      do_test (&json_ctx, 6, 2, i, BIG_CHAR);
+      do_test (&json_ctx, 1, 7, i, SMALL_CHAR);
+      do_test (&json_ctx, 7, 1, i, BIG_CHAR);
+      do_test (&json_ctx, 3, 4, i, SMALL_CHAR);
+      do_test (&json_ctx, 4, 3, i, BIG_CHAR);
+      do_test (&json_ctx, 5, 7, i, SMALL_CHAR);
+      do_test (&json_ctx, 7, 5, i, SMALL_CHAR);
     }
 
+  for (i = 1; i < 2048; i += i)
+    {
+      do_test (&json_ctx, 1, 0, i, SMALL_CHAR);
+      do_test (&json_ctx, 0, i, i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, i, SMALL_CHAR);
+      do_test (&json_ctx, i, i, i, SMALL_CHAR);
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
   return ret;
 }
 
diff --git a/benchtests/bench-strncat.c b/benchtests/bench-strncat.c
index b148c55279..5ccc09a4f8 100644
--- a/benchtests/bench-strncat.c
+++ b/benchtests/bench-strncat.c
@@ -33,6 +33,8 @@
 # define SMALL_CHAR 1273
 #endif /* WIDE */
 
+#include "json-lib.h"
+
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
 
 CHAR *
@@ -49,7 +51,8 @@ IMPL (STRNCAT, 2)
 IMPL (generic_strncat, 0)
 
 static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
+	     size_t n)
 {
   size_t k = STRLEN (dst), i, iters = INNER_LOOP_ITERS8;
   timing_t start, stop, cur;
@@ -65,8 +68,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
   size_t len = STRLEN (src);
   if (MEMCMP (dst + k, src, len + 1 > n ? n : len + 1) != 0)
     {
-      error (0, 0, "Incorrect concatenation in function %s",
-	     impl->name);
+      error (0, 0, "Incorrect concatenation in function %s", impl->name);
       ret = 1;
       return;
     }
@@ -88,12 +90,12 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align1, size_t align2, size_t len1, size_t len2,
-	 size_t n, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len1,
+	 size_t len2, size_t n, int max_char)
 {
   size_t i;
   CHAR *s1, *s2;
@@ -118,53 +120,93 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
   for (i = 0; i < len2; i++)
     s2[i] = 32 + 23 * i % (max_char - 32);
 
-  printf ("Length %4zd/%4zd, alignment %2zd/%2zd, N %4zd:",
-	  len1, len2, align1, align2, n);
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_attr_uint (json_ctx, "len1", len1);
+  json_attr_uint (json_ctx, "len2", len2);
+  json_attr_uint (json_ctx, "n", n);
+  json_attr_uint (json_ctx, "max_char", max_char);
+
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
     {
       s2[len2] = '\0';
-      do_one_test (impl, s2, s1, n);
+      do_one_test (json_ctx, impl, s2, s1, n);
     }
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 int
 main (void)
 {
+  json_ctx_t json_ctx;
   size_t i, n;
 
   test_init ();
 
-  printf ("%28s", "");
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
-  for (n = 2; n <= 2048; n*=4)
+  for (n = 2; n <= 2048; n *= 4)
     {
-      do_test (0, 2, 2, 2, n, SMALL_CHAR);
-      do_test (0, 0, 4, 4, n, SMALL_CHAR);
-      do_test (4, 0, 4, 4, n, BIG_CHAR);
-      do_test (0, 0, 8, 8, n, SMALL_CHAR);
-      do_test (0, 8, 8, 8, n, SMALL_CHAR);
+      do_test (&json_ctx, 0, 2, 2, 2, n, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 4, 4, n, SMALL_CHAR);
+      do_test (&json_ctx, 4, 0, 4, 4, n, BIG_CHAR);
+      do_test (&json_ctx, 0, 0, 8, 8, n, SMALL_CHAR);
+      do_test (&json_ctx, 0, 8, 8, 8, n, SMALL_CHAR);
 
       for (i = 1; i < 8; ++i)
 	{
-	  do_test (0, 0, 8 << i, 8 << i, n, SMALL_CHAR);
-	  do_test (8 - i, 2 * i, 8 << i, 8 << i, n, SMALL_CHAR);
-	  do_test (0, 0, 8 << i, 2 << i, n, SMALL_CHAR);
-	  do_test (8 - i, 2 * i, 8 << i, 2 << i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, 8 << i, 8 << i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 8 << i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, 8 << i, 2 << i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 2 << i, n, SMALL_CHAR);
 	}
 
       for (i = 1; i < 8; ++i)
 	{
-	  do_test (i, 2 * i, 8 << i, 1, n, SMALL_CHAR);
-	  do_test (2 * i, i, 8 << i, 1, n, BIG_CHAR);
-	  do_test (i, i, 8 << i, 10, n, SMALL_CHAR);
+	  do_test (&json_ctx, i, 2 * i, 8 << i, 1, n, SMALL_CHAR);
+	  do_test (&json_ctx, 2 * i, i, 8 << i, 1, n, BIG_CHAR);
+	  do_test (&json_ctx, i, i, 8 << i, 10, n, SMALL_CHAR);
 	}
     }
 
+  for (i = 128; i < 2048; i += i)
+    {
+      for (n = i - 64; n <= i + 64; n += 32)
+	{
+	  do_test (&json_ctx, 1, 0, i, i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 0, i, i, i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, i, i, n, SMALL_CHAR);
+	  do_test (&json_ctx, i, i, i, i, n, SMALL_CHAR);
+	  do_test (&json_ctx, 1, 0, i, n, i, SMALL_CHAR);
+	  do_test (&json_ctx, 0, i, i, n, i, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, i, n, i, SMALL_CHAR);
+	  do_test (&json_ctx, i, i, i, n, i, SMALL_CHAR);
+	}
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
   return ret;
 }
diff --git a/benchtests/bench-strncpy.c b/benchtests/bench-strncpy.c
index 8207d99f4d..f621cbfe09 100644
--- a/benchtests/bench-strncpy.c
+++ b/benchtests/bench-strncpy.c
@@ -24,6 +24,8 @@
 # define SMALL_CHAR 127
 #endif /* !WIDE */
 
+#include "json-lib.h"
+
 #ifndef STRNCPY_RESULT
 # define STRNCPY_RESULT(dst, len, n) dst
 # define TEST_MAIN
@@ -52,7 +54,8 @@ IMPL (generic_strncpy, 0)
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
 
 static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
+	     size_t len, size_t n)
 {
   size_t i, iters = INNER_LOOP_ITERS_LARGE * (4 / CHARBYTES);
   timing_t start, stop, cur;
@@ -77,7 +80,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
       size_t i;
 
       for (i = len; i < n; ++i)
-	if (dst [i] != '\0')
+	if (dst[i] != '\0')
 	  {
 	    error (0, 0, "Wrong result in function %s", impl->name);
 	    ret = 1;
@@ -94,17 +97,18 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
 
   TIMING_DIFF (cur, start, stop);
 
-  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+  json_element_double (json_ctx, (double) cur / (double) iters);
 }
 
 static void
-do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
+do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
+	 size_t n, int max_char)
 {
   size_t i;
   CHAR *s1, *s2;
 
-/* For wcsncpy: align1 and align2 here mean alignment not in bytes,
-   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)).  */
+  /* For wcsncpy: align1 and align2 here mean alignment not in bytes,
+     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)).  */
   align1 &= 7;
   if ((align1 + len) * sizeof (CHAR) >= page_size)
     return;
@@ -123,46 +127,86 @@ do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
        ++i)
     s1[i] = 32 + 32 * i % (max_char - 32);
 
-  printf ("Length %4zd, n %4zd, alignment %2zd/%2zd:", len, n, align1, align2);
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "align1", align1);
+  json_attr_uint (json_ctx, "align2", align2);
+  json_attr_uint (json_ctx, "len", len);
+  json_attr_uint (json_ctx, "n", n);
+  json_attr_uint (json_ctx, "max_char", max_char);
+
+  json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (impl, s2, s1, len, n);
+    do_one_test (json_ctx, impl, s2, s1, len, n);
 
-  putchar ('\n');
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
 }
 
 static int
 test_main (void)
 {
-  size_t i;
+  json_ctx_t json_ctx;
+  size_t i, j;
 
   test_init ();
 
-  printf ("%28s", "");
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "");
+
+  json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-    printf ("\t%s", impl->name);
-  putchar ('\n');
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (i, i, 16, 16, SMALL_CHAR);
-      do_test (i, i, 16, 16, BIG_CHAR);
-      do_test (i, 2 * i, 16, 16, SMALL_CHAR);
-      do_test (2 * i, i, 16, 16, BIG_CHAR);
-      do_test (8 - i, 2 * i, 1 << i, 2 << i, SMALL_CHAR);
-      do_test (2 * i, 8 - i, 2 << i, 1 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 1 << i, 2 << i, BIG_CHAR);
-      do_test (2 * i, 8 - i, 2 << i, 1 << i, BIG_CHAR);
+      do_test (&json_ctx, i, i, 16, 16, SMALL_CHAR);
+      do_test (&json_ctx, i, i, 16, 16, BIG_CHAR);
+      do_test (&json_ctx, i, 2 * i, 16, 16, SMALL_CHAR);
+      do_test (&json_ctx, 2 * i, i, 16, 16, BIG_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 1 << i, 2 << i, SMALL_CHAR);
+      do_test (&json_ctx, 2 * i, 8 - i, 2 << i, 1 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 1 << i, 2 << i, BIG_CHAR);
+      do_test (&json_ctx, 2 * i, 8 - i, 2 << i, 1 << i, BIG_CHAR);
     }
 
   for (i = 1; i < 8; ++i)
     {
-      do_test (0, 0, 4 << i, 8 << i, SMALL_CHAR);
-      do_test (0, 0, 16 << i, 8 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 4 << i, 8 << i, SMALL_CHAR);
-      do_test (8 - i, 2 * i, 16 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 4 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 0, 0, 16 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 4 << i, 8 << i, SMALL_CHAR);
+      do_test (&json_ctx, 8 - i, 2 * i, 16 << i, 8 << i, SMALL_CHAR);
     }
 
+  for (i = 128; i < 2048; i += i)
+    {
+      for (j = i - 64; j <= i + 64; j += 32)
+	{
+	  do_test (&json_ctx, 1, 0, i, j, SMALL_CHAR);
+	  do_test (&json_ctx, 0, i, i, j, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, i, j, SMALL_CHAR);
+	  do_test (&json_ctx, i, i, i, j, SMALL_CHAR);
+	  do_test (&json_ctx, 1, 0, j, i, SMALL_CHAR);
+	  do_test (&json_ctx, 0, i, j, i, SMALL_CHAR);
+	  do_test (&json_ctx, 0, 0, j, i, SMALL_CHAR);
+	  do_test (&json_ctx, i, i, j, i, SMALL_CHAR);
+	}
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
   return ret;
 }
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v3 2/5] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
  2022-11-04 20:13 ` [PATCH v3 1/5] " Noah Goldstein
@ 2022-11-04 20:13   ` Noah Goldstein
  2022-11-04 21:46     ` H.J. Lu
  2022-11-04 20:13   ` [PATCH v3 3/5] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04 20:13 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
    1. Use more overlapping stores to avoid branches.
    2. Reduce how unrolled the aligning copies are (this is more of a
       code-size save, its a negative for some sizes in terms of
       perf).
    3. Improve the loop a bit (similiar to what we do in strlen with
       2x vpminu + kortest instead of 3x vpminu + kmov + test).
    4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
       number that are taken.

Performance Changes:

    Times are from N = 10 runs of the benchmark suite and are
    reported as geometric mean of all ratios of
    New Implementation / Old Implementation.

    stpcpy-evex      -> 0.922
    strcat-evex      -> 0.985
    strcpy-evex      -> 0.880

    strncpy-evex     -> 0.831
    stpncpy-evex     -> 0.780

    strncat-evex     -> 0.958

Code Size Changes:
    function         -> Bytes New / Bytes Old -> Ratio

    strcat-evex      ->  819 / 1874 -> 0.437
    strcpy-evex      ->  700 / 1074 -> 0.652
    stpcpy-evex      ->  735 / 1094 -> 0.672

    strncpy-evex     -> 1397 / 2611 -> 0.535
    stpncpy-evex     -> 1489 / 2691 -> 0.553

    strncat-evex     -> 1184 / 2832 -> 0.418

Notes:
    1. Because of the significant difference between the
       implementations they are split into three files.

           strcpy-evex.S    -> strcpy, stpcpy, strcat
           strncpy-evex.S   -> strncpy
           strncat-evex.S    > strncat

       I couldn't find a way to merge them without making the
       ifdefs incredibly difficult to follow.

    2. All implementations can be made evex512 by including
       "x86-evex512-vecs.h" at the top.

    3. All implementations have an optional define:
        `USE_EVEX_MASKED_STORE`
       Setting to one uses evex-masked stores for handling short
       strings.  This saves code size and branches.  It's disabled
       for all implementations are the moment as there are some
       serious drawbacks to masked stores in certain cases, but
       that may be fixed on future architectures.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
 sysdeps/x86_64/multiarch/stpncpy-evex.S       |    5 +-
 sysdeps/x86_64/multiarch/strcat-evex.S        |  291 +---
 sysdeps/x86_64/multiarch/strcat-strlen-evex.S |  110 ++
 sysdeps/x86_64/multiarch/strcpy-evex.S        | 1282 ++++++-----------
 sysdeps/x86_64/multiarch/strncat-evex.S       |  525 ++++++-
 sysdeps/x86_64/multiarch/strncpy-evex.S       |  995 ++++++++++++-
 .../multiarch/strncpy-or-cat-overflow-def.h   |   65 +
 7 files changed, 2100 insertions(+), 1173 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h

diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
index 99ea76a372..3693491baa 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
@@ -3,6 +3,5 @@
 #endif
 
 #define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY	STPNCPY
-#include "strcpy-evex.S"
+#define STRNCPY	STPNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
index 0e2df947e9..b4207b7889 100644
--- a/sysdeps/x86_64/multiarch/strcat-evex.S
+++ b/sysdeps/x86_64/multiarch/strcat-evex.S
@@ -1,286 +1,7 @@
-/* strcat with 256-bit EVEX instructions.
-   Copyright (C) 2021-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_evex
-# endif
-
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
-/* zero register */
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMM0		ymm17
-# define YMM1		ymm18
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE	32
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRCAT)
-	mov	%rdi, %r9
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-	xor	%eax, %eax
-	mov	%edi, %ecx
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-	cmp	$(VEC_SIZE * 3), %ecx
-	ja	L(fourth_vector_boundary)
-	vpcmpb	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_first_vector)
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	jmp	L(align_vec_size_start)
-L(fourth_vector_boundary):
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	vpcmpb	$0, (%rax), %YMMZERO, %k0
-	mov	$-1, %r10d
-	sub	%rax, %rcx
-	shl	%cl, %r10d
-	kmovd	%k0, %edx
-	and	%r10d, %edx
-	jnz	L(exit)
-
-L(align_vec_size_start):
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 4), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	kmovd	%k4, %edx
-	add	$(VEC_SIZE * 4), %rax
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 4), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 5), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	add	$VEC_SIZE, %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	add	$VEC_SIZE, %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
-	add	$VEC_SIZE, %rax
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	add	$VEC_SIZE, %rax
-
-	.p2align 4
-L(align_four_vec_loop):
-	VMOVA	(%rax), %YMM0
-	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
-	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
-	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
-	vpminub	%YMM0, %YMM1, %YMM0
-	/* If K0 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM0, %YMMZERO, %k0
-	add	$(VEC_SIZE * 4), %rax
-	ktestd	%k0, %k0
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
-	sub	$(VEC_SIZE * 5), %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit):
-	sub	%rdi, %rax
-L(exit_null_on_first_vector):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_second_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$VEC_SIZE, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_third_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 2), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fourth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 3), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fifth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-
-	.p2align 4
-L(StartStrcpyPart):
-	lea	(%r9, %rax), %rdi
-	mov	%rsi, %rcx
-	mov	%r9, %rax      /* save result */
-
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(ExitZero)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-evex.S"
+#ifndef STRCAT
+# define STRCAT	__strcat_evex
 #endif
+
+#define USE_AS_STRCAT
+#define STRCPY	STRCAT
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
new file mode 100644
index 0000000000..9530d7b683
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
@@ -0,0 +1,110 @@
+/* strlen used for begining of str{n}cat using EVEX 256/512.
+   Copyright (C) 2011-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+/* NOTE: This file is meant to be included by strcat-evex or
+   strncat-evex and does not standalone.  Before including %rdi
+   must be saved in %rax.  */
+
+
+/* Simple strlen implementation that ends at
+   L(strcat_strlen_done).  */
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+	VPCMPEQ	(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+#ifdef USE_AS_WCSCPY
+	subl	%r8d, %edi
+	shrl	$2, %edi
+#endif
+	shrx	%VRDI, %VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+	movq	%rax, %rdi
+#endif
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v0)
+
+
+	VPCMPEQ	VEC_SIZE(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	leaq	(VEC_SIZE)(%r8), %rdi
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v1)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v2)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v3)
+
+	andq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4,, 8
+L(loop_2x_vec):
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(0)
+	VPMIN	(VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(2)
+	VPMIN	(VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	VPTESTN	%VMM(3), %VMM(3), %k3
+	subq	$(VEC_SIZE * -4), %rdi
+	KORTEST	%k1, %k3
+	jz	L(loop_2x_vec)
+
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v0)
+
+	KMOV	%k1, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v1)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v2)
+
+	KMOV	%k3, %VRCX
+L(bsf_and_done_v3):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v2):
+	bsf	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
+	jmp	L(strcat_strlen_done)
+
+	.p2align 4,, 4
+L(bsf_and_done_v1):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v0):
+	bsf	%VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#else
+	addq	%rcx, %rdi
+#endif
+L(strcat_strlen_done):
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
index 82e45ac675..1ba0195ed2 100644
--- a/sysdeps/x86_64/multiarch/strcpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
@@ -1,4 +1,4 @@
-/* strcpy with 256-bit EVEX instructions.
+/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
    Copyright (C) 2021-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,990 +17,526 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <isa-level.h>
-
 #if ISA_SHOULD_BUILD (4)
 
 
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+	/* Use movsb in page cross case to save code size.  */
+# define USE_MOVSB_IN_PAGE_CROSS	1
 
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_evex
-#  endif
+# include <sysdep.h>
 
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
-/* Number of bytes in a vector register */
-# ifndef VEC_SIZE
-#  define VEC_SIZE	32
+# ifndef STRCPY
+#  define STRCPY	__strcpy_evex
 # endif
 
-# define XMM2		xmm18
-# define XMM3		xmm19
 
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
-# define YMM7		ymm23
+# ifdef USE_AS_WCSCPY
+#  define VMOVU_MASK	vmovdqu32
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
 
-# ifndef USE_AS_STRCAT
+#  define REP_MOVS	rep movsd
 
-/* zero register */
-#  define XMMZERO	xmm16
-#  define YMMZERO	ymm16
-#  define YMM1		ymm17
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRCPY)
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-	test	%R8_LP, %R8_LP
-	jz	L(ExitZero)
-#  endif
-	mov	%rsi, %rcx
-#  ifndef USE_AS_STPCPY
-	mov	%rdi, %rax      /* save result */
-#  endif
+#  define USE_WIDE_CHAR
+# else
+#  define VMOVU_MASK	vmovdqu8
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
 
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+#  define REP_MOVS	rep movsb
 # endif
 
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	cmp	$(VEC_SIZE * 2), %ecx
-	jbe	L(SourceStringAlignmentLessTwoVecSize)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-
-	vpcmpb	$0, (%rsi), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	shr	%cl, %rdx
+# include "reg-macros.h"
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	mov	$VEC_SIZE, %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  else
-	mov	$(VEC_SIZE + 1), %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  endif
-	jbe	L(CopyVecSizeTailCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail)
-
-	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
-	kmovd	%k1, %edx
 
-# ifdef USE_AS_STRNCPY
-	add	$VEC_SIZE, %r10
-	cmp	%r10, %r8
-	jbe	L(CopyTwoVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize)
-
-	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
-	VMOVU	%YMM2, (%rdi)
-
-/* If source address alignment != destination address alignment */
-	.p2align 4
-L(UnalignVecSizeBoth):
-	sub	%rcx, %rdi
-# ifdef USE_AS_STRNCPY
-	add	%rcx, %r8
-	sbb	%rcx, %rcx
-	or	%rcx, %r8
-# endif
-	mov	$VEC_SIZE, %rcx
-	VMOVA	(%rsi, %rcx), %YMM2
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 3), %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STPCPY
+#  define END_REG	rax
 # else
-	jnz	L(CopyVecSize)
+#  define END_REG	rdi, %rdx, CHAR_SIZE
 # endif
 
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
+# ifdef USE_AS_STRCAT
+#  define PAGE_ALIGN_REG	edx
+#  define PAGE_ALIGN_REG_64	rdx
 # else
-	jnz	L(CopyVecSize)
+#  define PAGE_ALIGN_REG	eax
+#  define PAGE_ALIGN_REG_64	rax
 # endif
 
-	VMOVU	%YMM3, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
-	vpcmpb	$0, %YMM4, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
 
-	VMOVU	%YMM4, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
 
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
-	VMOVU	%YMM2, (%rdi, %rcx)
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
 
-	VMOVU	%YMM3, (%rdi, %rcx)
-	mov	%rsi, %rdx
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	and	$-(VEC_SIZE * 4), %rsi
-	sub	%rsi, %rdx
-	sub	%rdx, %rdi
-# ifdef USE_AS_STRNCPY
-	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
-# endif
-L(UnalignedFourVecSizeLoop):
-	VMOVA	(%rsi), %YMM4
-	VMOVA	VEC_SIZE(%rsi), %YMM5
-	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
-	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
-	vpminub	%YMM5, %YMM4, %YMM2
-	vpminub	%YMM7, %YMM6, %YMM3
-	vpminub	%YMM2, %YMM3, %YMM2
-	/* If K7 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM2, %YMMZERO, %k7
-	kmovd	%k7, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRCPY)
+# ifdef USE_AS_STRCAT
+	movq	%rdi, %rax
+#  include "strcat-strlen-evex.S"
 # endif
-	test	%edx, %edx
-	jnz	L(UnalignedFourVecSizeLeave)
-
-L(UnalignedFourVecSizeLoop_start):
-	add	$(VEC_SIZE * 4), %rdi
-	add	$(VEC_SIZE * 4), %rsi
-	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
-	VMOVA	(%rsi), %YMM4
-	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
-	VMOVA	VEC_SIZE(%rsi), %YMM5
-	vpminub	%YMM5, %YMM4, %YMM2
-	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
-	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
-	VMOVU	%YMM7, -VEC_SIZE(%rdi)
-	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
-	vpminub	%YMM7, %YMM6, %YMM3
-	vpminub	%YMM2, %YMM3, %YMM2
-	/* If K7 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM2, %YMMZERO, %k7
-	kmovd	%k7, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
+
+	movl	%esi, %PAGE_ALIGN_REG
+	andl	$(PAGE_SIZE - 1), %PAGE_ALIGN_REG
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 # endif
-	test	%edx, %edx
-	jz	L(UnalignedFourVecSizeLoop_start)
 
-L(UnalignedFourVecSizeLeave):
-	vpcmpb	$0, %YMM4, %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_0)
 
-	vpcmpb	$0, %YMM5, %YMMZERO, %k2
-	kmovd	%k2, %ecx
-	test	%ecx, %ecx
-	jnz	L(CopyVecSizeUnaligned_16)
+	/* Two short string implementations. One with traditional
+	   branching approach and one with masked instructions (which
+	   have potential for dramatically bad perf if dst splits a
+	   page and is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	VPTEST	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+#  ifdef USE_AS_WCSCPY
+	subl	$((1 << CHAR_PER_VEC)- 1), %VRCX
+#  else
+	inc	%VRCX
+#  endif
+	jz	L(more_1x_vec)
+	KMOV	%VRCX, %k1
+	KXOR	%k0, %k1, %k1
 
-	vpcmpb	$0, %YMM6, %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_32)
-
-	vpcmpb	$0, %YMM7, %YMMZERO, %k4
-	kmovd	%k4, %ecx
-	bsf	%ecx, %edx
-	VMOVU	%YMM4, (%rdi)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$(VEC_SIZE * 3), %rsi
-	add	$(VEC_SIZE * 3), %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
 
-/* If source address alignment == destination address alignment */
+#  ifdef USE_AS_STPCPY
+	bsf	%VRCX, %VRCX
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
+#  endif
+	ret
 
-L(SourceStringAlignmentLessTwoVecSize):
-	VMOVU	(%rsi), %YMM3
-	VMOVU	VEC_SIZE(%rsi), %YMM2
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
+# else
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jz	L(more_1x_vec)
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$VEC_SIZE, %r8
+	xorl	%edx, %edx
+	bsf	%VRCX, %VRDX
+#  ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  endif
+
+	/* Use mask bits in rcx to detect which copy we need. If the low
+	   mask is zero then there must be a bit set in the upper half.
+	   I.e if rcx != 0 and ecx == 0, then match must be upper 32
+	   bits so we use L(copy_32_63).  */
+#  if VEC_SIZE == 64
+#   ifdef USE_AS_WCSCPY
+	testb	%cl, %cl
+#   else
+	testl	%ecx, %ecx
+#   endif
+	jz	L(copy_32_63)
+#  endif
+
+#  ifdef USE_AS_WCSCPY
+	testb	$0xf, %cl
 #  else
-	cmp	$(VEC_SIZE + 1), %r8
+	testw	%cx, %cx
 #  endif
-	jbe	L(CopyVecSizeTail1Case2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail1)
+	jz	L(copy_16_31)
 
-	VMOVU	%YMM3, (%rdi)
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$(VEC_SIZE * 2), %r8
+#  ifdef USE_AS_WCSCPY
+	testb	$0x3, %cl
 #  else
-	cmp	$((VEC_SIZE * 2) + 1), %r8
+	testb	%cl, %cl
 #  endif
-	jbe	L(CopyTwoVecSize1Case2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize1)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-	jmp	L(UnalignVecSizeBoth)
+	jz	L(copy_8_15)
 
-/*------End of main part with loops---------------------*/
 
-/* Case1 */
+#  ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+	/* No need to copy, we know its zero.  */
+	movl	$0, (%END_REG)
 
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
-	.p2align 4
-L(CopyVecSize):
-	add	%rcx, %rdi
-# endif
-L(CopyVecSizeTail):
-	add	%rcx, %rsi
-L(CopyVecSizeTail1):
-	bsf	%edx, %edx
-L(CopyVecSizeExit):
-	cmp	$32, %edx
-	jae	L(Exit32_63)
-	cmp	$16, %edx
-	jae	L(Exit16_31)
-	cmp	$8, %edx
-	jae	L(Exit8_15)
-	cmp	$4, %edx
-	jae	L(Exit4_7)
-	cmp	$3, %edx
-	je	L(Exit3)
-	cmp	$1, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	movb	$0, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$1, %r8
-	lea	1(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
 	ret
+#  else
 
-	.p2align 4
-L(CopyTwoVecSize1):
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$VEC_SIZE, %r8
-# endif
-	jmp	L(CopyVecSizeTail1)
-
-	.p2align 4
-L(CopyTwoVecSize):
-	bsf	%edx, %edx
-	add	%rcx, %rsi
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	jmp	L(CopyVecSizeExit)
-
-	.p2align 4
-L(CopyVecSizeUnaligned_0):
-	bsf	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM4, (%rdi)
-	add	$((VEC_SIZE * 4) - 1), %r8
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	jmp	L(CopyVecSizeExit)
-# endif
+	testb	$0x7, %cl
+	jz	L(copy_4_7)
 
-	.p2align 4
-L(CopyVecSizeUnaligned_16):
-	bsf	%ecx, %edx
-	VMOVU	%YMM4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	VEC_SIZE(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	add	$((VEC_SIZE * 3) - 1), %r8
-	sub	%rdx, %r8
-	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
 
-	.p2align 4
-L(CopyVecSizeUnaligned_32):
-	bsf	%edx, %edx
-	VMOVU	%YMM4, (%rdi)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-	add	$((VEC_SIZE * 2) - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$(VEC_SIZE * 2), %rsi
-	add	$(VEC_SIZE * 2), %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
+	test	%edx, %edx
+	jz	L(set_null_term)
 
-# ifdef USE_AS_STRNCPY
-#  ifndef USE_AS_STRCAT
-	.p2align 4
-L(CopyVecSizeUnalignedVec6):
-	VMOVU	%YMM6, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec5):
-	VMOVU	%YMM5, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec4):
-	VMOVU	%YMM4, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec3):
-	VMOVU	%YMM3, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
+	/* NB: make this `vmovw` if support for AVX512-FP16 is added.
+	 */
+	vmovd	%VMM_128(0), %esi
+	movw	%si, (%rdi)
+
+	.p2align 4,, 1
+L(set_null_term):
+	/* No need to copy, we know its zero.  */
+	movb	$0, (%END_REG)
+	ret
 #  endif
 
-/* Case2 */
-
-	.p2align 4
-L(CopyVecSizeCase2):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTailCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTail1Case2):
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-/* Case2 or Case3,  Case3 */
-
-	.p2align 4
-L(CopyVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeCase2)
-L(CopyVecSizeCase3):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyTwoVecSizeCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyVecSizeTailCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTailCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSize1Case2OrCase3):
-	add	$VEC_SIZE, %rdi
-	add	$VEC_SIZE, %rsi
-	sub	$VEC_SIZE, %r8
-L(CopyVecSizeTail1Case2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTail1Case2)
-	jmp	L(StrncpyExit)
+#  if VEC_SIZE == 64
+	.p2align 4,, 6
+L(copy_32_63):
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
+	ret
+#  endif
+
+
+	.p2align 4,, 6
+L(copy_16_31):
+	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
+	   and will save code size.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
+	ret
+
+	.p2align 4,, 8
+L(copy_8_15):
+#  ifdef USE_AS_WCSCPY
+	movl	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+#  else
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
+#  endif
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%END_REG)
+	ret
 # endif
 
-/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
 
-	.p2align 4
-L(Exit1):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
+# ifndef USE_AS_WCSCPY
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%ecx, -(4 - CHAR_SIZE)(%END_REG)
+	ret
 # endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$2, %r8
-	lea	2(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rdi)
 # endif
-	ret
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+	addq	%rsi, %rdi
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
 
-	.p2align 4
-L(Exit2):
-	movzwl	(%rsi), %ecx
-	mov	%cx, (%rdi)
-	movb	$0, 2(%rdi)
+	/* Ideally we store after moves to minimize impact of potential
+	   false-dependencies.  */
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rax)
+# endif
+
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+
+
+	/* Align for 4x loop.  */
+	subq	%rsi, %rdi
+
+	/* + VEC_SIZE * 5 because we never added the original VEC_SIZE
+	   we covered before aligning.  */
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$-(VEC_SIZE * 4), %rsi
+
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	/* Restore rdi (%rdi).  */
+	addq	%rsi, %rdi
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x0_end)
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	/* Place L(ret_vec_x4) here to save code size.  We get a
+	   meaningfuly benefit doing this for stpcpy.  */
+	KMOV	%k4, %VRDX
+L(ret_vec_x3):
+	bsf	%VRDX, %VRDX
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$3, %r8
-	lea	3(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
 # endif
+L(return_end):
 	ret
 
-	.p2align 4
-L(Exit3):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
+	.p2align 4,, 6
+L(ret_vec_x0_end):
+	bsf	%VRCX, %VRCX
 # ifdef USE_AS_STPCPY
-	lea	3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$4, %r8
-	lea	4(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
+	inc	%VRCX
+	VMOVU	(-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
 	ret
 
-	.p2align 4
-L(Exit4_7):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	-3(%rsi, %rdx), %ecx
-	mov	%ecx, -3(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x1):
+	bsf	%VRCX, %VRCX
+	VMOVU	(VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
 	ret
 
-	.p2align 4
-L(Exit8_15):
-	mov	(%rsi), %rcx
-	mov	-7(%rsi, %rdx), %r9
-	mov	%rcx, (%rdi)
-	mov	%r9, -7(%rdi, %rdx)
+	.p2align 4,, 4
+L(ret_vec_x2):
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
 	ret
 
-	.p2align 4
-L(Exit16_31):
-	VMOVU	(%rsi), %XMM2
-	VMOVU	-15(%rsi, %rdx), %XMM3
-	VMOVU	%XMM2, (%rdi)
-	VMOVU	%XMM3, -15(%rdi, %rdx)
+	/* ret_vec_x3 reuses return code after the loop.  */
+	.p2align 4,, 6
+L(ret_vec_x4):
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub %rdx, %r8
-	sub $1, %r8
-	lea 1(%rdi, %rdx), %rdi
-	jnz L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
 	ret
 
-	.p2align 4
-L(Exit32_63):
-	VMOVU	(%rsi), %YMM2
-	VMOVU	-31(%rsi, %rdx), %YMM3
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, -31(%rdi, %rdx)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
+
+	.p2align 4,, 4
+L(page_cross):
+# ifndef USE_AS_STRCAT
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
 # endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	movq	%rsi, %rcx
+	andq	$(VEC_SIZE * -1), %rcx
+
+	VPCMPEQ	(%rcx), %VZERO, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSCPY
+	andl	$(VEC_SIZE - 1), %PAGE_ALIGN_REG
+	shrl	$2, %PAGE_ALIGN_REG
 # endif
-	ret
+	shrx	%VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
 
-# ifdef USE_AS_STRNCPY
+# if USE_MOVSB_IN_PAGE_CROSS
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
 
-	.p2align 4
-L(StrncpyExit1):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 1(%rdi)
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shl	%VRCX
+	jz	L(page_cross_continue)
+#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 #  endif
-	ret
+	bsf	%VRCX, %VRCX
+	REP_MOVS
 
-	.p2align 4
-L(StrncpyExit2):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
 #  ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 2(%rdi)
+	leaq	-CHAR_SIZE(%rdi), %rax
 #  endif
 	ret
 
-	.p2align 4
-L(StrncpyExit3_4):
-	movzwl	(%rsi), %ecx
-	movzwl	-2(%rsi, %r8), %edx
-	mov	%cx, (%rdi)
-	mov	%dx, -2(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
 
-	.p2align 4
-L(StrncpyExit5_8):
-	mov	(%rsi), %ecx
-	mov	-4(%rsi, %r8), %edx
-	mov	%ecx, (%rdi)
-	mov	%edx, -4(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
+# else
+	/* Check if we found zero-char before end of page.  */
+	test	%VRCX, %VRCX
+	jz	L(page_cross_continue)
 
-	.p2align 4
-L(StrncpyExit9_16):
-	mov	(%rsi), %rcx
-	mov	-8(%rsi, %r8), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
+	/* Traditional copy case, essentially same as used in non-page-
+	   cross case but since we can't reuse VMM(0) we need twice as
+	   many loads from rsi.  */
 
-	.p2align 4
-L(StrncpyExit17_32):
-	VMOVU	(%rsi), %XMM2
-	VMOVU	-16(%rsi, %r8), %XMM3
-	VMOVU	%XMM2, (%rdi)
-	VMOVU	%XMM3, -16(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
+#  ifndef USE_AS_STRCAT
+	xorl	%edx, %edx
 #  endif
-	ret
-
-	.p2align 4
-L(StrncpyExit33_64):
-	/*  0/32, 31/16 */
-	VMOVU	(%rsi), %YMM2
-	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
+	/* Dependency on rdi must already have been satisfied.  */
+	bsf	%VRCX, %VRDX
 #  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  elif !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 #  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
 
-	.p2align 4
-L(StrncpyExit65):
-	/* 0/32, 32/32, 64/1 */
-	VMOVU	(%rsi), %YMM2
-	VMOVU	32(%rsi), %YMM3
-	mov	64(%rsi), %cl
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, 32(%rdi)
-	mov	%cl, 64(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	65(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 65(%rdi)
+#  if VEC_SIZE == 64
+#   ifdef USE_AS_WCSCPY
+	testb	%cl, %cl
+#   else
+	test	%ecx, %ecx
+#   endif
+	jz	L(page_cross_copy_32_63)
 #  endif
-	ret
-
-#  ifndef USE_AS_STRCAT
 
-	.p2align 4
-L(Fill1):
-	mov	%dl, (%rdi)
-	ret
+#  ifdef USE_AS_WCSCPY
+	testb	$0xf, %cl
+#  else
+	testw	%cx, %cx
+#  endif
+	jz	L(page_cross_copy_16_31)
 
-	.p2align 4
-L(Fill2):
-	mov	%dx, (%rdi)
-	ret
+#  ifdef USE_AS_WCSCPY
+	testb	$0x3, %cl
+#  else
+	testb	%cl, %cl
+#  endif
+	jz	L(page_cross_copy_8_15)
 
-	.p2align 4
-L(Fill3_4):
-	mov	%dx, (%rdi)
-	mov     %dx, -2(%rdi, %r8)
+#  ifdef USE_AS_WCSCPY
+	movl	(%rsi), %esi
+	movl	%esi, (%rdi)
+	movl	$0, (%END_REG)
 	ret
+#  else
 
-	.p2align 4
-L(Fill5_8):
-	mov	%edx, (%rdi)
-	mov     %edx, -4(%rdi, %r8)
-	ret
+	testb	$0x7, %cl
+	jz	L(page_cross_copy_4_7)
 
-	.p2align 4
-L(Fill9_16):
-	mov	%rdx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
+	test	%edx, %edx
+	jz	L(page_cross_set_null_term)
+	movzwl	(%rsi), %ecx
+	movw	%cx, (%rdi)
+L(page_cross_set_null_term):
+	movb	$0, (%END_REG)
 	ret
 
-	.p2align 4
-L(Fill17_32):
-	VMOVU	%XMMZERO, (%rdi)
-	VMOVU	%XMMZERO, -16(%rdi, %r8)
-	ret
 
-	.p2align 4
-L(CopyVecSizeUnalignedVec2):
-	VMOVU	%YMM2, (%rdi, %rcx)
-
-	.p2align 4
-L(CopyVecSizeVecExit):
-	bsf	%edx, %edx
-	add	$(VEC_SIZE - 1), %r8
-	add	%rcx, %rdi
-#   ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-#   endif
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-
-	.p2align 4
-L(StrncpyFillTailWithZero):
-	xor	%edx, %edx
-	sub	$VEC_SIZE, %r8
-	jbe	L(StrncpyFillExit)
-
-	VMOVU	%YMMZERO, (%rdi)
-	add	$VEC_SIZE, %rdi
-
-	mov	%rdi, %rsi
-	and	$(VEC_SIZE - 1), %esi
-	sub	%rsi, %rdi
-	add	%rsi, %r8
-	sub	$(VEC_SIZE * 4), %r8
-	jb	L(StrncpyFillLessFourVecSize)
-
-L(StrncpyFillLoopVmovdqa):
-	VMOVA	%YMMZERO, (%rdi)
-	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
-	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
-	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE * 4), %rdi
-	sub	$(VEC_SIZE * 4), %r8
-	jae	L(StrncpyFillLoopVmovdqa)
-
-L(StrncpyFillLessFourVecSize):
-	add	$(VEC_SIZE * 2), %r8
-	jl	L(StrncpyFillLessTwoVecSize)
-	VMOVA	%YMMZERO, (%rdi)
-	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
-	add	$(VEC_SIZE * 2), %rdi
-	sub	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	VMOVA	%YMMZERO, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillLessTwoVecSize):
-	add	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	VMOVA	%YMMZERO, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillExit):
-	add	$VEC_SIZE, %r8
-L(Fill):
-	cmp	$17, %r8d
-	jae	L(Fill17_32)
-	cmp	$9, %r8d
-	jae	L(Fill9_16)
-	cmp	$5, %r8d
-	jae	L(Fill5_8)
-	cmp	$3, %r8d
-	jae	L(Fill3_4)
-	cmp	$1, %r8d
-	ja	L(Fill2)
-	je	L(Fill1)
+	.p2align 4,, 4
+L(page_cross_copy_4_7):
+	movl	(%rsi), %ecx
+	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, -(4 - CHAR_SIZE)(%END_REG)
 	ret
-
-/* end of ifndef USE_AS_STRCAT */
 #  endif
 
-	.p2align 4
-L(UnalignedLeaveCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(UnalignedFourVecSizeLeaveCase2)
-L(UnalignedFourVecSizeLeaveCase3):
-	lea	(VEC_SIZE * 4)(%r8), %rcx
-	and	$-VEC_SIZE, %rcx
-	add	$(VEC_SIZE * 3), %r8
-	jl	L(CopyVecSizeCase3)
-	VMOVU	%YMM4, (%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 4)(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (VEC_SIZE * 4)(%rdi)
-#  endif
+#  if VEC_SIZE == 64
+	.p2align 4,, 4
+L(page_cross_copy_32_63):
+	VMOVU	(%rsi), %VMM_256(0)
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
 	ret
-
-	.p2align 4
-L(UnalignedFourVecSizeLeaveCase2):
-	xor	%ecx, %ecx
-	vpcmpb	$0, %YMM4, %YMMZERO, %k1
-	kmovd	%k1, %edx
-	add	$(VEC_SIZE * 3), %r8
-	jle	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
-	vpcmpb	$0, %YMM5, %YMMZERO, %k2
-	kmovd	%k2, %edx
-	VMOVU	%YMM4, (%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec5)
-#  else
-	jnz	L(CopyVecSize)
 #  endif
 
-	vpcmpb	$0, %YMM6, %YMMZERO, %k3
-	kmovd	%k3, %edx
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec6)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
-
-	vpcmpb	$0, %YMM7, %YMMZERO, %k4
-	kmovd	%k4, %edx
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-	lea	VEC_SIZE(%rdi, %rcx), %rdi
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-L(StrncpyExit):
-	cmp	$65, %r8d
-	je	L(StrncpyExit65)
-	cmp	$33, %r8d
-	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8d
-	jae	L(StrncpyExit17_32)
-	cmp	$9, %r8d
-	jae	L(StrncpyExit9_16)
-	cmp	$5, %r8d
-	jae	L(StrncpyExit5_8)
-	cmp	$3, %r8d
-	jae	L(StrncpyExit3_4)
-	cmp	$1, %r8d
-	ja	L(StrncpyExit2)
-	je	L(StrncpyExit1)
-#  ifdef USE_AS_STPCPY
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi)
-#  endif
+	.p2align 4,, 4
+L(page_cross_copy_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
 	ret
 
-	.p2align 4
-L(ExitZero):
-#  ifndef USE_AS_STRCAT
-	mov	%rdi, %rax
-#  endif
+	.p2align 4,, 4
+L(page_cross_copy_8_15):
+	movq	(%rsi), %rcx
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+	movq	%rcx, (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%END_REG)
 	ret
-
-# endif
-
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
 # endif
+END(STRCPY)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
index 203a19bf21..d648ba5cfe 100644
--- a/sysdeps/x86_64/multiarch/strncat-evex.S
+++ b/sysdeps/x86_64/multiarch/strncat-evex.S
@@ -1,7 +1,520 @@
-#ifndef STRNCAT
-# define STRNCAT	__strncat_evex
-#endif
+/* {wcs|str}ncat  with 256/512-bit EVEX.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+# ifndef STRNCAT
+#  define STRNCAT	__strncat_evex
+# endif
+
+
+# ifdef USE_AS_WCSCPY
+#  define movNULL	movl
+#  define VMOVU_MASK	vmovdqu32
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
+
+#  define REP_MOVS	rep movsd
+
+#  define VMASK_REG	VR10
+#  define FIND_FIRST_ONE(src, dst)	movl $CHAR_PER_VEC, %dst; bsf %src, %dst
+
+#  define USE_WIDE_CHAR
+# else
+#  define movNULL	movb
+#  define VMOVU_MASK	vmovdqu8
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
+
+#  define REP_MOVS	rep movsb
+
+#  define VMASK_REG	VRCX
+#  define FIND_FIRST_ONE(src, dst)	tzcnt %src, %dst
+
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# include "reg-macros.h"
+
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCAT)
+	movq	%rdi, %rax
+
+	/* NB: It's safe to filter out zero-length strings WITHOUT
+	   setting null-term. Destination MUST be a null-terminated
+	   string so essentially the work is already done.  */
+# ifdef USE_AS_WCSCPY
+	leaq	-1(%rdx), %rcx
+	shrq	$56, %rcx
+	jnz	L(zero_len)
+# else
+	test	%rdx, %rdx
+	jle	L(zero_len)
+# endif
+
+# include "strcat-strlen-evex.S"
+
+	movl	%esi, %ecx
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPTESTN	%VMM(0), %VMM(0), %k0
+
+	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
+	   <= CHAR_PER_VEC with masked instructions (which have
+	   potential for dramatically bad perf if dst splits a page and
+	   is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	KMOV	%k0, %VRCX
+	FIND_FIRST_ONE (VRCX, VR8)
+	cmpq	%r8, %rdx
+	jbe	L(less_1x_vec)
+
+	test	%VRCX, %VRCX
+	jz	L(more_1x_vec)
+
+	blsmsk	%VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
+	ret
+
+L(less_1x_vec):
+	mov	$-1, %VRCX
+	bzhi	%VRDX, %VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
+
+	ret
+# else
+	KMOV	%k0, %VMASK_REG
+	/* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
+	   %VMASK_REG, %VRCX` for wcsncat.  */
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpq	%rcx, %rdx
+	jbe	L(less_1x_vec)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	je	L(more_1x_vec)
+
+	movl	%ecx, %edx
+
+L(less_1x_vec):
+#  if VEC_SIZE == 64
+	cmpl	$(32 / CHAR_SIZE), %edx
+	jae	L(copy_32_63)
+#  endif
+
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jae	L(copy_16_31)
+
+
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jae	L(copy_8_15)
+
+#  ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  else
+
+	cmpl	$4, %edx
+	jae	L(copy_4_7)
+
+	movzbl	(%rsi), %ecx
+	cmpl	$1, %edx
+	jbe	L(set_null_term)
+
+	movzwl	1(%rsi), %esi
+	movw	%si, 1(%rdi)
+
+	.p2align 4,, 1
+L(set_null_term):
+	movb	%cl, (%rdi)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+#  endif
+
+#  if VEC_SIZE == 64
+	.p2align 4,, 6
+L(copy_32_63):
+	VMOVU	-(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  endif
+	.p2align 4,, 6
+L(copy_16_31):
+	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
+	   and will save code size.  */
+	vmovdqu	-(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 2
+L(copy_8_15):
+	movq	-(8)(%rsi, %rdx, CHAR_SIZE), %rcx
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+
+#  ifndef USE_AS_WCSCPY
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-(4)(%rsi, %rdx, CHAR_SIZE), %ecx
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  endif
+
+# endif
+	.p2align 4,, 4
+L(zero_len):
+# ifdef USE_AS_WCSCPY
+	test	%rdx, %rdx
+# endif
+	jne	OVERFLOW_STRCAT
+	ret
 
-#define USE_AS_STRNCAT
-#define STRCAT	STRNCAT
-#include "strcat-evex.S"
+	.p2align 4,, 8
+L(more_1x_vec):
+	VMOVU	%VMM(0), (%rdi)
+
+	/* We are going to align rsi here so will need to be able to re-
+	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
+
+	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+L(loop_last_4x_vec):
+	addq	%rsi, %rdi
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+
+	/* Will need this regardless.  */
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VMASK_REG
+
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	jne	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	addl	$-CHAR_PER_VEC, %edx
+	bzhi	%VRDX, %VRCX, %VR8
+	jz	L(ret_vec_x2_len)
+L(ret_vec_x2):
+	bsf	%VRCX, %VRDX
+L(ret_vec_x2_len):
+	VMOVU	(VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	movNULL	$0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 4
+L(ret_vec_x1_len):
+	movl	%edx, %ecx
+L(ret_vec_x1):
+	VMOVU	(VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	movNULL	$0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	addl	$-(CHAR_PER_VEC * 4), %edx
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VMASK_REG
+	subq	$-(VEC_SIZE * 4), %rsi
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+# ifdef USE_AS_WCSCPY
+	xorl	%ecx, %ecx
+# endif
+	bsf	%VMASK_REG, %VRCX
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VMASK_REG
+
+	cmpq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(more_4x_vec)
+
+	/* Adjust length before going to L(ret_vec_x3_len) or
+	   L(ret_vec_x3).  */
+	addl	$(CHAR_PER_VEC * -2), %edx
+
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	jne	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	addl	$-CHAR_PER_VEC, %edx
+	bzhi	%VRDX, %VRCX, %VR8
+	jz	L(ret_vec_x4_len)
+L(ret_vec_x4):
+	bsf	%VRCX, %VRDX
+L(ret_vec_x4_len):
+	VMOVU	(VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	movNULL	$0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 4
+L(ret_vec_x3_len):
+	movl	%edx, %ecx
+L(ret_vec_x3):
+	VMOVU	(VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	movNULL	$0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 8
+L(more_4x_vec):
+# ifdef USE_AS_WCSCPY
+	xorl	%ecx, %ecx
+# endif
+	bsf	%VMASK_REG, %VRCX
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+
+	/* Check if we are near the end before aligning.  */
+	cmpq	$(CHAR_PER_VEC * 8), %rdx
+	jbe	L(last_4x_vec)
+
+
+	/* Add rsi to rdx (length) before aligning rsi. NB: Since we
+	   filtered out huge lengths this cannot overflow.  */
+# ifdef USE_AS_WCSCPY
+	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rsi, %rdx
+# endif
+
+	/* Subtract rsi from rdi before aligning (add back will have
+	   correct rdi for aligned rsi).  */
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+
+	/* Offset rsi by VEC_SIZE so that we can jump to
+	   L(loop_last_4x_vec).  */
+	addq	$-(VEC_SIZE), %rsi
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	/* Store loop end in r9.  */
+	leaq	-(VEC_SIZE * 5)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	/* Restore rdi (dst).  */
+	addq	%rsi, %rdi
+
+	/* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
+	   test with bsf.  */
+	bsf	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
+
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	bsf	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+
+	KMOV	%k4, %VRCX
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	ret
+
+
+	.p2align 4,, 4
+L(page_cross):
+	movq	%rsi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+	VPCMPEQ	(%r8), %VZERO, %k0
+
+# ifdef USE_AS_WCSCPY
+	KMOV	%k0, %VR9
+	shrl	$2, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+	shrx	%VRCX, %VR9, %VRCX
+# else
+	KMOV	%k0, %VRCX
+	shrx	%VRSI, %VRCX, %VRCX
+# endif
+
+	subl	%esi, %r8d
+	andl	$(VEC_SIZE - 1), %r8d
+# ifdef USE_AS_WCSCPY
+	shrl	$2, %r8d
+# endif
+	cmpq	%r8, %rdx
+	jbe	L(page_cross_small)
+	/* Optimizing more for space as this is very cold code. This
+	   saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shl	%VRCX
+	jz	L(page_cross_continue)
+	bsf	%VRCX, %VRCX
+	REP_MOVS
+	ret
+
+L(page_cross_small):
+	tzcnt	%VRCX, %VRCX
+	jz	L(page_cross_setz)
+	cmpl	%edx, %ecx
+	cmova	%edx, %ecx
+
+# ifdef USE_AS_WCSCPY
+	rep	movsd
+# else
+	rep	movsb
+# endif
+L(page_cross_setz):
+	movNULL	$0, (%rdi)
+	ret
+END(STRNCAT)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
index 1b3426d511..49eaf4cbd9 100644
--- a/sysdeps/x86_64/multiarch/strncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
@@ -1,7 +1,990 @@
-#ifndef STRNCPY
-# define STRNCPY	__strncpy_evex
-#endif
+/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+
+
+# include <sysdep.h>
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+
+# ifndef STRNCPY
+#  define STRNCPY	__strncpy_evex
+# endif
+
+# ifdef USE_AS_WCSCPY
+#  define VMOVU_MASK	vmovdqu32
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define CHAR_SIZE	4
+
+#  define REP_MOVS	rep movsd
+#  define REP_STOS	rep stosl
+
+#  define USE_WIDE_CHAR
+
+# else
+#  define VMOVU_MASK	vmovdqu8
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define CHAR_SIZE	1
+
+#  define REP_MOVS	rep movsb
+#  define REP_STOS	rep stosb
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+
+# define VZERO	VMM(7)
+# define VZERO_256	VMM_256(7)
+# define VZERO_128	VMM_128(7)
+
+# if VEC_SIZE == 64
+#  define VZERO_HALF	VZERO_256
+# else
+#  define VZERO_HALF	VZERO_128
+# endif
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCPY)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   just running rep stos{b|l} to zero set (which will almost
+	   certainly segfault), if that succeeds then just calling
+	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
+# ifdef USE_AS_WCSCPY
+	decq	%rdx
+	movq	%rdx, %rax
+	/* 56 is end of max supported address space.  */
+	shr	$56, %rax
+	jnz	L(zero_len)
+# else
+	decq	%rdx
+	/* If the flag needs to become `jb` replace `dec` with `sub`.
+	 */
+	jl	L(zero_len)
+# endif
+
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
+	movl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+
+	/* If no STPCPY just save end ahead of time.  */
+# ifndef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+
+
+	cmpq	$(CHAR_PER_VEC), %rdx
+
+	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
+	   <= CHAR_PER_VEC with masked instructions (which have
+	   potential for dramatically bad perf if dst splits a page and
+	   is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	/* `jae` because length rdx is now length - 1.  */
+	jae	L(more_1x_vec)
+
+	/* If there where multiple zero-CHAR matches in the first VEC,
+	   VRCX will be overset but thats fine since any oversets where
+	   at zero-positions anyways.  */
+
+#  ifdef USE_AS_STPCPY
+	tzcnt	%VRCX, %VRAX
+	cmpl	%eax, %edx
+	cmovb	%edx, %eax
+#   ifdef USE_AS_WCSCPY
+	adcl	$0, %eax
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#   else
+	adcq	%rdi, %rax
+#   endif
+#  endif
+	dec	%VRCX
+
+	/* Zero out all non-zero CHAR's after the first zero match.  */
+	KMOV	%VRCX, %k1
+
+	/* Use VZERO as destination so this can be reused for
+	   L(zfill_less_vec) (which if jumped to by subsequent logic
+	   will have zerod out VZERO.  */
+	VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
+L(zfill_less_vec):
+	/* Get mask for what we need to set.  */
+	incl	%edx
+	mov	$-1, %VRCX
+	bzhi	%VRDX, %VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	VMOVU_MASK %VZERO, (%rdi){%k1}
+	ret
+
+	.p2align 4,, 4
+L(zero_len):
+	cmpq	$-1, %rdx
+	jne	L(best_effort_strncpy)
+	movq	%rdi, %rax
+	ret
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# else
+	/* `jb` because length rdx is now length - 1.  */
+	jb	L(less_1x_vec)
+# endif
+
+
+	/* This may overset but thats fine because we still need to zero
+	   fill.  */
+	VMOVU	%VMM(0), (%rdi)
+
+
+	/* Length must be >= CHAR_PER_VEC so match here means we must
+	   zero-fill.  */
+	test	%VRCX, %VRCX
+	jnz	L(zfill)
+
+
+	/* We are going to align rsi here so will need to be able to re-
+	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
+	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+
+L(loop_last_4x_vec):
+	addq	%rsi, %rdi
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+
+	/* -1 because of the `dec %rdx` earlier.  */
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	/* This will be need to be computed no matter what. We do it
+	   ahead of time for CHAR_PER_VEC == 64 because we can't adjust
+	   the value of `tzcnt` with a shift.  */
+# if CHAR_PER_VEC == 64
+	tzcntq	%rcx, %rcx
+# endif
+
+	cmpl	$(CHAR_PER_VEC), %edx
+	jb	L(ret_vec_x1_len)
+
+	/* Seperate logic for CHAR_PER_VEC == 64 because we already did
+	   `tzcnt` on VRCX.  */
+# if CHAR_PER_VEC == 64
+	/* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
+	cmpb	$CHAR_PER_VEC, %cl
+	jnz	L(ret_vec_x1_no_bsf)
+# else
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+# endif
+
+
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %VZERO, %k0
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	KMOV	%k0, %VRCX
+
+# if CHAR_PER_VEC < 64
+	/* This essentiallys adds CHAR_PER_VEC to computed result.  */
+	shlq	$CHAR_PER_VEC, %rcx
+# else
+	tzcntq	%rcx, %rcx
+	addl	$CHAR_PER_VEC, %ecx
+# endif
+
+	.p2align 4,, 4
+L(ret_vec_x1_len):
+	/* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
+	   already been done.  */
+# if CHAR_PER_VEC < 64
+	tzcntq	%rcx, %rcx
+# endif
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x1_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+L(ret_vec_x1_len_no_zfill):
+	VMOVU	((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+	.p2align 4,, 10
+L(ret_vec_x1):
+	bsf	%VRCX, %VRCX
+L(ret_vec_x1_no_bsf):
+	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	subl	%ecx, %edx
+	cmpl	$CHAR_PER_VEC, %edx
+	jb	L(ret_vec_x1_len_no_zfill_mov)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+	ret
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	/* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
+	   $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
+	   using `movzbl`.  */
+# if CHAR_PER_VEC == 64
+	movzbl	%dl, %edx
+# else
+	andl	$(CHAR_PER_VEC * 4 - 1), %edx
+# endif
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+	subq	$-(VEC_SIZE * 4), %rsi
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2 - 1), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	test	%VRCX, %VRCX
+	/* Must fill at least 2x VEC.  */
+	jnz	L(zfill_vec1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	/* Must fill at least 1x VEC.  */
+	jnz	L(zfill_vec2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRCX
+
+	/* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rdx
+	ja	L(more_4x_vec)
+
+	subl	$(CHAR_PER_VEC * 3), %edx
+	jb	L(ret_vec_x3_len)
+
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rsi), %VZERO, %k0
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	KMOV	%k0, %VRCX
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x4_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	movl	%ecx, %edx
+L(ret_vec_x4_len_no_zfill):
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 4 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+L(ret_vec_x3_len):
+	addl	$(CHAR_PER_VEC * 1), %edx
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x3_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+	.p2align 4,, 4
+L(ret_vec_x3_len_no_zfill):
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 3 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsf	%VRCX, %VRCX
+	VMOVU	%VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
+	subl	%ecx, %edx
+	jl	L(ret_vec_x3_len_no_zfill_mov)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+	ret
+
+	.p2align 4,, 8
+L(more_4x_vec):
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec4)
 
-#define USE_AS_STRNCPY
-#define STRCPY	STRNCPY
-#include "strcpy-evex.S"
+	/* Recheck length before aligning.  */
+	cmpq	$(CHAR_PER_VEC * 8 - 1), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
+# ifdef USE_AS_WCSCPY
+	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rsi, %rdx
+# endif
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+
+
+	/* Offset rsi by VEC_SIZE so that we can jump to
+	   L(loop_last_4x_vec).  */
+	addq	$-(VEC_SIZE), %rsi
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	/* Store loop end in r9.  */
+	leaq	-(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	/* Restore rdx (length).  */
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	/* Restore rdi (dst).  */
+	addq	%rsi, %rdi
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec1)
+
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec2)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec3)
+
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
+	KMOV	%k4, %VRCX
+	// Zfill more....
+
+	.p2align 4,, 4
+L(zfill_vec4):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -2), %rdx
+L(zfill_vec2):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -1), %rdx
+L(zfill):
+	/* VRCX must be non-zero.  */
+	bsf	%VRCX, %VRCX
+
+	/* Adjust length / dst for zfill.  */
+	subq	%rcx, %rdx
+# ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+# else
+	addq	%rcx, %rdi
+# endif
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_from_page_cross):
+
+	/* From here on out its just memset(rdi, 0, rdx).  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jb	L(zfill_less_vec)
+
+L(zfill_more_1x_vec):
+	VMOVU	%VZERO, (%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
+	ja	L(zfill_more_2x_vec)
+L(zfill_done0):
+	ret
+
+	/* Coming from vec1/vec2 we must be able to zfill at least 2x
+	   VEC.  */
+	.p2align 4,, 8
+L(zfill_vec3):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -2), %rdx
+	.p2align 4,, 2
+L(zfill_vec1):
+	bsfq	%rcx, %rcx
+	/* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
+	 */
+	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+
+
+	VMOVU	%VZERO, (%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	jb	L(zfill_done0)
+L(zfill_more_2x_vec):
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VZERO, (VEC_SIZE)(%rdi)
+	subq	$(CHAR_PER_VEC * 4 - 1), %rdx
+	jbe	L(zfill_done)
+
+# ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rdi, %rdx
+# endif
+
+	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi)
+
+
+	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
+	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	jbe	L(zfill_done)
+
+	/* Align rdi and zfill loop.  */
+	andq	$-(VEC_SIZE), %rdi
+	.p2align 4,, 12
+L(zfill_loop_4x_vec):
+	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	ja	L(zfill_loop_4x_vec)
+L(zfill_done):
+	ret
+
+
+	/* Less 1x VEC case if we are not using evex masked store.  */
+# if !USE_EVEX_MASKED_STORE
+	.p2align 4,, 8
+L(copy_1x):
+	/* Special case for copy 1x. It can be handled quickly and many
+	   buffer sizes have convenient alignment.  */
+	VMOVU	%VMM(0), (%rdi)
+	/* If no zeros then we are done.  */
+	testl	%ecx, %ecx
+	jz	L(ret_1x_1x)
+
+	/* Need to zfill, not we know that length <= CHAR_PER_VEC so we
+	   only handle the small case here.  */
+	bsf	%VRCX, %VRCX
+L(zfill_less_vec_no_bsf):
+	/* Adjust length / dst then just zfill less_vec.  */
+	subq	%rcx, %rdx
+#  ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#  else
+	addq	%rcx, %rdi
+#  endif
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+
+L(zfill_less_vec):
+	cmpl	$((VEC_SIZE / 2) / CHAR_SIZE), %edx
+	jb	L(zfill_less_half)
+
+	VMOVU	%VZERO_HALF, (%rdi)
+	VMOVU	%VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	ret
+#  ifdef USE_AS_STPCPY
+L(ret_1x_1x):
+	leaq	CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
+	ret
+#  endif
+
+
+#  if VEC_SIZE == 64
+	.p2align 4,, 4
+L(copy_32_63):
+	/* Overfill to avoid branches.  */
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+
+	/* We are taking advantage of the fact that to be here we must
+	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
+	   way for overwriting.  */
+	cmpl	%ecx, %edx
+	ja	L(zfill_less_vec_no_bsf)
+#   ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+#   else
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  endif
+
+	.p2align 4,, 4
+L(copy_16_31):
+	/* Overfill to avoid branches.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpl	%ecx, %edx
+
+	/* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
+	   we have a larger copy block for 32-63 so this is just falls
+	   through to zfill 16-31. If VEC_SIZE == 32 then we check for
+	   full zfill of less 1x VEC.  */
+#  if VEC_SIZE == 64
+	jbe	L(ret_16_31)
+	subl	%ecx, %edx
+#   ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#   else
+	addq	%rcx, %rdi
+#   endif
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+L(zfill_less_half):
+L(zfill_less_32):
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jb	L(zfill_less_16)
+	VMOVU	%VZERO_128, (%rdi)
+	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+#   ifdef USE_AS_STPCPY
+	ret
+#   endif
+L(ret_16_31):
+#   ifdef USE_AS_STPCPY
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  else
+	/* VEC_SIZE == 32 begins.  */
+	ja	L(zfill_less_vec_no_bsf)
+#   ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+#   else
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  endif
+
+
+	.p2align 4,, 4
+L(copy_8_15):
+	/* Overfill to avoid branches.  */
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpl	%ecx, %edx
+	jbe	L(ret_8_15)
+	subl	%ecx, %edx
+#  ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#  else
+	addq	%rcx, %rdi
+#  endif
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+	.p2align 4,, 8
+#  if VEC_SIZE == 32
+L(zfill_less_half):
+#  endif
+L(zfill_less_16):
+	xorl	%ecx, %ecx
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jb	L(zfill_less_8)
+	movq	%rcx, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+#  ifndef USE_AS_STPCPY
+L(ret_8_15):
+#  endif
+	ret
+
+	.p2align 4,, 8
+L(less_1x_vec):
+	je	L(copy_1x)
+
+	/* We will need `tzcnt` result for all other copy sizes.  */
+	tzcnt	%VRCX, %VRCX
+#  if VEC_SIZE == 64
+	cmpl	$(32 / CHAR_SIZE), %edx
+	jae	L(copy_32_63)
+#  endif
+
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jae	L(copy_16_31)
+
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jae	L(copy_8_15)
+#  ifdef USE_AS_WCSCPY
+	testl	%ecx, %ecx
+	jz	L(zfill_less_8_set_ret)
+
+	movl	(%rsi, %rdx, CHAR_SIZE), %esi
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%esi, (%rdi, %rdx, CHAR_SIZE)
+#   ifdef USE_AS_STPCPY
+	cmpl	%ecx, %edx
+L(ret_8_15):
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#   endif
+	ret
+L(zfill_less_8_set_ret):
+	xorl	%ecx, %ecx
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+L(zfill_less_8):
+	movl	%ecx, (%rdi)
+	movl	%ecx, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  else
+	cmpl	$3, %edx
+	jb	L(copy_0_3)
+	/* Overfill to avoid branches.  */
+	movl	-3(%rsi, %rdx), %esi
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%esi, -3(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_4_7)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+	xorl	%ecx, %ecx
+	.p2align 4,, 8
+L(zfill_less_8):
+	cmpl	$3, %edx
+	jb	L(zfill_less_3)
+	movl	%ecx, (%rdi)
+	movl	%ecx, -3(%rdi, %rdx)
+#   ifdef USE_AS_STPCPY
+	ret
+#   endif
+
+L(ret_4_7):
+#   ifdef USE_AS_STPCPY
+L(ret_8_15):
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#   endif
+	ret
+
+	.p2align 4,, 4
+L(zfill_less_3):
+	testl	%edx, %edx
+	jz	L(zfill_1)
+	movw	%cx, (%rdi)
+L(zfill_1):
+	movb	%cl, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_0_3):
+	vmovd	%VMM_128(0), %r8d
+	testl	%edx, %edx
+	jz	L(copy_1)
+	movw	%r8w, (%rdi)
+	cmpl	%ecx, %edx
+	ja	L(zfill_from_1)
+	movzbl	(%rsi, %rdx), %r8d
+#   ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+	movb	%r8b, (%rdi, %rdx)
+	ret
+#   endif
+
+L(copy_1):
+#   ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	cmpl	%ecx, %edx
+	adcq	%rdi, %rax
+#   endif
+#   ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+#   else
+	movb	%r8b, (%rdi, %rdx)
+#   endif
+	ret
+#  endif
+
+
+#  ifndef USE_AS_WCSCPY
+	.p2align 4,, 8
+L(zfill_from_1):
+#   ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+#   endif
+	movw	$0, -1(%rdi, %rdx)
+	ret
+#  endif
+
+	.p2align 4,, 4
+L(zero_len):
+	incq	%rdx
+	jne	L(best_effort_strncpy)
+	movq	%rdi, %rax
+	ret
+# endif
+
+
+	.p2align 4,, 4
+	.p2align 6,, 8
+L(page_cross):
+	movq	%rsi, %rax
+	andq	$(VEC_SIZE * -1), %rax
+	VPCMPEQ	(%rax), %VZERO, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSCPY
+	movl	%esi, %r8d
+	shrl	$2, %r8d
+	andl	$(CHAR_PER_VEC - 1), %r8d
+	shrx	%VR8, %VRCX, %VRCX
+# else
+	shrx	%VRSI, %VRCX, %VRCX
+# endif
+
+	/* Compute amount of bytes we checked.  */
+	subl	%esi, %eax
+	andl	$(VEC_SIZE - 1), %eax
+# ifdef USE_AS_WCSCPY
+	shrl	$2, %eax
+# endif
+
+	/* If rax > rdx then we are finishing the copy at the end of the
+	   page.  */
+	cmpq	%rax, %rdx
+	jb	L(page_cross_small)
+
+
+	/* If rcx is non-zero then continue.  */
+	test	%VRCX, %VRCX
+	jz	L(page_cross_continue)
+
+	/* We found zero-CHAR so need to copy then zfill (we know we
+	   didn't cover all of length here).  */
+	bsf	%VRCX, %VRCX
+L(movsb_and_zfill):
+	incl	%ecx
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	leaq	-CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
+# else
+	movq	%rdi, %rax
+# endif
+
+	REP_MOVS
+# ifdef USE_AS_WCSCPY
+	movl	$0, (%rdi)
+# else
+	movb	$0, (%rdi)
+# endif
+	jmp	L(zfill_from_page_cross)
+
+L(page_cross_small):
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(page_cross_copy_only)
+
+	/* Do a zfill of the tail before copying.  */
+	movq	%rdi, %r9
+	xorl	%eax, %eax
+
+	movl	%ecx, %r8d
+
+	subl	%ecx, %edx
+	leaq	CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+	movl	%edx, %ecx
+	REP_STOS
+	movq	%r9, %rdi
+	movl	%r8d, %edx
+L(page_cross_copy_only):
+	leal	1(%rdx), %ecx
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcl	$0, %edx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# else
+	movq	%rdi, %rax
+# endif
+	REP_MOVS
+	ret
+
+
+L(best_effort_strncpy):
+	movq	%rdx, %rcx
+	xorl	%eax, %eax
+	movq	%rdi, %r8
+	/* The length is >= 2^63. We very much so expect to segfault at
+	   rep stos. If that doesn't happen then just strcpy to finish.
+	 */
+	REP_STOS
+	movq	%r8, %rdi
+	jmp	OVERFLOW_STRCPY
+END(STRNCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
new file mode 100644
index 0000000000..d5ff4cbe50
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
@@ -0,0 +1,65 @@
+#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
+#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
+
+#if defined USE_MULTIARCH && IS_IN(libc)
+#  define UNDERSCORES __
+#  ifdef USE_WITH_SSE2
+#    define ISA_EXT _sse2
+#  elif defined USE_WITH_AVX
+#    ifdef USE_WITH_RTM
+#      define ISA_EXT _avx_rtm
+#    else
+#      define ISA_EXT _avx
+#    endif
+#  elif defined USE_WITH_AVX2
+#    ifdef USE_WITH_RTM
+#      define ISA_EXT _avx2_rtm
+#    else
+#      define ISA_EXT _avx2
+#    endif
+
+#  elif defined USE_WITH_EVEX256
+#    define ISA_EXT _evex
+#  elif defined USE_WITH_EVEX512
+#    define ISA_EXT _evex512
+#  endif
+#else
+#  define UNDERSCORES
+#  define ISA_EXT
+#endif
+
+#ifdef USE_AS_WCSCPY
+#  define STRCPY_PREFIX wc
+#  define STRCAT_PREFIX wcs
+#  ifdef USE_AS_STPCPY
+#    define STRCPY_POSTFIX pcpy
+#  else
+#    define STRCPY_POSTFIX scpy
+#  endif
+#else
+#  define STRCPY_PREFIX st
+#  define STRCAT_PREFIX str
+#  ifdef USE_AS_STPCPY
+#    define STRCPY_POSTFIX pcpy
+#  else
+#    define STRCPY_POSTFIX rcpy
+#  endif
+#endif
+#define STRCAT_POSTFIX cat
+
+#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext)                 \
+  underscores##prefix##postfix##ext
+
+#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
+
+#ifndef OVERFLOW_STRCPY
+#  define OVERFLOW_STRCPY                                                     \
+    OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
+#endif
+
+#ifndef OVERFLOW_STRCAT
+#  define OVERFLOW_STRCAT                                                     \
+    OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
+#endif
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v3 3/5] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions
  2022-11-04 20:13 ` [PATCH v3 1/5] " Noah Goldstein
  2022-11-04 20:13   ` [PATCH v3 2/5] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
@ 2022-11-04 20:13   ` Noah Goldstein
  2022-11-04 20:13   ` [PATCH v3 4/5] x86: Add evex optimized functions for the wchar_t strcpy family Noah Goldstein
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04 20:13 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
    1. Use more overlapping stores to avoid branches.
    2. Reduce how unrolled the aligning copies are (this is more of a
       code-size save, its a negative for some sizes in terms of
       perf).
    3. For st{r|p}n{cat|cpy} re-order the branches to minimize the
       number that are taken.

Performance Changes:

    Times are from N = 10 runs of the benchmark suite and are
    reported as geometric mean of all ratios of
    New Implementation / Old Implementation.

    strcat-avx2      -> 0.998
    strcpy-avx2      -> 0.937
    stpcpy-avx2      -> 0.971

    strncpy-avx2     -> 0.793
    stpncpy-avx2     -> 0.775

    strncat-avx2     -> 0.962

Code Size Changes:
    function         -> Bytes New / Bytes Old -> Ratio

    strcat-avx2      ->  685 / 1639 -> 0.418
    strcpy-avx2      ->  560 /  903 -> 0.620
    stpcpy-avx2      ->  592 /  939 -> 0.630

    strncpy-avx2     -> 1176 / 2390 -> 0.492
    stpncpy-avx2     -> 1268 / 2438 -> 0.520

    strncat-avx2     -> 1042 / 2563 -> 0.407

Notes:
    1. Because of the significant difference between the
       implementations they are split into three files.

           strcpy-avx2.S    -> strcpy, stpcpy, strcat
           strncpy-avx2.S   -> strncpy
           strncat-avx2.S    > strncat

       I couldn't find a way to merge them without making the
       ifdefs incredibly difficult to follow.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
 sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S    |    6 +-
 sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S   |    7 +-
 sysdeps/x86_64/multiarch/stpncpy-avx2.S       |    5 +-
 sysdeps/x86_64/multiarch/strcat-avx2-rtm.S    |   13 +-
 sysdeps/x86_64/multiarch/strcat-avx2.S        |  268 +---
 sysdeps/x86_64/multiarch/strcat-strlen-avx2.S |  101 ++
 sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S    |   13 +-
 sysdeps/x86_64/multiarch/strcpy-avx2.S        | 1236 +++++------------
 sysdeps/x86_64/multiarch/strncat-avx2-rtm.S   |    6 +-
 sysdeps/x86_64/multiarch/strncat-avx2.S       |  424 +++++-
 sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S   |    6 +-
 sysdeps/x86_64/multiarch/strncpy-avx2.S       |  740 +++++++++-
 sysdeps/x86_64/multiarch/x86-avx-vecs.h       |    5 +-
 sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h  |   26 +
 sysdeps/x86_64/multiarch/x86-avx2-vecs.h      |   27 +
 15 files changed, 1649 insertions(+), 1234 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-vecs.h

diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
index 2b9c07a59f..189a288053 100644
--- a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STPCPY	__stpcpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "stpcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
index 60a2ccfe53..1b252985e7 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
@@ -1,4 +1,3 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STPNCPY	__stpncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "stpncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
index b2f8c19143..a46a8edbe2 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
@@ -3,6 +3,5 @@
 #endif
 
 #define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY	STPNCPY
-#include "strcpy-avx2.S"
+#define STRNCPY	STPNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
index 637fb557c4..94d51d10bd 100644
--- a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
@@ -1,12 +1,3 @@
-#ifndef STRCAT
-# define STRCAT __strcat_avx2_rtm
-#endif
-
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
-
-#define SECTION(p) p##.avx.rtm
-
+#define STRCAT	__strcat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
 #include "strcat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
index d9b7fb2a43..3f914fa342 100644
--- a/sysdeps/x86_64/multiarch/strcat-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
@@ -16,266 +16,10 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (3)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_avx2
-# endif
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE	32
-
-# ifndef SECTION
-#  define SECTION(p)	p##.avx
-# endif
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRCAT)
-	mov	%rdi, %r9
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-	xor	%eax, %eax
-	mov	%edi, %ecx
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	vpxor	%xmm6, %xmm6, %xmm6
-	cmp	$(VEC_SIZE * 3), %ecx
-	ja	L(fourth_vector_boundary)
-	vpcmpeqb (%rdi), %ymm6, %ymm0
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_first_vector)
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	jmp	L(align_vec_size_start)
-L(fourth_vector_boundary):
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	vpcmpeqb	(%rax), %ymm6, %ymm0
-	mov	$-1, %r10d
-	sub	%rax, %rcx
-	shl	%cl, %r10d
-	vpmovmskb %ymm0, %edx
-	and	%r10d, %edx
-	jnz	L(exit)
-
-L(align_vec_size_start):
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 4), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 4), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 4), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 5), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
-	add	$VEC_SIZE, %rax
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
-	add	$VEC_SIZE, %rax
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
-	add	$VEC_SIZE, %rax
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	add	$VEC_SIZE, %rax
-
-	.p2align 4
-L(align_four_vec_loop):
-	vmovaps	(%rax),	%ymm4
-	vpminub	VEC_SIZE(%rax),	%ymm4, %ymm4
-	vmovaps	(VEC_SIZE * 2)(%rax),	%ymm5
-	vpminub	(VEC_SIZE * 3)(%rax),	%ymm5, %ymm5
-	add	$(VEC_SIZE * 4),	%rax
-	vpminub	%ymm4,	%ymm5, %ymm5
-	vpcmpeqb %ymm5,	%ymm6, %ymm5
-	vpmovmskb %ymm5,	%edx
-	test	%edx,	%edx
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
-	sub	$(VEC_SIZE * 5),	%rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit):
-	sub	%rdi, %rax
-L(exit_null_on_first_vector):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_second_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$VEC_SIZE, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_third_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 2), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fourth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 3), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fifth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-
-	.p2align 4
-L(StartStrcpyPart):
-	lea	(%r9, %rax), %rdi
-	mov	%rsi, %rcx
-	mov	%r9, %rax      /* save result */
-
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(ExitZero)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-avx2.S"
+#ifndef STRCAT
+# define STRCAT	__strcat_avx2
 #endif
+
+#define USE_AS_STRCAT
+#define STRCPY	STRCAT
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
new file mode 100644
index 0000000000..f50514e07c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
@@ -0,0 +1,101 @@
+/* strlen used for begining of str{n}cat using AVX2.
+   Copyright (C) 2011-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+/* NOTE: This file is meant to be included by strcat-avx2 or
+   strncat-avx2 and does not standalone.  Before including %rdi
+   must be saved in %rax.  */
+
+
+/* Simple strlen implementation that ends at
+   L(strcat_strlen_done).  */
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+	VPCMPEQ	(%r8), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	shrxl	%edi, %ecx, %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	VEC_SIZE(%r8), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	leaq	(VEC_SIZE)(%r8), %rdi
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v1)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v2)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v3)
+
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	.p2align 4,, 8
+L(loop_2x_vec):
+	VMOVA	(VEC_SIZE * 0 + 1)(%rdi), %VMM(0)
+	VPMIN	(VEC_SIZE * 1 + 1)(%rdi), %VMM(0), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 1)(%rdi), %VMM(2)
+	VPMIN	(VEC_SIZE * 3 + 1)(%rdi), %VMM(2), %VMM(3)
+	VPMIN	%VMM(1), %VMM(3), %VMM(3)
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(3)
+	vpmovmskb %VMM(3), %r8d
+	subq	$(VEC_SIZE * -4), %rdi
+	testl	%r8d, %r8d
+	jz	L(loop_2x_vec)
+
+	addq	$(VEC_SIZE * -4 + 1), %rdi
+
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(1)
+	vpmovmskb %VMM(1), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v1)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(2)
+	vpmovmskb %VMM(2), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v2)
+
+	movl	%r8d, %ecx
+L(bsf_and_done_v3):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v2):
+	bsfl	%ecx, %ecx
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx), %rdi
+	jmp	L(strcat_strlen_done)
+
+	.p2align 4,, 4
+L(bsf_and_done_v1):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v0):
+	bsfl	%ecx, %ecx
+	addq	%rcx, %rdi
+L(strcat_strlen_done):
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
index c2c581ecf7..fe80ffd265 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
@@ -1,12 +1,3 @@
-#ifndef STRCPY
-# define STRCPY __strcpy_avx2_rtm
-#endif
-
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
-
-#define SECTION(p) p##.avx.rtm
-
+#define STRCPY	__strcpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
 #include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index c725834929..b87a1722d5 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -20,984 +20,378 @@
 
 #if ISA_SHOULD_BUILD (3)
 
+# include <sysdep.h>
 
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
-
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_avx2
-#  endif
-
-# endif
-
-/* Number of bytes in a vector register */
 # ifndef VEC_SIZE
-#  define VEC_SIZE	32
-# endif
-
-# ifndef VZEROUPPER
-#  define VZEROUPPER	vzeroupper
-# endif
-
-# ifndef SECTION
-#  define SECTION(p)	p##.avx
-# endif
-
-/* zero register */
-#define xmmZ	xmm0
-#define ymmZ	ymm0
-
-/* mask register */
-#define ymmM	ymm1
-
-# ifndef USE_AS_STRCAT
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRCPY)
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-	test	%R8_LP, %R8_LP
-	jz	L(ExitZero)
-#  endif
-	mov	%rsi, %rcx
-#  ifndef USE_AS_STPCPY
-	mov	%rdi, %rax      /* save result */
-#  endif
-
+#  include "x86-avx2-vecs.h"
 # endif
 
-	vpxor	%xmmZ, %xmmZ, %xmmZ
-
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	cmp	$(VEC_SIZE * 2), %ecx
-	jbe	L(SourceStringAlignmentLessTwoVecSize)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-
-	vpcmpeqb (%rsi), %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	shr	%cl, %rdx
-
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	mov	$VEC_SIZE, %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  else
-	mov	$(VEC_SIZE + 1), %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  endif
-	jbe	L(CopyVecSizeTailCase2OrCase3)
+# ifndef STRCPY
+#  define STRCPY	__strcpy_avx2
 # endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail)
 
-	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
-	vpmovmskb %ymm2, %edx
+	/* Use movsb in page cross case to save code size.  */
+# define USE_MOVSB_IN_PAGE_CROSS	1
 
-# ifdef USE_AS_STRNCPY
-	add	$VEC_SIZE, %r10
-	cmp	%r10, %r8
-	jbe	L(CopyTwoVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize)
-
-	vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
-	vmovdqu %ymm2, (%rdi)
-
-/* If source address alignment != destination address alignment */
-	.p2align 4
-L(UnalignVecSizeBoth):
-	sub	%rcx, %rdi
-# ifdef USE_AS_STRNCPY
-	add	%rcx, %r8
-	sbb	%rcx, %rcx
-	or	%rcx, %r8
-# endif
-	mov	$VEC_SIZE, %rcx
-	vmovdqa (%rsi, %rcx), %ymm2
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 3), %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_WCSCPY
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
 # else
-	jnz	L(CopyVecSize)
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
 # endif
 
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
-	vpcmpeqb %ymm3, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define PAGE_SIZE	4096
 
-	vmovdqu %ymm3, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
-	vpcmpeqb %ymm4, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
+# ifdef USE_AS_STPCPY
+#  define END_REG	rax
 # else
-	jnz	L(CopyVecSize)
+#  define END_REG	rdi, %rdx
 # endif
 
-	vmovdqu %ymm4, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STRCAT
+#  define PAGE_ALIGN_REG	ecx
 # else
-	jnz	L(CopyVecSize)
+#  define PAGE_ALIGN_REG	eax
 # endif
 
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
 
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vpcmpeqb %ymm3, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRCPY)
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
 
-	vmovdqu %ymm3, (%rdi, %rcx)
-	mov	%rsi, %rdx
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	and	$-(VEC_SIZE * 4), %rsi
-	sub	%rsi, %rdx
-	sub	%rdx, %rdi
-# ifdef USE_AS_STRNCPY
-	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
-# endif
-L(UnalignedFourVecSizeLoop):
-	vmovdqa (%rsi), %ymm4
-	vmovdqa VEC_SIZE(%rsi), %ymm5
-	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
-	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
-	vpminub %ymm5, %ymm4, %ymm2
-	vpminub %ymm7, %ymm6, %ymm3
-	vpminub %ymm2, %ymm3, %ymm3
-	vpcmpeqb %ymmM, %ymm3, %ymm3
-	vpmovmskb %ymm3, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(UnalignedFourVecSizeLeave)
-
-L(UnalignedFourVecSizeLoop_start):
-	add	$(VEC_SIZE * 4), %rdi
-	add	$(VEC_SIZE * 4), %rsi
-	vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
-	vmovdqa (%rsi), %ymm4
-	vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
-	vmovdqa VEC_SIZE(%rsi), %ymm5
-	vpminub %ymm5, %ymm4, %ymm2
-	vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
-	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
-	vmovdqu %ymm7, -VEC_SIZE(%rdi)
-	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
-	vpminub %ymm7, %ymm6, %ymm3
-	vpminub %ymm2, %ymm3, %ymm3
-	vpcmpeqb %ymmM, %ymm3, %ymm3
-	vpmovmskb %ymm3, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jz	L(UnalignedFourVecSizeLoop_start)
-
-L(UnalignedFourVecSizeLeave):
-	vpcmpeqb %ymm4, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_0)
-
-	vpcmpeqb %ymm5, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %ecx
-	test	%ecx, %ecx
-	jnz	L(CopyVecSizeUnaligned_16)
-
-	vpcmpeqb %ymm6, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_32)
-
-	vpcmpeqb %ymm7, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %ecx
-	bsf	%ecx, %edx
-	vmovdqu %ymm4, (%rdi)
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$(VEC_SIZE * 3), %rsi
-	add	$(VEC_SIZE * 3), %rdi
-	jmp	L(CopyVecSizeExit)
+# ifdef USE_AS_STRCAT
+	movq	%rdi, %rax
+#  include "strcat-strlen-avx2.S"
 # endif
 
-/* If source address alignment == destination address alignment */
-
-L(SourceStringAlignmentLessTwoVecSize):
-	vmovdqu (%rsi), %ymm3
-	vmovdqu VEC_SIZE(%rsi), %ymm2
-	vpcmpeqb %ymm3, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$VEC_SIZE, %r8
-#  else
-	cmp	$(VEC_SIZE + 1), %r8
-#  endif
-	jbe	L(CopyVecSizeTail1Case2OrCase3)
+	movl	%esi, %PAGE_ALIGN_REG
+	andl	$(PAGE_SIZE - 1), %PAGE_ALIGN_REG
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
+	ja	L(page_cross)
+L(page_cross_continue):
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 # endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail1)
-
-	vmovdqu %ymm3, (%rdi)
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$(VEC_SIZE * 2), %r8
-#  else
-	cmp	$((VEC_SIZE * 2) + 1), %r8
-#  endif
-	jbe	L(CopyTwoVecSize1Case2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize1)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-	jmp	L(UnalignVecSizeBoth)
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
 
-/*------End of main part with loops---------------------*/
+	testl	%ecx, %ecx
+	jz	L(more_1x_vec)
 
-/* Case1 */
+	/* No longer need ymm registers so just vzeroupper so it doesn't
+	   need to be duplicated at each return statement.  */
+	COND_VZEROUPPER
 
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
-	.p2align 4
-L(CopyVecSize):
-	add	%rcx, %rdi
-# endif
-L(CopyVecSizeTail):
-	add	%rcx, %rsi
-L(CopyVecSizeTail1):
-	bsf	%edx, %edx
-L(CopyVecSizeExit):
-	cmp	$32, %edx
-	jae	L(Exit32_63)
-	cmp	$16, %edx
-	jae	L(Exit16_31)
-	cmp	$8, %edx
-	jae	L(Exit8_15)
-	cmp	$4, %edx
-	jae	L(Exit4_7)
-	cmp	$3, %edx
-	je	L(Exit3)
-	cmp	$1, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	movb	$0, (%rdi)
+	xorl	%edx, %edx
+	bsfl	%ecx, %edx
 # ifdef USE_AS_STPCPY
-	lea	(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$1, %r8
-	lea	1(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
-
-	.p2align 4
-L(CopyTwoVecSize1):
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$VEC_SIZE, %r8
-# endif
-	jmp	L(CopyVecSizeTail1)
-
-	.p2align 4
-L(CopyTwoVecSize):
-	bsf	%edx, %edx
-	add	%rcx, %rsi
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	jmp	L(CopyVecSizeExit)
-
-	.p2align 4
-L(CopyVecSizeUnaligned_0):
-	bsf	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm4, (%rdi)
-	add	$((VEC_SIZE * 4) - 1), %r8
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	jmp	L(CopyVecSizeExit)
-# endif
-
-	.p2align 4
-L(CopyVecSizeUnaligned_16):
-	bsf	%ecx, %edx
-	vmovdqu %ymm4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	VEC_SIZE(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	add	$((VEC_SIZE * 3) - 1), %r8
-	sub	%rdx, %r8
-	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
+	leaq	(%rdi, %rdx), %rax
+# endif
+
+	/* Use mask bits in rcx to detect which copy we need. If the low
+	   mask is zero then there must be a bit set in the upper half.
+	   I.e if ecx != 0 and cx == 0, then match must be upper 16
+	   bits so we use L(copy_16_31).  */
+	testw	%cx, %cx
+	jz	L(copy_16_31)
+
+	testb	%cl, %cl
+	jz	L(copy_8_15)
+# ifdef USE_AS_WCSCPY
+	vmovd	%xmm0, (%rdi)
+	movl	$0, (%END_REG)
+	ret
 # else
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
-
-	.p2align 4
-L(CopyVecSizeUnaligned_32):
-	bsf	%edx, %edx
-	vmovdqu %ymm4, (%rdi)
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-	add	$((VEC_SIZE * 2) - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
+	testb	$0x7, %cl
+	jz	L(copy_4_7)
+
+	testl	%edx, %edx
+	jz	L(set_null_term)
+	vmovd	%xmm0, %ecx
+	movw	%cx, (%rdi)
+
+	.p2align 4,, 2
+L(set_null_term):
+	movb	$0, (%END_REG)
+	ret
+
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-3(%rsi, %rdx), %ecx
+	vmovd	%xmm0, (%rdi)
+	movl	%ecx, -3(%END_REG)
+	ret
+# endif
+
+	.p2align 4,, 10
+L(copy_16_31):
+	VMOVU	-(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
+	ret
+
+	.p2align 4,, 10
+L(copy_8_15):
+# ifdef USE_AS_WCSCPY
+	movl	-(8 - CHAR_SIZE)(%rsi, %rdx), %ecx
 # else
-	add	$(VEC_SIZE * 2), %rsi
-	add	$(VEC_SIZE * 2), %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
-
-# ifdef USE_AS_STRNCPY
-#  ifndef USE_AS_STRCAT
-	.p2align 4
-L(CopyVecSizeUnalignedVec6):
-	vmovdqu %ymm6, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec5):
-	vmovdqu %ymm5, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec4):
-	vmovdqu %ymm4, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec3):
-	vmovdqu %ymm3, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-#  endif
-
-/* Case2 */
-
-	.p2align 4
-L(CopyVecSizeCase2):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTailCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTail1Case2):
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-/* Case2 or Case3,  Case3 */
-
-	.p2align 4
-L(CopyVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeCase2)
-L(CopyVecSizeCase3):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyTwoVecSizeCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyVecSizeTailCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTailCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSize1Case2OrCase3):
-	add	$VEC_SIZE, %rdi
-	add	$VEC_SIZE, %rsi
-	sub	$VEC_SIZE, %r8
-L(CopyVecSizeTail1Case2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTail1Case2)
-	jmp	L(StrncpyExit)
-# endif
-
-/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
-
-	.p2align 4
-L(Exit1):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$2, %r8
-	lea	2(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Exit2):
-	movzwl	(%rsi), %ecx
-	mov	%cx, (%rdi)
-	movb	$0, 2(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$3, %r8
-	lea	3(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Exit3):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
+# endif
+	vmovq	%xmm0, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%END_REG)
+	ret
+
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rdi)
+# endif
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	addq	%rsi, %rdi
+	VMOVA	1(%rsi), %VMM(1)
+
+	/* Try and order stores after as many loads as is reasonable to
+	   avoid potential false dependencies.  */
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rax)
+# endif
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE + 1)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), 1(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE + 1)(%rdi)
+
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2 + 1)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %edx
+	testl	%edx, %edx
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 3 + 1)(%rdi)
+
+	/* Subtract rsi from rdi before aligning. Adding back rsi will
+	   get proper rdi (dst) for new src.  */
+	subq	%rsi, %rdi
+	incq	%rsi
+	orq	$(VEC_SIZE * 4 - 1), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %edx
+	addq	%rsi, %rdi
+
+	testl	%edx, %edx
+	jnz	L(loop_4x_done)
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 1)(%rdi)
+
+
+	VMOVA	(VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %edx
+	subq	$(VEC_SIZE * -4), %rdi
+	testl	%edx, %edx
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
+
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
+L(ret_vec_x4):
+	bsfl	%edx, %edx
+	VMOVU	((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
 # ifdef USE_AS_STPCPY
-	lea	3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$4, %r8
-	lea	4(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax
 # endif
+L(return_end):
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(Exit4_7):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	-3(%rsi, %rdx), %ecx
-	mov	%ecx, -3(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x1):
+	bsfl	%ecx, %ecx
+	VMOVU	(1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	1(%rcx, %rdi), %rax
 # endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Exit8_15):
-	mov	(%rsi), %rcx
-	mov	-7(%rsi, %rdx), %r9
-	mov	%rcx, (%rdi)
-	mov	%r9, -7(%rdi, %rdx)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
-L(Exit16_31):
-	vmovdqu (%rsi), %xmm2
-	vmovdqu -15(%rsi, %rdx), %xmm3
-	vmovdqu %xmm2, (%rdi)
-	vmovdqu %xmm3, -15(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x2):
+	bsfl	%ecx, %ecx
+	VMOVU	((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub %rdx, %r8
-	sub $1, %r8
-	lea 1(%rdi, %rdx), %rdi
-	jnz L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax
 # endif
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(Exit32_63):
-	vmovdqu (%rsi), %ymm2
-	vmovdqu -31(%rsi, %rdx), %ymm3
-	vmovdqu %ymm2, (%rdi)
-	vmovdqu %ymm3, -31(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsfl	%ecx, %ecx
+	VMOVU	((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax
 # endif
 	VZEROUPPER_RETURN
 
-# ifdef USE_AS_STRNCPY
 
-	.p2align 4
-L(StrncpyExit1):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
+	.p2align 4,, 4
+L(page_cross):
+	movq	%rsi, %rcx
+	andq	$(VEC_SIZE * -1), %rcx
+
+	VPCMPEQ	(%rcx), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+# if USE_MOVSB_IN_PAGE_CROSS
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shll	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsfl	%ecx, %ecx
+#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
+#  endif
+	rep	movsb
 #  ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 1(%rdi)
+	leaq	-CHAR_SIZE(%rdi), %rax
 #  endif
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(StrncpyExit2):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 2(%rdi)
-#  endif
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(StrncpyExit3_4):
-	movzwl	(%rsi), %ecx
-	movzwl	-2(%rsi, %r8), %edx
-	mov	%cx, (%rdi)
-	mov	%dx, -2(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit5_8):
-	mov	(%rsi), %ecx
-	mov	-4(%rsi, %r8), %edx
-	mov	%ecx, (%rdi)
-	mov	%edx, -4(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit9_16):
-	mov	(%rsi), %rcx
-	mov	-8(%rsi, %r8), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit17_32):
-	vmovdqu (%rsi), %xmm2
-	vmovdqu -16(%rsi, %r8), %xmm3
-	vmovdqu %xmm2, (%rdi)
-	vmovdqu %xmm3, -16(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit33_64):
-	/*  0/32, 31/16 */
-	vmovdqu (%rsi), %ymm2
-	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
-	vmovdqu %ymm2, (%rdi)
-	vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit65):
-	/* 0/32, 32/32, 64/1 */
-	vmovdqu (%rsi), %ymm2
-	vmovdqu 32(%rsi), %ymm3
-	mov	64(%rsi), %cl
-	vmovdqu %ymm2, (%rdi)
-	vmovdqu %ymm3, 32(%rdi)
-	mov	%cl, 64(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	65(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 65(%rdi)
-#  endif
-	VZEROUPPER_RETURN
+# else
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
 
+	/* Traditional copy case, essentially same as used in non-page-
+	   cross case but since we can't reuse VMM(0) we need twice as
+	   many loads from rsi.  */
 #  ifndef USE_AS_STRCAT
-
-	.p2align 4
-L(Fill1):
-	mov	%dl, (%rdi)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill2):
-	mov	%dx, (%rdi)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill3_4):
-	mov	%dx, (%rdi)
-	mov     %dx, -2(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill5_8):
-	mov	%edx, (%rdi)
-	mov     %edx, -4(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill9_16):
-	mov	%rdx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill17_32):
-	vmovdqu %xmmZ, (%rdi)
-	vmovdqu %xmmZ, -16(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec2):
-	vmovdqu %ymm2, (%rdi, %rcx)
-
-	.p2align 4
-L(CopyVecSizeVecExit):
-	bsf	%edx, %edx
-	add	$(VEC_SIZE - 1), %r8
-	add	%rcx, %rdi
-#   ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-#   endif
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-
-	.p2align 4
-L(StrncpyFillTailWithZero):
-	xor	%edx, %edx
-	sub	$VEC_SIZE, %r8
-	jbe	L(StrncpyFillExit)
-
-	vmovdqu %ymmZ, (%rdi)
-	add	$VEC_SIZE, %rdi
-
-	mov	%rdi, %rsi
-	and	$(VEC_SIZE - 1), %esi
-	sub	%rsi, %rdi
-	add	%rsi, %r8
-	sub	$(VEC_SIZE * 4), %r8
-	jb	L(StrncpyFillLessFourVecSize)
-
-L(StrncpyFillLoopVmovdqa):
-	vmovdqa %ymmZ, (%rdi)
-	vmovdqa %ymmZ, VEC_SIZE(%rdi)
-	vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
-	vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE * 4), %rdi
-	sub	$(VEC_SIZE * 4), %r8
-	jae	L(StrncpyFillLoopVmovdqa)
-
-L(StrncpyFillLessFourVecSize):
-	add	$(VEC_SIZE * 2), %r8
-	jl	L(StrncpyFillLessTwoVecSize)
-	vmovdqa %ymmZ, (%rdi)
-	vmovdqa %ymmZ, VEC_SIZE(%rdi)
-	add	$(VEC_SIZE * 2), %rdi
-	sub	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	vmovdqa %ymmZ, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillLessTwoVecSize):
-	add	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	vmovdqa %ymmZ, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillExit):
-	add	$VEC_SIZE, %r8
-L(Fill):
-	cmp	$17, %r8d
-	jae	L(Fill17_32)
-	cmp	$9, %r8d
-	jae	L(Fill9_16)
-	cmp	$5, %r8d
-	jae	L(Fill5_8)
-	cmp	$3, %r8d
-	jae	L(Fill3_4)
-	cmp	$1, %r8d
-	ja	L(Fill2)
-	je	L(Fill1)
-	VZEROUPPER_RETURN
-
-/* end of ifndef USE_AS_STRCAT */
+	xorl	%edx, %edx
 #  endif
-
-	.p2align 4
-L(UnalignedLeaveCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(UnalignedFourVecSizeLeaveCase2)
-L(UnalignedFourVecSizeLeaveCase3):
-	lea	(VEC_SIZE * 4)(%r8), %rcx
-	and	$-VEC_SIZE, %rcx
-	add	$(VEC_SIZE * 3), %r8
-	jl	L(CopyVecSizeCase3)
-	vmovdqu %ymm4, (%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
+	bsfl	%ecx, %edx
 #  ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 4)(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (VEC_SIZE * 4)(%rdi)
+	leaq	(%rdi, %rdx), %rax
+#  elif !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 #  endif
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(UnalignedFourVecSizeLeaveCase2):
-	xor	%ecx, %ecx
-	vpcmpeqb %ymm4, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$(VEC_SIZE * 3), %r8
-	jle	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
-	vpcmpeqb %ymm5, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	vmovdqu %ymm4, (%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec5)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
+	/* vzeroupper early to avoid duplicating at each return.  */
+	COND_VZEROUPPER
 
-	vpcmpeqb %ymm6, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec6)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
+	testw	%cx, %cx
+	jz	L(page_cross_copy_16_31)
 
-	vpcmpeqb %ymm7, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-	lea	VEC_SIZE(%rdi, %rcx), %rdi
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-L(StrncpyExit):
-	cmp	$65, %r8d
-	je	L(StrncpyExit65)
-	cmp	$33, %r8d
-	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8d
-	jae	L(StrncpyExit17_32)
-	cmp	$9, %r8d
-	jae	L(StrncpyExit9_16)
-	cmp	$5, %r8d
-	jae	L(StrncpyExit5_8)
-	cmp	$3, %r8d
-	jae	L(StrncpyExit3_4)
-	cmp	$1, %r8d
-	ja	L(StrncpyExit2)
-	je	L(StrncpyExit1)
-#  ifdef USE_AS_STPCPY
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(ExitZero):
-#  ifndef USE_AS_STRCAT
-	mov	%rdi, %rax
-#  endif
-	VZEROUPPER_RETURN
+	testb	%cl, %cl
+	jz	L(page_cross_copy_8_15)
 
-# endif
+	testl	$0x7, %cl
+	jz	L(page_cross_copy_4_7)
 
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
-# endif
+	testl	%edx, %edx
+	jz	L(page_cross_set_null_term)
+	movzwl	(%rsi), %ecx
+	movw	%cx, (%rdi)
+L(page_cross_set_null_term):
+	movb	$0, (%END_REG)
+	ret
+
+	.p2align 4,, 4
+L(page_cross_copy_4_7):
+	movl	(%rsi), %ecx
+	movl	-3(%rsi, %rdx), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, -3(%END_REG)
+	ret
+
+	.p2align 4,, 4
+L(page_cross_copy_8_15):
+	movq	(%rsi), %rcx
+	movq	-7(%rsi, %rdx), %rsi
+	movq	%rcx, (%rdi)
+	movq	%rsi, -7(%END_REG)
+	ret
+
+
+	.p2align 4,, 3
+L(page_cross_copy_16_31):
+	VMOVU	(%rsi), %xmm0
+	VMOVU	-15(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -15(%END_REG)
+	ret
+# endif
+
+END(STRCPY)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
index 0dcea18dbb..2bbdbb91ab 100644
--- a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_avx2_rtm
-#include "strcat-avx2-rtm.S"
+#define STRNCAT	__strncat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "strncat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
index 52ecbca943..547cef9486 100644
--- a/sysdeps/x86_64/multiarch/strncat-avx2.S
+++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
@@ -1,7 +1,419 @@
-#ifndef STRNCAT
-# define STRNCAT	__strncat_avx2
-#endif
+/* strncat with AVX2
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+#  include "x86-avx2-vecs.h"
+# endif
+
+# ifndef STRNCAT
+#  define STRNCAT	__strncat_avx2
+# endif
+
+# ifdef USE_AS_WCSCPY
+#  define movNULL	movl
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define movNULL	movb
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCAT)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   using the non-length variant {wcs|str}cat.  */
+	movq	%rdi, %rax
+# ifdef USE_AS_WCSCPY
+	leaq	-1(%rdx), %rcx
+	shr	$56, %rcx
+	jnz	L(zero_len)
+	salq	$2, %rdx
+# else
+	test	%rdx, %rdx
+	jl	L(zero_len)
+# endif
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
+
+# include "strcat-strlen-avx2.S"
+
+	movl	%esi, %ecx
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	tzcnt	%ecx, %r8d
+	cmpq	%r8, %rdx
+	jbe	L(less_1x_vec)
+
+	testl	%ecx, %ecx
+	jz	L(more_1x_vec)
+
+	/* Hoist this to save code size.  */
+
+	movl	%r8d, %edx
+
+L(less_1x_vec):
+	COND_VZEROUPPER
+
+	cmpl	$16, %edx
+	jae	L(copy_16_31)
+	cmpl	$8, %edx
+	jae	L(copy_8_15)
+
+
+# ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+# else
+	cmpl	$4, %edx
+	jae	L(copy_4_7)
+
+	movzbl	(%rsi), %ecx
+	cmpl	$1, %edx
+	jbe	L(set_null_term)
+
+	/* NB: make this `vmovw` if support for AVX512-FP16 is added.
+	 */
+	movzwl	1(%rsi), %esi
+	movw	%si, 1(%rdi)
+
+	.p2align 4,, 1
+L(set_null_term):
+	movb	%cl, (%rdi)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 11
+L(copy_4_7):
+	movl	-(4)(%rsi, %rdx), %ecx
+	vmovd	%xmm0, (%rdi)
+	movl	%ecx, -(4)(%rdi, %rdx)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+# endif
+
+
+	.p2align 4,, 10
+L(copy_16_31):
+	VMOVU	-(16)(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -(16)(%rdi, %rdx)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 10
+L(copy_8_15):
+	movq	-(8)(%rsi, %rdx), %rcx
+	vmovq	%xmm0, (%rdi)
+	movq	%rcx, -(8)(%rdi, %rdx)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+	.p2align 6,, 14
+L(more_1x_vec):
+	VMOVU	%VMM(0), (%rdi)
+
+	/* Align rsi (src) and just rdx/rdi (length/dst).  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	incq	%rsi
+	addq	%rsi, %rdi
+L(loop_last_4x_vec):
+	subq	%rsi, %rdx
+	VMOVA	0(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	tzcnt	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len)
+
+	cmpl	$VEC_SIZE, %ecx
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	addl	$-VEC_SIZE, %edx
+	bzhil	%edx, %ecx, %r8d
+	jz	L(ret_vec_x2_len)
+L(ret_vec_x2):
+	bsfl	%ecx, %edx
+L(ret_vec_x2_len):
+	VMOVU	(%rsi, %rdx), %VMM(0)
+	movNULL	$0, (VEC_SIZE)(%rdi, %rdx)
+	VMOVU	%VMM(0), (%rdi, %rdx)
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+
+	.p2align 4,, 12
+L(ret_vec_x1_len):
+	movl	%edx, %ecx
+L(ret_vec_x1):
+	VMOVU	-(VEC_SIZE)(%rsi, %rcx), %VMM(1)
+	movNULL	$0, (%rdi, %rcx)
+	VMOVU	%VMM(1), -VEC_SIZE(%rdi, %rcx)
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	subq	$-(VEC_SIZE * 4), %rsi
+	VMOVA	0(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	subq	$-(VEC_SIZE * 4), %rdi
+	addl	$-(VEC_SIZE * 4), %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+	/* L(ret_vec_x1) expects ecx to have position of first match so
+	   test with bsf.  */
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
 
-#define USE_AS_STRNCAT
-#define STRCAT	STRNCAT
-#include "strcat-avx2.S"
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE * 1)(%rdi)
+
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* Check if length is greater than 4x VEC.  */
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	addl	$(VEC_SIZE * -2), %edx
+
+	tzcnt	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len)
+
+	cmpl	$VEC_SIZE, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	addl	$-VEC_SIZE, %edx
+	bzhil	%edx, %ecx, %r8d
+	jz	L(ret_vec_x4_len)
+L(ret_vec_x4):
+	bsfl	%ecx, %edx
+L(ret_vec_x4_len):
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %VMM(0)
+	movNULL	$0, (VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * 2)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 4
+L(ret_vec_x3_len):
+	movl	%edx, %ecx
+L(ret_vec_x3):
+	VMOVU	(VEC_SIZE)(%rsi, %rcx), %VMM(0)
+	movNULL	$0, (VEC_SIZE * 2)(%rdi, %rcx)
+	VMOVU	%VMM(0), (VEC_SIZE)(%rdi, %rcx)
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(more_4x_vec):
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi)
+
+
+	/* Recheck length before aligning.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Align rsi (src) and just rdx/rdi (length/dst).  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %r8d
+	addq	%rsi, %rdi
+	testl	%r8d, %r8d
+	jnz	L(loop_4x_done)
+
+	/* Use r9 for end of region before handling last 4x VEC
+	   specially.  */
+	leaq	-(VEC_SIZE * 4)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %r8d
+
+	testl	%r8d, %r8d
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	/* L(ret_vec_x1) expects ecx to have position of first match so
+	   test with bsf.  */
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	bsfl	%r8d, %r8d
+	VMOVU	(VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8)
+	VZEROUPPER_RETURN
+
+
+
+	.p2align 4,, 4
+L(page_cross):
+	movq	%rsi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+
+	VPCMPEQ	(%r8), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+
+	subl	%esi, %r8d
+	andl	$(VEC_SIZE - 1), %r8d
+	cmpq	%r8, %rdx
+	jb	L(page_cross_small)
+
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shll	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsfl	%ecx, %ecx
+	rep	movsb
+	VZEROUPPER_RETURN
+
+L(page_cross_small):
+	tzcntl	%ecx, %ecx
+	jz	L(page_cross_setz)
+	cmpl	%edx, %ecx
+	cmova	%edx, %ecx
+	rep	movsb
+L(page_cross_setz):
+	movNULL	$0, (%rdi)
+	VZEROUPPER_RETURN
+L(zero_len):
+# ifdef USE_AS_WCSCPY
+	test	%rdx, %rdx
+# endif
+	jnz	OVERFLOW_STRCAT
+	ret
+
+
+END(STRNCAT)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
index 79e7083299..b582a4a7a1 100644
--- a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STRNCPY	__strncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
index ce634e94fa..d1b25b7a42 100644
--- a/sysdeps/x86_64/multiarch/strncpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
@@ -1,7 +1,735 @@
-#ifndef STRNCPY
-# define STRNCPY	__strncpy_avx2
-#endif
+/* strncpy with AVX2
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+
+# ifndef VEC_SIZE
+#  include "x86-avx2-vecs.h"
+# endif
+
+# ifndef STRNCPY
+#  define STRNCPY	__strncpy_avx2
+# endif
+
+
+# ifdef USE_AS_WCSCPY
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCPY)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   just running rep stos{b|l} to zero set (which will almost
+	   certainly segfault), if that succeeds then just calling
+	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
+# ifdef USE_AS_WCSCPY
+	decq	%rdx
+	movq	%rdx, %rax
+	/* 56 is end of max supported address space.  */
+	shr	$56, %rax
+	jnz	L(zero_len)
+	salq	$2, %rdx
+# else
+	decq	%rdx
+	/* `dec` can macrofuse with `jl`. If the flag needs to become
+	   `jb` replace `dec` with `sub`.  */
+	jl	L(zero_len)
+# endif
+
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
+	movl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* If no STPCPY just save end ahead of time.  */
+# ifndef USE_AS_STPCPY
+	movq	%rdi, %rax
+# elif defined USE_AS_WCSCPY
+	/* Clear dependency as nearly all return code for wcpncpy uses
+	   `setc %al`.  */
+	xorl	%eax, %eax
+# endif
+
+	cmpq	$(VEC_SIZE - CHAR_SIZE), %rdx
+	/* `jb` because length rdx is now length - CHAR_SIZE.  */
+	jbe	L(less_1x_vec)
+
+	/* This may overset but thats fine because we still need to zero
+	   fill.  */
+	VMOVU	%VMM(0), (%rdi)
+
+	testl	%ecx, %ecx
+	jnz	L(zfill)
+
+	/* Align.  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	incq	%rsi
+L(last_4x_vec):
+	addq	%rsi, %rdi
+L(loop_last_4x_vec):
+	subq	%rsi, %rdx
+
+
+	VMOVA	0(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jae	L(more_2x_vec)
+
+	cmpl	$(VEC_SIZE), %edx
+	jb	L(ret_vec_x1_len)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VPCMPEQ	VEC_SIZE(%rsi), %VZERO, %VMM(6)
+	VMOVU	%VMM(1), (%rdi)
+	vpmovmskb %VMM(6), %ecx
+	shlq	$VEC_SIZE, %rcx
+L(ret_vec_x1_len):
+	tzcntq	%rcx, %rcx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+L(ret_vec_x1_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+L(ret_vec_x1_len_no_zfill):
+	VMOVU	((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	.p2align 4,, 6
+L(ret_vec_x1):
+	bsfl	%ecx, %ecx
+	VMOVU	%VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+	subl	%ecx, %edx
+	/* Check if we need to reload/store.  */
+	cmpl	$VEC_SIZE, %edx
+	jb	L(ret_vec_x1_len_no_zfill_mov)
+	/* Otherwise safe to just store directly.  */
+	VMOVU	%VMM(1), (%rdi)
+	VMOVU	%VZERO, (%rdi, %rcx)
+# ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 12
+L(more_2x_vec):
+	VMOVU	%VMM(1), (%rdi)
+	testl	%ecx, %ecx
+	/* Must fill at least 2x VEC.  */
+	jnz	L(zfill_vec1)
+
+	VMOVA	VEC_SIZE(%rsi), %VMM(2)
+	VMOVU	%VMM(2), VEC_SIZE(%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	/* Must fill at least 1x VEC.  */
+	jnz	L(zfill_vec2)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
+	   CHAR_SIZE.  */
+	cmpq	$(VEC_SIZE * 4 - CHAR_SIZE), %rdx
+	ja	L(more_4x_vec)
+
+	subl	$(VEC_SIZE * 3), %edx
+	jb	L(ret_vec_x3_len)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	vpmovmskb %VMM(6), %ecx
+	tzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x4_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+	movl	%ecx, %edx
+L(ret_vec_x4_len_no_zfill):
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 3 + 0)(%edx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+
+L(ret_vec_x3_len):
+	addl	$(VEC_SIZE * 1), %edx
+	tzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+L(ret_vec_x3_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+	.p2align 4,, 4
+L(ret_vec_x3_len_no_zfill):
+	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 2 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsfl	%ecx, %ecx
+	VMOVU	%VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
+	subl	%ecx, %edx
+	jl	L(ret_vec_x3_len_no_zfill_mov)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx), %rax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(more_4x_vec):
+
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec3)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(4)
+	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec4)
+
+	movq	%rdx, %rcx
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+	/* Recheck length before aligning.  */
+	cmpq	$(VEC_SIZE * 8 - CHAR_SIZE), %rcx
+	jbe	L(last_4x_vec)
+
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %r8d
+	addq	%rsi, %rdi
+	testl	%r8d, %r8d
+	jnz	L(loop_4x_done)
+
+	/* Use r9 as end register.  */
+	leaq	-(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
 
-#define USE_AS_STRNCPY
-#define STRCPY	STRNCPY
-#include "strcpy-avx2.S"
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %r8d
+
+	testl	%r8d, %r8d
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	subq	%rsi, %rdx
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec1)
+
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec2)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec3)
+
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+	movl	%r8d, %ecx
+
+	// Zfill more....
+
+	.p2align 4,, 4
+L(zfill_vec4):
+	addq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2), %rdx
+L(zfill_vec2):
+	shlq	$VEC_SIZE, %rcx
+L(zfill):
+	bsfq	%rcx, %rcx
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_from_page_cross):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(zfill_less_vec_vzeroupper)
+
+L(zfill_more_1x_vec):
+	VMOVU	%VZERO, CHAR_SIZE(%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jae	L(zfill_more_2x_vec)
+L(zfill_done0):
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(zfill_vec3):
+	addq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2), %rdx
+	.p2align 4,, 2
+L(zfill_vec1):
+	bsfl	%ecx, %ecx
+	addq	%rcx, %rdi
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+	/* zfill from vec1/vec3 must have to set at least 2x VECS.  */
+
+	VMOVU	%VZERO, CHAR_SIZE(%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jb	L(zfill_done0)
+L(zfill_more_2x_vec):
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
+	subq	$(VEC_SIZE * 4 - CHAR_SIZE), %rdx
+	jbe	L(zfill_done)
+
+	addq	%rdi, %rdx
+	VMOVU	%VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
+
+
+	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
+	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
+
+	subq	$-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
+	cmpq	%rdi, %rdx
+	jbe	L(zfill_done)
+
+	andq	$-(VEC_SIZE), %rdi
+	.p2align 4,, 12
+L(zfill_loop_4x_vec):
+	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	ja	L(zfill_loop_4x_vec)
+L(zfill_done):
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(copy_1x):
+	VMOVU	%VMM(0), (%rdi)
+	testl	%ecx, %ecx
+	jz	L(ret_32_32)
+L(zfill_less_vec):
+	bsfl	%ecx, %ecx
+L(zfill_less_vec_no_bsf):
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_less_vec_vzeroupper):
+	COND_VZEROUPPER
+	/* We are taking advantage of the fact that to be here we must
+	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
+	   way for overwriting.  */
+	cmpl	$16, %edx
+	jb	L(zfill_less_16)
+	VMOVU	%VZERO_128, (%rdi)
+	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
+	ret
+# ifdef USE_AS_STPCPY
+L(ret_32_32):
+	leaq	CHAR_SIZE(%rdi, %rdx), %rax
+	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4,, 4
+L(copy_16_31):
+	/* Overfill to avoid branches.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	ja	L(zfill_less_vec_no_bsf)
+# ifndef USE_AS_STPCPY
+L(ret_32_32):
+# else
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 4
+L(copy_8_15):
+	/* Overfill to avoid branches.  */
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
+	vmovq	%xmm0, (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_8_15)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+	.p2align 4,, 8
+L(zfill_less_16):
+	xorl	%ecx, %ecx
+	cmpl	$8, %edx
+	jb	L(zfill_less_8)
+	movq	%rcx, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
+# ifndef USE_AS_STPCPY
+L(ret_8_15):
+# endif
+	ret
+
+
+	.p2align 4,, 8
+L(less_1x_vec):
+	/* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
+	   buffer sizes are aligned conventially.  */
+	je	L(copy_1x)
+
+	tzcntl	%ecx, %ecx
+	cmpl	$16, %edx
+	jae	L(copy_16_31)
+
+	COND_VZEROUPPER
+	cmpl	$8, %edx
+	jae	L(copy_8_15)
+# ifdef USE_AS_WCSCPY
+	testl	%ecx, %ecx
+	jz	L(zfill_less_8_set_ret)
+
+	movl	(%rsi, %rdx), %esi
+	vmovd	%xmm0, (%rdi)
+	movl	%esi, (%rdi, %rdx)
+
+#  ifdef USE_AS_STPCPY
+	cmpl	%ecx, %edx
+L(ret_8_15):
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  endif
+	ret
+L(zfill_less_8_set_ret):
+	xorl	%ecx, %ecx
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+L(zfill_less_8):
+	movl	%ecx, (%rdi)
+	movl	%ecx, (%rdi, %rdx)
+	ret
+
+# else
+	cmpl	$3, %edx
+	jb	L(copy_0_3)
+	/* Overfill to avoid branches.  */
+	movl	-3(%rsi, %rdx), %esi
+	vmovd	%xmm0, (%rdi)
+	movl	%esi, -3(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_4_7)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+	xorl	%ecx, %ecx
+	.p2align 4,, 8
+L(zfill_less_8):
+	cmpl	$3, %edx
+	jb	L(zfill_less_3)
+	movl	%ecx, (%rdi)
+	movl	%ecx, -3(%rdi, %rdx)
+#  ifdef USE_AS_STPCPY
+	ret
+#  endif
+
+L(ret_4_7):
+#  ifdef USE_AS_STPCPY
+L(ret_8_15):
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+	ret
+
+	.p2align 4,, 4
+L(zfill_less_3):
+	testl	%edx, %edx
+	jz	L(zfill_1)
+	movw	%cx, (%rdi)
+L(zfill_1):
+	movb	%cl, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_0_3):
+	vmovd	%xmm0, %r8d
+	testl	%edx, %edx
+	jz	L(copy_1)
+	movw	%r8w, (%rdi)
+	cmpl	%ecx, %edx
+	ja	L(zfill_from_1)
+	movzbl	(%rsi, %rdx), %r8d
+#  ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+	movb	%r8b, (%rdi, %rdx)
+	ret
+#  endif
+
+L(copy_1):
+#  ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	cmpl	%ecx, %edx
+	adcq	%rdi, %rax
+#  endif
+#  ifdef USE_AS_WCSCPY
+	vmovd	%xmm0, (%rdi)
+#  else
+	movb	%r8b, (%rdi, %rdx)
+#  endif
+	ret
+# endif
+
+	.p2align 4,, 2
+L(zero_len):
+	movq	%rdi, %rax
+	ret
+# ifndef USE_AS_WCSCPY
+	.p2align 4,, 8
+L(zfill_from_1):
+#  ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+#  endif
+	movw	$0, -1(%rdi, %rdx)
+	ret
+# endif
+
+	.p2align 4,, 4
+	.p2align 6,, 8
+L(page_cross):
+	movq	%rsi, %rax
+	andq	$(VEC_SIZE * -1), %rax
+
+	VPCMPEQ	(%rax), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+
+	subl	%esi, %eax
+	andl	$(VEC_SIZE - 1), %eax
+	cmpq	%rax, %rdx
+	jb	L(page_cross_small)
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* If rcx is non-zero then continue.  */
+	shl	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsf	%ecx, %ecx
+
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	leaq	-CHAR_SIZE(%rdi, %rcx), %rax
+# else
+	movq	%rdi, %rax
+# endif
+
+	rep	movsb
+# ifdef USE_AS_WCSCPY
+	movl	$0, (%rdi)
+# else
+	movb	$0, (%rdi)
+# endif
+	jmp	L(zfill_from_page_cross)
+
+L(page_cross_small):
+	tzcntl	%ecx, %ecx
+	xorl	%eax, %eax
+	cmpl	%ecx, %edx
+	jbe	L(page_cross_copy_only)
+
+	/* Do a zfill of the tail before copying.  */
+	movq	%rdi, %r9
+	movl	%ecx, %r8d
+
+	subl	%ecx, %edx
+	leaq	CHAR_SIZE(%rdi, %rcx), %rdi
+	movl	%edx, %ecx
+	rep	stosb
+	movq	%r9, %rdi
+	movl	%r8d, %edx
+L(page_cross_copy_only):
+	leal	CHAR_SIZE(%rdx), %ecx
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdi, %rdx
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# else
+	movq	%rdi, %rax
+# endif
+	rep	movsb
+	ret
+
+
+L(best_effort_strncpy):
+	movq	%rdx, %rcx
+	xorl	%eax, %eax
+	movq	%rdi, %r8
+	/* The length is >= 2^63. We very much so expect to segfault at
+	   rep stos. If that doesn't happen then just strcpy to finish.
+	 */
+# ifdef USE_AS_WCSCPY
+	rep	stosl
+# else
+	rep	stosb
+# endif
+	movq	%r8, %rdi
+	jmp	OVERFLOW_STRCPY
+END(STRNCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
index dca1089060..01bead1435 100644
--- a/sysdeps/x86_64/multiarch/x86-avx-vecs.h
+++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
@@ -27,7 +27,10 @@
 #define VEC_SIZE			32
 #include "x86-vec-macros.h"
 
-#define USE_WITH_AVX		1
+#ifndef USE_WITH_AVX2
+# define USE_WITH_AVX		1
+#endif
+
 #define SECTION(p)			p##.avx
 
 /* 4-byte mov instructions with AVX2.  */
diff --git a/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
new file mode 100644
index 0000000000..a5966701ec
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
@@ -0,0 +1,26 @@
+/* Common config for AVX2-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX2_RTM_VECS_H
+#define _X86_AVX2_RTM_VECS_H			1
+
+#define USE_WITH_AVX2		1
+#include "x86-avx-rtm-vecs.h"
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-avx2-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
new file mode 100644
index 0000000000..16d7ae5147
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
@@ -0,0 +1,27 @@
+/* Common config for AVX2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX2_VECS_H
+#define _X86_AVX2_VECS_H			1
+
+#define USE_WITH_AVX2		1
+
+#include "x86-avx-vecs.h"
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v3 4/5] x86: Add evex optimized functions for the wchar_t strcpy family
  2022-11-04 20:13 ` [PATCH v3 1/5] " Noah Goldstein
  2022-11-04 20:13   ` [PATCH v3 2/5] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
  2022-11-04 20:13   ` [PATCH v3 3/5] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
@ 2022-11-04 20:13   ` Noah Goldstein
  2022-11-04 20:13   ` [PATCH v3 5/5] x86: Add avx2 " Noah Goldstein
  2022-11-04 21:01   ` [PATCH v3 1/5] benchtests: Make str{n}{cat|cpy} benchmarks output json H.J. Lu
  4 siblings, 0 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04 20:13 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Implemented:
    wcscat-evex  (+ 905 bytes)
    wcscpy-evex  (+ 674 bytes)
    wcpcpy-evex  (+ 709 bytes)
    wcsncpy-evex (+1358 bytes)
    wcpncpy-evex (+1467 bytes)
    wcsncat-evex (+1213 bytes)

Performance Changes:
    Times are from N = 10 runs of the benchmark suite and are reported
    as geometric mean of all ratios of New Implementation / Best Old
    Implementation. Best Old Implementation was determined with the
    highest ISA implementation.

    wcscat-evex     -> 0.991
    wcscpy-evex     -> 0.587
    wcpcpy-evex     -> 0.695
    wcsncpy-evex    -> 0.719
    wcpncpy-evex    -> 0.694
    wcsncat-evex    -> 0.979

Code Size Changes:
    This change  increase the size of libc.so by ~6.3kb bytes. For
    reference the patch optimizing the normal strcpy family functions
    decreases libc.so by ~5.7kb.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
 sysdeps/x86_64/Makefile                    |  5 ++
 sysdeps/x86_64/multiarch/Makefile          | 14 ++++-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c | 69 +++++++++++++++++++++-
 sysdeps/x86_64/multiarch/ifunc-wcs.h       | 49 +++++++++++++++
 sysdeps/x86_64/multiarch/wcpcpy-evex.S     |  8 +++
 sysdeps/x86_64/multiarch/wcpcpy-generic.c  | 27 +++++++++
 sysdeps/x86_64/multiarch/wcpcpy.c          | 37 ++++++++++++
 sysdeps/x86_64/multiarch/wcpncpy-evex.S    |  8 +++
 sysdeps/x86_64/multiarch/wcpncpy-generic.c | 27 +++++++++
 sysdeps/x86_64/multiarch/wcpncpy.c         | 37 ++++++++++++
 sysdeps/x86_64/multiarch/wcscat-evex.S     |  9 +++
 sysdeps/x86_64/multiarch/wcscat-generic.c  | 27 +++++++++
 sysdeps/x86_64/multiarch/wcscat.c          | 37 ++++++++++++
 sysdeps/x86_64/multiarch/wcscpy-evex.S     |  7 +++
 sysdeps/x86_64/multiarch/wcscpy-generic.c  |  3 +-
 sysdeps/x86_64/multiarch/wcscpy.c          | 12 ++++
 sysdeps/x86_64/multiarch/wcsncat-evex.S    |  9 +++
 sysdeps/x86_64/multiarch/wcsncat-generic.c | 27 +++++++++
 sysdeps/x86_64/multiarch/wcsncat.c         | 34 +++++++++++
 sysdeps/x86_64/multiarch/wcsncpy-evex.S    |  7 +++
 sysdeps/x86_64/multiarch/wcsncpy-generic.c | 27 +++++++++
 sysdeps/x86_64/multiarch/wcsncpy.c         | 37 ++++++++++++
 sysdeps/x86_64/wcpcpy-generic.c            | 31 ++++++++++
 sysdeps/x86_64/wcpcpy.S                    | 40 +++++++++++++
 sysdeps/x86_64/wcpncpy-generic.c           | 31 ++++++++++
 sysdeps/x86_64/wcpncpy.S                   | 40 +++++++++++++
 sysdeps/x86_64/wcscat-generic.c            | 31 ++++++++++
 sysdeps/x86_64/wcscat.S                    | 40 +++++++++++++
 sysdeps/x86_64/wcscpy.S                    |  1 +
 sysdeps/x86_64/wcsncat-generic.c           | 31 ++++++++++
 sysdeps/x86_64/wcsncat.S                   | 38 ++++++++++++
 sysdeps/x86_64/wcsncpy-generic.c           | 31 ++++++++++
 sysdeps/x86_64/wcsncpy.S                   | 40 +++++++++++++
 33 files changed, 865 insertions(+), 6 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcs.h
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy.c
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy.c
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcscat.c
 create mode 100644 sysdeps/x86_64/multiarch/wcscpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat.c
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy.c
 create mode 100644 sysdeps/x86_64/wcpcpy-generic.c
 create mode 100644 sysdeps/x86_64/wcpcpy.S
 create mode 100644 sysdeps/x86_64/wcpncpy-generic.c
 create mode 100644 sysdeps/x86_64/wcpncpy.S
 create mode 100644 sysdeps/x86_64/wcscat-generic.c
 create mode 100644 sysdeps/x86_64/wcscat.S
 create mode 100644 sysdeps/x86_64/wcsncat-generic.c
 create mode 100644 sysdeps/x86_64/wcsncat.S
 create mode 100644 sysdeps/x86_64/wcsncpy-generic.c
 create mode 100644 sysdeps/x86_64/wcsncpy.S

diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 3627c5659f..688eb2d7c4 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -188,8 +188,13 @@ endif
 ifeq ($(subdir),wcsmbs)
 
 sysdep_routines += \
+  wcpcpy-generic \
+  wcpncpy-generic \
+  wcscat-generic \
   wcscpy-generic \
+  wcsncat-generic \
   wcsncmp-generic \
+  wcsncpy-generic \
   wcsnlen-generic \
 # sysdep_routines
 
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 066bfa48d9..d6e01940c3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -131,6 +131,12 @@ endif
 
 ifeq ($(subdir),wcsmbs)
 sysdep_routines += \
+  wcpcpy-evex \
+  wcpcpy-generic \
+  wcpncpy-evex \
+  wcpncpy-generic \
+  wcscat-evex \
+  wcscat-generic \
   wcschr-avx2 \
   wcschr-avx2-rtm \
   wcschr-evex \
@@ -140,6 +146,8 @@ sysdep_routines += \
   wcscmp-avx2-rtm \
   wcscmp-evex \
   wcscmp-sse2 \
+  wcscpy-evex \
+  wcscpy-generic \
   wcscpy-ssse3 \
   wcslen-avx2 \
   wcslen-avx2-rtm \
@@ -147,9 +155,13 @@ sysdep_routines += \
   wcslen-evex512 \
   wcslen-sse2 \
   wcslen-sse4_1 \
+  wcsncat-evex \
+  wcsncat-generic \
   wcsncmp-avx2 \
   wcsncmp-avx2-rtm \
   wcsncmp-evex \
+  wcsncpy-evex \
+  wcsncpy-generic \
   wcsnlen-avx2 \
   wcsnlen-avx2-rtm \
   wcsnlen-evex \
@@ -163,8 +175,8 @@ sysdep_routines += \
   wmemchr-avx2 \
   wmemchr-avx2-rtm \
   wmemchr-evex \
-  wmemchr-evex512 \
   wmemchr-evex-rtm \
+  wmemchr-evex512 \
   wmemchr-sse2 \
   wmemcmp-avx2-movbe \
   wmemcmp-avx2-movbe-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7cebee7ec7..959cb0b420 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -901,16 +901,79 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/wcscpy.c.  */
   IFUNC_IMPL (i, name, wcscpy,
-	      /* ISA V4 wrapper for SSSE3 implementation because
-	         the SSSE3 implementation is also used at ISA
-	         level 3/4.  */
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcscpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __wcscpy_ssse3)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
 				     1,
 				     __wcscpy_generic))
 
+  /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
+  IFUNC_IMPL (i, name, wcsncpy,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncpy_evex)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcpncpy,
+				     1,
+				     __wcsncpy_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcpcpy.c.  */
+  IFUNC_IMPL (i, name, wcpcpy,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcpcpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpcpy_evex)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcpcpy,
+				     1,
+				     __wcpcpy_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcpncpy.c.  */
+  IFUNC_IMPL (i, name, wcpncpy,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcpncpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpncpy_evex)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncpy,
+				     1,
+				     __wcpncpy_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcscat.c.  */
+  IFUNC_IMPL (i, name, wcscat,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcscat,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscat_evex)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscat,
+				     1,
+				     __wcscat_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcsncat.c.  */
+  IFUNC_IMPL (i, name, wcsncat,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncat,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncat_evex)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncat,
+				     1,
+				     __wcsncat_generic))
+
   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
   IFUNC_IMPL (i, name, wcslen,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcslen,
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcs.h b/sysdeps/x86_64/multiarch/ifunc-wcs.h
new file mode 100644
index 0000000000..da6e1b03d0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-wcs.h
@@ -0,0 +1,49 @@
+/* Common definition for ifunc selections optimized wide-character
+   string copy functions.
+
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+#ifndef GENERIC
+# define GENERIC generic
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+				      AVX_Fast_Unaligned_Load, ))
+    {
+      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+    }
+
+  return OPTIMIZE (GENERIC);
+}
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-evex.S b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
new file mode 100644
index 0000000000..ac6429cc07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
@@ -0,0 +1,8 @@
+#ifndef WCPCPY
+# define WCPCPY	__wcpcpy_evex
+#endif
+
+#define USE_AS_STPCPY
+#define USE_AS_WCSCPY
+#define STRCPY	WCPCPY
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-generic.c b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
new file mode 100644
index 0000000000..6039196a3e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
@@ -0,0 +1,27 @@
+/* wcpcpy.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (3)
+
+# define WCPCPY __wcpcpy_generic
+# include <wcsmbs/wcpcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcpcpy.c b/sysdeps/x86_64/multiarch/wcpcpy.c
new file mode 100644
index 0000000000..8f96ddbc99
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcpcpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcpcpy __redirect_wcpcpy
+# include <wchar.h>
+# undef __wcpcpy
+
+# define SYMBOL_NAME wcpcpy
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcpcpy, __wcpcpy, IFUNC_SELECTOR ());
+weak_alias (__wcpcpy, wcpcpy)
+# ifdef SHARED
+__hidden_ver1 (__wcpcpy, __GI___wcpcpy, __redirect_wcpcpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpcpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-evex.S b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
new file mode 100644
index 0000000000..62ddb694fe
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
@@ -0,0 +1,8 @@
+#ifndef WCPNCPY
+# define WCPNCPY	__wcpncpy_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STPCPY
+#define STRNCPY	WCPNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-generic.c b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
new file mode 100644
index 0000000000..de8d34320e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
@@ -0,0 +1,27 @@
+/* wcpncpy.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (3)
+
+# define WCPNCPY __wcpncpy_generic
+# include <wcsmbs/wcpncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcpncpy.c b/sysdeps/x86_64/multiarch/wcpncpy.c
new file mode 100644
index 0000000000..ed8f307e07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcpncpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcpncpy __redirect_wcpncpy
+# include <wchar.h>
+# undef __wcpncpy
+
+# define SYMBOL_NAME wcpncpy
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcpncpy, __wcpncpy, IFUNC_SELECTOR ());
+weak_alias (__wcpncpy, wcpncpy)
+# ifdef SHARED
+__hidden_ver1 (__wcpncpy, __GI___wcpncpy, __redirect_wcpncpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpncpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcscat-evex.S b/sysdeps/x86_64/multiarch/wcscat-evex.S
new file mode 100644
index 0000000000..1d017e4899
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-evex.S
@@ -0,0 +1,9 @@
+#ifndef WCSCAT
+# define WCSCAT	__wcscat_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRCPY	WCSCAT
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcscat-generic.c b/sysdeps/x86_64/multiarch/wcscat-generic.c
new file mode 100644
index 0000000000..d86b4d5c00
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-generic.c
@@ -0,0 +1,27 @@
+/* wcscat.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (3)
+
+# define WCSCAT __wcscat_generic
+# include <wcsmbs/wcscat.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcscat.c b/sysdeps/x86_64/multiarch/wcscat.c
new file mode 100644
index 0000000000..3277c44561
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcscat.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcscat __redirect_wcscat
+# include <wchar.h>
+# undef __wcscat
+
+# define SYMBOL_NAME wcscat
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcscat, __wcscat, IFUNC_SELECTOR ());
+weak_alias (__wcscat, wcscat)
+# ifdef SHARED
+__hidden_ver1 (__wcscat, __GI___wcscat, __redirect_wcscat)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcscat);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcscpy-evex.S b/sysdeps/x86_64/multiarch/wcscpy-evex.S
new file mode 100644
index 0000000000..1069a8e224
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-evex.S
@@ -0,0 +1,7 @@
+#ifndef WCSCPY
+# define WCSCPY	__wcscpy_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define STRCPY	WCSCPY
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-generic.c b/sysdeps/x86_64/multiarch/wcscpy-generic.c
index 93d314aaad..4a1fffae4b 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcscpy-generic.c
@@ -18,8 +18,7 @@
 
 
 #include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (1)
+#if ISA_SHOULD_BUILD (3)
 
 # define WCSCPY  __wcscpy_generic
 # include <wcsmbs/wcscpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
index 92c917b6b4..efe32e505f 100644
--- a/sysdeps/x86_64/multiarch/wcscpy.c
+++ b/sysdeps/x86_64/multiarch/wcscpy.c
@@ -26,6 +26,8 @@
 # define SYMBOL_NAME wcscpy
 # include <init-arch.h>
 
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
@@ -35,6 +37,16 @@ IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
+  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load, ))
+    {
+      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+    }
+
   if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
     return OPTIMIZE (ssse3);
 
diff --git a/sysdeps/x86_64/multiarch/wcsncat-evex.S b/sysdeps/x86_64/multiarch/wcsncat-evex.S
new file mode 100644
index 0000000000..392215950a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-evex.S
@@ -0,0 +1,9 @@
+#ifndef WCSCAT
+# define WCSCAT	__wcsncat_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRNCAT	WCSCAT
+#include "strncat-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncat-generic.c b/sysdeps/x86_64/multiarch/wcsncat-generic.c
new file mode 100644
index 0000000000..4b55cb40bc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-generic.c
@@ -0,0 +1,27 @@
+/* wcsncat.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (3)
+
+# define WCSNCAT __wcsncat_generic
+# include <wcsmbs/wcsncat.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncat.c b/sysdeps/x86_64/multiarch/wcsncat.c
new file mode 100644
index 0000000000..49c46aef08
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat.c
@@ -0,0 +1,34 @@
+/* Multiple versions of wcsncat.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define wcsncat __redirect_wcsncat
+# include <wchar.h>
+# undef wcsncat
+
+# define SYMBOL_NAME wcsncat
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcsncat, wcsncat, IFUNC_SELECTOR ());
+# ifdef SHARED
+__hidden_ver1 (wcsncat, __GI_wcsncat, __redirect_wcsncat)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncat);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-evex.S b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
new file mode 100644
index 0000000000..2debb8fd6b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
@@ -0,0 +1,7 @@
+#ifndef WCSNCPY
+# define WCSNCPY	__wcsncpy_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define STRNCPY	WCSNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-generic.c b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
new file mode 100644
index 0000000000..d0e8a86605
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
@@ -0,0 +1,27 @@
+/* wcsncpy.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (3)
+
+# define WCSNCPY __wcsncpy_generic
+# include <wcsmbs/wcsncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncpy.c b/sysdeps/x86_64/multiarch/wcsncpy.c
new file mode 100644
index 0000000000..5b89dd4d27
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcsncpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcsncpy __redirect_wcsncpy
+# include <wchar.h>
+# undef __wcsncpy
+
+# define SYMBOL_NAME wcsncpy
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcsncpy, __wcsncpy, IFUNC_SELECTOR ());
+weak_alias (__wcsncpy, wcsncpy)
+# ifdef SHARED
+__hidden_ver1 (__wcsncpy, __GI___wcsncpy, __redirect_wcsncpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/wcpcpy-generic.c b/sysdeps/x86_64/wcpcpy-generic.c
new file mode 100644
index 0000000000..d52525f288
--- /dev/null
+++ b/sysdeps/x86_64/wcpcpy-generic.c
@@ -0,0 +1,31 @@
+/* wcpcpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpcpy non-multiarch build is split into two files,
+   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcpcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcpcpy.S b/sysdeps/x86_64/wcpcpy.S
new file mode 100644
index 0000000000..97e9207c16
--- /dev/null
+++ b/sysdeps/x86_64/wcpcpy.S
@@ -0,0 +1,40 @@
+/* wcpcpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpcpy non-multiarch build is split into two files,
+   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCPCPY	__wcpcpy
+
+# define DEFAULT_IMPL_V4	"multiarch/wcpcpy-evex.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcpcpy, wcpcpy)
+libc_hidden_def (__wcpcpy)
+#endif
diff --git a/sysdeps/x86_64/wcpncpy-generic.c b/sysdeps/x86_64/wcpncpy-generic.c
new file mode 100644
index 0000000000..871219a445
--- /dev/null
+++ b/sysdeps/x86_64/wcpncpy-generic.c
@@ -0,0 +1,31 @@
+/* wcpncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpncpy non-multiarch build is split into two files,
+   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcpncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcpncpy.S b/sysdeps/x86_64/wcpncpy.S
new file mode 100644
index 0000000000..2169ed5545
--- /dev/null
+++ b/sysdeps/x86_64/wcpncpy.S
@@ -0,0 +1,40 @@
+/* wcpncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpncpy non-multiarch build is split into two files,
+   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCPNCPY	__wcpncpy
+
+# define DEFAULT_IMPL_V4	"multiarch/wcpncpy-evex.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcpncpy, wcpncpy)
+libc_hidden_def (__wcpncpy)
+#endif
diff --git a/sysdeps/x86_64/wcscat-generic.c b/sysdeps/x86_64/wcscat-generic.c
new file mode 100644
index 0000000000..85f981a81f
--- /dev/null
+++ b/sysdeps/x86_64/wcscat-generic.c
@@ -0,0 +1,31 @@
+/* wcscat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcscat non-multiarch build is split into two files,
+   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcscat.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcscat.S b/sysdeps/x86_64/wcscat.S
new file mode 100644
index 0000000000..8432087c7c
--- /dev/null
+++ b/sysdeps/x86_64/wcscat.S
@@ -0,0 +1,40 @@
+/* wcscat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcscat non-multiarch build is split into two files,
+   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCSCAT	__wcscat
+
+# define DEFAULT_IMPL_V4	"multiarch/wcscat-evex.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcscat, wcscat)
+libc_hidden_def (__wcscat)
+#endif
diff --git a/sysdeps/x86_64/wcscpy.S b/sysdeps/x86_64/wcscpy.S
index 11d0bb4bab..ff8bdd3aea 100644
--- a/sysdeps/x86_64/wcscpy.S
+++ b/sysdeps/x86_64/wcscpy.S
@@ -28,6 +28,7 @@
 
 # define WCSCPY	__wcscpy
 
+# define DEFAULT_IMPL_V4	"multiarch/wcscpy-evex.S"
 # define DEFAULT_IMPL_V2	"multiarch/wcscpy-ssse3.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
diff --git a/sysdeps/x86_64/wcsncat-generic.c b/sysdeps/x86_64/wcsncat-generic.c
new file mode 100644
index 0000000000..2cc0f7b11a
--- /dev/null
+++ b/sysdeps/x86_64/wcsncat-generic.c
@@ -0,0 +1,31 @@
+/* wcsncat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncat non-multiarch build is split into two files,
+   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcsncat.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcsncat.S b/sysdeps/x86_64/wcsncat.S
new file mode 100644
index 0000000000..64e144a9c7
--- /dev/null
+++ b/sysdeps/x86_64/wcsncat.S
@@ -0,0 +1,38 @@
+/* wcsncat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncat non-multiarch build is split into two files,
+   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCSNCAT	wcsncat
+
+# define DEFAULT_IMPL_V4	"multiarch/wcsncat-evex.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+#endif
diff --git a/sysdeps/x86_64/wcsncpy-generic.c b/sysdeps/x86_64/wcsncpy-generic.c
new file mode 100644
index 0000000000..49d06b8ae8
--- /dev/null
+++ b/sysdeps/x86_64/wcsncpy-generic.c
@@ -0,0 +1,31 @@
+/* wcsncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncpy non-multiarch build is split into two files,
+   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcsncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcsncpy.S b/sysdeps/x86_64/wcsncpy.S
new file mode 100644
index 0000000000..1450c1aa28
--- /dev/null
+++ b/sysdeps/x86_64/wcsncpy.S
@@ -0,0 +1,40 @@
+/* wcsncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncpy non-multiarch build is split into two files,
+   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCSNCPY	__wcsncpy
+
+# define DEFAULT_IMPL_V4	"multiarch/wcsncpy-evex.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcsncpy, wcsncpy)
+libc_hidden_def (__wcsncpy)
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v3 5/5] x86: Add avx2 optimized functions for the wchar_t strcpy family
  2022-11-04 20:13 ` [PATCH v3 1/5] " Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-11-04 20:13   ` [PATCH v3 4/5] x86: Add evex optimized functions for the wchar_t strcpy family Noah Goldstein
@ 2022-11-04 20:13   ` Noah Goldstein
  2022-11-04 21:01   ` [PATCH v3 1/5] benchtests: Make str{n}{cat|cpy} benchmarks output json H.J. Lu
  4 siblings, 0 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04 20:13 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Implemented:
    wcscat-avx2{+rtm}  (+ 744 * 2 bytes
    wcscpy-avx2{+rtm}  (+ 539 * 2 bytes)
    wcpcpy-avx2{+rtm}  (+ 577 * 2 bytes)
    wcsncpy-avx2{+rtm} (+1108 * 2 bytes)
    wcpncpy-avx2{+rtm} (+1214 * 2 bytes)
    wcsncat-avx2{+rtm} (+1085 * 2 bytes)

Performance Changes:
    Times are from N = 10 runs of the benchmark suite and are reported
    as geometric mean of all ratios of New Implementation / Best Old
    Implementation. Best Old Implementation was determined with the
    highest ISA implementation.

    wcscat-avx2     -> 0.975
    wcscpy-avx2     -> 0.591
    wcpcpy-avx2     -> 0.698
    wcsncpy-avx2    -> 0.730
    wcpncpy-avx2    -> 0.711
    wcsncat-avx2    -> 0.954

Code Size Changes:
    This change  increase the size of libc.so by ~11kb bytes. For
    reference the patch optimizing the normal strcpy family functions
    decreases libc.so by ~5.2kb.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
 sysdeps/x86_64/multiarch/Makefile           | 12 ++++
 sysdeps/x86_64/multiarch/ifunc-impl-list.c  | 66 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/ifunc-wcs.h        | 11 ++++
 sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S  |  3 +
 sysdeps/x86_64/multiarch/wcpcpy-avx2.S      |  8 +++
 sysdeps/x86_64/multiarch/wcpcpy-generic.c   |  2 +-
 sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S |  3 +
 sysdeps/x86_64/multiarch/wcpncpy-avx2.S     |  8 +++
 sysdeps/x86_64/multiarch/wcpncpy-generic.c  |  2 +-
 sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S  |  3 +
 sysdeps/x86_64/multiarch/wcscat-avx2.S      | 10 ++++
 sysdeps/x86_64/multiarch/wcscat-generic.c   |  2 +-
 sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S  |  3 +
 sysdeps/x86_64/multiarch/wcscpy-avx2.S      |  7 +++
 sysdeps/x86_64/multiarch/wcscpy-generic.c   |  2 +-
 sysdeps/x86_64/multiarch/wcscpy.c           |  9 +++
 sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S |  3 +
 sysdeps/x86_64/multiarch/wcsncat-avx2.S     |  9 +++
 sysdeps/x86_64/multiarch/wcsncat-generic.c  |  2 +-
 sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S |  3 +
 sysdeps/x86_64/multiarch/wcsncpy-avx2.S     |  7 +++
 sysdeps/x86_64/multiarch/wcsncpy-generic.c  |  2 +-
 sysdeps/x86_64/wcpcpy-generic.c             |  2 +-
 sysdeps/x86_64/wcpcpy.S                     |  3 +-
 sysdeps/x86_64/wcpncpy-generic.c            |  2 +-
 sysdeps/x86_64/wcpncpy.S                    |  3 +-
 sysdeps/x86_64/wcscat-generic.c             |  2 +-
 sysdeps/x86_64/wcscat.S                     |  3 +-
 sysdeps/x86_64/wcscpy.S                     |  1 +
 sysdeps/x86_64/wcsncat-generic.c            |  2 +-
 sysdeps/x86_64/wcsncat.S                    |  3 +-
 sysdeps/x86_64/wcsncpy-generic.c            |  2 +-
 sysdeps/x86_64/wcsncpy.S                    |  3 +-
 33 files changed, 187 insertions(+), 16 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d6e01940c3..f848fc0e28 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -131,10 +131,16 @@ endif
 
 ifeq ($(subdir),wcsmbs)
 sysdep_routines += \
+  wcpcpy-avx2 \
+  wcpcpy-avx2-rtm \
   wcpcpy-evex \
   wcpcpy-generic \
+  wcpncpy-avx2 \
+  wcpncpy-avx2-rtm \
   wcpncpy-evex \
   wcpncpy-generic \
+  wcscat-avx2 \
+  wcscat-avx2-rtm \
   wcscat-evex \
   wcscat-generic \
   wcschr-avx2 \
@@ -146,6 +152,8 @@ sysdep_routines += \
   wcscmp-avx2-rtm \
   wcscmp-evex \
   wcscmp-sse2 \
+  wcscpy-avx2 \
+  wcscpy-avx2-rtm \
   wcscpy-evex \
   wcscpy-generic \
   wcscpy-ssse3 \
@@ -155,11 +163,15 @@ sysdep_routines += \
   wcslen-evex512 \
   wcslen-sse2 \
   wcslen-sse4_1 \
+  wcsncat-avx2 \
+  wcsncat-avx2-rtm \
   wcsncat-evex \
   wcsncat-generic \
   wcsncmp-avx2 \
   wcsncmp-avx2-rtm \
   wcsncmp-evex \
+  wcsncpy-avx2 \
+  wcsncpy-avx2-rtm \
   wcsncpy-evex \
   wcsncpy-generic \
   wcsnlen-avx2 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 959cb0b420..71e8953e91 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -908,6 +908,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcscpy_evex)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcscpy_avx2_rtm)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcscpy,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __wcscpy_ssse3)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
@@ -922,6 +933,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI1)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcsncpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcsncpy_avx2_rtm)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcpncpy,
 				     1,
 				     __wcsncpy_generic))
@@ -934,6 +956,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI1)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcpcpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpcpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcpcpy_avx2_rtm)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcpcpy,
 				     1,
 				     __wcpcpy_generic))
@@ -946,6 +979,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI1)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcpncpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpncpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcpncpy_avx2_rtm)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncpy,
 				     1,
 				     __wcpncpy_generic))
@@ -958,6 +1002,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI1)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcscat_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscat_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcscat_avx2_rtm)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscat,
 				     1,
 				     __wcscat_generic))
@@ -970,6 +1025,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI1)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcsncat_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncat_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcsncat_avx2_rtm)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncat,
 				     1,
 				     __wcsncat_generic))
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcs.h b/sysdeps/x86_64/multiarch/ifunc-wcs.h
index da6e1b03d0..cda633d8fb 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wcs.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wcs.h
@@ -27,6 +27,9 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+
 extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
 
 static inline void *
@@ -43,6 +46,14 @@ IFUNC_SELECTOR (void)
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+				       Prefer_No_VZEROUPPER, !))
+	return OPTIMIZE (avx2);
+
     }
 
   return OPTIMIZE (GENERIC);
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
new file mode 100644
index 0000000000..756280a3ab
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCPCPY	__wcpcpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcpcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
new file mode 100644
index 0000000000..0fffd912d3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
@@ -0,0 +1,8 @@
+#ifndef WCPCPY
+# define WCPCPY	__wcpcpy_avx2
+#endif
+
+#define USE_AS_STPCPY
+#define USE_AS_WCSCPY
+#define STRCPY	WCPCPY
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-generic.c b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
index 6039196a3e..0ba29b081f 100644
--- a/sysdeps/x86_64/multiarch/wcpcpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
@@ -19,7 +19,7 @@
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCPCPY __wcpcpy_generic
 # include <wcsmbs/wcpcpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
new file mode 100644
index 0000000000..80600d6b01
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCPNCPY	__wcpncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcpncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
new file mode 100644
index 0000000000..b7e594f7b7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
@@ -0,0 +1,8 @@
+#ifndef WCPNCPY
+# define WCPNCPY	__wcpncpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STPCPY
+#define STRNCPY	WCPNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-generic.c b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
index de8d34320e..4aab4ecdd2 100644
--- a/sysdeps/x86_64/multiarch/wcpncpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
@@ -19,7 +19,7 @@
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCPNCPY __wcpncpy_generic
 # include <wcsmbs/wcpncpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
new file mode 100644
index 0000000000..e99449a2dc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSCAT	__wcscat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcscat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2.S b/sysdeps/x86_64/multiarch/wcscat-avx2.S
new file mode 100644
index 0000000000..a20f23c09d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-avx2.S
@@ -0,0 +1,10 @@
+#ifndef WCSCAT
+# define WCSCAT	__wcscat_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRCPY	WCSCAT
+
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscat-generic.c b/sysdeps/x86_64/multiarch/wcscat-generic.c
index d86b4d5c00..6476f85bbb 100644
--- a/sysdeps/x86_64/multiarch/wcscat-generic.c
+++ b/sysdeps/x86_64/multiarch/wcscat-generic.c
@@ -19,7 +19,7 @@
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSCAT __wcscat_generic
 # include <wcsmbs/wcscat.c>
diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
new file mode 100644
index 0000000000..2f800c8d3e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSCPY	__wcscpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcscpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2.S b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
new file mode 100644
index 0000000000..6bc509da07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
@@ -0,0 +1,7 @@
+#ifndef WCSCPY
+# define WCSCPY	__wcscpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define STRCPY	WCSCPY
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-generic.c b/sysdeps/x86_64/multiarch/wcscpy-generic.c
index 4a1fffae4b..600d606c45 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcscpy-generic.c
@@ -18,7 +18,7 @@
 
 
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSCPY  __wcscpy_generic
 # include <wcsmbs/wcscpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
index efe32e505f..7f6387817b 100644
--- a/sysdeps/x86_64/multiarch/wcscpy.c
+++ b/sysdeps/x86_64/multiarch/wcscpy.c
@@ -28,6 +28,9 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
@@ -45,6 +48,12 @@ IFUNC_SELECTOR (void)
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER, !))
+	return OPTIMIZE (avx2);
     }
 
   if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
new file mode 100644
index 0000000000..609d6e69c0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSNCAT	__wcsncat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcsncat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2.S b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
new file mode 100644
index 0000000000..a72105b7e9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
@@ -0,0 +1,9 @@
+#ifndef WCSNCAT
+# define WCSNCAT	__wcsncat_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRNCAT	WCSNCAT
+#include "strncat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncat-generic.c b/sysdeps/x86_64/multiarch/wcsncat-generic.c
index 4b55cb40bc..9ced02b35e 100644
--- a/sysdeps/x86_64/multiarch/wcsncat-generic.c
+++ b/sysdeps/x86_64/multiarch/wcsncat-generic.c
@@ -19,7 +19,7 @@
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSNCAT __wcsncat_generic
 # include <wcsmbs/wcsncat.c>
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
new file mode 100644
index 0000000000..cab5a6b820
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSNCPY	__wcsncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcsncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
new file mode 100644
index 0000000000..3a1a8a372c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
@@ -0,0 +1,7 @@
+#ifndef WCSNCPY
+# define WCSNCPY	__wcsncpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define STRNCPY	WCSNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-generic.c b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
index d0e8a86605..693521713b 100644
--- a/sysdeps/x86_64/multiarch/wcsncpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
@@ -19,7 +19,7 @@
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSNCPY __wcsncpy_generic
 # include <wcsmbs/wcsncpy.c>
diff --git a/sysdeps/x86_64/wcpcpy-generic.c b/sysdeps/x86_64/wcpcpy-generic.c
index d52525f288..2e4d69a500 100644
--- a/sysdeps/x86_64/wcpcpy-generic.c
+++ b/sysdeps/x86_64/wcpcpy-generic.c
@@ -24,7 +24,7 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcpcpy.c>
 
diff --git a/sysdeps/x86_64/wcpcpy.S b/sysdeps/x86_64/wcpcpy.S
index 97e9207c16..cfde4309fe 100644
--- a/sysdeps/x86_64/wcpcpy.S
+++ b/sysdeps/x86_64/wcpcpy.S
@@ -24,11 +24,12 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCPCPY	__wcpcpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcpcpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcpcpy-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcpncpy-generic.c b/sysdeps/x86_64/wcpncpy-generic.c
index 871219a445..1f12a0e4c6 100644
--- a/sysdeps/x86_64/wcpncpy-generic.c
+++ b/sysdeps/x86_64/wcpncpy-generic.c
@@ -24,7 +24,7 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcpncpy.c>
 
diff --git a/sysdeps/x86_64/wcpncpy.S b/sysdeps/x86_64/wcpncpy.S
index 2169ed5545..2f89482d30 100644
--- a/sysdeps/x86_64/wcpncpy.S
+++ b/sysdeps/x86_64/wcpncpy.S
@@ -24,11 +24,12 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCPNCPY	__wcpncpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcpncpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcpncpy-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcscat-generic.c b/sysdeps/x86_64/wcscat-generic.c
index 85f981a81f..3552167ebe 100644
--- a/sysdeps/x86_64/wcscat-generic.c
+++ b/sysdeps/x86_64/wcscat-generic.c
@@ -24,7 +24,7 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcscat.c>
 
diff --git a/sysdeps/x86_64/wcscat.S b/sysdeps/x86_64/wcscat.S
index 8432087c7c..2e59987e76 100644
--- a/sysdeps/x86_64/wcscat.S
+++ b/sysdeps/x86_64/wcscat.S
@@ -24,11 +24,12 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCSCAT	__wcscat
 
 # define DEFAULT_IMPL_V4	"multiarch/wcscat-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcscat-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcscpy.S b/sysdeps/x86_64/wcscpy.S
index ff8bdd3aea..ab9288ed74 100644
--- a/sysdeps/x86_64/wcscpy.S
+++ b/sysdeps/x86_64/wcscpy.S
@@ -29,6 +29,7 @@
 # define WCSCPY	__wcscpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcscpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcscpy-avx2.S"
 # define DEFAULT_IMPL_V2	"multiarch/wcscpy-ssse3.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
diff --git a/sysdeps/x86_64/wcsncat-generic.c b/sysdeps/x86_64/wcsncat-generic.c
index 2cc0f7b11a..47f6a8ad56 100644
--- a/sysdeps/x86_64/wcsncat-generic.c
+++ b/sysdeps/x86_64/wcsncat-generic.c
@@ -24,7 +24,7 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcsncat.c>
 
diff --git a/sysdeps/x86_64/wcsncat.S b/sysdeps/x86_64/wcsncat.S
index 64e144a9c7..9a55499131 100644
--- a/sysdeps/x86_64/wcsncat.S
+++ b/sysdeps/x86_64/wcsncat.S
@@ -24,11 +24,12 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCSNCAT	wcsncat
 
 # define DEFAULT_IMPL_V4	"multiarch/wcsncat-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcsncat-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcsncpy-generic.c b/sysdeps/x86_64/wcsncpy-generic.c
index 49d06b8ae8..7f19fcaddc 100644
--- a/sysdeps/x86_64/wcsncpy-generic.c
+++ b/sysdeps/x86_64/wcsncpy-generic.c
@@ -24,7 +24,7 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcsncpy.c>
 
diff --git a/sysdeps/x86_64/wcsncpy.S b/sysdeps/x86_64/wcsncpy.S
index 1450c1aa28..dc44b32395 100644
--- a/sysdeps/x86_64/wcsncpy.S
+++ b/sysdeps/x86_64/wcsncpy.S
@@ -24,11 +24,12 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCSNCPY	__wcsncpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcsncpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcsncpy-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
  2022-11-04 16:33     ` H.J. Lu
@ 2022-11-04 20:20       ` Noah Goldstein
  0 siblings, 0 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04 20:20 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Nov 4, 2022 at 9:34 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Nov 4, 2022 at 1:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Optimizations are:
> >     1. Use more overlapping stores to avoid branches.
> >     2. Reduce how unrolled the aligning copies are (this is more of a
> >        code-size save, its a negative for some sizes in terms of
> >        perf).
> >     3. Improve the loop a bit (similiar to what we do in strlen with
> >        2x vpminu + kortest instead of 3x vpminu + kmov + test).
> >     4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
> >        number that are taken.
> >
> > Performance Changes:
> >
> >     Times are from N = 10 runs of the benchmark suite and are
> >     reported as geometric mean of all ratios of
> >     New Implementation / Old Implementation.
> >
> >     stpcpy-evex      -> 0.922
> >     strcat-evex      -> 0.985
> >     strcpy-evex      -> 0.880
> >
> >     strncpy-evex     -> 0.831
> >     stpncpy-evex     -> 0.780
> >
> >     strncat-evex     -> 0.958
> >
> > Code Size Changes:
> >     function         -> Bytes New / Bytes Old -> Ratio
> >
> >     strcat-evex      -> 819  / 1874 -> 0.437
> >     strcpy-evex      -> 700  / 1074 -> 0.652
> >     stpcpy-evex      -> 735  / 1094 -> 0.672
> >
> >     strncpy-evex     -> 1397 / 2611 -> 0.535
> >     stpncpy-evex     -> 1489 / 2691 -> 0.553
> >
> >     strncat-evex     -> 1184 / 2832 -> 0.418
> >
> > Notes:
> >     1. Because of the significant difference between the
> >        implementations they are split into three files.
> >
> >            strcpy-evex.S    -> strcpy, stpcpy, strcat
> >            strncpy-evex.S   -> strncpy
> >            strncat-evex.S    > strncat
> >
> >        I couldn't find a way to merge them without making the
> >        ifdefs incredibly difficult to follow.
> >
> >     2. All implementations can be made evex512 by including
> >        "x86-evex512-vecs.h" at the top.
> >
> >     3. All implementations have an optional define:
> >         `USE_EVEX_MASKED_STORE`
> >        Setting to one uses evex-masked stores for handling short
> >        strings.  This saves code size and branches.  It's disabled
> >        for all implementations are the moment as there are some
> >        serious drawbacks to masked stores in certain cases, but
> >        that may be fixed on future architectures.
> >
> > Full check passes on x86-64 and build succeeds for all ISA levels w/
> > and w/o multiarch.
> > ---
> >  sysdeps/x86_64/multiarch/stpncpy-evex.S       |    5 +-
> >  sysdeps/x86_64/multiarch/strcat-evex.S        |  291 +---
> >  sysdeps/x86_64/multiarch/strcat-strlen-evex.S |   85 ++
> >  sysdeps/x86_64/multiarch/strcpy-evex.S        | 1282 ++++++-----------
> >  sysdeps/x86_64/multiarch/strncat-evex.S       |  525 ++++++-
> >  sysdeps/x86_64/multiarch/strncpy-evex.S       |  995 ++++++++++++-
> >  .../multiarch/strncpy-or-cat-overflow-def.h   |   65 +
> >  7 files changed, 2075 insertions(+), 1173 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> >
> > diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> > index 99ea76a372..3693491baa 100644
> > --- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
> > +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> > @@ -3,6 +3,5 @@
> >  #endif
> >
> >  #define USE_AS_STPCPY
> > -#define USE_AS_STRNCPY
> > -#define STRCPY STPNCPY
> > -#include "strcpy-evex.S"
> > +#define STRNCPY        STPNCPY
> > +#include "strncpy-evex.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
> > index 0e2df947e9..b4207b7889 100644
> > --- a/sysdeps/x86_64/multiarch/strcat-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strcat-evex.S
> > @@ -1,286 +1,7 @@
> > -/* strcat with 256-bit EVEX instructions.
> > -   Copyright (C) 2021-2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -#include <isa-level.h>
> > -
> > -#if ISA_SHOULD_BUILD (4)
> > -
> > -
> > -# include <sysdep.h>
> > -
> > -# ifndef STRCAT
> > -#  define STRCAT  __strcat_evex
> > -# endif
> > -
> > -# define VMOVU         vmovdqu64
> > -# define VMOVA         vmovdqa64
> > -
> > -/* zero register */
> > -# define XMMZERO       xmm16
> > -# define YMMZERO       ymm16
> > -# define YMM0          ymm17
> > -# define YMM1          ymm18
> > -
> > -# define USE_AS_STRCAT
> > -
> > -/* Number of bytes in a vector register */
> > -# define VEC_SIZE      32
> > -
> > -       .section .text.evex,"ax",@progbits
> > -ENTRY (STRCAT)
> > -       mov     %rdi, %r9
> > -# ifdef USE_AS_STRNCAT
> > -       mov     %rdx, %r8
> > -# endif
> > -
> > -       xor     %eax, %eax
> > -       mov     %edi, %ecx
> > -       and     $((VEC_SIZE * 4) - 1), %ecx
> > -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> > -       cmp     $(VEC_SIZE * 3), %ecx
> > -       ja      L(fourth_vector_boundary)
> > -       vpcmpb  $0, (%rdi), %YMMZERO, %k0
> > -       kmovd   %k0, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_first_vector)
> > -       mov     %rdi, %rax
> > -       and     $-VEC_SIZE, %rax
> > -       jmp     L(align_vec_size_start)
> > -L(fourth_vector_boundary):
> > -       mov     %rdi, %rax
> > -       and     $-VEC_SIZE, %rax
> > -       vpcmpb  $0, (%rax), %YMMZERO, %k0
> > -       mov     $-1, %r10d
> > -       sub     %rax, %rcx
> > -       shl     %cl, %r10d
> > -       kmovd   %k0, %edx
> > -       and     %r10d, %edx
> > -       jnz     L(exit)
> > -
> > -L(align_vec_size_start):
> > -       vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> > -       kmovd   %k0, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_second_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > -       kmovd   %k1, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_third_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > -       kmovd   %k2, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fourth_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > -       kmovd   %k3, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fifth_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > -       add     $(VEC_SIZE * 4), %rax
> > -       kmovd   %k4, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_second_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > -       kmovd   %k1, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_third_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > -       kmovd   %k2, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fourth_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > -       kmovd   %k3, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fifth_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > -       kmovd   %k4, %edx
> > -       add     $(VEC_SIZE * 4), %rax
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_second_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > -       kmovd   %k1, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_third_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > -       kmovd   %k2, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fourth_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > -       kmovd   %k3, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fifth_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > -       add     $(VEC_SIZE * 4), %rax
> > -       kmovd   %k4, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_second_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > -       kmovd   %k1, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_third_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > -       kmovd   %k2, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fourth_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > -       kmovd   %k3, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fifth_vector)
> > -
> > -       test    $((VEC_SIZE * 4) - 1), %rax
> > -       jz      L(align_four_vec_loop)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > -       add     $(VEC_SIZE * 5), %rax
> > -       kmovd   %k4, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit)
> > -
> > -       test    $((VEC_SIZE * 4) - 1), %rax
> > -       jz      L(align_four_vec_loop)
> > -
> > -       vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> > -       add     $VEC_SIZE, %rax
> > -       kmovd   %k0, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit)
> > -
> > -       test    $((VEC_SIZE * 4) - 1), %rax
> > -       jz      L(align_four_vec_loop)
> > -
> > -       vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> > -       add     $VEC_SIZE, %rax
> > -       kmovd   %k0, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit)
> > -
> > -       test    $((VEC_SIZE * 4) - 1), %rax
> > -       jz      L(align_four_vec_loop)
> > -
> > -       vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k1
> > -       add     $VEC_SIZE, %rax
> > -       kmovd   %k1, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit)
> > -
> > -       add     $VEC_SIZE, %rax
> > -
> > -       .p2align 4
> > -L(align_four_vec_loop):
> > -       VMOVA   (%rax), %YMM0
> > -       VMOVA   (VEC_SIZE * 2)(%rax), %YMM1
> > -       vpminub VEC_SIZE(%rax), %YMM0, %YMM0
> > -       vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1
> > -       vpminub %YMM0, %YMM1, %YMM0
> > -       /* If K0 != 0, there is a null byte.  */
> > -       vpcmpb  $0, %YMM0, %YMMZERO, %k0
> > -       add     $(VEC_SIZE * 4), %rax
> > -       ktestd  %k0, %k0
> > -       jz      L(align_four_vec_loop)
> > -
> > -       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
> > -       sub     $(VEC_SIZE * 5), %rax
> > -       kmovd   %k0, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_second_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > -       kmovd   %k1, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_third_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > -       kmovd   %k2, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fourth_vector)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > -       kmovd   %k3, %edx
> > -       sub     %rdi, %rax
> > -       bsf     %rdx, %rdx
> > -       add     %rdx, %rax
> > -       add     $(VEC_SIZE * 4), %rax
> > -       jmp     L(StartStrcpyPart)
> > -
> > -       .p2align 4
> > -L(exit):
> > -       sub     %rdi, %rax
> > -L(exit_null_on_first_vector):
> > -       bsf     %rdx, %rdx
> > -       add     %rdx, %rax
> > -       jmp     L(StartStrcpyPart)
> > -
> > -       .p2align 4
> > -L(exit_null_on_second_vector):
> > -       sub     %rdi, %rax
> > -       bsf     %rdx, %rdx
> > -       add     %rdx, %rax
> > -       add     $VEC_SIZE, %rax
> > -       jmp     L(StartStrcpyPart)
> > -
> > -       .p2align 4
> > -L(exit_null_on_third_vector):
> > -       sub     %rdi, %rax
> > -       bsf     %rdx, %rdx
> > -       add     %rdx, %rax
> > -       add     $(VEC_SIZE * 2), %rax
> > -       jmp     L(StartStrcpyPart)
> > -
> > -       .p2align 4
> > -L(exit_null_on_fourth_vector):
> > -       sub     %rdi, %rax
> > -       bsf     %rdx, %rdx
> > -       add     %rdx, %rax
> > -       add     $(VEC_SIZE * 3), %rax
> > -       jmp     L(StartStrcpyPart)
> > -
> > -       .p2align 4
> > -L(exit_null_on_fifth_vector):
> > -       sub     %rdi, %rax
> > -       bsf     %rdx, %rdx
> > -       add     %rdx, %rax
> > -       add     $(VEC_SIZE * 4), %rax
> > -
> > -       .p2align 4
> > -L(StartStrcpyPart):
> > -       lea     (%r9, %rax), %rdi
> > -       mov     %rsi, %rcx
> > -       mov     %r9, %rax      /* save result */
> > -
> > -# ifdef USE_AS_STRNCAT
> > -       test    %r8, %r8
> > -       jz      L(ExitZero)
> > -#  define USE_AS_STRNCPY
> > -# endif
> > -
> > -# include "strcpy-evex.S"
> > +#ifndef STRCAT
> > +# define STRCAT        __strcat_evex
> >  #endif
> > +
> > +#define USE_AS_STRCAT
> > +#define STRCPY STRCAT
> > +#include "strcpy-evex.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> > new file mode 100644
> > index 0000000000..9bc777c339
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
>
> Copyright notice is missing.

Fixed in V3.
>
> > @@ -0,0 +1,85 @@
> > +    /* Simple strlen implementation that ends at L(strcat_strlen_done).  */
> > +       vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> > +       movq    %rdi, %r8
> > +       andq    $(VEC_SIZE * -1), %r8
> > +       VPCMPEQ (%r8), %VZERO, %k0
> > +       KMOV    %k0, %VRCX
> > +#ifdef USE_AS_WCSCPY
> > +       subl    %r8d, %edi
> > +       shrl    $2, %edi
> > +#endif
> > +       shrx    %VRDI, %VRCX, %VRCX
> > +#ifdef USE_AS_WCSCPY
> > +       movq    %rax, %rdi
> > +#endif
> > +       test    %VRCX, %VRCX
> > +       jnz     L(bsf_and_done_v0)
> > +
> > +
> > +       VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0
> > +       KMOV    %k0, %VRCX
> > +       leaq    (VEC_SIZE)(%r8), %rdi
> > +       test    %VRCX, %VRCX
> > +       jnz     L(bsf_and_done_v0)
> > +
> > +       VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(bsf_and_done_v1)
> > +
> > +       VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(bsf_and_done_v2)
> > +
> > +       VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(bsf_and_done_v3)
> > +
> > +       andq    $-(VEC_SIZE * 4), %rdi
> > +       .p2align 4,, 8
> > +L(loop_2x_vec):
> > +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(0)
> > +       VPMIN   (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(2)
> > +       VPMIN   (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
> > +       VPTESTN %VMM(1), %VMM(1), %k1
> > +       VPTESTN %VMM(3), %VMM(3), %k3
> > +       subq    $(VEC_SIZE * -4), %rdi
> > +       KORTEST %k1, %k3
> > +       jz      L(loop_2x_vec)
> > +
> > +       VPTESTN %VMM(0), %VMM(0), %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(bsf_and_done_v0)
> > +
> > +       KMOV    %k1, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(bsf_and_done_v1)
> > +
> > +       VPTESTN %VMM(2), %VMM(2), %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(bsf_and_done_v2)
> > +
> > +       KMOV    %k3, %VRCX
> > +L(bsf_and_done_v3):
> > +       addq    $VEC_SIZE, %rdi
> > +L(bsf_and_done_v2):
> > +       bsf     %VRCX, %VRCX
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
> > +       jmp     L(strcat_strlen_done)
> > +
> > +       .p2align 4,, 4
> > +L(bsf_and_done_v1):
> > +       addq    $VEC_SIZE, %rdi
> > +L(bsf_and_done_v0):
> > +       bsf     %VRCX, %VRCX
> > +#ifdef USE_AS_WCSCPY
> > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > +#else
> > +       addq    %rcx, %rdi
> > +#endif
> > +L(strcat_strlen_done):
> > diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
> > index 82e45ac675..1ba0195ed2 100644
> > --- a/sysdeps/x86_64/multiarch/strcpy-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
> > @@ -1,4 +1,4 @@
> > -/* strcpy with 256-bit EVEX instructions.
> > +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
> >     Copyright (C) 2021-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> > @@ -17,990 +17,526 @@
> >     <https://www.gnu.org/licenses/>.  */
> >
> >  #include <isa-level.h>
> > -
> >  #if ISA_SHOULD_BUILD (4)
> >
> >
> > -# ifndef USE_AS_STRCAT
> > -#  include <sysdep.h>
> > +       /* Use evex-masked stores for small sizes. Turned off at the
> > +          moment.  */
> > +# define USE_EVEX_MASKED_STORE 0
> > +       /* Use movsb in page cross case to save code size.  */
> > +# define USE_MOVSB_IN_PAGE_CROSS       1
> >
> > -#  ifndef STRCPY
> > -#   define STRCPY  __strcpy_evex
> > -#  endif
> > +# include <sysdep.h>
> >
> > +# ifndef VEC_SIZE
> > +#  include "x86-evex256-vecs.h"
> >  # endif
> >
> > -# define VMOVU         vmovdqu64
> > -# define VMOVA         vmovdqa64
> > -
> > -/* Number of bytes in a vector register */
> > -# ifndef VEC_SIZE
> > -#  define VEC_SIZE     32
> > +# ifndef STRCPY
> > +#  define STRCPY       __strcpy_evex
> >  # endif
> >
> > -# define XMM2          xmm18
> > -# define XMM3          xmm19
> >
> > -# define YMM2          ymm18
> > -# define YMM3          ymm19
> > -# define YMM4          ymm20
> > -# define YMM5          ymm21
> > -# define YMM6          ymm22
> > -# define YMM7          ymm23
> > +# ifdef USE_AS_WCSCPY
> > +#  define VMOVU_MASK   vmovdqu32
> > +#  define VPMIN        vpminud
> > +#  define VPTESTN      vptestnmd
> > +#  define VPTEST       vptestmd
> > +#  define VPCMPEQ      vpcmpeqd
> > +#  define CHAR_SIZE    4
> >
> > -# ifndef USE_AS_STRCAT
> > +#  define REP_MOVS     rep movsd
> >
> > -/* zero register */
> > -#  define XMMZERO      xmm16
> > -#  define YMMZERO      ymm16
> > -#  define YMM1         ymm17
> > -
> > -       .section .text.evex,"ax",@progbits
> > -ENTRY (STRCPY)
> > -#  ifdef USE_AS_STRNCPY
> > -       mov     %RDX_LP, %R8_LP
> > -       test    %R8_LP, %R8_LP
> > -       jz      L(ExitZero)
> > -#  endif
> > -       mov     %rsi, %rcx
> > -#  ifndef USE_AS_STPCPY
> > -       mov     %rdi, %rax      /* save result */
> > -#  endif
> > +#  define USE_WIDE_CHAR
> > +# else
> > +#  define VMOVU_MASK   vmovdqu8
> > +#  define VPMIN        vpminub
> > +#  define VPTESTN      vptestnmb
> > +#  define VPTEST       vptestmb
> > +#  define VPCMPEQ      vpcmpeqb
> > +#  define CHAR_SIZE    1
> >
> > -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> > +#  define REP_MOVS     rep movsb
> >  # endif
> >
> > -       and     $((VEC_SIZE * 4) - 1), %ecx
> > -       cmp     $(VEC_SIZE * 2), %ecx
> > -       jbe     L(SourceStringAlignmentLessTwoVecSize)
> > -
> > -       and     $-VEC_SIZE, %rsi
> > -       and     $(VEC_SIZE - 1), %ecx
> > -
> > -       vpcmpb  $0, (%rsi), %YMMZERO, %k0
> > -       kmovd   %k0, %edx
> > -       shr     %cl, %rdx
> > +# include "reg-macros.h"
> >
> > -# ifdef USE_AS_STRNCPY
> > -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > -       mov     $VEC_SIZE, %r10
> > -       sub     %rcx, %r10
> > -       cmp     %r10, %r8
> > -#  else
> > -       mov     $(VEC_SIZE + 1), %r10
> > -       sub     %rcx, %r10
> > -       cmp     %r10, %r8
> > -#  endif
> > -       jbe     L(CopyVecSizeTailCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyVecSizeTail)
> > -
> > -       vpcmpb  $0, VEC_SIZE(%rsi), %YMMZERO, %k1
> > -       kmovd   %k1, %edx
> >
> > -# ifdef USE_AS_STRNCPY
> > -       add     $VEC_SIZE, %r10
> > -       cmp     %r10, %r8
> > -       jbe     L(CopyTwoVecSizeCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyTwoVecSize)
> > -
> > -       VMOVU   (%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
> > -       VMOVU   %YMM2, (%rdi)
> > -
> > -/* If source address alignment != destination address alignment */
> > -       .p2align 4
> > -L(UnalignVecSizeBoth):
> > -       sub     %rcx, %rdi
> > -# ifdef USE_AS_STRNCPY
> > -       add     %rcx, %r8
> > -       sbb     %rcx, %rcx
> > -       or      %rcx, %r8
> > -# endif
> > -       mov     $VEC_SIZE, %rcx
> > -       VMOVA   (%rsi, %rcx), %YMM2
> > -       VMOVU   %YMM2, (%rdi, %rcx)
> > -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> > -       vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > -       kmovd   %k0, %edx
> > -       add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $(VEC_SIZE * 3), %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec2)
> > +# ifdef USE_AS_STPCPY
> > +#  define END_REG      rax
> >  # else
> > -       jnz     L(CopyVecSize)
> > +#  define END_REG      rdi, %rdx, CHAR_SIZE
> >  # endif
> >
> > -       VMOVU   %YMM2, (%rdi, %rcx)
> > -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM3
> > -       vpcmpb  $0, %YMM3, %YMMZERO, %k0
> > -       kmovd   %k0, %edx
> > -       add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec3)
> > +# ifdef USE_AS_STRCAT
> > +#  define PAGE_ALIGN_REG       edx
> > +#  define PAGE_ALIGN_REG_64    rdx
> >  # else
> > -       jnz     L(CopyVecSize)
> > +#  define PAGE_ALIGN_REG       eax
> > +#  define PAGE_ALIGN_REG_64    rax
> >  # endif
> >
> > -       VMOVU   %YMM3, (%rdi, %rcx)
> > -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM4
> > -       vpcmpb  $0, %YMM4, %YMMZERO, %k0
> > -       kmovd   %k0, %edx
> > -       add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec4)
> > -# else
> > -       jnz     L(CopyVecSize)
> > -# endif
> > +# define VZERO VMM(7)
> > +# define VZERO_128     VMM_128(7)
> >
> > -       VMOVU   %YMM4, (%rdi, %rcx)
> > -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> > -       vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > -       kmovd   %k0, %edx
> > -       add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec2)
> > -# else
> > -       jnz     L(CopyVecSize)
> > -# endif
> >
> > -       VMOVU   %YMM2, (%rdi, %rcx)
> > -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> > -       vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > -       kmovd   %k0, %edx
> > -       add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec2)
> > -# else
> > -       jnz     L(CopyVecSize)
> > -# endif
> > +# define PAGE_SIZE     4096
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> >
> > -       VMOVA   VEC_SIZE(%rsi, %rcx), %YMM3
> > -       VMOVU   %YMM2, (%rdi, %rcx)
> > -       vpcmpb  $0, %YMM3, %YMMZERO, %k0
> > -       kmovd   %k0, %edx
> > -       add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec3)
> > -# else
> > -       jnz     L(CopyVecSize)
> > -# endif
> >
> > -       VMOVU   %YMM3, (%rdi, %rcx)
> > -       mov     %rsi, %rdx
> > -       lea     VEC_SIZE(%rsi, %rcx), %rsi
> > -       and     $-(VEC_SIZE * 4), %rsi
> > -       sub     %rsi, %rdx
> > -       sub     %rdx, %rdi
> > -# ifdef USE_AS_STRNCPY
> > -       lea     (VEC_SIZE * 8)(%r8, %rdx), %r8
> > -# endif
> > -L(UnalignedFourVecSizeLoop):
> > -       VMOVA   (%rsi), %YMM4
> > -       VMOVA   VEC_SIZE(%rsi), %YMM5
> > -       VMOVA   (VEC_SIZE * 2)(%rsi), %YMM6
> > -       VMOVA   (VEC_SIZE * 3)(%rsi), %YMM7
> > -       vpminub %YMM5, %YMM4, %YMM2
> > -       vpminub %YMM7, %YMM6, %YMM3
> > -       vpminub %YMM2, %YMM3, %YMM2
> > -       /* If K7 != 0, there is a null byte.  */
> > -       vpcmpb  $0, %YMM2, %YMMZERO, %k7
> > -       kmovd   %k7, %edx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $(VEC_SIZE * 4), %r8
> > -       jbe     L(UnalignedLeaveCase2OrCase3)
> > +       .section SECTION(.text), "ax", @progbits
> > +ENTRY(STRCPY)
> > +# ifdef USE_AS_STRCAT
> > +       movq    %rdi, %rax
> > +#  include "strcat-strlen-evex.S"
> >  # endif
> > -       test    %edx, %edx
> > -       jnz     L(UnalignedFourVecSizeLeave)
> > -
> > -L(UnalignedFourVecSizeLoop_start):
> > -       add     $(VEC_SIZE * 4), %rdi
> > -       add     $(VEC_SIZE * 4), %rsi
> > -       VMOVU   %YMM4, -(VEC_SIZE * 4)(%rdi)
> > -       VMOVA   (%rsi), %YMM4
> > -       VMOVU   %YMM5, -(VEC_SIZE * 3)(%rdi)
> > -       VMOVA   VEC_SIZE(%rsi), %YMM5
> > -       vpminub %YMM5, %YMM4, %YMM2
> > -       VMOVU   %YMM6, -(VEC_SIZE * 2)(%rdi)
> > -       VMOVA   (VEC_SIZE * 2)(%rsi), %YMM6
> > -       VMOVU   %YMM7, -VEC_SIZE(%rdi)
> > -       VMOVA   (VEC_SIZE * 3)(%rsi), %YMM7
> > -       vpminub %YMM7, %YMM6, %YMM3
> > -       vpminub %YMM2, %YMM3, %YMM2
> > -       /* If K7 != 0, there is a null byte.  */
> > -       vpcmpb  $0, %YMM2, %YMMZERO, %k7
> > -       kmovd   %k7, %edx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $(VEC_SIZE * 4), %r8
> > -       jbe     L(UnalignedLeaveCase2OrCase3)
> > +
> > +       movl    %esi, %PAGE_ALIGN_REG
> > +       andl    $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
> > +       ja      L(page_cross)
> > +L(page_cross_continue):
> > +       VMOVU   (%rsi), %VMM(0)
> > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > +       movq    %rdi, %rax
> >  # endif
> > -       test    %edx, %edx
> > -       jz      L(UnalignedFourVecSizeLoop_start)
> >
> > -L(UnalignedFourVecSizeLeave):
> > -       vpcmpb  $0, %YMM4, %YMMZERO, %k1
> > -       kmovd   %k1, %edx
> > -       test    %edx, %edx
> > -       jnz     L(CopyVecSizeUnaligned_0)
> >
> > -       vpcmpb  $0, %YMM5, %YMMZERO, %k2
> > -       kmovd   %k2, %ecx
> > -       test    %ecx, %ecx
> > -       jnz     L(CopyVecSizeUnaligned_16)
> > +       /* Two short string implementations. One with traditional
> > +          branching approach and one with masked instructions (which
> > +          have potential for dramatically bad perf if dst splits a
> > +          page and is not in the TLB).  */
> > +# if USE_EVEX_MASKED_STORE
> > +       VPTEST  %VMM(0), %VMM(0), %k0
> > +       KMOV    %k0, %VRCX
> > +#  ifdef USE_AS_WCSCPY
> > +       subl    $((1 << CHAR_PER_VEC)- 1), %VRCX
> > +#  else
> > +       inc     %VRCX
> > +#  endif
> > +       jz      L(more_1x_vec)
> > +       KMOV    %VRCX, %k1
> > +       KXOR    %k0, %k1, %k1
> >
> > -       vpcmpb  $0, %YMM6, %YMMZERO, %k3
> > -       kmovd   %k3, %edx
> > -       test    %edx, %edx
> > -       jnz     L(CopyVecSizeUnaligned_32)
> > -
> > -       vpcmpb  $0, %YMM7, %YMMZERO, %k4
> > -       kmovd   %k4, %ecx
> > -       bsf     %ecx, %edx
> > -       VMOVU   %YMM4, (%rdi)
> > -       VMOVU   %YMM5, VEC_SIZE(%rdi)
> > -       VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > -       lea     (VEC_SIZE * 3)(%rdi, %rdx), %rax
> > -# endif
> > -       VMOVU   %YMM7, (VEC_SIZE * 3)(%rdi)
> > -       add     $(VEC_SIZE - 1), %r8
> > -       sub     %rdx, %r8
> > -       lea     ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
> > -       jmp     L(StrncpyFillTailWithZero)
> > -# else
> > -       add     $(VEC_SIZE * 3), %rsi
> > -       add     $(VEC_SIZE * 3), %rdi
> > -       jmp     L(CopyVecSizeExit)
> > -# endif
> > +       VMOVU_MASK %VMM(0), (%rdi){%k1}
> >
> > -/* If source address alignment == destination address alignment */
> > +#  ifdef USE_AS_STPCPY
> > +       bsf     %VRCX, %VRCX
> > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rax
> > +#  endif
> > +       ret
> >
> > -L(SourceStringAlignmentLessTwoVecSize):
> > -       VMOVU   (%rsi), %YMM3
> > -       VMOVU   VEC_SIZE(%rsi), %YMM2
> > -       vpcmpb  $0, %YMM3, %YMMZERO, %k0
> > -       kmovd   %k0, %edx
> > +# else
> > +       VPTESTN %VMM(0), %VMM(0), %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jz      L(more_1x_vec)
> >
> > -# ifdef USE_AS_STRNCPY
> > -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > -       cmp     $VEC_SIZE, %r8
> > +       xorl    %edx, %edx
> > +       bsf     %VRCX, %VRDX
> > +#  ifdef USE_AS_STPCPY
> > +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > +#  endif
> > +
> > +       /* Use mask bits in rcx to detect which copy we need. If the low
> > +          mask is zero then there must be a bit set in the upper half.
> > +          I.e if rcx != 0 and ecx == 0, then match must be upper 32
> > +          bits so we use L(copy_32_63).  */
> > +#  if VEC_SIZE == 64
> > +#   ifdef USE_AS_WCSCPY
> > +       testb   %cl, %cl
> > +#   else
> > +       testl   %ecx, %ecx
> > +#   endif
> > +       jz      L(copy_32_63)
> > +#  endif
> > +
> > +#  ifdef USE_AS_WCSCPY
> > +       testb   $0xf, %cl
> >  #  else
> > -       cmp     $(VEC_SIZE + 1), %r8
> > +       testw   %cx, %cx
> >  #  endif
> > -       jbe     L(CopyVecSizeTail1Case2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyVecSizeTail1)
> > +       jz      L(copy_16_31)
> >
> > -       VMOVU   %YMM3, (%rdi)
> > -       vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > -       kmovd   %k0, %edx
> >
> > -# ifdef USE_AS_STRNCPY
> > -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > -       cmp     $(VEC_SIZE * 2), %r8
> > +#  ifdef USE_AS_WCSCPY
> > +       testb   $0x3, %cl
> >  #  else
> > -       cmp     $((VEC_SIZE * 2) + 1), %r8
> > +       testb   %cl, %cl
> >  #  endif
> > -       jbe     L(CopyTwoVecSize1Case2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyTwoVecSize1)
> > -
> > -       and     $-VEC_SIZE, %rsi
> > -       and     $(VEC_SIZE - 1), %ecx
> > -       jmp     L(UnalignVecSizeBoth)
> > +       jz      L(copy_8_15)
> >
> > -/*------End of main part with loops---------------------*/
> >
> > -/* Case1 */
> > +#  ifdef USE_AS_WCSCPY
> > +       vmovd   %VMM_128(0), (%rdi)
> > +       /* No need to copy, we know its zero.  */
> > +       movl    $0, (%END_REG)
> >
> > -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> > -       .p2align 4
> > -L(CopyVecSize):
> > -       add     %rcx, %rdi
> > -# endif
> > -L(CopyVecSizeTail):
> > -       add     %rcx, %rsi
> > -L(CopyVecSizeTail1):
> > -       bsf     %edx, %edx
> > -L(CopyVecSizeExit):
> > -       cmp     $32, %edx
> > -       jae     L(Exit32_63)
> > -       cmp     $16, %edx
> > -       jae     L(Exit16_31)
> > -       cmp     $8, %edx
> > -       jae     L(Exit8_15)
> > -       cmp     $4, %edx
> > -       jae     L(Exit4_7)
> > -       cmp     $3, %edx
> > -       je      L(Exit3)
> > -       cmp     $1, %edx
> > -       ja      L(Exit2)
> > -       je      L(Exit1)
> > -       movb    $0, (%rdi)
> > -# ifdef USE_AS_STPCPY
> > -       lea     (%rdi), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     $1, %r8
> > -       lea     1(%rdi), %rdi
> > -       jnz     L(StrncpyFillTailWithZero)
> > -# endif
> >         ret
> > +#  else
> >
> > -       .p2align 4
> > -L(CopyTwoVecSize1):
> > -       add     $VEC_SIZE, %rsi
> > -       add     $VEC_SIZE, %rdi
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     $VEC_SIZE, %r8
> > -# endif
> > -       jmp     L(CopyVecSizeTail1)
> > -
> > -       .p2align 4
> > -L(CopyTwoVecSize):
> > -       bsf     %edx, %edx
> > -       add     %rcx, %rsi
> > -       add     $VEC_SIZE, %edx
> > -       sub     %ecx, %edx
> > -       jmp     L(CopyVecSizeExit)
> > -
> > -       .p2align 4
> > -L(CopyVecSizeUnaligned_0):
> > -       bsf     %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %rdx), %rax
> > -# endif
> > -       VMOVU   %YMM4, (%rdi)
> > -       add     $((VEC_SIZE * 4) - 1), %r8
> > -       sub     %rdx, %r8
> > -       lea     1(%rdi, %rdx), %rdi
> > -       jmp     L(StrncpyFillTailWithZero)
> > -# else
> > -       jmp     L(CopyVecSizeExit)
> > -# endif
> > +       testb   $0x7, %cl
> > +       jz      L(copy_4_7)
> >
> > -       .p2align 4
> > -L(CopyVecSizeUnaligned_16):
> > -       bsf     %ecx, %edx
> > -       VMOVU   %YMM4, (%rdi)
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > -       lea     VEC_SIZE(%rdi, %rdx), %rax
> > -# endif
> > -       VMOVU   %YMM5, VEC_SIZE(%rdi)
> > -       add     $((VEC_SIZE * 3) - 1), %r8
> > -       sub     %rdx, %r8
> > -       lea     (VEC_SIZE + 1)(%rdi, %rdx), %rdi
> > -       jmp     L(StrncpyFillTailWithZero)
> > -# else
> > -       add     $VEC_SIZE, %rsi
> > -       add     $VEC_SIZE, %rdi
> > -       jmp     L(CopyVecSizeExit)
> > -# endif
> >
> > -       .p2align 4
> > -L(CopyVecSizeUnaligned_32):
> > -       bsf     %edx, %edx
> > -       VMOVU   %YMM4, (%rdi)
> > -       VMOVU   %YMM5, VEC_SIZE(%rdi)
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > -       lea     (VEC_SIZE * 2)(%rdi, %rdx), %rax
> > -# endif
> > -       VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > -       add     $((VEC_SIZE * 2) - 1), %r8
> > -       sub     %rdx, %r8
> > -       lea     ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
> > -       jmp     L(StrncpyFillTailWithZero)
> > -# else
> > -       add     $(VEC_SIZE * 2), %rsi
> > -       add     $(VEC_SIZE * 2), %rdi
> > -       jmp     L(CopyVecSizeExit)
> > -# endif
> > +       test    %edx, %edx
> > +       jz      L(set_null_term)
> >
> > -# ifdef USE_AS_STRNCPY
> > -#  ifndef USE_AS_STRCAT
> > -       .p2align 4
> > -L(CopyVecSizeUnalignedVec6):
> > -       VMOVU   %YMM6, (%rdi, %rcx)
> > -       jmp     L(CopyVecSizeVecExit)
> > -
> > -       .p2align 4
> > -L(CopyVecSizeUnalignedVec5):
> > -       VMOVU   %YMM5, (%rdi, %rcx)
> > -       jmp     L(CopyVecSizeVecExit)
> > -
> > -       .p2align 4
> > -L(CopyVecSizeUnalignedVec4):
> > -       VMOVU   %YMM4, (%rdi, %rcx)
> > -       jmp     L(CopyVecSizeVecExit)
> > -
> > -       .p2align 4
> > -L(CopyVecSizeUnalignedVec3):
> > -       VMOVU   %YMM3, (%rdi, %rcx)
> > -       jmp     L(CopyVecSizeVecExit)
> > +       /* NB: make this `vmovw` if support for AVX512-FP16 is added.
> > +        */
> > +       vmovd   %VMM_128(0), %esi
> > +       movw    %si, (%rdi)
> > +
> > +       .p2align 4,, 1
> > +L(set_null_term):
> > +       /* No need to copy, we know its zero.  */
> > +       movb    $0, (%END_REG)
> > +       ret
> >  #  endif
> >
> > -/* Case2 */
> > -
> > -       .p2align 4
> > -L(CopyVecSizeCase2):
> > -       add     $VEC_SIZE, %r8
> > -       add     %rcx, %rdi
> > -       add     %rcx, %rsi
> > -       bsf     %edx, %edx
> > -       cmp     %r8d, %edx
> > -       jb      L(CopyVecSizeExit)
> > -       jmp     L(StrncpyExit)
> > -
> > -       .p2align 4
> > -L(CopyTwoVecSizeCase2):
> > -       add     %rcx, %rsi
> > -       bsf     %edx, %edx
> > -       add     $VEC_SIZE, %edx
> > -       sub     %ecx, %edx
> > -       cmp     %r8d, %edx
> > -       jb      L(CopyVecSizeExit)
> > -       jmp     L(StrncpyExit)
> > -
> > -L(CopyVecSizeTailCase2):
> > -       add     %rcx, %rsi
> > -       bsf     %edx, %edx
> > -       cmp     %r8d, %edx
> > -       jb      L(CopyVecSizeExit)
> > -       jmp     L(StrncpyExit)
> > -
> > -L(CopyVecSizeTail1Case2):
> > -       bsf     %edx, %edx
> > -       cmp     %r8d, %edx
> > -       jb      L(CopyVecSizeExit)
> > -       jmp     L(StrncpyExit)
> > -
> > -/* Case2 or Case3,  Case3 */
> > -
> > -       .p2align 4
> > -L(CopyVecSizeCase2OrCase3):
> > -       test    %rdx, %rdx
> > -       jnz     L(CopyVecSizeCase2)
> > -L(CopyVecSizeCase3):
> > -       add     $VEC_SIZE, %r8
> > -       add     %rcx, %rdi
> > -       add     %rcx, %rsi
> > -       jmp     L(StrncpyExit)
> > -
> > -       .p2align 4
> > -L(CopyTwoVecSizeCase2OrCase3):
> > -       test    %rdx, %rdx
> > -       jnz     L(CopyTwoVecSizeCase2)
> > -       add     %rcx, %rsi
> > -       jmp     L(StrncpyExit)
> > -
> > -       .p2align 4
> > -L(CopyVecSizeTailCase2OrCase3):
> > -       test    %rdx, %rdx
> > -       jnz     L(CopyVecSizeTailCase2)
> > -       add     %rcx, %rsi
> > -       jmp     L(StrncpyExit)
> > -
> > -       .p2align 4
> > -L(CopyTwoVecSize1Case2OrCase3):
> > -       add     $VEC_SIZE, %rdi
> > -       add     $VEC_SIZE, %rsi
> > -       sub     $VEC_SIZE, %r8
> > -L(CopyVecSizeTail1Case2OrCase3):
> > -       test    %rdx, %rdx
> > -       jnz     L(CopyVecSizeTail1Case2)
> > -       jmp     L(StrncpyExit)
> > +#  if VEC_SIZE == 64
> > +       .p2align 4,, 6
> > +L(copy_32_63):
> > +       VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > +       VMOVU   %VMM_256(0), (%rdi)
> > +       VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
> > +       ret
> > +#  endif
> > +
> > +
> > +       .p2align 4,, 6
> > +L(copy_16_31):
> > +       /* Use xmm1 explicitly here as it won't require a `vzeroupper`
> > +          and will save code size.  */
> > +       vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > +       VMOVU   %VMM_128(0), (%rdi)
> > +       vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
> > +       ret
> > +
> > +       .p2align 4,, 8
> > +L(copy_8_15):
> > +#  ifdef USE_AS_WCSCPY
> > +       movl    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> > +#  else
> > +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
> > +#  endif
> > +       vmovq   %VMM_128(0), (%rdi)
> > +       movq    %rcx, -(8 - CHAR_SIZE)(%END_REG)
> > +       ret
> >  # endif
> >
> > -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
> >
> > -       .p2align 4
> > -L(Exit1):
> > -       movzwl  (%rsi), %edx
> > -       mov     %dx, (%rdi)
> > -# ifdef USE_AS_STPCPY
> > -       lea     1(%rdi), %rax
> > +# ifndef USE_AS_WCSCPY
> > +       .p2align 4,, 12
> > +L(copy_4_7):
> > +       movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> > +       vmovd   %VMM_128(0), (%rdi)
> > +       movl    %ecx, -(4 - CHAR_SIZE)(%END_REG)
> > +       ret
> >  # endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     $2, %r8
> > -       lea     2(%rdi), %rdi
> > -       jnz     L(StrncpyFillTailWithZero)
> > +
> > +
> > +       .p2align 4,, 8
> > +L(more_1x_vec):
> > +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > +       VMOVU   %VMM(0), (%rdi)
> >  # endif
> > -       ret
> > +       subq    %rsi, %rdi
> > +       andq    $-(VEC_SIZE), %rsi
> > +       addq    %rsi, %rdi
> > +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> >
> > -       .p2align 4
> > -L(Exit2):
> > -       movzwl  (%rsi), %ecx
> > -       mov     %cx, (%rdi)
> > -       movb    $0, 2(%rdi)
> > +       /* Ideally we store after moves to minimize impact of potential
> > +          false-dependencies.  */
> > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > +       VMOVU   %VMM(0), (%rax)
> > +# endif
> > +
> > +       VPTESTN %VMM(1), %VMM(1), %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(ret_vec_x1)
> > +
> > +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > +       VMOVU   %VMM(1), VEC_SIZE(%rdi)
> > +
> > +       VPTESTN %VMM(2), %VMM(2), %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(ret_vec_x2)
> > +
> > +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > +
> > +       VPTESTN %VMM(3), %VMM(3), %k0
> > +       KMOV    %k0, %VRDX
> > +       test    %VRDX, %VRDX
> > +       jnz     L(ret_vec_x3)
> > +
> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > +       VPTESTN %VMM(4), %VMM(4), %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(ret_vec_x4)
> > +
> > +       VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> > +
> > +
> > +       /* Align for 4x loop.  */
> > +       subq    %rsi, %rdi
> > +
> > +       /* + VEC_SIZE * 5 because we never added the original VEC_SIZE
> > +          we covered before aligning.  */
> > +       subq    $-(VEC_SIZE * 5), %rsi
> > +       andq    $-(VEC_SIZE * 4), %rsi
> > +
> > +
> > +       /* Load first half of the loop before entry.  */
> > +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +       VPTESTN %VMM(4), %VMM(4), %k2
> > +       VPTESTN %VMM(6), %VMM(6), %k4
> > +       KORTEST %k2, %k4
> > +       jnz     L(loop_4x_done)
> > +
> > +       .p2align 4,, 11
> > +L(loop_4x_vec):
> > +
> > +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> > +
> > +       subq    $(VEC_SIZE * -4), %rsi
> > +
> > +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > +
> > +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +       VPTESTN %VMM(4), %VMM(4), %k2
> > +       VPTESTN %VMM(6), %VMM(6), %k4
> > +       KORTEST %k2, %k4
> > +       jz      L(loop_4x_vec)
> > +
> > +L(loop_4x_done):
> > +       VPTESTN %VMM(0), %VMM(0), %k0
> > +       KMOV    %k0, %VRCX
> > +       /* Restore rdi (%rdi).  */
> > +       addq    %rsi, %rdi
> > +       test    %VRCX, %VRCX
> > +       jnz     L(ret_vec_x0_end)
> > +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> > +
> > +       KMOV    %k2, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(ret_vec_x1)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> > +
> > +       VPTESTN %VMM(2), %VMM(2), %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(ret_vec_x2)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> > +       /* Place L(ret_vec_x4) here to save code size.  We get a
> > +          meaningfuly benefit doing this for stpcpy.  */
> > +       KMOV    %k4, %VRDX
> > +L(ret_vec_x3):
> > +       bsf     %VRDX, %VRDX
> > +       VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > +       VMOVU   %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> >  # ifdef USE_AS_STPCPY
> > -       lea     2(%rdi), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     $3, %r8
> > -       lea     3(%rdi), %rdi
> > -       jnz     L(StrncpyFillTailWithZero)
> > +       leaq    (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
> >  # endif
> > +L(return_end):
> >         ret
> >
> > -       .p2align 4
> > -L(Exit3):
> > -       mov     (%rsi), %edx
> > -       mov     %edx, (%rdi)
> > +       .p2align 4,, 6
> > +L(ret_vec_x0_end):
> > +       bsf     %VRCX, %VRCX
> >  # ifdef USE_AS_STPCPY
> > -       lea     3(%rdi), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     $4, %r8
> > -       lea     4(%rdi), %rdi
> > -       jnz     L(StrncpyFillTailWithZero)
> > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rax
> >  # endif
> > +       inc     %VRCX
> > +       VMOVU   (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > +       VMOVU   %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> >         ret
> >
> > -       .p2align 4
> > -L(Exit4_7):
> > -       mov     (%rsi), %ecx
> > -       mov     %ecx, (%rdi)
> > -       mov     -3(%rsi, %rdx), %ecx
> > -       mov     %ecx, -3(%rdi, %rdx)
> > +       .p2align 4,, 8
> > +L(ret_vec_x1):
> > +       bsf     %VRCX, %VRCX
> > +       VMOVU   (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > +       VMOVU   %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> >  # ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %rdx), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     %rdx, %r8
> > -       sub     $1, %r8
> > -       lea     1(%rdi, %rdx), %rdi
> > -       jnz     L(StrncpyFillTailWithZero)
> > +       leaq    VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
> >  # endif
> >         ret
> >
> > -       .p2align 4
> > -L(Exit8_15):
> > -       mov     (%rsi), %rcx
> > -       mov     -7(%rsi, %rdx), %r9
> > -       mov     %rcx, (%rdi)
> > -       mov     %r9, -7(%rdi, %rdx)
> > +       .p2align 4,, 4
> > +L(ret_vec_x2):
> > +       bsf     %VRCX, %VRCX
> > +       VMOVU   ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > +       VMOVU   %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> >  # ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %rdx), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     %rdx, %r8
> > -       sub     $1, %r8
> > -       lea     1(%rdi, %rdx), %rdi
> > -       jnz     L(StrncpyFillTailWithZero)
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
> >  # endif
> >         ret
> >
> > -       .p2align 4
> > -L(Exit16_31):
> > -       VMOVU   (%rsi), %XMM2
> > -       VMOVU   -15(%rsi, %rdx), %XMM3
> > -       VMOVU   %XMM2, (%rdi)
> > -       VMOVU   %XMM3, -15(%rdi, %rdx)
> > +       /* ret_vec_x3 reuses return code after the loop.  */
> > +       .p2align 4,, 6
> > +L(ret_vec_x4):
> > +       bsf     %VRCX, %VRCX
> > +       VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > +       VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> >  # ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %rdx), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub %rdx, %r8
> > -       sub $1, %r8
> > -       lea 1(%rdi, %rdx), %rdi
> > -       jnz L(StrncpyFillTailWithZero)
> > +       leaq    (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
> >  # endif
> >         ret
> >
> > -       .p2align 4
> > -L(Exit32_63):
> > -       VMOVU   (%rsi), %YMM2
> > -       VMOVU   -31(%rsi, %rdx), %YMM3
> > -       VMOVU   %YMM2, (%rdi)
> > -       VMOVU   %YMM3, -31(%rdi, %rdx)
> > -# ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %rdx), %rax
> > +
> > +       .p2align 4,, 4
> > +L(page_cross):
> > +# ifndef USE_AS_STRCAT
> > +       vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> >  # endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     %rdx, %r8
> > -       sub     $1, %r8
> > -       lea     1(%rdi, %rdx), %rdi
> > -       jnz     L(StrncpyFillTailWithZero)
> > +       movq    %rsi, %rcx
> > +       andq    $(VEC_SIZE * -1), %rcx
> > +
> > +       VPCMPEQ (%rcx), %VZERO, %k0
> > +       KMOV    %k0, %VRCX
> > +# ifdef USE_AS_WCSCPY
> > +       andl    $(VEC_SIZE - 1), %PAGE_ALIGN_REG
> > +       shrl    $2, %PAGE_ALIGN_REG
> >  # endif
> > -       ret
> > +       shrx    %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
> >
> > -# ifdef USE_AS_STRNCPY
> > +# if USE_MOVSB_IN_PAGE_CROSS
> > +       /* Optimizing more aggressively for space as this is very cold
> > +          code. This saves 2x cache lines.  */
> >
> > -       .p2align 4
> > -L(StrncpyExit1):
> > -       movzbl  (%rsi), %edx
> > -       mov     %dl, (%rdi)
> > -#  ifdef USE_AS_STPCPY
> > -       lea     1(%rdi), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, 1(%rdi)
> > +       /* This adds once to the later result which will get correct
> > +          copy bounds. NB: this can never zero-out a non-zero RCX as
> > +          to be in the page cross case rsi cannot be aligned and we
> > +          already right-shift rcx by the misalignment.  */
> > +       shl     %VRCX
> > +       jz      L(page_cross_continue)
> > +#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > +       movq    %rdi, %rax
> >  #  endif
> > -       ret
> > +       bsf     %VRCX, %VRCX
> > +       REP_MOVS
> >
> > -       .p2align 4
> > -L(StrncpyExit2):
> > -       movzwl  (%rsi), %edx
> > -       mov     %dx, (%rdi)
> >  #  ifdef USE_AS_STPCPY
> > -       lea     2(%rdi), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, 2(%rdi)
> > +       leaq    -CHAR_SIZE(%rdi), %rax
> >  #  endif
> >         ret
> >
> > -       .p2align 4
> > -L(StrncpyExit3_4):
> > -       movzwl  (%rsi), %ecx
> > -       movzwl  -2(%rsi, %r8), %edx
> > -       mov     %cx, (%rdi)
> > -       mov     %dx, -2(%rdi, %r8)
> > -#  ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %r8), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, (%rdi, %r8)
> > -#  endif
> > -       ret
> >
> > -       .p2align 4
> > -L(StrncpyExit5_8):
> > -       mov     (%rsi), %ecx
> > -       mov     -4(%rsi, %r8), %edx
> > -       mov     %ecx, (%rdi)
> > -       mov     %edx, -4(%rdi, %r8)
> > -#  ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %r8), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, (%rdi, %r8)
> > -#  endif
> > -       ret
> > +# else
> > +       /* Check if we found zero-char before end of page.  */
> > +       test    %VRCX, %VRCX
> > +       jz      L(page_cross_continue)
> >
> > -       .p2align 4
> > -L(StrncpyExit9_16):
> > -       mov     (%rsi), %rcx
> > -       mov     -8(%rsi, %r8), %rdx
> > -       mov     %rcx, (%rdi)
> > -       mov     %rdx, -8(%rdi, %r8)
> > -#  ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %r8), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, (%rdi, %r8)
> > -#  endif
> > -       ret
> > +       /* Traditional copy case, essentially same as used in non-page-
> > +          cross case but since we can't reuse VMM(0) we need twice as
> > +          many loads from rsi.  */
> >
> > -       .p2align 4
> > -L(StrncpyExit17_32):
> > -       VMOVU   (%rsi), %XMM2
> > -       VMOVU   -16(%rsi, %r8), %XMM3
> > -       VMOVU   %XMM2, (%rdi)
> > -       VMOVU   %XMM3, -16(%rdi, %r8)
> > -#  ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %r8), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, (%rdi, %r8)
> > +#  ifndef USE_AS_STRCAT
> > +       xorl    %edx, %edx
> >  #  endif
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncpyExit33_64):
> > -       /*  0/32, 31/16 */
> > -       VMOVU   (%rsi), %YMM2
> > -       VMOVU   -VEC_SIZE(%rsi, %r8), %YMM3
> > -       VMOVU   %YMM2, (%rdi)
> > -       VMOVU   %YMM3, -VEC_SIZE(%rdi, %r8)
> > +       /* Dependency on rdi must already have been satisfied.  */
> > +       bsf     %VRCX, %VRDX
> >  #  ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %r8), %rax
> > +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > +#  elif !defined USE_AS_STRCAT
> > +       movq    %rdi, %rax
> >  #  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, (%rdi, %r8)
> > -#  endif
> > -       ret
> >
> > -       .p2align 4
> > -L(StrncpyExit65):
> > -       /* 0/32, 32/32, 64/1 */
> > -       VMOVU   (%rsi), %YMM2
> > -       VMOVU   32(%rsi), %YMM3
> > -       mov     64(%rsi), %cl
> > -       VMOVU   %YMM2, (%rdi)
> > -       VMOVU   %YMM3, 32(%rdi)
> > -       mov     %cl, 64(%rdi)
> > -#  ifdef USE_AS_STPCPY
> > -       lea     65(%rdi), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, 65(%rdi)
> > +#  if VEC_SIZE == 64
> > +#   ifdef USE_AS_WCSCPY
> > +       testb   %cl, %cl
> > +#   else
> > +       test    %ecx, %ecx
> > +#   endif
> > +       jz      L(page_cross_copy_32_63)
> >  #  endif
> > -       ret
> > -
> > -#  ifndef USE_AS_STRCAT
> >
> > -       .p2align 4
> > -L(Fill1):
> > -       mov     %dl, (%rdi)
> > -       ret
> > +#  ifdef USE_AS_WCSCPY
> > +       testb   $0xf, %cl
> > +#  else
> > +       testw   %cx, %cx
> > +#  endif
> > +       jz      L(page_cross_copy_16_31)
> >
> > -       .p2align 4
> > -L(Fill2):
> > -       mov     %dx, (%rdi)
> > -       ret
> > +#  ifdef USE_AS_WCSCPY
> > +       testb   $0x3, %cl
> > +#  else
> > +       testb   %cl, %cl
> > +#  endif
> > +       jz      L(page_cross_copy_8_15)
> >
> > -       .p2align 4
> > -L(Fill3_4):
> > -       mov     %dx, (%rdi)
> > -       mov     %dx, -2(%rdi, %r8)
> > +#  ifdef USE_AS_WCSCPY
> > +       movl    (%rsi), %esi
> > +       movl    %esi, (%rdi)
> > +       movl    $0, (%END_REG)
> >         ret
> > +#  else
> >
> > -       .p2align 4
> > -L(Fill5_8):
> > -       mov     %edx, (%rdi)
> > -       mov     %edx, -4(%rdi, %r8)
> > -       ret
> > +       testb   $0x7, %cl
> > +       jz      L(page_cross_copy_4_7)
> >
> > -       .p2align 4
> > -L(Fill9_16):
> > -       mov     %rdx, (%rdi)
> > -       mov     %rdx, -8(%rdi, %r8)
> > +       test    %edx, %edx
> > +       jz      L(page_cross_set_null_term)
> > +       movzwl  (%rsi), %ecx
> > +       movw    %cx, (%rdi)
> > +L(page_cross_set_null_term):
> > +       movb    $0, (%END_REG)
> >         ret
> >
> > -       .p2align 4
> > -L(Fill17_32):
> > -       VMOVU   %XMMZERO, (%rdi)
> > -       VMOVU   %XMMZERO, -16(%rdi, %r8)
> > -       ret
> >
> > -       .p2align 4
> > -L(CopyVecSizeUnalignedVec2):
> > -       VMOVU   %YMM2, (%rdi, %rcx)
> > -
> > -       .p2align 4
> > -L(CopyVecSizeVecExit):
> > -       bsf     %edx, %edx
> > -       add     $(VEC_SIZE - 1), %r8
> > -       add     %rcx, %rdi
> > -#   ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %rdx), %rax
> > -#   endif
> > -       sub     %rdx, %r8
> > -       lea     1(%rdi, %rdx), %rdi
> > -
> > -       .p2align 4
> > -L(StrncpyFillTailWithZero):
> > -       xor     %edx, %edx
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(StrncpyFillExit)
> > -
> > -       VMOVU   %YMMZERO, (%rdi)
> > -       add     $VEC_SIZE, %rdi
> > -
> > -       mov     %rdi, %rsi
> > -       and     $(VEC_SIZE - 1), %esi
> > -       sub     %rsi, %rdi
> > -       add     %rsi, %r8
> > -       sub     $(VEC_SIZE * 4), %r8
> > -       jb      L(StrncpyFillLessFourVecSize)
> > -
> > -L(StrncpyFillLoopVmovdqa):
> > -       VMOVA   %YMMZERO, (%rdi)
> > -       VMOVA   %YMMZERO, VEC_SIZE(%rdi)
> > -       VMOVA   %YMMZERO, (VEC_SIZE * 2)(%rdi)
> > -       VMOVA   %YMMZERO, (VEC_SIZE * 3)(%rdi)
> > -       add     $(VEC_SIZE * 4), %rdi
> > -       sub     $(VEC_SIZE * 4), %r8
> > -       jae     L(StrncpyFillLoopVmovdqa)
> > -
> > -L(StrncpyFillLessFourVecSize):
> > -       add     $(VEC_SIZE * 2), %r8
> > -       jl      L(StrncpyFillLessTwoVecSize)
> > -       VMOVA   %YMMZERO, (%rdi)
> > -       VMOVA   %YMMZERO, VEC_SIZE(%rdi)
> > -       add     $(VEC_SIZE * 2), %rdi
> > -       sub     $VEC_SIZE, %r8
> > -       jl      L(StrncpyFillExit)
> > -       VMOVA   %YMMZERO, (%rdi)
> > -       add     $VEC_SIZE, %rdi
> > -       jmp     L(Fill)
> > -
> > -       .p2align 4
> > -L(StrncpyFillLessTwoVecSize):
> > -       add     $VEC_SIZE, %r8
> > -       jl      L(StrncpyFillExit)
> > -       VMOVA   %YMMZERO, (%rdi)
> > -       add     $VEC_SIZE, %rdi
> > -       jmp     L(Fill)
> > -
> > -       .p2align 4
> > -L(StrncpyFillExit):
> > -       add     $VEC_SIZE, %r8
> > -L(Fill):
> > -       cmp     $17, %r8d
> > -       jae     L(Fill17_32)
> > -       cmp     $9, %r8d
> > -       jae     L(Fill9_16)
> > -       cmp     $5, %r8d
> > -       jae     L(Fill5_8)
> > -       cmp     $3, %r8d
> > -       jae     L(Fill3_4)
> > -       cmp     $1, %r8d
> > -       ja      L(Fill2)
> > -       je      L(Fill1)
> > +       .p2align 4,, 4
> > +L(page_cross_copy_4_7):
> > +       movl    (%rsi), %ecx
> > +       movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
> > +       movl    %ecx, (%rdi)
> > +       movl    %esi, -(4 - CHAR_SIZE)(%END_REG)
> >         ret
> > -
> > -/* end of ifndef USE_AS_STRCAT */
> >  #  endif
> >
> > -       .p2align 4
> > -L(UnalignedLeaveCase2OrCase3):
> > -       test    %rdx, %rdx
> > -       jnz     L(UnalignedFourVecSizeLeaveCase2)
> > -L(UnalignedFourVecSizeLeaveCase3):
> > -       lea     (VEC_SIZE * 4)(%r8), %rcx
> > -       and     $-VEC_SIZE, %rcx
> > -       add     $(VEC_SIZE * 3), %r8
> > -       jl      L(CopyVecSizeCase3)
> > -       VMOVU   %YMM4, (%rdi)
> > -       sub     $VEC_SIZE, %r8
> > -       jb      L(CopyVecSizeCase3)
> > -       VMOVU   %YMM5, VEC_SIZE(%rdi)
> > -       sub     $VEC_SIZE, %r8
> > -       jb      L(CopyVecSizeCase3)
> > -       VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > -       sub     $VEC_SIZE, %r8
> > -       jb      L(CopyVecSizeCase3)
> > -       VMOVU   %YMM7, (VEC_SIZE * 3)(%rdi)
> > -#  ifdef USE_AS_STPCPY
> > -       lea     (VEC_SIZE * 4)(%rdi), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, (VEC_SIZE * 4)(%rdi)
> > -#  endif
> > +#  if VEC_SIZE == 64
> > +       .p2align 4,, 4
> > +L(page_cross_copy_32_63):
> > +       VMOVU   (%rsi), %VMM_256(0)
> > +       VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > +       VMOVU   %VMM_256(0), (%rdi)
> > +       VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
> >         ret
> > -
> > -       .p2align 4
> > -L(UnalignedFourVecSizeLeaveCase2):
> > -       xor     %ecx, %ecx
> > -       vpcmpb  $0, %YMM4, %YMMZERO, %k1
> > -       kmovd   %k1, %edx
> > -       add     $(VEC_SIZE * 3), %r8
> > -       jle     L(CopyVecSizeCase2OrCase3)
> > -       test    %edx, %edx
> > -#  ifndef USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec4)
> > -#  else
> > -       jnz     L(CopyVecSize)
> > -#  endif
> > -       vpcmpb  $0, %YMM5, %YMMZERO, %k2
> > -       kmovd   %k2, %edx
> > -       VMOVU   %YMM4, (%rdi)
> > -       add     $VEC_SIZE, %rcx
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -       test    %edx, %edx
> > -#  ifndef USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec5)
> > -#  else
> > -       jnz     L(CopyVecSize)
> >  #  endif
> >
> > -       vpcmpb  $0, %YMM6, %YMMZERO, %k3
> > -       kmovd   %k3, %edx
> > -       VMOVU   %YMM5, VEC_SIZE(%rdi)
> > -       add     $VEC_SIZE, %rcx
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -       test    %edx, %edx
> > -#  ifndef USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec6)
> > -#  else
> > -       jnz     L(CopyVecSize)
> > -#  endif
> > -
> > -       vpcmpb  $0, %YMM7, %YMMZERO, %k4
> > -       kmovd   %k4, %edx
> > -       VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > -       lea     VEC_SIZE(%rdi, %rcx), %rdi
> > -       lea     VEC_SIZE(%rsi, %rcx), %rsi
> > -       bsf     %edx, %edx
> > -       cmp     %r8d, %edx
> > -       jb      L(CopyVecSizeExit)
> > -L(StrncpyExit):
> > -       cmp     $65, %r8d
> > -       je      L(StrncpyExit65)
> > -       cmp     $33, %r8d
> > -       jae     L(StrncpyExit33_64)
> > -       cmp     $17, %r8d
> > -       jae     L(StrncpyExit17_32)
> > -       cmp     $9, %r8d
> > -       jae     L(StrncpyExit9_16)
> > -       cmp     $5, %r8d
> > -       jae     L(StrncpyExit5_8)
> > -       cmp     $3, %r8d
> > -       jae     L(StrncpyExit3_4)
> > -       cmp     $1, %r8d
> > -       ja      L(StrncpyExit2)
> > -       je      L(StrncpyExit1)
> > -#  ifdef USE_AS_STPCPY
> > -       mov     %rdi, %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, (%rdi)
> > -#  endif
> > +       .p2align 4,, 4
> > +L(page_cross_copy_16_31):
> > +       vmovdqu (%rsi), %xmm0
> > +       vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > +       vmovdqu %xmm0, (%rdi)
> > +       vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
> >         ret
> >
> > -       .p2align 4
> > -L(ExitZero):
> > -#  ifndef USE_AS_STRCAT
> > -       mov     %rdi, %rax
> > -#  endif
> > +       .p2align 4,, 4
> > +L(page_cross_copy_8_15):
> > +       movq    (%rsi), %rcx
> > +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> > +       movq    %rcx, (%rdi)
> > +       movq    %rsi, -(8 - CHAR_SIZE)(%END_REG)
> >         ret
> > -
> > -# endif
> > -
> > -# ifndef USE_AS_STRCAT
> > -END (STRCPY)
> > -# else
> > -END (STRCAT)
> >  # endif
> > +END(STRCPY)
> >  #endif
> > diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
> > index 203a19bf21..d648ba5cfe 100644
> > --- a/sysdeps/x86_64/multiarch/strncat-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strncat-evex.S
> > @@ -1,7 +1,520 @@
> > -#ifndef STRNCAT
> > -# define STRNCAT       __strncat_evex
> > -#endif
> > +/* {wcs|str}ncat  with 256/512-bit EVEX.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (4)
> > +
> > +       /* Use evex-masked stores for small sizes. Turned off at the
> > +          moment.  */
> > +# define USE_EVEX_MASKED_STORE 0
> > +
> > +# include <sysdep.h>
> > +
> > +# ifndef VEC_SIZE
> > +#  include "x86-evex256-vecs.h"
> > +# endif
> > +
> > +# ifndef STRNCAT
> > +#  define STRNCAT      __strncat_evex
> > +# endif
> > +
> > +
> > +# ifdef USE_AS_WCSCPY
> > +#  define movNULL      movl
> > +#  define VMOVU_MASK   vmovdqu32
> > +#  define VPMIN        vpminud
> > +#  define VPTESTN      vptestnmd
> > +#  define VPTEST       vptestmd
> > +#  define VPCMPEQ      vpcmpeqd
> > +#  define CHAR_SIZE    4
> > +
> > +#  define REP_MOVS     rep movsd
> > +
> > +#  define VMASK_REG    VR10
> > +#  define FIND_FIRST_ONE(src, dst)     movl $CHAR_PER_VEC, %dst; bsf %src, %dst
> > +
> > +#  define USE_WIDE_CHAR
> > +# else
> > +#  define movNULL      movb
> > +#  define VMOVU_MASK   vmovdqu8
> > +#  define VPMIN        vpminub
> > +#  define VPTESTN      vptestnmb
> > +#  define VPTEST       vptestmb
> > +#  define VPCMPEQ      vpcmpeqb
> > +#  define CHAR_SIZE    1
> > +
> > +#  define REP_MOVS     rep movsb
> > +
> > +#  define VMASK_REG    VRCX
> > +#  define FIND_FIRST_ONE(src, dst)     tzcnt %src, %dst
> > +
> > +# endif
> > +
> > +# include "strncpy-or-cat-overflow-def.h"
> > +
> > +# include "reg-macros.h"
> > +
> > +
> > +# define VZERO VMM(7)
> > +# define VZERO_128     VMM_128(7)
> > +
> > +# define PAGE_SIZE     4096
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > +
> > +       .section SECTION(.text), "ax", @progbits
> > +ENTRY(STRNCAT)
> > +       movq    %rdi, %rax
> > +
> > +       /* NB: It's safe to filter out zero-length strings WITHOUT
> > +          setting null-term. Destination MUST be a null-terminated
> > +          string so essentially the work is already done.  */
> > +# ifdef USE_AS_WCSCPY
> > +       leaq    -1(%rdx), %rcx
> > +       shrq    $56, %rcx
> > +       jnz     L(zero_len)
> > +# else
> > +       test    %rdx, %rdx
> > +       jle     L(zero_len)
> > +# endif
> > +
> > +# include "strcat-strlen-evex.S"
> > +
> > +       movl    %esi, %ecx
> > +       andl    $(PAGE_SIZE - 1), %ecx
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > +       ja      L(page_cross)
> > +L(page_cross_continue):
> > +       VMOVU   (%rsi), %VMM(0)
> > +       VPTESTN %VMM(0), %VMM(0), %k0
> > +
> > +       /* If USE_EVEX_MASK_STORE is enabled then we just handle length
> > +          <= CHAR_PER_VEC with masked instructions (which have
> > +          potential for dramatically bad perf if dst splits a page and
> > +          is not in the TLB).  */
> > +# if USE_EVEX_MASKED_STORE
> > +       KMOV    %k0, %VRCX
> > +       FIND_FIRST_ONE (VRCX, VR8)
> > +       cmpq    %r8, %rdx
> > +       jbe     L(less_1x_vec)
> > +
> > +       test    %VRCX, %VRCX
> > +       jz      L(more_1x_vec)
> > +
> > +       blsmsk  %VRCX, %VRCX
> > +       KMOV    %VRCX, %k1
> > +       VMOVU_MASK %VMM(0), (%rdi){%k1}
> > +       ret
> > +
> > +L(less_1x_vec):
> > +       mov     $-1, %VRCX
> > +       bzhi    %VRDX, %VRCX, %VRCX
> > +       KMOV    %VRCX, %k1
> > +       movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > +       VMOVU_MASK %VMM(0), (%rdi){%k1}
> > +
> > +       ret
> > +# else
> > +       KMOV    %k0, %VMASK_REG
> > +       /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
> > +          %VMASK_REG, %VRCX` for wcsncat.  */
> > +       FIND_FIRST_ONE (VMASK_REG, VRCX)
> > +       cmpq    %rcx, %rdx
> > +       jbe     L(less_1x_vec)
> > +
> > +       /* If there were no zero-CHARs (rcx was zero before
> > +          FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> > +       cmpl    $CHAR_PER_VEC, %ecx
> > +       je      L(more_1x_vec)
> > +
> > +       movl    %ecx, %edx
> > +
> > +L(less_1x_vec):
> > +#  if VEC_SIZE == 64
> > +       cmpl    $(32 / CHAR_SIZE), %edx
> > +       jae     L(copy_32_63)
> > +#  endif
> > +
> > +       cmpl    $(16 / CHAR_SIZE), %edx
> > +       jae     L(copy_16_31)
> > +
> > +
> > +       cmpl    $(8 / CHAR_SIZE), %edx
> > +       jae     L(copy_8_15)
> > +
> > +#  ifdef USE_AS_WCSCPY
> > +       vmovd   %VMM_128(0), (%rdi)
> > +       movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > +       ret
> > +#  else
> > +
> > +       cmpl    $4, %edx
> > +       jae     L(copy_4_7)
> > +
> > +       movzbl  (%rsi), %ecx
> > +       cmpl    $1, %edx
> > +       jbe     L(set_null_term)
> > +
> > +       movzwl  1(%rsi), %esi
> > +       movw    %si, 1(%rdi)
> > +
> > +       .p2align 4,, 1
> > +L(set_null_term):
> > +       movb    %cl, (%rdi)
> > +       movNULL $0, (%rdi, %rdx)
> > +       ret
> > +#  endif
> > +
> > +#  if VEC_SIZE == 64
> > +       .p2align 4,, 6
> > +L(copy_32_63):
> > +       VMOVU   -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > +       VMOVU   %VMM_256(0), (%rdi)
> > +       VMOVU   %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
> > +       movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > +       ret
> > +#  endif
> > +       .p2align 4,, 6
> > +L(copy_16_31):
> > +       /* Use xmm1 explicitly here as it won't require a `vzeroupper`
> > +          and will save code size.  */
> > +       vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > +       VMOVU   %VMM_128(0), (%rdi)
> > +       vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
> > +       movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > +       ret
> > +
> > +       .p2align 4,, 2
> > +L(copy_8_15):
> > +       movq    -(8)(%rsi, %rdx, CHAR_SIZE), %rcx
> > +       vmovq   %VMM_128(0), (%rdi)
> > +       movq    %rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
> > +       movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > +       ret
> > +
> > +#  ifndef USE_AS_WCSCPY
> > +       .p2align 4,, 12
> > +L(copy_4_7):
> > +       movl    -(4)(%rsi, %rdx, CHAR_SIZE), %ecx
> > +       vmovd   %VMM_128(0), (%rdi)
> > +       movl    %ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
> > +       movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > +       ret
> > +#  endif
> > +
> > +# endif
> > +       .p2align 4,, 4
> > +L(zero_len):
> > +# ifdef USE_AS_WCSCPY
> > +       test    %rdx, %rdx
> > +# endif
> > +       jne     OVERFLOW_STRCAT
> > +       ret
> >
> > -#define USE_AS_STRNCAT
> > -#define STRCAT STRNCAT
> > -#include "strcat-evex.S"
> > +       .p2align 4,, 8
> > +L(more_1x_vec):
> > +       VMOVU   %VMM(0), (%rdi)
> > +
> > +       /* We are going to align rsi here so will need to be able to re-
> > +          adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> > +          so rsi + rdx * CHAR_SIZE cannot overflow.  */
> > +
> > +       leaq    (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> > +       subq    %rsi, %rdi
> > +       andq    $-(VEC_SIZE), %rsi
> > +L(loop_last_4x_vec):
> > +       addq    %rsi, %rdi
> > +       subq    %rsi, %rdx
> > +# ifdef USE_AS_WCSCPY
> > +       shrq    $2, %rdx
> > +# endif
> > +
> > +       /* Will need this regardless.  */
> > +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> > +       VPTESTN %VMM(1), %VMM(1), %k0
> > +       KMOV    %k0, %VMASK_REG
> > +
> > +       cmpq    $(CHAR_PER_VEC * 2), %rdx
> > +       ja      L(more_2x_vec)
> > +
> > +L(last_2x_vec):
> > +       FIND_FIRST_ONE (VMASK_REG, VRCX)
> > +       cmpl    %ecx, %edx
> > +       jbe     L(ret_vec_x1_len)
> > +
> > +       /* If there were no zero-CHARs (rcx was zero before
> > +          FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> > +       cmpl    $CHAR_PER_VEC, %ecx
> > +       jne     L(ret_vec_x1)
> > +
> > +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > +       VPTESTN %VMM(2), %VMM(2), %k0
> > +       KMOV    %k0, %VRCX
> > +       addl    $-CHAR_PER_VEC, %edx
> > +       bzhi    %VRDX, %VRCX, %VR8
> > +       jz      L(ret_vec_x2_len)
> > +L(ret_vec_x2):
> > +       bsf     %VRCX, %VRDX
> > +L(ret_vec_x2_len):
> > +       VMOVU   (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > +       movNULL $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> > +       VMOVU   %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +       ret
> > +
> > +       .p2align 4,, 4
> > +L(ret_vec_x1_len):
> > +       movl    %edx, %ecx
> > +L(ret_vec_x1):
> > +       VMOVU   (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > +       movNULL $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
> > +       VMOVU   %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > +       VZEROUPPER_RETURN
> > +
> > +
> > +       .p2align 4,, 8
> > +L(last_4x_vec):
> > +       addl    $-(CHAR_PER_VEC * 4), %edx
> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(1)
> > +       VPTESTN %VMM(1), %VMM(1), %k0
> > +       KMOV    %k0, %VMASK_REG
> > +       subq    $-(VEC_SIZE * 4), %rsi
> > +       subq    $-(VEC_SIZE * 4), %rdi
> > +       cmpl    $(CHAR_PER_VEC * 2), %edx
> > +       jbe     L(last_2x_vec)
> > +       .p2align 4,, 8
> > +L(more_2x_vec):
> > +# ifdef USE_AS_WCSCPY
> > +       xorl    %ecx, %ecx
> > +# endif
> > +       bsf     %VMASK_REG, %VRCX
> > +       jnz     L(ret_vec_x1)
> > +
> > +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > +       VPTESTN %VMM(2), %VMM(2), %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(ret_vec_x2)
> > +
> > +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > +       VPTESTN %VMM(3), %VMM(3), %k0
> > +       KMOV    %k0, %VMASK_REG
> > +
> > +       cmpq    $(CHAR_PER_VEC * 4), %rdx
> > +       ja      L(more_4x_vec)
> > +
> > +       /* Adjust length before going to L(ret_vec_x3_len) or
> > +          L(ret_vec_x3).  */
> > +       addl    $(CHAR_PER_VEC * -2), %edx
> > +
> > +       FIND_FIRST_ONE (VMASK_REG, VRCX)
> > +       cmpl    %ecx, %edx
> > +       jbe     L(ret_vec_x3_len)
> > +
> > +       /* If there were no zero-CHARs (rcx was zero before
> > +          FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> > +       cmpl    $CHAR_PER_VEC, %ecx
> > +       jne     L(ret_vec_x3)
> > +
> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > +       VPTESTN %VMM(4), %VMM(4), %k0
> > +       KMOV    %k0, %VRCX
> > +       addl    $-CHAR_PER_VEC, %edx
> > +       bzhi    %VRDX, %VRCX, %VR8
> > +       jz      L(ret_vec_x4_len)
> > +L(ret_vec_x4):
> > +       bsf     %VRCX, %VRDX
> > +L(ret_vec_x4_len):
> > +       VMOVU   (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > +       movNULL $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
> > +       VMOVU   %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +       ret
> > +
> > +       .p2align 4,, 4
> > +L(ret_vec_x3_len):
> > +       movl    %edx, %ecx
> > +L(ret_vec_x3):
> > +       VMOVU   (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > +       movNULL $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> > +       VMOVU   %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > +       ret
> > +
> > +       .p2align 4,, 8
> > +L(more_4x_vec):
> > +# ifdef USE_AS_WCSCPY
> > +       xorl    %ecx, %ecx
> > +# endif
> > +       bsf     %VMASK_REG, %VRCX
> > +       jnz     L(ret_vec_x3)
> > +
> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > +       VPTESTN %VMM(4), %VMM(4), %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(ret_vec_x4)
> > +
> > +       VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> > +
> > +       /* Check if we are near the end before aligning.  */
> > +       cmpq    $(CHAR_PER_VEC * 8), %rdx
> > +       jbe     L(last_4x_vec)
> > +
> > +
> > +       /* Add rsi to rdx (length) before aligning rsi. NB: Since we
> > +          filtered out huge lengths this cannot overflow.  */
> > +# ifdef USE_AS_WCSCPY
> > +       leaq    (%rsi, %rdx, CHAR_SIZE), %rdx
> > +# else
> > +       addq    %rsi, %rdx
> > +# endif
> > +
> > +       /* Subtract rsi from rdi before aligning (add back will have
> > +          correct rdi for aligned rsi).  */
> > +       subq    %rsi, %rdi
> > +       subq    $-(VEC_SIZE * 5), %rsi
> > +       andq    $(VEC_SIZE * -4), %rsi
> > +
> > +       /* Load first half of the loop before entry.  */
> > +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +       VPTESTN %VMM(4), %VMM(4), %k2
> > +       VPTESTN %VMM(6), %VMM(6), %k4
> > +
> > +       /* Offset rsi by VEC_SIZE so that we can jump to
> > +          L(loop_last_4x_vec).  */
> > +       addq    $-(VEC_SIZE), %rsi
> > +       KORTEST %k2, %k4
> > +       jnz     L(loop_4x_done)
> > +
> > +       /* Store loop end in r9.  */
> > +       leaq    -(VEC_SIZE * 5)(%rdx), %r9
> > +
> > +       .p2align 4,, 11
> > +L(loop_4x_vec):
> > +       VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> > +
> > +       subq    $(VEC_SIZE * -4), %rsi
> > +       cmpq    %rsi, %r9
> > +       jbe     L(loop_last_4x_vec)
> > +
> > +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> > +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> > +       VMOVA   (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> > +
> > +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +       VPTESTN %VMM(4), %VMM(4), %k2
> > +       VPTESTN %VMM(6), %VMM(6), %k4
> > +       KORTEST %k2, %k4
> > +       jz      L(loop_4x_vec)
> > +
> > +L(loop_4x_done):
> > +       VPTESTN %VMM(0), %VMM(0), %k0
> > +       KMOV    %k0, %VRCX
> > +       /* Restore rdi (dst).  */
> > +       addq    %rsi, %rdi
> > +
> > +       /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
> > +          test with bsf.  */
> > +       bsf     %VRCX, %VRCX
> > +       jnz     L(ret_vec_x1)
> > +       VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
> > +
> > +       KMOV    %k2, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(ret_vec_x2)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> > +
> > +       VPTESTN %VMM(2), %VMM(2), %k0
> > +       KMOV    %k0, %VRCX
> > +       bsf     %VRCX, %VRCX
> > +       jnz     L(ret_vec_x3)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> > +
> > +       KMOV    %k4, %VRCX
> > +       bsf     %VRCX, %VRCX
> > +       VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > +       VMOVU   %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > +       ret
> > +
> > +
> > +       .p2align 4,, 4
> > +L(page_cross):
> > +       movq    %rsi, %r8
> > +       andq    $(VEC_SIZE * -1), %r8
> > +       VPCMPEQ (%r8), %VZERO, %k0
> > +
> > +# ifdef USE_AS_WCSCPY
> > +       KMOV    %k0, %VR9
> > +       shrl    $2, %ecx
> > +       andl    $(CHAR_PER_VEC - 1), %ecx
> > +       shrx    %VRCX, %VR9, %VRCX
> > +# else
> > +       KMOV    %k0, %VRCX
> > +       shrx    %VRSI, %VRCX, %VRCX
> > +# endif
> > +
> > +       subl    %esi, %r8d
> > +       andl    $(VEC_SIZE - 1), %r8d
> > +# ifdef USE_AS_WCSCPY
> > +       shrl    $2, %r8d
> > +# endif
> > +       cmpq    %r8, %rdx
> > +       jbe     L(page_cross_small)
> > +       /* Optimizing more for space as this is very cold code. This
> > +          saves 2x cache lines.  */
> > +
> > +       /* This adds once to the later result which will get correct
> > +          copy bounds. NB: this can never zero-out a non-zero RCX as
> > +          to be in the page cross case rsi cannot be aligned and we
> > +          already right-shift rcx by the misalignment.  */
> > +       shl     %VRCX
> > +       jz      L(page_cross_continue)
> > +       bsf     %VRCX, %VRCX
> > +       REP_MOVS
> > +       ret
> > +
> > +L(page_cross_small):
> > +       tzcnt   %VRCX, %VRCX
> > +       jz      L(page_cross_setz)
> > +       cmpl    %edx, %ecx
> > +       cmova   %edx, %ecx
> > +
> > +# ifdef USE_AS_WCSCPY
> > +       rep     movsd
> > +# else
> > +       rep     movsb
> > +# endif
> > +L(page_cross_setz):
> > +       movNULL $0, (%rdi)
> > +       ret
> > +END(STRNCAT)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
> > index 1b3426d511..49eaf4cbd9 100644
> > --- a/sysdeps/x86_64/multiarch/strncpy-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
> > @@ -1,7 +1,990 @@
> > -#ifndef STRNCPY
> > -# define STRNCPY       __strncpy_evex
> > -#endif
> > +/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (4)
> > +
> > +       /* Use evex-masked stores for small sizes. Turned off at the
> > +          moment.  */
> > +# define USE_EVEX_MASKED_STORE 0
> > +
> > +
> > +# include <sysdep.h>
> > +# ifndef VEC_SIZE
> > +#  include "x86-evex256-vecs.h"
> > +# endif
> > +
> > +
> > +# ifndef STRNCPY
> > +#  define STRNCPY      __strncpy_evex
> > +# endif
> > +
> > +# ifdef USE_AS_WCSCPY
> > +#  define VMOVU_MASK   vmovdqu32
> > +#  define VPCMPEQ      vpcmpeqd
> > +#  define VPMIN        vpminud
> > +#  define VPTESTN      vptestnmd
> > +#  define VPTEST       vptestmd
> > +#  define CHAR_SIZE    4
> > +
> > +#  define REP_MOVS     rep movsd
> > +#  define REP_STOS     rep stosl
> > +
> > +#  define USE_WIDE_CHAR
> > +
> > +# else
> > +#  define VMOVU_MASK   vmovdqu8
> > +#  define VPCMPEQ      vpcmpeqb
> > +#  define VPMIN        vpminub
> > +#  define VPTESTN      vptestnmb
> > +#  define VPTEST       vptestmb
> > +#  define CHAR_SIZE    1
> > +
> > +#  define REP_MOVS     rep movsb
> > +#  define REP_STOS     rep stosb
> > +# endif
> > +
> > +# include "strncpy-or-cat-overflow-def.h"
> > +
> > +# define PAGE_SIZE     4096
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > +
> > +# include "reg-macros.h"
> > +
> > +
> > +# define VZERO VMM(7)
> > +# define VZERO_256     VMM_256(7)
> > +# define VZERO_128     VMM_128(7)
> > +
> > +# if VEC_SIZE == 64
> > +#  define VZERO_HALF   VZERO_256
> > +# else
> > +#  define VZERO_HALF   VZERO_128
> > +# endif
> > +
> > +       .section SECTION(.text), "ax", @progbits
> > +ENTRY(STRNCPY)
> > +       /* Filter zero length strings and very long strings.  Zero
> > +          length strings just return, very long strings are handled by
> > +          just running rep stos{b|l} to zero set (which will almost
> > +          certainly segfault), if that succeeds then just calling
> > +          OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
> > +# ifdef USE_AS_WCSCPY
> > +       decq    %rdx
> > +       movq    %rdx, %rax
> > +       /* 56 is end of max supported address space.  */
> > +       shr     $56, %rax
> > +       jnz     L(zero_len)
> > +# else
> > +       decq    %rdx
> > +       /* If the flag needs to become `jb` replace `dec` with `sub`.
> > +        */
> > +       jl      L(zero_len)
> > +# endif
> > +
> > +       vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> > +       movl    %esi, %eax
> > +       andl    $(PAGE_SIZE - 1), %eax
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +       ja      L(page_cross)
> > +
> > +L(page_cross_continue):
> > +       VMOVU   (%rsi), %VMM(0)
> > +       VPTESTN %VMM(0), %VMM(0), %k0
> > +       KMOV    %k0, %VRCX
> > +
> > +       /* If no STPCPY just save end ahead of time.  */
> > +# ifndef USE_AS_STPCPY
> > +       movq    %rdi, %rax
> > +# endif
> > +
> > +
> > +       cmpq    $(CHAR_PER_VEC), %rdx
> > +
> > +       /* If USE_EVEX_MASK_STORE is enabled then we just handle length
> > +          <= CHAR_PER_VEC with masked instructions (which have
> > +          potential for dramatically bad perf if dst splits a page and
> > +          is not in the TLB).  */
> > +# if USE_EVEX_MASKED_STORE
> > +       /* `jae` because length rdx is now length - 1.  */
> > +       jae     L(more_1x_vec)
> > +
> > +       /* If there where multiple zero-CHAR matches in the first VEC,
> > +          VRCX will be overset but thats fine since any oversets where
> > +          at zero-positions anyways.  */
> > +
> > +#  ifdef USE_AS_STPCPY
> > +       tzcnt   %VRCX, %VRAX
> > +       cmpl    %eax, %edx
> > +       cmovb   %edx, %eax
> > +#   ifdef USE_AS_WCSCPY
> > +       adcl    $0, %eax
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +#   else
> > +       adcq    %rdi, %rax
> > +#   endif
> > +#  endif
> > +       dec     %VRCX
> > +
> > +       /* Zero out all non-zero CHAR's after the first zero match.  */
> > +       KMOV    %VRCX, %k1
> > +
> > +       /* Use VZERO as destination so this can be reused for
> > +          L(zfill_less_vec) (which if jumped to by subsequent logic
> > +          will have zerod out VZERO.  */
> > +       VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
> > +L(zfill_less_vec):
> > +       /* Get mask for what we need to set.  */
> > +       incl    %edx
> > +       mov     $-1, %VRCX
> > +       bzhi    %VRDX, %VRCX, %VRCX
> > +       KMOV    %VRCX, %k1
> > +       VMOVU_MASK %VZERO, (%rdi){%k1}
> > +       ret
> > +
> > +       .p2align 4,, 4
> > +L(zero_len):
> > +       cmpq    $-1, %rdx
> > +       jne     L(best_effort_strncpy)
> > +       movq    %rdi, %rax
> > +       ret
> > +
> > +       .p2align 4,, 8
> > +L(more_1x_vec):
> > +# else
> > +       /* `jb` because length rdx is now length - 1.  */
> > +       jb      L(less_1x_vec)
> > +# endif
> > +
> > +
> > +       /* This may overset but thats fine because we still need to zero
> > +          fill.  */
> > +       VMOVU   %VMM(0), (%rdi)
> > +
> > +
> > +       /* Length must be >= CHAR_PER_VEC so match here means we must
> > +          zero-fill.  */
> > +       test    %VRCX, %VRCX
> > +       jnz     L(zfill)
> > +
> > +
> > +       /* We are going to align rsi here so will need to be able to re-
> > +          adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> > +          so rsi + rdx * CHAR_SIZE cannot overflow.  */
> > +       leaq    (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> > +       subq    %rsi, %rdi
> > +       andq    $-(VEC_SIZE), %rsi
> > +
> > +L(loop_last_4x_vec):
> > +       addq    %rsi, %rdi
> > +       subq    %rsi, %rdx
> > +# ifdef USE_AS_WCSCPY
> > +       shrq    $2, %rdx
> > +# endif
> > +
> > +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> > +       VPTESTN %VMM(1), %VMM(1), %k0
> > +       KMOV    %k0, %VRCX
> > +
> > +       /* -1 because of the `dec %rdx` earlier.  */
> > +       cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
> > +       ja      L(more_2x_vec)
> > +
> > +L(last_2x_vec):
> > +       /* This will be need to be computed no matter what. We do it
> > +          ahead of time for CHAR_PER_VEC == 64 because we can't adjust
> > +          the value of `tzcnt` with a shift.  */
> > +# if CHAR_PER_VEC == 64
> > +       tzcntq  %rcx, %rcx
> > +# endif
> > +
> > +       cmpl    $(CHAR_PER_VEC), %edx
> > +       jb      L(ret_vec_x1_len)
> > +
> > +       /* Seperate logic for CHAR_PER_VEC == 64 because we already did
> > +          `tzcnt` on VRCX.  */
> > +# if CHAR_PER_VEC == 64
> > +       /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
> > +       cmpb    $CHAR_PER_VEC, %cl
> > +       jnz     L(ret_vec_x1_no_bsf)
> > +# else
> > +       test    %VRCX, %VRCX
> > +       jnz     L(ret_vec_x1)
> > +# endif
> > +
> > +
> > +
> > +       VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0
> > +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > +       KMOV    %k0, %VRCX
> > +
> > +# if CHAR_PER_VEC < 64
> > +       /* This essentiallys adds CHAR_PER_VEC to computed result.  */
> > +       shlq    $CHAR_PER_VEC, %rcx
> > +# else
> > +       tzcntq  %rcx, %rcx
> > +       addl    $CHAR_PER_VEC, %ecx
> > +# endif
> > +
> > +       .p2align 4,, 4
> > +L(ret_vec_x1_len):
> > +       /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
> > +          already been done.  */
> > +# if CHAR_PER_VEC < 64
> > +       tzcntq  %rcx, %rcx
> > +# endif
> > +       cmpl    %ecx, %edx
> > +       jbe     L(ret_vec_x1_len_no_zfill)
> > +       /* Fall through (expectation) is copy len < buffer len.  */
> > +       VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +L(ret_vec_x1_len_no_zfill_mov):
> > +       movl    %ecx, %edx
> > +# ifdef USE_AS_STPCPY
> > +       /* clear flags.  */
> > +       xorl    %ecx, %ecx
> > +# endif
> > +L(ret_vec_x1_len_no_zfill):
> > +       VMOVU   ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > +       VMOVU   %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +# ifdef USE_AS_STPCPY
> > +#  ifdef USE_AS_WCSCPY
> > +       adcq    $0, %rdx
> > +       leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
> > +#  else
> > +       leal    (VEC_SIZE)(%rdx), %eax
> > +       adcq    %rdi, %rax
> > +#  endif
> > +# endif
> > +       ret
> > +
> > +
> > +       .p2align 4,, 10
> > +L(ret_vec_x1):
> > +       bsf     %VRCX, %VRCX
> > +L(ret_vec_x1_no_bsf):
> > +       VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +       subl    %ecx, %edx
> > +       cmpl    $CHAR_PER_VEC, %edx
> > +       jb      L(ret_vec_x1_len_no_zfill_mov)
> > +       /* Fall through (expectation) is copy len < buffer len.  */
> > +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > +       VMOVU   %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
> > +# ifdef USE_AS_STPCPY
> > +       leaq    (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
> > +# endif
> > +       ret
> > +
> > +       .p2align 4,, 8
> > +L(last_4x_vec):
> > +       /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
> > +          $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
> > +          using `movzbl`.  */
> > +# if CHAR_PER_VEC == 64
> > +       movzbl  %dl, %edx
> > +# else
> > +       andl    $(CHAR_PER_VEC * 4 - 1), %edx
> > +# endif
> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(1)
> > +       VPTESTN %VMM(1), %VMM(1), %k0
> > +       KMOV    %k0, %VRCX
> > +       subq    $-(VEC_SIZE * 4), %rsi
> > +       subq    $-(VEC_SIZE * 4), %rdi
> > +       cmpl    $(CHAR_PER_VEC * 2 - 1), %edx
> > +       jbe     L(last_2x_vec)
> > +       .p2align 4,, 8
> > +L(more_2x_vec):
> > +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > +       test    %VRCX, %VRCX
> > +       /* Must fill at least 2x VEC.  */
> > +       jnz     L(zfill_vec1)
> > +
> > +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > +       VPTESTN %VMM(2), %VMM(2), %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       /* Must fill at least 1x VEC.  */
> > +       jnz     L(zfill_vec2)
> > +
> > +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> > +       VPTESTN %VMM(3), %VMM(3), %k0
> > +       KMOV    %k0, %VRCX
> > +
> > +       /* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
> > +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rdx
> > +       ja      L(more_4x_vec)
> > +
> > +       subl    $(CHAR_PER_VEC * 3), %edx
> > +       jb      L(ret_vec_x3_len)
> > +
> > +       test    %VRCX, %VRCX
> > +       jnz     L(ret_vec_x3)
> > +
> > +       VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0
> > +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > +       KMOV    %k0, %VRCX
> > +       tzcnt   %VRCX, %VRCX
> > +       cmpl    %ecx, %edx
> > +       jbe     L(ret_vec_x4_len_no_zfill)
> > +       /* Fall through (expectation) is copy len < buffer len.  */
> > +       VMOVU   %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +       movl    %ecx, %edx
> > +L(ret_vec_x4_len_no_zfill):
> > +       VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > +       VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +# ifdef USE_AS_STPCPY
> > +#  ifdef USE_AS_WCSCPY
> > +       adcq    $0, %rdx
> > +       leaq    (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
> > +#  else
> > +       leal    (VEC_SIZE * 4 + 0)(%rdx), %eax
> > +       adcq    %rdi, %rax
> > +#  endif
> > +# endif
> > +       ret
> > +
> > +
> > +L(ret_vec_x3_len):
> > +       addl    $(CHAR_PER_VEC * 1), %edx
> > +       tzcnt   %VRCX, %VRCX
> > +       cmpl    %ecx, %edx
> > +       jbe     L(ret_vec_x3_len_no_zfill)
> > +       /* Fall through (expectation) is copy len < buffer len.  */
> > +       VMOVU   %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +L(ret_vec_x3_len_no_zfill_mov):
> > +       movl    %ecx, %edx
> > +# ifdef USE_AS_STPCPY
> > +       /* clear flags.  */
> > +       xorl    %ecx, %ecx
> > +# endif
> > +       .p2align 4,, 4
> > +L(ret_vec_x3_len_no_zfill):
> > +       VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > +       VMOVU   %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +# ifdef USE_AS_STPCPY
> > +#  ifdef USE_AS_WCSCPY
> > +       adcq    $0, %rdx
> > +       leaq    (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
> > +#  else
> > +       leal    (VEC_SIZE * 3 + 0)(%rdx), %eax
> > +       adcq    %rdi, %rax
> > +#  endif
> > +# endif
> > +       ret
> > +
> > +
> > +       .p2align 4,, 8
> > +L(ret_vec_x3):
> > +       bsf     %VRCX, %VRCX
> > +       VMOVU   %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
> > +       subl    %ecx, %edx
> > +       jl      L(ret_vec_x3_len_no_zfill_mov)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > +       VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> > +# ifdef USE_AS_STPCPY
> > +       leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
> > +# endif
> > +       ret
> > +
> > +       .p2align 4,, 8
> > +L(more_4x_vec):
> > +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > +       test    %VRCX, %VRCX
> > +       jnz     L(zfill_vec3)
> > +
> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > +       VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> > +       VPTESTN %VMM(4), %VMM(4), %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(zfill_vec4)
> >
> > -#define USE_AS_STRNCPY
> > -#define STRCPY STRNCPY
> > -#include "strcpy-evex.S"
> > +       /* Recheck length before aligning.  */
> > +       cmpq    $(CHAR_PER_VEC * 8 - 1), %rdx
> > +       jbe     L(last_4x_vec)
> > +
> > +       /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
> > +# ifdef USE_AS_WCSCPY
> > +       leaq    (%rsi, %rdx, CHAR_SIZE), %rdx
> > +# else
> > +       addq    %rsi, %rdx
> > +# endif
> > +       subq    %rsi, %rdi
> > +       subq    $-(VEC_SIZE * 5), %rsi
> > +       andq    $(VEC_SIZE * -4), %rsi
> > +
> > +
> > +       /* Load first half of the loop before entry.  */
> > +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +       VPTESTN %VMM(4), %VMM(4), %k2
> > +       VPTESTN %VMM(6), %VMM(6), %k4
> > +
> > +
> > +       /* Offset rsi by VEC_SIZE so that we can jump to
> > +          L(loop_last_4x_vec).  */
> > +       addq    $-(VEC_SIZE), %rsi
> > +       KORTEST %k2, %k4
> > +       jnz     L(loop_4x_done)
> > +
> > +       /* Store loop end in r9.  */
> > +       leaq    -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
> > +
> > +       .p2align 4,, 11
> > +L(loop_4x_vec):
> > +       VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> > +
> > +       subq    $(VEC_SIZE * -4), %rsi
> > +       cmpq    %rsi, %r9
> > +       jbe     L(loop_last_4x_vec)
> > +
> > +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> > +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> > +       VMOVA   (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> > +
> > +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +       VPTESTN %VMM(4), %VMM(4), %k2
> > +       VPTESTN %VMM(6), %VMM(6), %k4
> > +       KORTEST %k2, %k4
> > +       jz      L(loop_4x_vec)
> > +
> > +L(loop_4x_done):
> > +       /* Restore rdx (length).  */
> > +       subq    %rsi, %rdx
> > +# ifdef USE_AS_WCSCPY
> > +       shrq    $2, %rdx
> > +# endif
> > +       VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > +       /* Restore rdi (dst).  */
> > +       addq    %rsi, %rdi
> > +       VPTESTN %VMM(0), %VMM(0), %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(zfill_vec1)
> > +
> > +       VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> > +       KMOV    %k2, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(zfill_vec2)
> > +
> > +       VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> > +       VPTESTN %VMM(2), %VMM(2), %k0
> > +       KMOV    %k0, %VRCX
> > +       test    %VRCX, %VRCX
> > +       jnz     L(zfill_vec3)
> > +
> > +       VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
> > +       KMOV    %k4, %VRCX
> > +       // Zfill more....
> > +
> > +       .p2align 4,, 4
> > +L(zfill_vec4):
> > +       subq    $(VEC_SIZE * -2), %rdi
> > +       addq    $(CHAR_PER_VEC * -2), %rdx
> > +L(zfill_vec2):
> > +       subq    $(VEC_SIZE * -2), %rdi
> > +       addq    $(CHAR_PER_VEC * -1), %rdx
> > +L(zfill):
> > +       /* VRCX must be non-zero.  */
> > +       bsf     %VRCX, %VRCX
> > +
> > +       /* Adjust length / dst for zfill.  */
> > +       subq    %rcx, %rdx
> > +# ifdef USE_AS_WCSCPY
> > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > +# else
> > +       addq    %rcx, %rdi
> > +# endif
> > +# ifdef USE_AS_STPCPY
> > +       movq    %rdi, %rax
> > +# endif
> > +L(zfill_from_page_cross):
> > +
> > +       /* From here on out its just memset(rdi, 0, rdx).  */
> > +       cmpq    $CHAR_PER_VEC, %rdx
> > +       jb      L(zfill_less_vec)
> > +
> > +L(zfill_more_1x_vec):
> > +       VMOVU   %VZERO, (%rdi)
> > +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +       cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
> > +       ja      L(zfill_more_2x_vec)
> > +L(zfill_done0):
> > +       ret
> > +
> > +       /* Coming from vec1/vec2 we must be able to zfill at least 2x
> > +          VEC.  */
> > +       .p2align 4,, 8
> > +L(zfill_vec3):
> > +       subq    $(VEC_SIZE * -2), %rdi
> > +       addq    $(CHAR_PER_VEC * -2), %rdx
> > +       .p2align 4,, 2
> > +L(zfill_vec1):
> > +       bsfq    %rcx, %rcx
> > +       /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
> > +        */
> > +       leaq    VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> > +       subq    %rcx, %rdx
> > +# ifdef USE_AS_STPCPY
> > +       movq    %rdi, %rax
> > +# endif
> > +
> > +
> > +       VMOVU   %VZERO, (%rdi)
> > +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +       cmpq    $(CHAR_PER_VEC * 2), %rdx
> > +       jb      L(zfill_done0)
> > +L(zfill_more_2x_vec):
> > +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> > +       VMOVU   %VZERO, (VEC_SIZE)(%rdi)
> > +       subq    $(CHAR_PER_VEC * 4 - 1), %rdx
> > +       jbe     L(zfill_done)
> > +
> > +# ifdef USE_AS_WCSCPY
> > +       leaq    (%rdi, %rdx, CHAR_SIZE), %rdx
> > +# else
> > +       addq    %rdi, %rdx
> > +# endif
> > +
> > +       VMOVU   %VZERO, (VEC_SIZE * 2)(%rdi)
> > +       VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi)
> > +
> > +
> > +       VMOVU   %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
> > +       VMOVU   %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
> > +
> > +       subq    $-(VEC_SIZE * 4), %rdi
> > +       cmpq    %rdi, %rdx
> > +       jbe     L(zfill_done)
> > +
> > +       /* Align rdi and zfill loop.  */
> > +       andq    $-(VEC_SIZE), %rdi
> > +       .p2align 4,, 12
> > +L(zfill_loop_4x_vec):
> > +       VMOVA   %VZERO, (VEC_SIZE * 0)(%rdi)
> > +       VMOVA   %VZERO, (VEC_SIZE * 1)(%rdi)
> > +       VMOVA   %VZERO, (VEC_SIZE * 2)(%rdi)
> > +       VMOVA   %VZERO, (VEC_SIZE * 3)(%rdi)
> > +       subq    $-(VEC_SIZE * 4), %rdi
> > +       cmpq    %rdi, %rdx
> > +       ja      L(zfill_loop_4x_vec)
> > +L(zfill_done):
> > +       ret
> > +
> > +
> > +       /* Less 1x VEC case if we are not using evex masked store.  */
> > +# if !USE_EVEX_MASKED_STORE
> > +       .p2align 4,, 8
> > +L(copy_1x):
> > +       /* Special case for copy 1x. It can be handled quickly and many
> > +          buffer sizes have convenient alignment.  */
> > +       VMOVU   %VMM(0), (%rdi)
> > +       /* If no zeros then we are done.  */
> > +       testl   %ecx, %ecx
> > +       jz      L(ret_1x_1x)
> > +
> > +       /* Need to zfill, not we know that length <= CHAR_PER_VEC so we
> > +          only handle the small case here.  */
> > +       bsf     %VRCX, %VRCX
> > +L(zfill_less_vec_no_bsf):
> > +       /* Adjust length / dst then just zfill less_vec.  */
> > +       subq    %rcx, %rdx
> > +#  ifdef USE_AS_WCSCPY
> > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > +#  else
> > +       addq    %rcx, %rdi
> > +#  endif
> > +#  ifdef USE_AS_STPCPY
> > +       movq    %rdi, %rax
> > +#  endif
> > +
> > +L(zfill_less_vec):
> > +       cmpl    $((VEC_SIZE / 2) / CHAR_SIZE), %edx
> > +       jb      L(zfill_less_half)
> > +
> > +       VMOVU   %VZERO_HALF, (%rdi)
> > +       VMOVU   %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +       ret
> > +#  ifdef USE_AS_STPCPY
> > +L(ret_1x_1x):
> > +       leaq    CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
> > +       ret
> > +#  endif
> > +
> > +
> > +#  if VEC_SIZE == 64
> > +       .p2align 4,, 4
> > +L(copy_32_63):
> > +       /* Overfill to avoid branches.  */
> > +       VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > +       VMOVU   %VMM_256(0), (%rdi)
> > +       VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +
> > +       /* We are taking advantage of the fact that to be here we must
> > +          be writing null-term as (%rdi, %rcx) we have a byte of lee-
> > +          way for overwriting.  */
> > +       cmpl    %ecx, %edx
> > +       ja      L(zfill_less_vec_no_bsf)
> > +#   ifndef USE_AS_STPCPY
> > +L(ret_1x_1x):
> > +#   else
> > +#    ifdef USE_AS_WCSCPY
> > +       adcq    $0, %rdx
> > +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > +#    else
> > +       movl    %edx, %eax
> > +       adcq    %rdi, %rax
> > +#    endif
> > +#   endif
> > +       ret
> > +#  endif
> > +
> > +       .p2align 4,, 4
> > +L(copy_16_31):
> > +       /* Overfill to avoid branches.  */
> > +       vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > +       VMOVU   %VMM_128(0), (%rdi)
> > +       vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +       cmpl    %ecx, %edx
> > +
> > +       /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
> > +          we have a larger copy block for 32-63 so this is just falls
> > +          through to zfill 16-31. If VEC_SIZE == 32 then we check for
> > +          full zfill of less 1x VEC.  */
> > +#  if VEC_SIZE == 64
> > +       jbe     L(ret_16_31)
> > +       subl    %ecx, %edx
> > +#   ifdef USE_AS_WCSCPY
> > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > +#   else
> > +       addq    %rcx, %rdi
> > +#   endif
> > +#   ifdef USE_AS_STPCPY
> > +       movq    %rdi, %rax
> > +#   endif
> > +L(zfill_less_half):
> > +L(zfill_less_32):
> > +       cmpl    $(16 / CHAR_SIZE), %edx
> > +       jb      L(zfill_less_16)
> > +       VMOVU   %VZERO_128, (%rdi)
> > +       VMOVU   %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +#   ifdef USE_AS_STPCPY
> > +       ret
> > +#   endif
> > +L(ret_16_31):
> > +#   ifdef USE_AS_STPCPY
> > +#    ifdef USE_AS_WCSCPY
> > +       adcq    $0, %rdx
> > +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > +#    else
> > +       movl    %edx, %eax
> > +       adcq    %rdi, %rax
> > +#    endif
> > +#   endif
> > +       ret
> > +#  else
> > +       /* VEC_SIZE == 32 begins.  */
> > +       ja      L(zfill_less_vec_no_bsf)
> > +#   ifndef USE_AS_STPCPY
> > +L(ret_1x_1x):
> > +#   else
> > +#    ifdef USE_AS_WCSCPY
> > +       adcq    $0, %rdx
> > +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > +#    else
> > +       movl    %edx, %eax
> > +       adcq    %rdi, %rax
> > +#    endif
> > +#   endif
> > +       ret
> > +#  endif
> > +
> > +
> > +       .p2align 4,, 4
> > +L(copy_8_15):
> > +       /* Overfill to avoid branches.  */
> > +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> > +       vmovq   %VMM_128(0), (%rdi)
> > +       movq    %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +       cmpl    %ecx, %edx
> > +       jbe     L(ret_8_15)
> > +       subl    %ecx, %edx
> > +#  ifdef USE_AS_WCSCPY
> > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > +#  else
> > +       addq    %rcx, %rdi
> > +#  endif
> > +#  ifdef USE_AS_STPCPY
> > +       movq    %rdi, %rax
> > +#  endif
> > +       .p2align 4,, 8
> > +#  if VEC_SIZE == 32
> > +L(zfill_less_half):
> > +#  endif
> > +L(zfill_less_16):
> > +       xorl    %ecx, %ecx
> > +       cmpl    $(8 / CHAR_SIZE), %edx
> > +       jb      L(zfill_less_8)
> > +       movq    %rcx, (%rdi)
> > +       movq    %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +#  ifndef USE_AS_STPCPY
> > +L(ret_8_15):
> > +#  endif
> > +       ret
> > +
> > +       .p2align 4,, 8
> > +L(less_1x_vec):
> > +       je      L(copy_1x)
> > +
> > +       /* We will need `tzcnt` result for all other copy sizes.  */
> > +       tzcnt   %VRCX, %VRCX
> > +#  if VEC_SIZE == 64
> > +       cmpl    $(32 / CHAR_SIZE), %edx
> > +       jae     L(copy_32_63)
> > +#  endif
> > +
> > +       cmpl    $(16 / CHAR_SIZE), %edx
> > +       jae     L(copy_16_31)
> > +
> > +       cmpl    $(8 / CHAR_SIZE), %edx
> > +       jae     L(copy_8_15)
> > +#  ifdef USE_AS_WCSCPY
> > +       testl   %ecx, %ecx
> > +       jz      L(zfill_less_8_set_ret)
> > +
> > +       movl    (%rsi, %rdx, CHAR_SIZE), %esi
> > +       vmovd   %VMM_128(0), (%rdi)
> > +       movl    %esi, (%rdi, %rdx, CHAR_SIZE)
> > +#   ifdef USE_AS_STPCPY
> > +       cmpl    %ecx, %edx
> > +L(ret_8_15):
> > +       adcq    $0, %rdx
> > +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > +#   endif
> > +       ret
> > +L(zfill_less_8_set_ret):
> > +       xorl    %ecx, %ecx
> > +#   ifdef USE_AS_STPCPY
> > +       movq    %rdi, %rax
> > +#   endif
> > +L(zfill_less_8):
> > +       movl    %ecx, (%rdi)
> > +       movl    %ecx, (%rdi, %rdx, CHAR_SIZE)
> > +       ret
> > +#  else
> > +       cmpl    $3, %edx
> > +       jb      L(copy_0_3)
> > +       /* Overfill to avoid branches.  */
> > +       movl    -3(%rsi, %rdx), %esi
> > +       vmovd   %VMM_128(0), (%rdi)
> > +       movl    %esi, -3(%rdi, %rdx)
> > +       cmpl    %ecx, %edx
> > +       jbe     L(ret_4_7)
> > +       subq    %rcx, %rdx
> > +       addq    %rcx, %rdi
> > +#   ifdef USE_AS_STPCPY
> > +       movq    %rdi, %rax
> > +#   endif
> > +       xorl    %ecx, %ecx
> > +       .p2align 4,, 8
> > +L(zfill_less_8):
> > +       cmpl    $3, %edx
> > +       jb      L(zfill_less_3)
> > +       movl    %ecx, (%rdi)
> > +       movl    %ecx, -3(%rdi, %rdx)
> > +#   ifdef USE_AS_STPCPY
> > +       ret
> > +#   endif
> > +
> > +L(ret_4_7):
> > +#   ifdef USE_AS_STPCPY
> > +L(ret_8_15):
> > +       movl    %edx, %eax
> > +       adcq    %rdi, %rax
> > +#   endif
> > +       ret
> > +
> > +       .p2align 4,, 4
> > +L(zfill_less_3):
> > +       testl   %edx, %edx
> > +       jz      L(zfill_1)
> > +       movw    %cx, (%rdi)
> > +L(zfill_1):
> > +       movb    %cl, (%rdi, %rdx)
> > +       ret
> > +
> > +       .p2align 4,, 8
> > +L(copy_0_3):
> > +       vmovd   %VMM_128(0), %r8d
> > +       testl   %edx, %edx
> > +       jz      L(copy_1)
> > +       movw    %r8w, (%rdi)
> > +       cmpl    %ecx, %edx
> > +       ja      L(zfill_from_1)
> > +       movzbl  (%rsi, %rdx), %r8d
> > +#   ifdef USE_AS_STPCPY
> > +       movl    %edx, %eax
> > +       adcq    %rdi, %rax
> > +       movb    %r8b, (%rdi, %rdx)
> > +       ret
> > +#   endif
> > +
> > +L(copy_1):
> > +#   ifdef USE_AS_STPCPY
> > +       movl    %edx, %eax
> > +       cmpl    %ecx, %edx
> > +       adcq    %rdi, %rax
> > +#   endif
> > +#   ifdef USE_AS_WCSCPY
> > +       vmovd   %VMM_128(0), (%rdi)
> > +#   else
> > +       movb    %r8b, (%rdi, %rdx)
> > +#   endif
> > +       ret
> > +#  endif
> > +
> > +
> > +#  ifndef USE_AS_WCSCPY
> > +       .p2align 4,, 8
> > +L(zfill_from_1):
> > +#   ifdef USE_AS_STPCPY
> > +       leaq    (%rdi, %rcx), %rax
> > +#   endif
> > +       movw    $0, -1(%rdi, %rdx)
> > +       ret
> > +#  endif
> > +
> > +       .p2align 4,, 4
> > +L(zero_len):
> > +       incq    %rdx
> > +       jne     L(best_effort_strncpy)
> > +       movq    %rdi, %rax
> > +       ret
> > +# endif
> > +
> > +
> > +       .p2align 4,, 4
> > +       .p2align 6,, 8
> > +L(page_cross):
> > +       movq    %rsi, %rax
> > +       andq    $(VEC_SIZE * -1), %rax
> > +       VPCMPEQ (%rax), %VZERO, %k0
> > +       KMOV    %k0, %VRCX
> > +# ifdef USE_AS_WCSCPY
> > +       movl    %esi, %r8d
> > +       shrl    $2, %r8d
> > +       andl    $(CHAR_PER_VEC - 1), %r8d
> > +       shrx    %VR8, %VRCX, %VRCX
> > +# else
> > +       shrx    %VRSI, %VRCX, %VRCX
> > +# endif
> > +
> > +       /* Compute amount of bytes we checked.  */
> > +       subl    %esi, %eax
> > +       andl    $(VEC_SIZE - 1), %eax
> > +# ifdef USE_AS_WCSCPY
> > +       shrl    $2, %eax
> > +# endif
> > +
> > +       /* If rax > rdx then we are finishing the copy at the end of the
> > +          page.  */
> > +       cmpq    %rax, %rdx
> > +       jb      L(page_cross_small)
> > +
> > +
> > +       /* If rcx is non-zero then continue.  */
> > +       test    %VRCX, %VRCX
> > +       jz      L(page_cross_continue)
> > +
> > +       /* We found zero-CHAR so need to copy then zfill (we know we
> > +          didn't cover all of length here).  */
> > +       bsf     %VRCX, %VRCX
> > +L(movsb_and_zfill):
> > +       incl    %ecx
> > +       subq    %rcx, %rdx
> > +# ifdef USE_AS_STPCPY
> > +       leaq    -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
> > +# else
> > +       movq    %rdi, %rax
> > +# endif
> > +
> > +       REP_MOVS
> > +# ifdef USE_AS_WCSCPY
> > +       movl    $0, (%rdi)
> > +# else
> > +       movb    $0, (%rdi)
> > +# endif
> > +       jmp     L(zfill_from_page_cross)
> > +
> > +L(page_cross_small):
> > +       tzcnt   %VRCX, %VRCX
> > +       cmpl    %ecx, %edx
> > +       jbe     L(page_cross_copy_only)
> > +
> > +       /* Do a zfill of the tail before copying.  */
> > +       movq    %rdi, %r9
> > +       xorl    %eax, %eax
> > +
> > +       movl    %ecx, %r8d
> > +
> > +       subl    %ecx, %edx
> > +       leaq    CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> > +       movl    %edx, %ecx
> > +       REP_STOS
> > +       movq    %r9, %rdi
> > +       movl    %r8d, %edx
> > +L(page_cross_copy_only):
> > +       leal    1(%rdx), %ecx
> > +# ifdef USE_AS_STPCPY
> > +#  ifdef USE_AS_WCSCPY
> > +       adcl    $0, %edx
> > +       leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > +#  else
> > +       movl    %edx, %eax
> > +       adcq    %rdi, %rax
> > +#  endif
> > +# else
> > +       movq    %rdi, %rax
> > +# endif
> > +       REP_MOVS
> > +       ret
> > +
> > +
> > +L(best_effort_strncpy):
> > +       movq    %rdx, %rcx
> > +       xorl    %eax, %eax
> > +       movq    %rdi, %r8
> > +       /* The length is >= 2^63. We very much so expect to segfault at
> > +          rep stos. If that doesn't happen then just strcpy to finish.
> > +        */
> > +       REP_STOS
> > +       movq    %r8, %rdi
> > +       jmp     OVERFLOW_STRCPY
> > +END(STRNCPY)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> > new file mode 100644
> > index 0000000000..d5ff4cbe50
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> > @@ -0,0 +1,65 @@
>
> Copyright notice is missing.
>
> > +#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
> > +#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
> > +
> > +#if defined USE_MULTIARCH && IS_IN(libc)
> > +#  define UNDERSCORES __
> > +#  ifdef USE_WITH_SSE2
> > +#    define ISA_EXT _sse2
> > +#  elif defined USE_WITH_AVX
> > +#    ifdef USE_WITH_RTM
> > +#      define ISA_EXT _avx_rtm
> > +#    else
> > +#      define ISA_EXT _avx
> > +#    endif
> > +#  elif defined USE_WITH_AVX2
> > +#    ifdef USE_WITH_RTM
> > +#      define ISA_EXT _avx2_rtm
> > +#    else
> > +#      define ISA_EXT _avx2
> > +#    endif
> > +
> > +#  elif defined USE_WITH_EVEX256
> > +#    define ISA_EXT _evex
> > +#  elif defined USE_WITH_EVEX512
> > +#    define ISA_EXT _evex512
> > +#  endif
> > +#else
> > +#  define UNDERSCORES
> > +#  define ISA_EXT
> > +#endif
> > +
> > +#ifdef USE_AS_WCSCPY
> > +#  define STRCPY_PREFIX wc
> > +#  define STRCAT_PREFIX wcs
> > +#  ifdef USE_AS_STPCPY
> > +#    define STRCPY_POSTFIX pcpy
> > +#  else
> > +#    define STRCPY_POSTFIX scpy
> > +#  endif
> > +#else
> > +#  define STRCPY_PREFIX st
> > +#  define STRCAT_PREFIX str
> > +#  ifdef USE_AS_STPCPY
> > +#    define STRCPY_POSTFIX pcpy
> > +#  else
> > +#    define STRCPY_POSTFIX rcpy
> > +#  endif
> > +#endif
> > +#define STRCAT_POSTFIX cat
> > +
> > +#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext)                 \
> > +  underscores##prefix##postfix##ext
> > +
> > +#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
> > +
> > +#ifndef OVERFLOW_STRCPY
> > +#  define OVERFLOW_STRCPY                                                     \
> > +    OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
> > +#endif
> > +
> > +#ifndef OVERFLOW_STRCAT
> > +#  define OVERFLOW_STRCAT                                                     \
> > +    OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
> > +#endif
> > +
> > +#endif
> > --
> > 2.34.1
> >
>
> OK with copyright notices added.
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 3/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions
  2022-11-04 16:45     ` H.J. Lu
@ 2022-11-04 20:21       ` Noah Goldstein
  0 siblings, 0 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04 20:21 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Nov 4, 2022 at 9:45 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Nov 4, 2022 at 1:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Optimizations are:
> >     1. Use more overlapping stores to avoid branches.
> >     2. Reduce how unrolled the aligning copies are (this is more of a
> >        code-size save, its a negative for some sizes in terms of
> >        perf).
> >     3. For st{r|p}n{cat|cpy} re-order the branches to minimize the
> >        number that are taken.
> >
> > Performance Changes:
> >
> >     Times are from N = 10 runs of the benchmark suite and are
> >     reported as geometric mean of all ratios of
> >     New Implementation / Old Implementation.
> >
> >     strcat-avx2      -> 0.998
> >     strcpy-avx2      -> 0.937
> >     stpcpy-avx2      -> 0.971
> >
> >     strncpy-avx2     -> 0.793
> >     stpncpy-avx2     -> 0.775
> >
> >     strncat-avx2     -> 0.962
> >
> > Code Size Changes:
> >     function         -> Bytes New / Bytes Old -> Ratio
> >
> >     strcat-avx2      -> 685  / 1639 -> 0.418
> >     strcpy-avx2      -> 560  / 903  -> 0.620
> >     stpcpy-avx2      -> 592  / 939  -> 0.630
> >
> >     strncpy-avx2     -> 1176 / 2390 -> 0.492
> >     stpncpy-avx2     -> 1268 / 2438 -> 0.520
> >
> >     strncat-avx2     -> 1042 / 2563 -> 0.407
> >
> > Notes:
> >     1. Because of the significant difference between the
> >        implementations they are split into three files.
> >
> >            strcpy-evex.S    -> strcpy, stpcpy, strcat
> >            strncpy-evex.S   -> strncpy
> >            strncat-evex.S    > strncat
> >
> >        I couldn't find a way to merge them without making the
> >        ifdefs incredibly difficult to follow.
> >
> >     2. All implementations can be made evex512 by including
> >        "x86-evex512-vecs.h" at the top.
>
> These comments are wrong for AVX2 implementations.

Sorry, fixed in V3.
>
> > Full check passes on x86-64 and build succeeds for all ISA levels w/
> > and w/o multiarch.
> >
> > Fix avx2
>
> Strayed comments?
>

Yes. Fixed in V3.
> > ---
> >  sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S    |    6 +-
> >  sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S   |    7 +-
> >  sysdeps/x86_64/multiarch/stpncpy-avx2.S       |    5 +-
> >  sysdeps/x86_64/multiarch/strcat-avx2-rtm.S    |   13 +-
> >  sysdeps/x86_64/multiarch/strcat-avx2.S        |  268 +---
> >  sysdeps/x86_64/multiarch/strcat-strlen-avx2.S |   76 +
> >  sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S    |   13 +-
> >  sysdeps/x86_64/multiarch/strcpy-avx2.S        | 1236 +++++------------
> >  sysdeps/x86_64/multiarch/strncat-avx2-rtm.S   |    6 +-
> >  sysdeps/x86_64/multiarch/strncat-avx2.S       |  424 +++++-
> >  sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S   |    6 +-
> >  sysdeps/x86_64/multiarch/strncpy-avx2.S       |  740 +++++++++-
> >  sysdeps/x86_64/multiarch/x86-avx-vecs.h       |    5 +-
> >  sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h  |   26 +
> >  sysdeps/x86_64/multiarch/x86-avx2-vecs.h      |   27 +
> >  15 files changed, 1624 insertions(+), 1234 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
> >  create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-vecs.h
> >
> > diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> > index 2b9c07a59f..189a288053 100644
> > --- a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> > @@ -1,3 +1,3 @@
> > -#define USE_AS_STPCPY
> > -#define STRCPY __stpcpy_avx2_rtm
> > -#include "strcpy-avx2-rtm.S"
> > +#define STPCPY __stpcpy_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > +#include "stpcpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> > index 60a2ccfe53..1b252985e7 100644
> > --- a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> > @@ -1,4 +1,3 @@
> > -#define USE_AS_STPCPY
> > -#define USE_AS_STRNCPY
> > -#define STRCPY __stpncpy_avx2_rtm
> > -#include "strcpy-avx2-rtm.S"
> > +#define STPNCPY        __stpncpy_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > +#include "stpncpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> > index b2f8c19143..a46a8edbe2 100644
> > --- a/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> > @@ -3,6 +3,5 @@
> >  #endif
> >
> >  #define USE_AS_STPCPY
> > -#define USE_AS_STRNCPY
> > -#define STRCPY STPNCPY
> > -#include "strcpy-avx2.S"
> > +#define STRNCPY        STPNCPY
> > +#include "strncpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> > index 637fb557c4..94d51d10bd 100644
> > --- a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> > @@ -1,12 +1,3 @@
> > -#ifndef STRCAT
> > -# define STRCAT __strcat_avx2_rtm
> > -#endif
> > -
> > -#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> > -  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > -
> > -#define VZEROUPPER_RETURN jmp   L(return_vzeroupper)
> > -
> > -#define SECTION(p) p##.avx.rtm
> > -
> > +#define STRCAT __strcat_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> >  #include "strcat-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
> > index d9b7fb2a43..3f914fa342 100644
> > --- a/sysdeps/x86_64/multiarch/strcat-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
> > @@ -16,266 +16,10 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#include <isa-level.h>
> > -
> > -#if ISA_SHOULD_BUILD (3)
> > -
> > -
> > -# include <sysdep.h>
> > -
> > -# ifndef STRCAT
> > -#  define STRCAT  __strcat_avx2
> > -# endif
> > -
> > -# define USE_AS_STRCAT
> > -
> > -/* Number of bytes in a vector register */
> > -# define VEC_SIZE      32
> > -
> > -# ifndef SECTION
> > -#  define SECTION(p)   p##.avx
> > -# endif
> > -
> > -       .section SECTION(.text),"ax",@progbits
> > -ENTRY (STRCAT)
> > -       mov     %rdi, %r9
> > -# ifdef USE_AS_STRNCAT
> > -       mov     %rdx, %r8
> > -# endif
> > -
> > -       xor     %eax, %eax
> > -       mov     %edi, %ecx
> > -       and     $((VEC_SIZE * 4) - 1), %ecx
> > -       vpxor   %xmm6, %xmm6, %xmm6
> > -       cmp     $(VEC_SIZE * 3), %ecx
> > -       ja      L(fourth_vector_boundary)
> > -       vpcmpeqb (%rdi), %ymm6, %ymm0
> > -       vpmovmskb %ymm0, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_first_vector)
> > -       mov     %rdi, %rax
> > -       and     $-VEC_SIZE, %rax
> > -       jmp     L(align_vec_size_start)
> > -L(fourth_vector_boundary):
> > -       mov     %rdi, %rax
> > -       and     $-VEC_SIZE, %rax
> > -       vpcmpeqb        (%rax), %ymm6, %ymm0
> > -       mov     $-1, %r10d
> > -       sub     %rax, %rcx
> > -       shl     %cl, %r10d
> > -       vpmovmskb %ymm0, %edx
> > -       and     %r10d, %edx
> > -       jnz     L(exit)
> > -
> > -L(align_vec_size_start):
> > -       vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
> > -       vpmovmskb %ymm0, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_second_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> > -       vpmovmskb %ymm1, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_third_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> > -       vpmovmskb %ymm2, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fourth_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> > -       vpmovmskb %ymm3, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fifth_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> > -       add     $(VEC_SIZE * 4), %rax
> > -       vpmovmskb %ymm0, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_second_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> > -       vpmovmskb %ymm1, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_third_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> > -       vpmovmskb %ymm2, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fourth_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> > -       vpmovmskb %ymm3, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fifth_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> > -       add     $(VEC_SIZE * 4), %rax
> > -       vpmovmskb %ymm0, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_second_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> > -       vpmovmskb %ymm1, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_third_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> > -       vpmovmskb %ymm2, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fourth_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> > -       vpmovmskb %ymm3, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fifth_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> > -       add     $(VEC_SIZE * 4), %rax
> > -       vpmovmskb %ymm0, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_second_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> > -       vpmovmskb %ymm1, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_third_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> > -       vpmovmskb %ymm2, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fourth_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> > -       vpmovmskb %ymm3, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fifth_vector)
> > -
> > -       test    $((VEC_SIZE * 4) - 1), %rax
> > -       jz      L(align_four_vec_loop)
> > -
> > -       vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> > -       add     $(VEC_SIZE * 5), %rax
> > -       vpmovmskb %ymm0, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit)
> > -
> > -       test    $((VEC_SIZE * 4) - 1), %rax
> > -       jz      L(align_four_vec_loop)
> > -
> > -       vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
> > -       add     $VEC_SIZE, %rax
> > -       vpmovmskb %ymm1, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit)
> > -
> > -       test    $((VEC_SIZE * 4) - 1), %rax
> > -       jz      L(align_four_vec_loop)
> > -
> > -       vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
> > -       add     $VEC_SIZE, %rax
> > -       vpmovmskb %ymm2, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit)
> > -
> > -       test    $((VEC_SIZE * 4) - 1), %rax
> > -       jz      L(align_four_vec_loop)
> > -
> > -       vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
> > -       add     $VEC_SIZE, %rax
> > -       vpmovmskb %ymm3, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit)
> > -
> > -       add     $VEC_SIZE, %rax
> > -
> > -       .p2align 4
> > -L(align_four_vec_loop):
> > -       vmovaps (%rax), %ymm4
> > -       vpminub VEC_SIZE(%rax), %ymm4, %ymm4
> > -       vmovaps (VEC_SIZE * 2)(%rax),   %ymm5
> > -       vpminub (VEC_SIZE * 3)(%rax),   %ymm5, %ymm5
> > -       add     $(VEC_SIZE * 4),        %rax
> > -       vpminub %ymm4,  %ymm5, %ymm5
> > -       vpcmpeqb %ymm5, %ymm6, %ymm5
> > -       vpmovmskb %ymm5,        %edx
> > -       test    %edx,   %edx
> > -       jz      L(align_four_vec_loop)
> > -
> > -       vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
> > -       sub     $(VEC_SIZE * 5),        %rax
> > -       vpmovmskb %ymm0, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_second_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> > -       vpmovmskb %ymm1, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_third_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> > -       vpmovmskb %ymm2, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_null_on_fourth_vector)
> > -
> > -       vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> > -       vpmovmskb %ymm3, %edx
> > -       sub     %rdi, %rax
> > -       bsf     %rdx, %rdx
> > -       add     %rdx, %rax
> > -       add     $(VEC_SIZE * 4), %rax
> > -       jmp     L(StartStrcpyPart)
> > -
> > -       .p2align 4
> > -L(exit):
> > -       sub     %rdi, %rax
> > -L(exit_null_on_first_vector):
> > -       bsf     %rdx, %rdx
> > -       add     %rdx, %rax
> > -       jmp     L(StartStrcpyPart)
> > -
> > -       .p2align 4
> > -L(exit_null_on_second_vector):
> > -       sub     %rdi, %rax
> > -       bsf     %rdx, %rdx
> > -       add     %rdx, %rax
> > -       add     $VEC_SIZE, %rax
> > -       jmp     L(StartStrcpyPart)
> > -
> > -       .p2align 4
> > -L(exit_null_on_third_vector):
> > -       sub     %rdi, %rax
> > -       bsf     %rdx, %rdx
> > -       add     %rdx, %rax
> > -       add     $(VEC_SIZE * 2), %rax
> > -       jmp     L(StartStrcpyPart)
> > -
> > -       .p2align 4
> > -L(exit_null_on_fourth_vector):
> > -       sub     %rdi, %rax
> > -       bsf     %rdx, %rdx
> > -       add     %rdx, %rax
> > -       add     $(VEC_SIZE * 3), %rax
> > -       jmp     L(StartStrcpyPart)
> > -
> > -       .p2align 4
> > -L(exit_null_on_fifth_vector):
> > -       sub     %rdi, %rax
> > -       bsf     %rdx, %rdx
> > -       add     %rdx, %rax
> > -       add     $(VEC_SIZE * 4), %rax
> > -
> > -       .p2align 4
> > -L(StartStrcpyPart):
> > -       lea     (%r9, %rax), %rdi
> > -       mov     %rsi, %rcx
> > -       mov     %r9, %rax      /* save result */
> > -
> > -# ifdef USE_AS_STRNCAT
> > -       test    %r8, %r8
> > -       jz      L(ExitZero)
> > -#  define USE_AS_STRNCPY
> > -# endif
> > -
> > -# include "strcpy-avx2.S"
> > +#ifndef STRCAT
> > +# define STRCAT        __strcat_avx2
> >  #endif
> > +
> > +#define USE_AS_STRCAT
> > +#define STRCPY STRCAT
> > +#include "strcpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
> > new file mode 100644
> > index 0000000000..128a45b6ff
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
>
> Missing copyright notice.

Fixed in V3.
>
> > @@ -0,0 +1,76 @@
> > +    /* Simple strlen implementation that ends at L(strcat_strlen_done).  */
> > +       movq    %rdi, %r8
> > +       andq    $(VEC_SIZE * -1), %r8
> > +       VPCMPEQ (%r8), %VZERO, %VMM(0)
>
>
> > +       vpmovmskb %VMM(0), %ecx
> > +       shrxl   %edi, %ecx, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(bsf_and_done_v0)
> > +
> > +       VPCMPEQ VEC_SIZE(%r8), %VZERO, %VMM(0)
> > +       vpmovmskb %VMM(0), %ecx
> > +       leaq    (VEC_SIZE)(%r8), %rdi
> > +       testl   %ecx, %ecx
> > +       jnz     L(bsf_and_done_v0)
> > +
> > +       VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %VMM(0)
> > +       vpmovmskb %VMM(0), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(bsf_and_done_v1)
> > +
> > +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %VMM(0)
> > +       vpmovmskb %VMM(0), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(bsf_and_done_v2)
> > +
> > +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %VMM(0)
> > +       vpmovmskb %VMM(0), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(bsf_and_done_v3)
> > +
> > +       orq     $(VEC_SIZE * 4 - 1), %rdi
> > +       .p2align 4,, 8
> > +L(loop_2x_vec):
> > +       VMOVA   (VEC_SIZE * 0 + 1)(%rdi), %VMM(0)
> > +       VPMIN   (VEC_SIZE * 1 + 1)(%rdi), %VMM(0), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 2 + 1)(%rdi), %VMM(2)
> > +       VPMIN   (VEC_SIZE * 3 + 1)(%rdi), %VMM(2), %VMM(3)
> > +       VPMIN   %VMM(1), %VMM(3), %VMM(3)
> > +       VPCMPEQ %VMM(3), %VZERO, %VMM(3)
> > +       vpmovmskb %VMM(3), %r8d
> > +       subq    $(VEC_SIZE * -4), %rdi
> > +       testl   %r8d, %r8d
> > +       jz      L(loop_2x_vec)
> > +
> > +       addq    $(VEC_SIZE * -4 + 1), %rdi
> > +
> > +       VPCMPEQ %VMM(0), %VZERO, %VMM(0)
> > +       vpmovmskb %VMM(0), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(bsf_and_done_v0)
> > +
> > +       VPCMPEQ %VMM(1), %VZERO, %VMM(1)
> > +       vpmovmskb %VMM(1), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(bsf_and_done_v1)
> > +
> > +       VPCMPEQ %VMM(2), %VZERO, %VMM(2)
> > +       vpmovmskb %VMM(2), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(bsf_and_done_v2)
> > +
> > +       movl    %r8d, %ecx
> > +L(bsf_and_done_v3):
> > +       addq    $VEC_SIZE, %rdi
> > +L(bsf_and_done_v2):
> > +       bsfl    %ecx, %ecx
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rcx), %rdi
> > +       jmp     L(strcat_strlen_done)
> > +
> > +       .p2align 4,, 4
> > +L(bsf_and_done_v1):
> > +       addq    $VEC_SIZE, %rdi
> > +L(bsf_and_done_v0):
> > +       bsfl    %ecx, %ecx
> > +       addq    %rcx, %rdi
> > +L(strcat_strlen_done):
> > diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> > index c2c581ecf7..fe80ffd265 100644
> > --- a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> > @@ -1,12 +1,3 @@
> > -#ifndef STRCPY
> > -# define STRCPY __strcpy_avx2_rtm
> > -#endif
> > -
> > -#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> > -  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > -
> > -#define VZEROUPPER_RETURN jmp   L(return_vzeroupper)
> > -
> > -#define SECTION(p) p##.avx.rtm
> > -
> > +#define STRCPY __strcpy_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> >  #include "strcpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
> > index c725834929..b87a1722d5 100644
> > --- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
> > @@ -20,984 +20,378 @@
> >
> >  #if ISA_SHOULD_BUILD (3)
> >
> > +# include <sysdep.h>
> >
> > -# ifndef USE_AS_STRCAT
> > -#  include <sysdep.h>
> > -
> > -#  ifndef STRCPY
> > -#   define STRCPY  __strcpy_avx2
> > -#  endif
> > -
> > -# endif
> > -
> > -/* Number of bytes in a vector register */
> >  # ifndef VEC_SIZE
> > -#  define VEC_SIZE     32
> > -# endif
> > -
> > -# ifndef VZEROUPPER
> > -#  define VZEROUPPER   vzeroupper
> > -# endif
> > -
> > -# ifndef SECTION
> > -#  define SECTION(p)   p##.avx
> > -# endif
> > -
> > -/* zero register */
> > -#define xmmZ   xmm0
> > -#define ymmZ   ymm0
> > -
> > -/* mask register */
> > -#define ymmM   ymm1
> > -
> > -# ifndef USE_AS_STRCAT
> > -
> > -       .section SECTION(.text),"ax",@progbits
> > -ENTRY (STRCPY)
> > -#  ifdef USE_AS_STRNCPY
> > -       mov     %RDX_LP, %R8_LP
> > -       test    %R8_LP, %R8_LP
> > -       jz      L(ExitZero)
> > -#  endif
> > -       mov     %rsi, %rcx
> > -#  ifndef USE_AS_STPCPY
> > -       mov     %rdi, %rax      /* save result */
> > -#  endif
> > -
> > +#  include "x86-avx2-vecs.h"
> >  # endif
> >
> > -       vpxor   %xmmZ, %xmmZ, %xmmZ
> > -
> > -       and     $((VEC_SIZE * 4) - 1), %ecx
> > -       cmp     $(VEC_SIZE * 2), %ecx
> > -       jbe     L(SourceStringAlignmentLessTwoVecSize)
> > -
> > -       and     $-VEC_SIZE, %rsi
> > -       and     $(VEC_SIZE - 1), %ecx
> > -
> > -       vpcmpeqb (%rsi), %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %edx
> > -       shr     %cl, %rdx
> > -
> > -# ifdef USE_AS_STRNCPY
> > -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > -       mov     $VEC_SIZE, %r10
> > -       sub     %rcx, %r10
> > -       cmp     %r10, %r8
> > -#  else
> > -       mov     $(VEC_SIZE + 1), %r10
> > -       sub     %rcx, %r10
> > -       cmp     %r10, %r8
> > -#  endif
> > -       jbe     L(CopyVecSizeTailCase2OrCase3)
> > +# ifndef STRCPY
> > +#  define STRCPY       __strcpy_avx2
> >  # endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyVecSizeTail)
> >
> > -       vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
> > -       vpmovmskb %ymm2, %edx
> > +       /* Use movsb in page cross case to save code size.  */
> > +# define USE_MOVSB_IN_PAGE_CROSS       1
> >
> > -# ifdef USE_AS_STRNCPY
> > -       add     $VEC_SIZE, %r10
> > -       cmp     %r10, %r8
> > -       jbe     L(CopyTwoVecSizeCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyTwoVecSize)
> > -
> > -       vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
> > -       vmovdqu %ymm2, (%rdi)
> > -
> > -/* If source address alignment != destination address alignment */
> > -       .p2align 4
> > -L(UnalignVecSizeBoth):
> > -       sub     %rcx, %rdi
> > -# ifdef USE_AS_STRNCPY
> > -       add     %rcx, %r8
> > -       sbb     %rcx, %rcx
> > -       or      %rcx, %r8
> > -# endif
> > -       mov     $VEC_SIZE, %rcx
> > -       vmovdqa (%rsi, %rcx), %ymm2
> > -       vmovdqu %ymm2, (%rdi, %rcx)
> > -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> > -       vpcmpeqb %ymm2, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %edx
> > -       add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $(VEC_SIZE * 3), %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec2)
> > +# ifdef USE_AS_WCSCPY
> > +#  define VPCMPEQ      vpcmpeqd
> > +#  define VPMIN        vpminud
> > +#  define CHAR_SIZE    4
> >  # else
> > -       jnz     L(CopyVecSize)
> > +#  define VPCMPEQ      vpcmpeqb
> > +#  define VPMIN        vpminub
> > +#  define CHAR_SIZE    1
> >  # endif
> >
> > -       vmovdqu %ymm2, (%rdi, %rcx)
> > -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
> > -       vpcmpeqb %ymm3, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %edx
> > -       add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec3)
> > -# else
> > -       jnz     L(CopyVecSize)
> > -# endif
> > +# define PAGE_SIZE     4096
> >
> > -       vmovdqu %ymm3, (%rdi, %rcx)
> > -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
> > -       vpcmpeqb %ymm4, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %edx
> > -       add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec4)
> > +# ifdef USE_AS_STPCPY
> > +#  define END_REG      rax
> >  # else
> > -       jnz     L(CopyVecSize)
> > +#  define END_REG      rdi, %rdx
> >  # endif
> >
> > -       vmovdqu %ymm4, (%rdi, %rcx)
> > -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> > -       vpcmpeqb %ymm2, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %edx
> > -       add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec2)
> > +# ifdef USE_AS_STRCAT
> > +#  define PAGE_ALIGN_REG       ecx
> >  # else
> > -       jnz     L(CopyVecSize)
> > +#  define PAGE_ALIGN_REG       eax
> >  # endif
> >
> > -       vmovdqu %ymm2, (%rdi, %rcx)
> > -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> > -       vpcmpeqb %ymm2, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %edx
> > -       add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec2)
> > -# else
> > -       jnz     L(CopyVecSize)
> > -# endif
> > +# define VZERO VMM(7)
> > +# define VZERO_128     VMM_128(7)
> >
> > -       vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
> > -       vmovdqu %ymm2, (%rdi, %rcx)
> > -       vpcmpeqb %ymm3, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %edx
> > -       add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec3)
> > -# else
> > -       jnz     L(CopyVecSize)
> > -# endif
> > +       .section SECTION(.text), "ax", @progbits
> > +ENTRY(STRCPY)
> > +       vpxor   %VZERO_128, %VZERO_128, %VZERO_128
> >
> > -       vmovdqu %ymm3, (%rdi, %rcx)
> > -       mov     %rsi, %rdx
> > -       lea     VEC_SIZE(%rsi, %rcx), %rsi
> > -       and     $-(VEC_SIZE * 4), %rsi
> > -       sub     %rsi, %rdx
> > -       sub     %rdx, %rdi
> > -# ifdef USE_AS_STRNCPY
> > -       lea     (VEC_SIZE * 8)(%r8, %rdx), %r8
> > -# endif
> > -L(UnalignedFourVecSizeLoop):
> > -       vmovdqa (%rsi), %ymm4
> > -       vmovdqa VEC_SIZE(%rsi), %ymm5
> > -       vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
> > -       vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
> > -       vpminub %ymm5, %ymm4, %ymm2
> > -       vpminub %ymm7, %ymm6, %ymm3
> > -       vpminub %ymm2, %ymm3, %ymm3
> > -       vpcmpeqb %ymmM, %ymm3, %ymm3
> > -       vpmovmskb %ymm3, %edx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $(VEC_SIZE * 4), %r8
> > -       jbe     L(UnalignedLeaveCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(UnalignedFourVecSizeLeave)
> > -
> > -L(UnalignedFourVecSizeLoop_start):
> > -       add     $(VEC_SIZE * 4), %rdi
> > -       add     $(VEC_SIZE * 4), %rsi
> > -       vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
> > -       vmovdqa (%rsi), %ymm4
> > -       vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
> > -       vmovdqa VEC_SIZE(%rsi), %ymm5
> > -       vpminub %ymm5, %ymm4, %ymm2
> > -       vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
> > -       vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
> > -       vmovdqu %ymm7, -VEC_SIZE(%rdi)
> > -       vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
> > -       vpminub %ymm7, %ymm6, %ymm3
> > -       vpminub %ymm2, %ymm3, %ymm3
> > -       vpcmpeqb %ymmM, %ymm3, %ymm3
> > -       vpmovmskb %ymm3, %edx
> > -# ifdef USE_AS_STRNCPY
> > -       sub     $(VEC_SIZE * 4), %r8
> > -       jbe     L(UnalignedLeaveCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jz      L(UnalignedFourVecSizeLoop_start)
> > -
> > -L(UnalignedFourVecSizeLeave):
> > -       vpcmpeqb %ymm4, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %edx
> > -       test    %edx, %edx
> > -       jnz     L(CopyVecSizeUnaligned_0)
> > -
> > -       vpcmpeqb %ymm5, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %ecx
> > -       test    %ecx, %ecx
> > -       jnz     L(CopyVecSizeUnaligned_16)
> > -
> > -       vpcmpeqb %ymm6, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %edx
> > -       test    %edx, %edx
> > -       jnz     L(CopyVecSizeUnaligned_32)
> > -
> > -       vpcmpeqb %ymm7, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %ecx
> > -       bsf     %ecx, %edx
> > -       vmovdqu %ymm4, (%rdi)
> > -       vmovdqu %ymm5, VEC_SIZE(%rdi)
> > -       vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > -       lea     (VEC_SIZE * 3)(%rdi, %rdx), %rax
> > -# endif
> > -       vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
> > -       add     $(VEC_SIZE - 1), %r8
> > -       sub     %rdx, %r8
> > -       lea     ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
> > -       jmp     L(StrncpyFillTailWithZero)
> > -# else
> > -       add     $(VEC_SIZE * 3), %rsi
> > -       add     $(VEC_SIZE * 3), %rdi
> > -       jmp     L(CopyVecSizeExit)
> > +# ifdef USE_AS_STRCAT
> > +       movq    %rdi, %rax
> > +#  include "strcat-strlen-avx2.S"
> >  # endif
> >
> > -/* If source address alignment == destination address alignment */
> > -
> > -L(SourceStringAlignmentLessTwoVecSize):
> > -       vmovdqu (%rsi), %ymm3
> > -       vmovdqu VEC_SIZE(%rsi), %ymm2
> > -       vpcmpeqb %ymm3, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %edx
> > -
> > -# ifdef USE_AS_STRNCPY
> > -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > -       cmp     $VEC_SIZE, %r8
> > -#  else
> > -       cmp     $(VEC_SIZE + 1), %r8
> > -#  endif
> > -       jbe     L(CopyVecSizeTail1Case2OrCase3)
> > +       movl    %esi, %PAGE_ALIGN_REG
> > +       andl    $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
> > +       ja      L(page_cross)
> > +L(page_cross_continue):
> > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > +       movq    %rdi, %rax
> >  # endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyVecSizeTail1)
> > -
> > -       vmovdqu %ymm3, (%rdi)
> > -       vpcmpeqb %ymm2, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %edx
> > -
> > -# ifdef USE_AS_STRNCPY
> > -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > -       cmp     $(VEC_SIZE * 2), %r8
> > -#  else
> > -       cmp     $((VEC_SIZE * 2) + 1), %r8
> > -#  endif
> > -       jbe     L(CopyTwoVecSize1Case2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyTwoVecSize1)
> > -
> > -       and     $-VEC_SIZE, %rsi
> > -       and     $(VEC_SIZE - 1), %ecx
> > -       jmp     L(UnalignVecSizeBoth)
> > +       VMOVU   (%rsi), %VMM(0)
> > +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> >
> > -/*------End of main part with loops---------------------*/
> > +       testl   %ecx, %ecx
> > +       jz      L(more_1x_vec)
> >
> > -/* Case1 */
> > +       /* No longer need ymm registers so just vzeroupper so it doesn't
> > +          need to be duplicated at each return statement.  */
> > +       COND_VZEROUPPER
> >
> > -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> > -       .p2align 4
> > -L(CopyVecSize):
> > -       add     %rcx, %rdi
> > -# endif
> > -L(CopyVecSizeTail):
> > -       add     %rcx, %rsi
> > -L(CopyVecSizeTail1):
> > -       bsf     %edx, %edx
> > -L(CopyVecSizeExit):
> > -       cmp     $32, %edx
> > -       jae     L(Exit32_63)
> > -       cmp     $16, %edx
> > -       jae     L(Exit16_31)
> > -       cmp     $8, %edx
> > -       jae     L(Exit8_15)
> > -       cmp     $4, %edx
> > -       jae     L(Exit4_7)
> > -       cmp     $3, %edx
> > -       je      L(Exit3)
> > -       cmp     $1, %edx
> > -       ja      L(Exit2)
> > -       je      L(Exit1)
> > -       movb    $0, (%rdi)
> > +       xorl    %edx, %edx
> > +       bsfl    %ecx, %edx
> >  # ifdef USE_AS_STPCPY
> > -       lea     (%rdi), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     $1, %r8
> > -       lea     1(%rdi), %rdi
> > -       jnz     L(StrncpyFillTailWithZero)
> > -# endif
> > -L(return_vzeroupper):
> > -       ZERO_UPPER_VEC_REGISTERS_RETURN
> > -
> > -       .p2align 4
> > -L(CopyTwoVecSize1):
> > -       add     $VEC_SIZE, %rsi
> > -       add     $VEC_SIZE, %rdi
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     $VEC_SIZE, %r8
> > -# endif
> > -       jmp     L(CopyVecSizeTail1)
> > -
> > -       .p2align 4
> > -L(CopyTwoVecSize):
> > -       bsf     %edx, %edx
> > -       add     %rcx, %rsi
> > -       add     $VEC_SIZE, %edx
> > -       sub     %ecx, %edx
> > -       jmp     L(CopyVecSizeExit)
> > -
> > -       .p2align 4
> > -L(CopyVecSizeUnaligned_0):
> > -       bsf     %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %rdx), %rax
> > -# endif
> > -       vmovdqu %ymm4, (%rdi)
> > -       add     $((VEC_SIZE * 4) - 1), %r8
> > -       sub     %rdx, %r8
> > -       lea     1(%rdi, %rdx), %rdi
> > -       jmp     L(StrncpyFillTailWithZero)
> > -# else
> > -       jmp     L(CopyVecSizeExit)
> > -# endif
> > -
> > -       .p2align 4
> > -L(CopyVecSizeUnaligned_16):
> > -       bsf     %ecx, %edx
> > -       vmovdqu %ymm4, (%rdi)
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > -       lea     VEC_SIZE(%rdi, %rdx), %rax
> > -# endif
> > -       vmovdqu %ymm5, VEC_SIZE(%rdi)
> > -       add     $((VEC_SIZE * 3) - 1), %r8
> > -       sub     %rdx, %r8
> > -       lea     (VEC_SIZE + 1)(%rdi, %rdx), %rdi
> > -       jmp     L(StrncpyFillTailWithZero)
> > +       leaq    (%rdi, %rdx), %rax
> > +# endif
> > +
> > +       /* Use mask bits in rcx to detect which copy we need. If the low
> > +          mask is zero then there must be a bit set in the upper half.
> > +          I.e if ecx != 0 and cx == 0, then match must be upper 16
> > +          bits so we use L(copy_16_31).  */
> > +       testw   %cx, %cx
> > +       jz      L(copy_16_31)
> > +
> > +       testb   %cl, %cl
> > +       jz      L(copy_8_15)
> > +# ifdef USE_AS_WCSCPY
> > +       vmovd   %xmm0, (%rdi)
> > +       movl    $0, (%END_REG)
> > +       ret
> >  # else
> > -       add     $VEC_SIZE, %rsi
> > -       add     $VEC_SIZE, %rdi
> > -       jmp     L(CopyVecSizeExit)
> > -# endif
> > -
> > -       .p2align 4
> > -L(CopyVecSizeUnaligned_32):
> > -       bsf     %edx, %edx
> > -       vmovdqu %ymm4, (%rdi)
> > -       vmovdqu %ymm5, VEC_SIZE(%rdi)
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > -       lea     (VEC_SIZE * 2)(%rdi, %rdx), %rax
> > -# endif
> > -       vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> > -       add     $((VEC_SIZE * 2) - 1), %r8
> > -       sub     %rdx, %r8
> > -       lea     ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
> > -       jmp     L(StrncpyFillTailWithZero)
> > +       testb   $0x7, %cl
> > +       jz      L(copy_4_7)
> > +
> > +       testl   %edx, %edx
> > +       jz      L(set_null_term)
> > +       vmovd   %xmm0, %ecx
> > +       movw    %cx, (%rdi)
> > +
> > +       .p2align 4,, 2
> > +L(set_null_term):
> > +       movb    $0, (%END_REG)
> > +       ret
> > +
> > +       .p2align 4,, 12
> > +L(copy_4_7):
> > +       movl    -3(%rsi, %rdx), %ecx
> > +       vmovd   %xmm0, (%rdi)
> > +       movl    %ecx, -3(%END_REG)
> > +       ret
> > +# endif
> > +
> > +       .p2align 4,, 10
> > +L(copy_16_31):
> > +       VMOVU   -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
> > +       VMOVU   %xmm0, (%rdi)
> > +       VMOVU   %xmm1, -(16 - CHAR_SIZE)(%END_REG)
> > +       ret
> > +
> > +       .p2align 4,, 10
> > +L(copy_8_15):
> > +# ifdef USE_AS_WCSCPY
> > +       movl    -(8 - CHAR_SIZE)(%rsi, %rdx), %ecx
> >  # else
> > -       add     $(VEC_SIZE * 2), %rsi
> > -       add     $(VEC_SIZE * 2), %rdi
> > -       jmp     L(CopyVecSizeExit)
> > -# endif
> > -
> > -# ifdef USE_AS_STRNCPY
> > -#  ifndef USE_AS_STRCAT
> > -       .p2align 4
> > -L(CopyVecSizeUnalignedVec6):
> > -       vmovdqu %ymm6, (%rdi, %rcx)
> > -       jmp     L(CopyVecSizeVecExit)
> > -
> > -       .p2align 4
> > -L(CopyVecSizeUnalignedVec5):
> > -       vmovdqu %ymm5, (%rdi, %rcx)
> > -       jmp     L(CopyVecSizeVecExit)
> > -
> > -       .p2align 4
> > -L(CopyVecSizeUnalignedVec4):
> > -       vmovdqu %ymm4, (%rdi, %rcx)
> > -       jmp     L(CopyVecSizeVecExit)
> > -
> > -       .p2align 4
> > -L(CopyVecSizeUnalignedVec3):
> > -       vmovdqu %ymm3, (%rdi, %rcx)
> > -       jmp     L(CopyVecSizeVecExit)
> > -#  endif
> > -
> > -/* Case2 */
> > -
> > -       .p2align 4
> > -L(CopyVecSizeCase2):
> > -       add     $VEC_SIZE, %r8
> > -       add     %rcx, %rdi
> > -       add     %rcx, %rsi
> > -       bsf     %edx, %edx
> > -       cmp     %r8d, %edx
> > -       jb      L(CopyVecSizeExit)
> > -       jmp     L(StrncpyExit)
> > -
> > -       .p2align 4
> > -L(CopyTwoVecSizeCase2):
> > -       add     %rcx, %rsi
> > -       bsf     %edx, %edx
> > -       add     $VEC_SIZE, %edx
> > -       sub     %ecx, %edx
> > -       cmp     %r8d, %edx
> > -       jb      L(CopyVecSizeExit)
> > -       jmp     L(StrncpyExit)
> > -
> > -L(CopyVecSizeTailCase2):
> > -       add     %rcx, %rsi
> > -       bsf     %edx, %edx
> > -       cmp     %r8d, %edx
> > -       jb      L(CopyVecSizeExit)
> > -       jmp     L(StrncpyExit)
> > -
> > -L(CopyVecSizeTail1Case2):
> > -       bsf     %edx, %edx
> > -       cmp     %r8d, %edx
> > -       jb      L(CopyVecSizeExit)
> > -       jmp     L(StrncpyExit)
> > -
> > -/* Case2 or Case3,  Case3 */
> > -
> > -       .p2align 4
> > -L(CopyVecSizeCase2OrCase3):
> > -       test    %rdx, %rdx
> > -       jnz     L(CopyVecSizeCase2)
> > -L(CopyVecSizeCase3):
> > -       add     $VEC_SIZE, %r8
> > -       add     %rcx, %rdi
> > -       add     %rcx, %rsi
> > -       jmp     L(StrncpyExit)
> > -
> > -       .p2align 4
> > -L(CopyTwoVecSizeCase2OrCase3):
> > -       test    %rdx, %rdx
> > -       jnz     L(CopyTwoVecSizeCase2)
> > -       add     %rcx, %rsi
> > -       jmp     L(StrncpyExit)
> > -
> > -       .p2align 4
> > -L(CopyVecSizeTailCase2OrCase3):
> > -       test    %rdx, %rdx
> > -       jnz     L(CopyVecSizeTailCase2)
> > -       add     %rcx, %rsi
> > -       jmp     L(StrncpyExit)
> > -
> > -       .p2align 4
> > -L(CopyTwoVecSize1Case2OrCase3):
> > -       add     $VEC_SIZE, %rdi
> > -       add     $VEC_SIZE, %rsi
> > -       sub     $VEC_SIZE, %r8
> > -L(CopyVecSizeTail1Case2OrCase3):
> > -       test    %rdx, %rdx
> > -       jnz     L(CopyVecSizeTail1Case2)
> > -       jmp     L(StrncpyExit)
> > -# endif
> > -
> > -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
> > -
> > -       .p2align 4
> > -L(Exit1):
> > -       movzwl  (%rsi), %edx
> > -       mov     %dx, (%rdi)
> > -# ifdef USE_AS_STPCPY
> > -       lea     1(%rdi), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     $2, %r8
> > -       lea     2(%rdi), %rdi
> > -       jnz     L(StrncpyFillTailWithZero)
> > -# endif
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(Exit2):
> > -       movzwl  (%rsi), %ecx
> > -       mov     %cx, (%rdi)
> > -       movb    $0, 2(%rdi)
> > -# ifdef USE_AS_STPCPY
> > -       lea     2(%rdi), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     $3, %r8
> > -       lea     3(%rdi), %rdi
> > -       jnz     L(StrncpyFillTailWithZero)
> > -# endif
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(Exit3):
> > -       mov     (%rsi), %edx
> > -       mov     %edx, (%rdi)
> > +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
> > +# endif
> > +       vmovq   %xmm0, (%rdi)
> > +       movq    %rcx, -(8 - CHAR_SIZE)(%END_REG)
> > +       ret
> > +
> > +
> > +       .p2align 4,, 8
> > +L(more_1x_vec):
> > +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > +       VMOVU   %VMM(0), (%rdi)
> > +# endif
> > +       subq    %rsi, %rdi
> > +       orq     $(VEC_SIZE - 1), %rsi
> > +       addq    %rsi, %rdi
> > +       VMOVA   1(%rsi), %VMM(1)
> > +
> > +       /* Try and order stores after as many loads as is reasonable to
> > +          avoid potential false dependencies.  */
> > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > +       VMOVU   %VMM(0), (%rax)
> > +# endif
> > +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x1)
> > +
> > +       VMOVA   (VEC_SIZE + 1)(%rsi), %VMM(2)
> > +       VMOVU   %VMM(1), 1(%rdi)
> > +
> > +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x2)
> > +
> > +       VMOVA   (VEC_SIZE * 2 + 1)(%rsi), %VMM(3)
> > +       VMOVU   %VMM(2), (VEC_SIZE + 1)(%rdi)
> > +
> > +       VPCMPEQ %VMM(3), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x3)
> > +
> > +       VMOVA   (VEC_SIZE * 3 + 1)(%rsi), %VMM(4)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 2 + 1)(%rdi)
> > +       VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %edx
> > +       testl   %edx, %edx
> > +       jnz     L(ret_vec_x4)
> > +
> > +       VMOVU   %VMM(4), (VEC_SIZE * 3 + 1)(%rdi)
> > +
> > +       /* Subtract rsi from rdi before aligning. Adding back rsi will
> > +          get proper rdi (dst) for new src.  */
> > +       subq    %rsi, %rdi
> > +       incq    %rsi
> > +       orq     $(VEC_SIZE * 4 - 1), %rsi
> > +
> > +       /* Do first half of loop ahead of time so loop can just start by
> > +          storing.  */
> > +       VMOVA   (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
> > +       VMOVA   (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
> > +       VMOVA   (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
> > +
> > +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> > +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %edx
> > +       addq    %rsi, %rdi
> > +
> > +       testl   %edx, %edx
> > +       jnz     L(loop_4x_done)
> > +
> > +       .p2align 4,, 11
> > +L(loop_4x_vec):
> > +
> > +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
> > +       subq    $(VEC_SIZE * -4), %rsi
> > +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 3 + 1)(%rdi)
> > +
> > +
> > +       VMOVA   (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
> > +       VMOVA   (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
> > +       VMOVA   (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
> > +
> > +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> > +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> > +
> > +       vpmovmskb %VMM(6), %edx
> > +       subq    $(VEC_SIZE * -4), %rdi
> > +       testl   %edx, %edx
> > +       jz      L(loop_4x_vec)
> > +
> > +L(loop_4x_done):
> > +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x1)
> > +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
> > +
> > +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x2)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
> > +
> > +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x3)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
> > +L(ret_vec_x4):
> > +       bsfl    %edx, %edx
> > +       VMOVU   ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> > +       VMOVU   %VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> >  # ifdef USE_AS_STPCPY
> > -       lea     3(%rdi), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     $4, %r8
> > -       lea     4(%rdi), %rdi
> > -       jnz     L(StrncpyFillTailWithZero)
> > +       leaq    (VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax
> >  # endif
> > +L(return_end):
> >         VZEROUPPER_RETURN
> >
> > -       .p2align 4
> > -L(Exit4_7):
> > -       mov     (%rsi), %ecx
> > -       mov     %ecx, (%rdi)
> > -       mov     -3(%rsi, %rdx), %ecx
> > -       mov     %ecx, -3(%rdi, %rdx)
> > +       .p2align 4,, 8
> > +L(ret_vec_x1):
> > +       bsfl    %ecx, %ecx
> > +       VMOVU   (1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> > +       VMOVU   %VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
> >  # ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %rdx), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     %rdx, %r8
> > -       sub     $1, %r8
> > -       lea     1(%rdi, %rdx), %rdi
> > -       jnz     L(StrncpyFillTailWithZero)
> > +       leaq    1(%rcx, %rdi), %rax
> >  # endif
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(Exit8_15):
> > -       mov     (%rsi), %rcx
> > -       mov     -7(%rsi, %rdx), %r9
> > -       mov     %rcx, (%rdi)
> > -       mov     %r9, -7(%rdi, %rdx)
> > -# ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %rdx), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     %rdx, %r8
> > -       sub     $1, %r8
> > -       lea     1(%rdi, %rdx), %rdi
> > -       jnz     L(StrncpyFillTailWithZero)
> > -# endif
> > -       VZEROUPPER_RETURN
> > +L(return_vzeroupper):
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> > -       .p2align 4
> > -L(Exit16_31):
> > -       vmovdqu (%rsi), %xmm2
> > -       vmovdqu -15(%rsi, %rdx), %xmm3
> > -       vmovdqu %xmm2, (%rdi)
> > -       vmovdqu %xmm3, -15(%rdi, %rdx)
> > +       .p2align 4,, 8
> > +L(ret_vec_x2):
> > +       bsfl    %ecx, %ecx
> > +       VMOVU   ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> > +       VMOVU   %VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
> >  # ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %rdx), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub %rdx, %r8
> > -       sub $1, %r8
> > -       lea 1(%rdi, %rdx), %rdi
> > -       jnz L(StrncpyFillTailWithZero)
> > +       leaq    (VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax
> >  # endif
> >         VZEROUPPER_RETURN
> >
> > -       .p2align 4
> > -L(Exit32_63):
> > -       vmovdqu (%rsi), %ymm2
> > -       vmovdqu -31(%rsi, %rdx), %ymm3
> > -       vmovdqu %ymm2, (%rdi)
> > -       vmovdqu %ymm3, -31(%rdi, %rdx)
> > +       .p2align 4,, 8
> > +L(ret_vec_x3):
> > +       bsfl    %ecx, %ecx
> > +       VMOVU   ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> > +       VMOVU   %VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
> >  # ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %rdx), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -       sub     %rdx, %r8
> > -       sub     $1, %r8
> > -       lea     1(%rdi, %rdx), %rdi
> > -       jnz     L(StrncpyFillTailWithZero)
> > +       leaq    (VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax
> >  # endif
> >         VZEROUPPER_RETURN
> >
> > -# ifdef USE_AS_STRNCPY
> >
> > -       .p2align 4
> > -L(StrncpyExit1):
> > -       movzbl  (%rsi), %edx
> > -       mov     %dl, (%rdi)
> > +       .p2align 4,, 4
> > +L(page_cross):
> > +       movq    %rsi, %rcx
> > +       andq    $(VEC_SIZE * -1), %rcx
> > +
> > +       VPCMPEQ (%rcx), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       shrxl   %esi, %ecx, %ecx
> > +# if USE_MOVSB_IN_PAGE_CROSS
> > +       /* Optimizing more aggressively for space as this is very cold
> > +          code. This saves 2x cache lines.  */
> > +
> > +       /* This adds once to the later result which will get correct
> > +          copy bounds. NB: this can never zero-out a non-zero RCX as
> > +          to be in the page cross case rsi cannot be aligned and we
> > +          already right-shift rcx by the misalignment.  */
> > +       shll    $CHAR_SIZE, %ecx
> > +       jz      L(page_cross_continue)
> > +       bsfl    %ecx, %ecx
> > +#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > +       movq    %rdi, %rax
> > +#  endif
> > +       rep     movsb
> >  #  ifdef USE_AS_STPCPY
> > -       lea     1(%rdi), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, 1(%rdi)
> > +       leaq    -CHAR_SIZE(%rdi), %rax
> >  #  endif
> > -       VZEROUPPER_RETURN
> >
> > -       .p2align 4
> > -L(StrncpyExit2):
> > -       movzwl  (%rsi), %edx
> > -       mov     %dx, (%rdi)
> > -#  ifdef USE_AS_STPCPY
> > -       lea     2(%rdi), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, 2(%rdi)
> > -#  endif
> >         VZEROUPPER_RETURN
> >
> > -       .p2align 4
> > -L(StrncpyExit3_4):
> > -       movzwl  (%rsi), %ecx
> > -       movzwl  -2(%rsi, %r8), %edx
> > -       mov     %cx, (%rdi)
> > -       mov     %dx, -2(%rdi, %r8)
> > -#  ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %r8), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, (%rdi, %r8)
> > -#  endif
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(StrncpyExit5_8):
> > -       mov     (%rsi), %ecx
> > -       mov     -4(%rsi, %r8), %edx
> > -       mov     %ecx, (%rdi)
> > -       mov     %edx, -4(%rdi, %r8)
> > -#  ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %r8), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, (%rdi, %r8)
> > -#  endif
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(StrncpyExit9_16):
> > -       mov     (%rsi), %rcx
> > -       mov     -8(%rsi, %r8), %rdx
> > -       mov     %rcx, (%rdi)
> > -       mov     %rdx, -8(%rdi, %r8)
> > -#  ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %r8), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, (%rdi, %r8)
> > -#  endif
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(StrncpyExit17_32):
> > -       vmovdqu (%rsi), %xmm2
> > -       vmovdqu -16(%rsi, %r8), %xmm3
> > -       vmovdqu %xmm2, (%rdi)
> > -       vmovdqu %xmm3, -16(%rdi, %r8)
> > -#  ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %r8), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, (%rdi, %r8)
> > -#  endif
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(StrncpyExit33_64):
> > -       /*  0/32, 31/16 */
> > -       vmovdqu (%rsi), %ymm2
> > -       vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
> > -       vmovdqu %ymm2, (%rdi)
> > -       vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
> > -#  ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %r8), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, (%rdi, %r8)
> > -#  endif
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(StrncpyExit65):
> > -       /* 0/32, 32/32, 64/1 */
> > -       vmovdqu (%rsi), %ymm2
> > -       vmovdqu 32(%rsi), %ymm3
> > -       mov     64(%rsi), %cl
> > -       vmovdqu %ymm2, (%rdi)
> > -       vmovdqu %ymm3, 32(%rdi)
> > -       mov     %cl, 64(%rdi)
> > -#  ifdef USE_AS_STPCPY
> > -       lea     65(%rdi), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, 65(%rdi)
> > -#  endif
> > -       VZEROUPPER_RETURN
> > +# else
> > +       testl   %ecx, %ecx
> > +       jz      L(page_cross_continue)
> >
> > +       /* Traditional copy case, essentially same as used in non-page-
> > +          cross case but since we can't reuse VMM(0) we need twice as
> > +          many loads from rsi.  */
> >  #  ifndef USE_AS_STRCAT
> > -
> > -       .p2align 4
> > -L(Fill1):
> > -       mov     %dl, (%rdi)
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(Fill2):
> > -       mov     %dx, (%rdi)
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(Fill3_4):
> > -       mov     %dx, (%rdi)
> > -       mov     %dx, -2(%rdi, %r8)
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(Fill5_8):
> > -       mov     %edx, (%rdi)
> > -       mov     %edx, -4(%rdi, %r8)
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(Fill9_16):
> > -       mov     %rdx, (%rdi)
> > -       mov     %rdx, -8(%rdi, %r8)
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(Fill17_32):
> > -       vmovdqu %xmmZ, (%rdi)
> > -       vmovdqu %xmmZ, -16(%rdi, %r8)
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(CopyVecSizeUnalignedVec2):
> > -       vmovdqu %ymm2, (%rdi, %rcx)
> > -
> > -       .p2align 4
> > -L(CopyVecSizeVecExit):
> > -       bsf     %edx, %edx
> > -       add     $(VEC_SIZE - 1), %r8
> > -       add     %rcx, %rdi
> > -#   ifdef USE_AS_STPCPY
> > -       lea     (%rdi, %rdx), %rax
> > -#   endif
> > -       sub     %rdx, %r8
> > -       lea     1(%rdi, %rdx), %rdi
> > -
> > -       .p2align 4
> > -L(StrncpyFillTailWithZero):
> > -       xor     %edx, %edx
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(StrncpyFillExit)
> > -
> > -       vmovdqu %ymmZ, (%rdi)
> > -       add     $VEC_SIZE, %rdi
> > -
> > -       mov     %rdi, %rsi
> > -       and     $(VEC_SIZE - 1), %esi
> > -       sub     %rsi, %rdi
> > -       add     %rsi, %r8
> > -       sub     $(VEC_SIZE * 4), %r8
> > -       jb      L(StrncpyFillLessFourVecSize)
> > -
> > -L(StrncpyFillLoopVmovdqa):
> > -       vmovdqa %ymmZ, (%rdi)
> > -       vmovdqa %ymmZ, VEC_SIZE(%rdi)
> > -       vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
> > -       vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
> > -       add     $(VEC_SIZE * 4), %rdi
> > -       sub     $(VEC_SIZE * 4), %r8
> > -       jae     L(StrncpyFillLoopVmovdqa)
> > -
> > -L(StrncpyFillLessFourVecSize):
> > -       add     $(VEC_SIZE * 2), %r8
> > -       jl      L(StrncpyFillLessTwoVecSize)
> > -       vmovdqa %ymmZ, (%rdi)
> > -       vmovdqa %ymmZ, VEC_SIZE(%rdi)
> > -       add     $(VEC_SIZE * 2), %rdi
> > -       sub     $VEC_SIZE, %r8
> > -       jl      L(StrncpyFillExit)
> > -       vmovdqa %ymmZ, (%rdi)
> > -       add     $VEC_SIZE, %rdi
> > -       jmp     L(Fill)
> > -
> > -       .p2align 4
> > -L(StrncpyFillLessTwoVecSize):
> > -       add     $VEC_SIZE, %r8
> > -       jl      L(StrncpyFillExit)
> > -       vmovdqa %ymmZ, (%rdi)
> > -       add     $VEC_SIZE, %rdi
> > -       jmp     L(Fill)
> > -
> > -       .p2align 4
> > -L(StrncpyFillExit):
> > -       add     $VEC_SIZE, %r8
> > -L(Fill):
> > -       cmp     $17, %r8d
> > -       jae     L(Fill17_32)
> > -       cmp     $9, %r8d
> > -       jae     L(Fill9_16)
> > -       cmp     $5, %r8d
> > -       jae     L(Fill5_8)
> > -       cmp     $3, %r8d
> > -       jae     L(Fill3_4)
> > -       cmp     $1, %r8d
> > -       ja      L(Fill2)
> > -       je      L(Fill1)
> > -       VZEROUPPER_RETURN
> > -
> > -/* end of ifndef USE_AS_STRCAT */
> > +       xorl    %edx, %edx
> >  #  endif
> > -
> > -       .p2align 4
> > -L(UnalignedLeaveCase2OrCase3):
> > -       test    %rdx, %rdx
> > -       jnz     L(UnalignedFourVecSizeLeaveCase2)
> > -L(UnalignedFourVecSizeLeaveCase3):
> > -       lea     (VEC_SIZE * 4)(%r8), %rcx
> > -       and     $-VEC_SIZE, %rcx
> > -       add     $(VEC_SIZE * 3), %r8
> > -       jl      L(CopyVecSizeCase3)
> > -       vmovdqu %ymm4, (%rdi)
> > -       sub     $VEC_SIZE, %r8
> > -       jb      L(CopyVecSizeCase3)
> > -       vmovdqu %ymm5, VEC_SIZE(%rdi)
> > -       sub     $VEC_SIZE, %r8
> > -       jb      L(CopyVecSizeCase3)
> > -       vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> > -       sub     $VEC_SIZE, %r8
> > -       jb      L(CopyVecSizeCase3)
> > -       vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
> > +       bsfl    %ecx, %edx
> >  #  ifdef USE_AS_STPCPY
> > -       lea     (VEC_SIZE * 4)(%rdi), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, (VEC_SIZE * 4)(%rdi)
> > +       leaq    (%rdi, %rdx), %rax
> > +#  elif !defined USE_AS_STRCAT
> > +       movq    %rdi, %rax
> >  #  endif
> > -       VZEROUPPER_RETURN
> >
> > -       .p2align 4
> > -L(UnalignedFourVecSizeLeaveCase2):
> > -       xor     %ecx, %ecx
> > -       vpcmpeqb %ymm4, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %edx
> > -       add     $(VEC_SIZE * 3), %r8
> > -       jle     L(CopyVecSizeCase2OrCase3)
> > -       test    %edx, %edx
> > -#  ifndef USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec4)
> > -#  else
> > -       jnz     L(CopyVecSize)
> > -#  endif
> > -       vpcmpeqb %ymm5, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %edx
> > -       vmovdqu %ymm4, (%rdi)
> > -       add     $VEC_SIZE, %rcx
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -       test    %edx, %edx
> > -#  ifndef USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec5)
> > -#  else
> > -       jnz     L(CopyVecSize)
> > -#  endif
> > +       /* vzeroupper early to avoid duplicating at each return.  */
> > +       COND_VZEROUPPER
> >
> > -       vpcmpeqb %ymm6, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %edx
> > -       vmovdqu %ymm5, VEC_SIZE(%rdi)
> > -       add     $VEC_SIZE, %rcx
> > -       sub     $VEC_SIZE, %r8
> > -       jbe     L(CopyVecSizeCase2OrCase3)
> > -       test    %edx, %edx
> > -#  ifndef USE_AS_STRCAT
> > -       jnz     L(CopyVecSizeUnalignedVec6)
> > -#  else
> > -       jnz     L(CopyVecSize)
> > -#  endif
> > +       testw   %cx, %cx
> > +       jz      L(page_cross_copy_16_31)
> >
> > -       vpcmpeqb %ymm7, %ymmZ, %ymmM
> > -       vpmovmskb %ymmM, %edx
> > -       vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> > -       lea     VEC_SIZE(%rdi, %rcx), %rdi
> > -       lea     VEC_SIZE(%rsi, %rcx), %rsi
> > -       bsf     %edx, %edx
> > -       cmp     %r8d, %edx
> > -       jb      L(CopyVecSizeExit)
> > -L(StrncpyExit):
> > -       cmp     $65, %r8d
> > -       je      L(StrncpyExit65)
> > -       cmp     $33, %r8d
> > -       jae     L(StrncpyExit33_64)
> > -       cmp     $17, %r8d
> > -       jae     L(StrncpyExit17_32)
> > -       cmp     $9, %r8d
> > -       jae     L(StrncpyExit9_16)
> > -       cmp     $5, %r8d
> > -       jae     L(StrncpyExit5_8)
> > -       cmp     $3, %r8d
> > -       jae     L(StrncpyExit3_4)
> > -       cmp     $1, %r8d
> > -       ja      L(StrncpyExit2)
> > -       je      L(StrncpyExit1)
> > -#  ifdef USE_AS_STPCPY
> > -       mov     %rdi, %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -       movb    $0, (%rdi)
> > -#  endif
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(ExitZero):
> > -#  ifndef USE_AS_STRCAT
> > -       mov     %rdi, %rax
> > -#  endif
> > -       VZEROUPPER_RETURN
> > +       testb   %cl, %cl
> > +       jz      L(page_cross_copy_8_15)
> >
> > -# endif
> > +       testl   $0x7, %cl
> > +       jz      L(page_cross_copy_4_7)
> >
> > -# ifndef USE_AS_STRCAT
> > -END (STRCPY)
> > -# else
> > -END (STRCAT)
> > -# endif
> > +       testl   %edx, %edx
> > +       jz      L(page_cross_set_null_term)
> > +       movzwl  (%rsi), %ecx
> > +       movw    %cx, (%rdi)
> > +L(page_cross_set_null_term):
> > +       movb    $0, (%END_REG)
> > +       ret
> > +
> > +       .p2align 4,, 4
> > +L(page_cross_copy_4_7):
> > +       movl    (%rsi), %ecx
> > +       movl    -3(%rsi, %rdx), %esi
> > +       movl    %ecx, (%rdi)
> > +       movl    %esi, -3(%END_REG)
> > +       ret
> > +
> > +       .p2align 4,, 4
> > +L(page_cross_copy_8_15):
> > +       movq    (%rsi), %rcx
> > +       movq    -7(%rsi, %rdx), %rsi
> > +       movq    %rcx, (%rdi)
> > +       movq    %rsi, -7(%END_REG)
> > +       ret
> > +
> > +
> > +       .p2align 4,, 3
> > +L(page_cross_copy_16_31):
> > +       VMOVU   (%rsi), %xmm0
> > +       VMOVU   -15(%rsi, %rdx), %xmm1
> > +       VMOVU   %xmm0, (%rdi)
> > +       VMOVU   %xmm1, -15(%END_REG)
> > +       ret
> > +# endif
> > +
> > +END(STRCPY)
> >  #endif
> > diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> > index 0dcea18dbb..2bbdbb91ab 100644
> > --- a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> > @@ -1,3 +1,3 @@
> > -#define USE_AS_STRNCAT
> > -#define STRCAT __strncat_avx2_rtm
> > -#include "strcat-avx2-rtm.S"
> > +#define STRNCAT        __strncat_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > +#include "strncat-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
> > index 52ecbca943..547cef9486 100644
> > --- a/sysdeps/x86_64/multiarch/strncat-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
> > @@ -1,7 +1,419 @@
> > -#ifndef STRNCAT
> > -# define STRNCAT       __strncat_avx2
> > -#endif
> > +/* strncat with AVX2
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (3)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifndef VEC_SIZE
> > +#  include "x86-avx2-vecs.h"
> > +# endif
> > +
> > +# ifndef STRNCAT
> > +#  define STRNCAT      __strncat_avx2
> > +# endif
> > +
> > +# ifdef USE_AS_WCSCPY
> > +#  define movNULL      movl
> > +#  define VPCMPEQ      vpcmpeqd
> > +#  define VPMIN        vpminud
> > +#  define CHAR_SIZE    4
> > +# else
> > +#  define movNULL      movb
> > +#  define VPCMPEQ      vpcmpeqb
> > +#  define VPMIN        vpminub
> > +#  define CHAR_SIZE    1
> > +# endif
> > +
> > +# include "strncpy-or-cat-overflow-def.h"
> > +
> > +# define PAGE_SIZE     4096
> > +
> > +# define VZERO VMM(7)
> > +# define VZERO_128     VMM_128(7)
> > +
> > +       .section SECTION(.text), "ax", @progbits
> > +ENTRY(STRNCAT)
> > +       /* Filter zero length strings and very long strings.  Zero
> > +          length strings just return, very long strings are handled by
> > +          using the non-length variant {wcs|str}cat.  */
> > +       movq    %rdi, %rax
> > +# ifdef USE_AS_WCSCPY
> > +       leaq    -1(%rdx), %rcx
> > +       shr     $56, %rcx
> > +       jnz     L(zero_len)
> > +       salq    $2, %rdx
> > +# else
> > +       test    %rdx, %rdx
> > +       jl      L(zero_len)
> > +# endif
> > +       vpxor   %VZERO_128, %VZERO_128, %VZERO_128
> > +
> > +# include "strcat-strlen-avx2.S"
> > +
> > +       movl    %esi, %ecx
> > +       andl    $(PAGE_SIZE - 1), %ecx
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > +       ja      L(page_cross)
> > +L(page_cross_continue):
> > +       VMOVU   (%rsi), %VMM(0)
> > +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +
> > +       tzcnt   %ecx, %r8d
> > +       cmpq    %r8, %rdx
> > +       jbe     L(less_1x_vec)
> > +
> > +       testl   %ecx, %ecx
> > +       jz      L(more_1x_vec)
> > +
> > +       /* Hoist this to save code size.  */
> > +
> > +       movl    %r8d, %edx
> > +
> > +L(less_1x_vec):
> > +       COND_VZEROUPPER
> > +
> > +       cmpl    $16, %edx
> > +       jae     L(copy_16_31)
> > +       cmpl    $8, %edx
> > +       jae     L(copy_8_15)
> > +
> > +
> > +# ifdef USE_AS_WCSCPY
> > +       vmovd   %VMM_128(0), (%rdi)
> > +       movNULL $0, (%rdi, %rdx)
> > +       ret
> > +# else
> > +       cmpl    $4, %edx
> > +       jae     L(copy_4_7)
> > +
> > +       movzbl  (%rsi), %ecx
> > +       cmpl    $1, %edx
> > +       jbe     L(set_null_term)
> > +
> > +       /* NB: make this `vmovw` if support for AVX512-FP16 is added.
> > +        */
> > +       movzwl  1(%rsi), %esi
> > +       movw    %si, 1(%rdi)
> > +
> > +       .p2align 4,, 1
> > +L(set_null_term):
> > +       movb    %cl, (%rdi)
> > +       movNULL $0, (%rdi, %rdx)
> > +       ret
> > +
> > +       .p2align 4,, 11
> > +L(copy_4_7):
> > +       movl    -(4)(%rsi, %rdx), %ecx
> > +       vmovd   %xmm0, (%rdi)
> > +       movl    %ecx, -(4)(%rdi, %rdx)
> > +       movNULL $0, (%rdi, %rdx)
> > +       ret
> > +# endif
> > +
> > +
> > +       .p2align 4,, 10
> > +L(copy_16_31):
> > +       VMOVU   -(16)(%rsi, %rdx), %xmm1
> > +       VMOVU   %xmm0, (%rdi)
> > +       VMOVU   %xmm1, -(16)(%rdi, %rdx)
> > +       movNULL $0, (%rdi, %rdx)
> > +       ret
> > +
> > +       .p2align 4,, 10
> > +L(copy_8_15):
> > +       movq    -(8)(%rsi, %rdx), %rcx
> > +       vmovq   %xmm0, (%rdi)
> > +       movq    %rcx, -(8)(%rdi, %rdx)
> > +       movNULL $0, (%rdi, %rdx)
> > +       ret
> > +
> > +       .p2align 4,, 8
> > +       .p2align 6,, 14
> > +L(more_1x_vec):
> > +       VMOVU   %VMM(0), (%rdi)
> > +
> > +       /* Align rsi (src) and just rdx/rdi (length/dst).  */
> > +       addq    %rsi, %rdx
> > +       subq    %rsi, %rdi
> > +       orq     $(VEC_SIZE - 1), %rsi
> > +       incq    %rsi
> > +       addq    %rsi, %rdi
> > +L(loop_last_4x_vec):
> > +       subq    %rsi, %rdx
> > +       VMOVA   0(%rsi), %VMM(1)
> > +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       cmpq    $(VEC_SIZE * 2), %rdx
> > +       ja      L(more_2x_vec)
> > +L(last_2x_vec):
> > +       tzcnt   %ecx, %ecx
> > +       cmpl    %ecx, %edx
> > +       jbe     L(ret_vec_x1_len)
> > +
> > +       cmpl    $VEC_SIZE, %ecx
> > +       jnz     L(ret_vec_x1)
> > +
> > +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(2)
> > +       VMOVU   %VMM(1), (%rdi)
> > +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       addl    $-VEC_SIZE, %edx
> > +       bzhil   %edx, %ecx, %r8d
> > +       jz      L(ret_vec_x2_len)
> > +L(ret_vec_x2):
> > +       bsfl    %ecx, %edx
> > +L(ret_vec_x2_len):
> > +       VMOVU   (%rsi, %rdx), %VMM(0)
> > +       movNULL $0, (VEC_SIZE)(%rdi, %rdx)
> > +       VMOVU   %VMM(0), (%rdi, %rdx)
> > +L(return_vzeroupper):
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN
> > +
> > +
> > +       .p2align 4,, 12
> > +L(ret_vec_x1_len):
> > +       movl    %edx, %ecx
> > +L(ret_vec_x1):
> > +       VMOVU   -(VEC_SIZE)(%rsi, %rcx), %VMM(1)
> > +       movNULL $0, (%rdi, %rcx)
> > +       VMOVU   %VMM(1), -VEC_SIZE(%rdi, %rcx)
> > +       VZEROUPPER_RETURN
> > +
> > +       .p2align 4,, 8
> > +L(last_4x_vec):
> > +       subq    $-(VEC_SIZE * 4), %rsi
> > +       VMOVA   0(%rsi), %VMM(1)
> > +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       subq    $-(VEC_SIZE * 4), %rdi
> > +       addl    $-(VEC_SIZE * 4), %edx
> > +       cmpl    $(VEC_SIZE * 2), %edx
> > +       jbe     L(last_2x_vec)
> > +       .p2align 4,, 8
> > +L(more_2x_vec):
> > +       /* L(ret_vec_x1) expects ecx to have position of first match so
> > +          test with bsf.  */
> > +       bsfl    %ecx, %ecx
> > +       jnz     L(ret_vec_x1)
> > +
> > +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(2)
> > +       VMOVU   %VMM(1), (%rdi)
> > +
> > +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x2)
> > +
> >
> > -#define USE_AS_STRNCAT
> > -#define STRCAT STRNCAT
> > -#include "strcat-avx2.S"
> > +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(3)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 1)(%rdi)
> > +
> > +       VPCMPEQ %VMM(3), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +
> > +       /* Check if length is greater than 4x VEC.  */
> > +       cmpq    $(VEC_SIZE * 4), %rdx
> > +       ja      L(more_4x_vec)
> > +
> > +       addl    $(VEC_SIZE * -2), %edx
> > +
> > +       tzcnt   %ecx, %ecx
> > +       cmpl    %ecx, %edx
> > +       jbe     L(ret_vec_x3_len)
> > +
> > +       cmpl    $VEC_SIZE, %ecx
> > +       jnz     L(ret_vec_x3)
> > +
> > +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
> > +       VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       addl    $-VEC_SIZE, %edx
> > +       bzhil   %edx, %ecx, %r8d
> > +       jz      L(ret_vec_x4_len)
> > +L(ret_vec_x4):
> > +       bsfl    %ecx, %edx
> > +L(ret_vec_x4_len):
> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx), %VMM(0)
> > +       movNULL $0, (VEC_SIZE * 3)(%rdi, %rdx)
> > +       VMOVU   %VMM(0), (VEC_SIZE * 2)(%rdi, %rdx)
> > +       VZEROUPPER_RETURN
> > +
> > +       .p2align 4,, 4
> > +L(ret_vec_x3_len):
> > +       movl    %edx, %ecx
> > +L(ret_vec_x3):
> > +       VMOVU   (VEC_SIZE)(%rsi, %rcx), %VMM(0)
> > +       movNULL $0, (VEC_SIZE * 2)(%rdi, %rcx)
> > +       VMOVU   %VMM(0), (VEC_SIZE)(%rdi, %rcx)
> > +       VZEROUPPER_RETURN
> > +
> > +
> > +       .p2align 4,, 8
> > +L(more_4x_vec):
> > +       bsfl    %ecx, %ecx
> > +       jnz     L(ret_vec_x3)
> > +
> > +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(4)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi)
> > +       VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x4)
> > +
> > +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi)
> > +
> > +
> > +       /* Recheck length before aligning.  */
> > +       cmpq    $(VEC_SIZE * 8), %rdx
> > +       jbe     L(last_4x_vec)
> > +
> > +       /* Align rsi (src) and just rdx/rdi (length/dst).  */
> > +       addq    %rsi, %rdx
> > +       subq    %rsi, %rdi
> > +       subq    $-(VEC_SIZE * 4), %rsi
> > +       andq    $(VEC_SIZE * -4), %rsi
> > +
> > +       /* Do first half of loop ahead of time so loop can just start by
> > +          storing.  */
> > +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> > +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %r8d
> > +       addq    %rsi, %rdi
> > +       testl   %r8d, %r8d
> > +       jnz     L(loop_4x_done)
> > +
> > +       /* Use r9 for end of region before handling last 4x VEC
> > +          specially.  */
> > +       leaq    -(VEC_SIZE * 4)(%rdx), %r9
> > +
> > +       .p2align 4,, 11
> > +L(loop_4x_vec):
> > +
> > +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> > +       subq    $(VEC_SIZE * -4), %rsi
> > +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> > +
> > +       subq    $(VEC_SIZE * -4), %rdi
> > +       cmpq    %rsi, %r9
> > +       jbe     L(loop_last_4x_vec)
> > +
> > +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> > +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> > +
> > +       vpmovmskb %VMM(6), %r8d
> > +
> > +       testl   %r8d, %r8d
> > +       jz      L(loop_4x_vec)
> > +
> > +L(loop_4x_done):
> > +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       /* L(ret_vec_x1) expects ecx to have position of first match so
> > +          test with bsf.  */
> > +       bsfl    %ecx, %ecx
> > +       jnz     L(ret_vec_x1)
> > +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> > +
> > +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x2)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> > +
> > +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       bsfl    %ecx, %ecx
> > +       jnz     L(ret_vec_x3)
> > +
> > +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> > +       bsfl    %r8d, %r8d
> > +       VMOVU   (VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8)
> > +       VZEROUPPER_RETURN
> > +
> > +
> > +
> > +       .p2align 4,, 4
> > +L(page_cross):
> > +       movq    %rsi, %r8
> > +       andq    $(VEC_SIZE * -1), %r8
> > +
> > +       VPCMPEQ (%r8), %VZERO, %VMM(6)
> > +
> > +       vpmovmskb %VMM(6), %ecx
> > +       shrxl   %esi, %ecx, %ecx
> > +
> > +       subl    %esi, %r8d
> > +       andl    $(VEC_SIZE - 1), %r8d
> > +       cmpq    %r8, %rdx
> > +       jb      L(page_cross_small)
> > +
> > +       /* Optimizing more aggressively for space as this is very cold
> > +          code. This saves 2x cache lines.  */
> > +
> > +       /* This adds once to the later result which will get correct
> > +          copy bounds. NB: this can never zero-out a non-zero RCX as
> > +          to be in the page cross case rsi cannot be aligned and we
> > +          already right-shift rcx by the misalignment.  */
> > +       shll    $CHAR_SIZE, %ecx
> > +       jz      L(page_cross_continue)
> > +       bsfl    %ecx, %ecx
> > +       rep     movsb
> > +       VZEROUPPER_RETURN
> > +
> > +L(page_cross_small):
> > +       tzcntl  %ecx, %ecx
> > +       jz      L(page_cross_setz)
> > +       cmpl    %edx, %ecx
> > +       cmova   %edx, %ecx
> > +       rep     movsb
> > +L(page_cross_setz):
> > +       movNULL $0, (%rdi)
> > +       VZEROUPPER_RETURN
> > +L(zero_len):
> > +# ifdef USE_AS_WCSCPY
> > +       test    %rdx, %rdx
> > +# endif
> > +       jnz     OVERFLOW_STRCAT
> > +       ret
> > +
> > +
> > +END(STRNCAT)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> > index 79e7083299..b582a4a7a1 100644
> > --- a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> > @@ -1,3 +1,3 @@
> > -#define USE_AS_STRNCPY
> > -#define STRCPY __strncpy_avx2_rtm
> > -#include "strcpy-avx2-rtm.S"
> > +#define STRNCPY        __strncpy_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > +#include "strncpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
> > index ce634e94fa..d1b25b7a42 100644
> > --- a/sysdeps/x86_64/multiarch/strncpy-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
> > @@ -1,7 +1,735 @@
> > -#ifndef STRNCPY
> > -# define STRNCPY       __strncpy_avx2
> > -#endif
> > +/* strncpy with AVX2
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (3)
> > +
> > +# include <sysdep.h>
> > +
> > +
> > +# ifndef VEC_SIZE
> > +#  include "x86-avx2-vecs.h"
> > +# endif
> > +
> > +# ifndef STRNCPY
> > +#  define STRNCPY      __strncpy_avx2
> > +# endif
> > +
> > +
> > +# ifdef USE_AS_WCSCPY
> > +#  define VPCMPEQ      vpcmpeqd
> > +#  define VPMIN        vpminud
> > +#  define CHAR_SIZE    4
> > +# else
> > +#  define VPCMPEQ      vpcmpeqb
> > +#  define VPMIN        vpminub
> > +#  define CHAR_SIZE    1
> > +# endif
> > +
> > +# include "strncpy-or-cat-overflow-def.h"
> > +
> > +# define PAGE_SIZE     4096
> > +
> > +# define VZERO VMM(7)
> > +# define VZERO_128     VMM_128(7)
> > +
> > +
> > +       .section SECTION(.text), "ax", @progbits
> > +ENTRY(STRNCPY)
> > +       /* Filter zero length strings and very long strings.  Zero
> > +          length strings just return, very long strings are handled by
> > +          just running rep stos{b|l} to zero set (which will almost
> > +          certainly segfault), if that succeeds then just calling
> > +          OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
> > +# ifdef USE_AS_WCSCPY
> > +       decq    %rdx
> > +       movq    %rdx, %rax
> > +       /* 56 is end of max supported address space.  */
> > +       shr     $56, %rax
> > +       jnz     L(zero_len)
> > +       salq    $2, %rdx
> > +# else
> > +       decq    %rdx
> > +       /* `dec` can macrofuse with `jl`. If the flag needs to become
> > +          `jb` replace `dec` with `sub`.  */
> > +       jl      L(zero_len)
> > +# endif
> > +
> > +       vpxor   %VZERO_128, %VZERO_128, %VZERO_128
> > +       movl    %esi, %eax
> > +       andl    $(PAGE_SIZE - 1), %eax
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +       ja      L(page_cross)
> > +
> > +L(page_cross_continue):
> > +       VMOVU   (%rsi), %VMM(0)
> > +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +
> > +       /* If no STPCPY just save end ahead of time.  */
> > +# ifndef USE_AS_STPCPY
> > +       movq    %rdi, %rax
> > +# elif defined USE_AS_WCSCPY
> > +       /* Clear dependency as nearly all return code for wcpncpy uses
> > +          `setc %al`.  */
> > +       xorl    %eax, %eax
> > +# endif
> > +
> > +       cmpq    $(VEC_SIZE - CHAR_SIZE), %rdx
> > +       /* `jb` because length rdx is now length - CHAR_SIZE.  */
> > +       jbe     L(less_1x_vec)
> > +
> > +       /* This may overset but thats fine because we still need to zero
> > +          fill.  */
> > +       VMOVU   %VMM(0), (%rdi)
> > +
> > +       testl   %ecx, %ecx
> > +       jnz     L(zfill)
> > +
> > +       /* Align.  */
> > +       addq    %rsi, %rdx
> > +       subq    %rsi, %rdi
> > +       orq     $(VEC_SIZE - 1), %rsi
> > +       incq    %rsi
> > +L(last_4x_vec):
> > +       addq    %rsi, %rdi
> > +L(loop_last_4x_vec):
> > +       subq    %rsi, %rdx
> > +
> > +
> > +       VMOVA   0(%rsi), %VMM(1)
> > +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +
> > +       cmpq    $(VEC_SIZE * 2), %rdx
> > +       jae     L(more_2x_vec)
> > +
> > +       cmpl    $(VEC_SIZE), %edx
> > +       jb      L(ret_vec_x1_len)
> > +
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x1)
> > +
> > +       VPCMPEQ VEC_SIZE(%rsi), %VZERO, %VMM(6)
> > +       VMOVU   %VMM(1), (%rdi)
> > +       vpmovmskb %VMM(6), %ecx
> > +       shlq    $VEC_SIZE, %rcx
> > +L(ret_vec_x1_len):
> > +       tzcntq  %rcx, %rcx
> > +       cmpl    %ecx, %edx
> > +       jbe     L(ret_vec_x1_len_no_zfill)
> > +       /* Fall through (expectation) is copy len < buffer len.  */
> > +       VMOVU   %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> > +L(ret_vec_x1_len_no_zfill_mov):
> > +       movl    %ecx, %edx
> > +# ifdef USE_AS_STPCPY
> > +       /* clear flags.  */
> > +       xorl    %ecx, %ecx
> > +# endif
> > +L(ret_vec_x1_len_no_zfill):
> > +       VMOVU   ((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> > +       VMOVU   %VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> > +# ifdef USE_AS_STPCPY
> > +#  ifdef USE_AS_WCSCPY
> > +       setc    %al
> > +       addq    %rdx, %rdi
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +#  else
> > +       movl    %edx, %eax
> > +       adcq    %rdi, %rax
> > +#  endif
> > +# endif
> > +L(return_vzeroupper):
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN
> > +
> > +       .p2align 4,, 6
> > +L(ret_vec_x1):
> > +       bsfl    %ecx, %ecx
> > +       VMOVU   %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> > +       subl    %ecx, %edx
> > +       /* Check if we need to reload/store.  */
> > +       cmpl    $VEC_SIZE, %edx
> > +       jb      L(ret_vec_x1_len_no_zfill_mov)
> > +       /* Otherwise safe to just store directly.  */
> > +       VMOVU   %VMM(1), (%rdi)
> > +       VMOVU   %VZERO, (%rdi, %rcx)
> > +# ifdef USE_AS_STPCPY
> > +       leaq    (%rdi, %rcx), %rax
> > +# endif
> > +       VZEROUPPER_RETURN
> > +
> > +       .p2align 4,, 12
> > +L(more_2x_vec):
> > +       VMOVU   %VMM(1), (%rdi)
> > +       testl   %ecx, %ecx
> > +       /* Must fill at least 2x VEC.  */
> > +       jnz     L(zfill_vec1)
> > +
> > +       VMOVA   VEC_SIZE(%rsi), %VMM(2)
> > +       VMOVU   %VMM(2), VEC_SIZE(%rdi)
> > +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       testl   %ecx, %ecx
> > +       /* Must fill at least 1x VEC.  */
> > +       jnz     L(zfill_vec2)
> > +
> > +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(3)
> > +       VPCMPEQ %VMM(3), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +
> > +       /* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
> > +          CHAR_SIZE.  */
> > +       cmpq    $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
> > +       ja      L(more_4x_vec)
> > +
> > +       subl    $(VEC_SIZE * 3), %edx
> > +       jb      L(ret_vec_x3_len)
> > +
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x3)
> > +
> > +       VPCMPEQ (VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi)
> > +       vpmovmskb %VMM(6), %ecx
> > +       tzcntl  %ecx, %ecx
> > +       cmpl    %ecx, %edx
> > +       jbe     L(ret_vec_x4_len_no_zfill)
> > +       /* Fall through (expectation) is copy len < buffer len.  */
> > +       VMOVU   %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> > +       movl    %ecx, %edx
> > +L(ret_vec_x4_len_no_zfill):
> > +       VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> > +       VMOVU   %VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> > +# ifdef USE_AS_STPCPY
> > +#  ifdef USE_AS_WCSCPY
> > +       setc    %al
> > +       addq    %rdx, %rdi
> > +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +#  else
> > +       leal    (VEC_SIZE * 3 + 0)(%edx), %eax
> > +       adcq    %rdi, %rax
> > +#  endif
> > +# endif
> > +       VZEROUPPER_RETURN
> > +
> > +
> > +L(ret_vec_x3_len):
> > +       addl    $(VEC_SIZE * 1), %edx
> > +       tzcntl  %ecx, %ecx
> > +       cmpl    %ecx, %edx
> > +       jbe     L(ret_vec_x3_len_no_zfill)
> > +       /* Fall through (expectation) is copy len < buffer len.  */
> > +       VMOVU   %VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> > +L(ret_vec_x3_len_no_zfill_mov):
> > +       movl    %ecx, %edx
> > +# ifdef USE_AS_STPCPY
> > +       /* clear flags.  */
> > +       xorl    %ecx, %ecx
> > +# endif
> > +       .p2align 4,, 4
> > +L(ret_vec_x3_len_no_zfill):
> > +       VMOVU   ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> > +       VMOVU   %VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> > +# ifdef USE_AS_STPCPY
> > +#  ifdef USE_AS_WCSCPY
> > +       setc    %al
> > +       addq    %rdx, %rdi
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +#  else
> > +       leal    (VEC_SIZE * 2 + 0)(%rdx), %eax
> > +       adcq    %rdi, %rax
> > +#  endif
> > +# endif
> > +       VZEROUPPER_RETURN
> > +
> > +
> > +       .p2align 4,, 8
> > +L(ret_vec_x3):
> > +       bsfl    %ecx, %ecx
> > +       VMOVU   %VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
> > +       subl    %ecx, %edx
> > +       jl      L(ret_vec_x3_len_no_zfill_mov)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi)
> > +       VMOVU   %VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
> > +# ifdef USE_AS_STPCPY
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rcx), %rax
> > +# endif
> > +       VZEROUPPER_RETURN
> > +
> > +       .p2align 4,, 8
> > +L(more_4x_vec):
> > +
> > +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi)
> > +       testl   %ecx, %ecx
> > +       jnz     L(zfill_vec3)
> > +
> > +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(4)
> > +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi)
> > +       VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(zfill_vec4)
> > +
> > +       movq    %rdx, %rcx
> > +       addq    %rsi, %rdx
> > +       subq    %rsi, %rdi
> > +       subq    $-(VEC_SIZE * 4), %rsi
> > +       /* Recheck length before aligning.  */
> > +       cmpq    $(VEC_SIZE * 8 - CHAR_SIZE), %rcx
> > +       jbe     L(last_4x_vec)
> > +
> > +       andq    $(VEC_SIZE * -4), %rsi
> > +
> > +       /* Do first half of loop ahead of time so loop can just start by
> > +          storing.  */
> > +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> > +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %r8d
> > +       addq    %rsi, %rdi
> > +       testl   %r8d, %r8d
> > +       jnz     L(loop_4x_done)
> > +
> > +       /* Use r9 as end register.  */
> > +       leaq    -(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
> >
> > -#define USE_AS_STRNCPY
> > -#define STRCPY STRNCPY
> > -#include "strcpy-avx2.S"
> > +       .p2align 4,, 11
> > +L(loop_4x_vec):
> > +
> > +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> > +       subq    $(VEC_SIZE * -4), %rsi
> > +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> > +
> > +       subq    $(VEC_SIZE * -4), %rdi
> > +       cmpq    %rsi, %r9
> > +       jbe     L(loop_last_4x_vec)
> > +
> > +       VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > +       VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > +       VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > +       VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +       VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +       VPMIN   %VMM(4), %VMM(6), %VMM(6)
> > +       VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> > +
> > +       vpmovmskb %VMM(6), %r8d
> > +
> > +       testl   %r8d, %r8d
> > +       jz      L(loop_4x_vec)
> > +
> > +L(loop_4x_done):
> > +       subq    %rsi, %rdx
> > +       VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> > +       VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(zfill_vec1)
> > +
> > +       VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> > +       VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(zfill_vec2)
> > +
> > +       VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> > +       VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> > +       vpmovmskb %VMM(6), %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(zfill_vec3)
> > +
> > +       VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> > +       movl    %r8d, %ecx
> > +
> > +       // Zfill more....
> > +
> > +       .p2align 4,, 4
> > +L(zfill_vec4):
> > +       addq    $(VEC_SIZE * 2), %rdi
> > +       subq    $(VEC_SIZE * 2), %rdx
> > +L(zfill_vec2):
> > +       shlq    $VEC_SIZE, %rcx
> > +L(zfill):
> > +       bsfq    %rcx, %rcx
> > +       subq    %rcx, %rdx
> > +       addq    %rcx, %rdi
> > +# ifdef USE_AS_STPCPY
> > +       movq    %rdi, %rax
> > +# endif
> > +L(zfill_from_page_cross):
> > +       cmpq    $VEC_SIZE, %rdx
> > +       jb      L(zfill_less_vec_vzeroupper)
> > +
> > +L(zfill_more_1x_vec):
> > +       VMOVU   %VZERO, CHAR_SIZE(%rdi)
> > +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
> > +       cmpq    $(VEC_SIZE * 2), %rdx
> > +       jae     L(zfill_more_2x_vec)
> > +L(zfill_done0):
> > +       VZEROUPPER_RETURN
> > +
> > +       .p2align 4,, 8
> > +L(zfill_vec3):
> > +       addq    $(VEC_SIZE * 2), %rdi
> > +       subq    $(VEC_SIZE * 2), %rdx
> > +       .p2align 4,, 2
> > +L(zfill_vec1):
> > +       bsfl    %ecx, %ecx
> > +       addq    %rcx, %rdi
> > +       subq    %rcx, %rdx
> > +# ifdef USE_AS_STPCPY
> > +       movq    %rdi, %rax
> > +# endif
> > +       /* zfill from vec1/vec3 must have to set at least 2x VECS.  */
> > +
> > +       VMOVU   %VZERO, CHAR_SIZE(%rdi)
> > +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
> > +       cmpq    $(VEC_SIZE * 2), %rdx
> > +       jb      L(zfill_done0)
> > +L(zfill_more_2x_vec):
> > +       VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
> > +       VMOVU   %VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
> > +       subq    $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
> > +       jbe     L(zfill_done)
> > +
> > +       addq    %rdi, %rdx
> > +       VMOVU   %VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
> > +       VMOVU   %VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
> > +
> > +
> > +       VMOVU   %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
> > +       VMOVU   %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
> > +
> > +       subq    $-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
> > +       cmpq    %rdi, %rdx
> > +       jbe     L(zfill_done)
> > +
> > +       andq    $-(VEC_SIZE), %rdi
> > +       .p2align 4,, 12
> > +L(zfill_loop_4x_vec):
> > +       VMOVA   %VZERO, (VEC_SIZE * 0)(%rdi)
> > +       VMOVA   %VZERO, (VEC_SIZE * 1)(%rdi)
> > +       VMOVA   %VZERO, (VEC_SIZE * 2)(%rdi)
> > +       VMOVA   %VZERO, (VEC_SIZE * 3)(%rdi)
> > +       subq    $-(VEC_SIZE * 4), %rdi
> > +       cmpq    %rdi, %rdx
> > +       ja      L(zfill_loop_4x_vec)
> > +L(zfill_done):
> > +       VZEROUPPER_RETURN
> > +
> > +
> > +       .p2align 4,, 8
> > +L(copy_1x):
> > +       VMOVU   %VMM(0), (%rdi)
> > +       testl   %ecx, %ecx
> > +       jz      L(ret_32_32)
> > +L(zfill_less_vec):
> > +       bsfl    %ecx, %ecx
> > +L(zfill_less_vec_no_bsf):
> > +       subq    %rcx, %rdx
> > +       addq    %rcx, %rdi
> > +# ifdef USE_AS_STPCPY
> > +       movq    %rdi, %rax
> > +# endif
> > +L(zfill_less_vec_vzeroupper):
> > +       COND_VZEROUPPER
> > +       /* We are taking advantage of the fact that to be here we must
> > +          be writing null-term as (%rdi, %rcx) we have a byte of lee-
> > +          way for overwriting.  */
> > +       cmpl    $16, %edx
> > +       jb      L(zfill_less_16)
> > +       VMOVU   %VZERO_128, (%rdi)
> > +       VMOVU   %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
> > +       ret
> > +# ifdef USE_AS_STPCPY
> > +L(ret_32_32):
> > +       leaq    CHAR_SIZE(%rdi, %rdx), %rax
> > +       VZEROUPPER_RETURN
> > +# endif
> > +
> > +       .p2align 4,, 4
> > +L(copy_16_31):
> > +       /* Overfill to avoid branches.  */
> > +       vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
> > +       vmovdqu %xmm0, (%rdi)
> > +       vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
> > +       cmpl    %ecx, %edx
> > +       ja      L(zfill_less_vec_no_bsf)
> > +# ifndef USE_AS_STPCPY
> > +L(ret_32_32):
> > +# else
> > +#  ifdef USE_AS_WCSCPY
> > +       setc    %al
> > +       addq    %rdx, %rdi
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +#  else
> > +       movl    %edx, %eax
> > +       adcq    %rdi, %rax
> > +#  endif
> > +# endif
> > +       VZEROUPPER_RETURN
> > +
> > +       .p2align 4,, 4
> > +L(copy_8_15):
> > +       /* Overfill to avoid branches.  */
> > +       movq    -(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
> > +       vmovq   %xmm0, (%rdi)
> > +       movq    %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
> > +       cmpl    %ecx, %edx
> > +       jbe     L(ret_8_15)
> > +       subq    %rcx, %rdx
> > +       addq    %rcx, %rdi
> > +# ifdef USE_AS_STPCPY
> > +       movq    %rdi, %rax
> > +# endif
> > +       .p2align 4,, 8
> > +L(zfill_less_16):
> > +       xorl    %ecx, %ecx
> > +       cmpl    $8, %edx
> > +       jb      L(zfill_less_8)
> > +       movq    %rcx, (%rdi)
> > +       movq    %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
> > +# ifndef USE_AS_STPCPY
> > +L(ret_8_15):
> > +# endif
> > +       ret
> > +
> > +
> > +       .p2align 4,, 8
> > +L(less_1x_vec):
> > +       /* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
> > +          buffer sizes are aligned conventially.  */
> > +       je      L(copy_1x)
> > +
> > +       tzcntl  %ecx, %ecx
> > +       cmpl    $16, %edx
> > +       jae     L(copy_16_31)
> > +
> > +       COND_VZEROUPPER
> > +       cmpl    $8, %edx
> > +       jae     L(copy_8_15)
> > +# ifdef USE_AS_WCSCPY
> > +       testl   %ecx, %ecx
> > +       jz      L(zfill_less_8_set_ret)
> > +
> > +       movl    (%rsi, %rdx), %esi
> > +       vmovd   %xmm0, (%rdi)
> > +       movl    %esi, (%rdi, %rdx)
> > +
> > +#  ifdef USE_AS_STPCPY
> > +       cmpl    %ecx, %edx
> > +L(ret_8_15):
> > +       setc    %al
> > +       addq    %rdx, %rdi
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +#  endif
> > +       ret
> > +L(zfill_less_8_set_ret):
> > +       xorl    %ecx, %ecx
> > +#  ifdef USE_AS_STPCPY
> > +       movq    %rdi, %rax
> > +#  endif
> > +L(zfill_less_8):
> > +       movl    %ecx, (%rdi)
> > +       movl    %ecx, (%rdi, %rdx)
> > +       ret
> > +
> > +# else
> > +       cmpl    $3, %edx
> > +       jb      L(copy_0_3)
> > +       /* Overfill to avoid branches.  */
> > +       movl    -3(%rsi, %rdx), %esi
> > +       vmovd   %xmm0, (%rdi)
> > +       movl    %esi, -3(%rdi, %rdx)
> > +       cmpl    %ecx, %edx
> > +       jbe     L(ret_4_7)
> > +       subq    %rcx, %rdx
> > +       addq    %rcx, %rdi
> > +#  ifdef USE_AS_STPCPY
> > +       movq    %rdi, %rax
> > +#  endif
> > +       xorl    %ecx, %ecx
> > +       .p2align 4,, 8
> > +L(zfill_less_8):
> > +       cmpl    $3, %edx
> > +       jb      L(zfill_less_3)
> > +       movl    %ecx, (%rdi)
> > +       movl    %ecx, -3(%rdi, %rdx)
> > +#  ifdef USE_AS_STPCPY
> > +       ret
> > +#  endif
> > +
> > +L(ret_4_7):
> > +#  ifdef USE_AS_STPCPY
> > +L(ret_8_15):
> > +       movl    %edx, %eax
> > +       adcq    %rdi, %rax
> > +#  endif
> > +       ret
> > +
> > +       .p2align 4,, 4
> > +L(zfill_less_3):
> > +       testl   %edx, %edx
> > +       jz      L(zfill_1)
> > +       movw    %cx, (%rdi)
> > +L(zfill_1):
> > +       movb    %cl, (%rdi, %rdx)
> > +       ret
> > +
> > +       .p2align 4,, 8
> > +L(copy_0_3):
> > +       vmovd   %xmm0, %r8d
> > +       testl   %edx, %edx
> > +       jz      L(copy_1)
> > +       movw    %r8w, (%rdi)
> > +       cmpl    %ecx, %edx
> > +       ja      L(zfill_from_1)
> > +       movzbl  (%rsi, %rdx), %r8d
> > +#  ifdef USE_AS_STPCPY
> > +       movl    %edx, %eax
> > +       adcq    %rdi, %rax
> > +       movb    %r8b, (%rdi, %rdx)
> > +       ret
> > +#  endif
> > +
> > +L(copy_1):
> > +#  ifdef USE_AS_STPCPY
> > +       movl    %edx, %eax
> > +       cmpl    %ecx, %edx
> > +       adcq    %rdi, %rax
> > +#  endif
> > +#  ifdef USE_AS_WCSCPY
> > +       vmovd   %xmm0, (%rdi)
> > +#  else
> > +       movb    %r8b, (%rdi, %rdx)
> > +#  endif
> > +       ret
> > +# endif
> > +
> > +       .p2align 4,, 2
> > +L(zero_len):
> > +       movq    %rdi, %rax
> > +       ret
> > +# ifndef USE_AS_WCSCPY
> > +       .p2align 4,, 8
> > +L(zfill_from_1):
> > +#  ifdef USE_AS_STPCPY
> > +       leaq    (%rdi, %rcx), %rax
> > +#  endif
> > +       movw    $0, -1(%rdi, %rdx)
> > +       ret
> > +# endif
> > +
> > +       .p2align 4,, 4
> > +       .p2align 6,, 8
> > +L(page_cross):
> > +       movq    %rsi, %rax
> > +       andq    $(VEC_SIZE * -1), %rax
> > +
> > +       VPCMPEQ (%rax), %VZERO, %VMM(6)
> > +
> > +       vpmovmskb %VMM(6), %ecx
> > +       shrxl   %esi, %ecx, %ecx
> > +
> > +       subl    %esi, %eax
> > +       andl    $(VEC_SIZE - 1), %eax
> > +       cmpq    %rax, %rdx
> > +       jb      L(page_cross_small)
> > +       /* Optimizing more aggressively for space as this is very cold
> > +          code. This saves 2x cache lines.  */
> > +
> > +       /* If rcx is non-zero then continue.  */
> > +       shl     $CHAR_SIZE, %ecx
> > +       jz      L(page_cross_continue)
> > +       bsf     %ecx, %ecx
> > +
> > +       subq    %rcx, %rdx
> > +# ifdef USE_AS_STPCPY
> > +       leaq    -CHAR_SIZE(%rdi, %rcx), %rax
> > +# else
> > +       movq    %rdi, %rax
> > +# endif
> > +
> > +       rep     movsb
> > +# ifdef USE_AS_WCSCPY
> > +       movl    $0, (%rdi)
> > +# else
> > +       movb    $0, (%rdi)
> > +# endif
> > +       jmp     L(zfill_from_page_cross)
> > +
> > +L(page_cross_small):
> > +       tzcntl  %ecx, %ecx
> > +       xorl    %eax, %eax
> > +       cmpl    %ecx, %edx
> > +       jbe     L(page_cross_copy_only)
> > +
> > +       /* Do a zfill of the tail before copying.  */
> > +       movq    %rdi, %r9
> > +       movl    %ecx, %r8d
> > +
> > +       subl    %ecx, %edx
> > +       leaq    CHAR_SIZE(%rdi, %rcx), %rdi
> > +       movl    %edx, %ecx
> > +       rep     stosb
> > +       movq    %r9, %rdi
> > +       movl    %r8d, %edx
> > +L(page_cross_copy_only):
> > +       leal    CHAR_SIZE(%rdx), %ecx
> > +# ifdef USE_AS_STPCPY
> > +#  ifdef USE_AS_WCSCPY
> > +       setc    %al
> > +       addq    %rdi, %rdx
> > +       leaq    (%rdx, %rax, CHAR_SIZE), %rax
> > +#  else
> > +       movl    %edx, %eax
> > +       adcq    %rdi, %rax
> > +#  endif
> > +# else
> > +       movq    %rdi, %rax
> > +# endif
> > +       rep     movsb
> > +       ret
> > +
> > +
> > +L(best_effort_strncpy):
> > +       movq    %rdx, %rcx
> > +       xorl    %eax, %eax
> > +       movq    %rdi, %r8
> > +       /* The length is >= 2^63. We very much so expect to segfault at
> > +          rep stos. If that doesn't happen then just strcpy to finish.
> > +        */
> > +# ifdef USE_AS_WCSCPY
> > +       rep     stosl
> > +# else
> > +       rep     stosb
> > +# endif
> > +       movq    %r8, %rdi
> > +       jmp     OVERFLOW_STRCPY
> > +END(STRNCPY)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> > index dca1089060..01bead1435 100644
> > --- a/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> > +++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> > @@ -27,7 +27,10 @@
> >  #define VEC_SIZE                       32
> >  #include "x86-vec-macros.h"
> >
> > -#define USE_WITH_AVX           1
> > +#ifndef USE_WITH_AVX2
> > +# define USE_WITH_AVX          1
>
> Is this checked somewhere?
>
> > +#endif
> > +
> >  #define SECTION(p)                     p##.avx
> >
> >  /* 4-byte mov instructions with AVX2.  */
> > diff --git a/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
> > new file mode 100644
> > index 0000000000..a5966701ec
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
> > @@ -0,0 +1,26 @@
> > +/* Common config for AVX2-RTM VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _X86_AVX2_RTM_VECS_H
> > +#define _X86_AVX2_RTM_VECS_H                   1
> > +
> > +#define USE_WITH_AVX2          1
> > +#include "x86-avx-rtm-vecs.h"
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/x86-avx2-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
> > new file mode 100644
> > index 0000000000..16d7ae5147
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
> > @@ -0,0 +1,27 @@
> > +/* Common config for AVX2 VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _X86_AVX2_VECS_H
> > +#define _X86_AVX2_VECS_H                       1
> > +
> > +#define USE_WITH_AVX2          1
>
> Is this checked somewhere?
>
> > +
> > +#include "x86-avx-vecs.h"
> > +
> > +#endif
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 4/4] x86: Add optimized functions for the wide-character strcpy family
  2022-11-04 16:47     ` H.J. Lu
@ 2022-11-04 20:22       ` Noah Goldstein
  0 siblings, 0 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04 20:22 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Nov 4, 2022 at 9:47 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Nov 4, 2022 at 1:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Implemented:
> >     wcscat-avx2{+rtm}
> >     wcscpy-avx2{+rtm}
> >     wcpcpy-avx2{+rtm}
> >     wcsncpy-avx2{+rtm}
> >     wcpncpy-avx2{+rtm}
> >     wcsncat-avx2{+rtm}
> >     wcscat-evex
> >     wcscpy-evex
> >     wcpcpy-evex
> >     wcsncpy-evex
> >     wcpncpy-evex
> >     wcsncat-evex
> >
> > Performance Changes:
> >     Times are from N = 10 runs of the benchmark suite and are reported
> >     as geometric mean of all ratios of New Implementation / Best Old
> >     Implementation. Best Old Implementation was determined with the
> >     highest ISA implementation.
> >
> >     wcscat-avx2     -> 0.975
> >     wcscpy-avx2     -> 0.591
> >     wcpcpy-avx2     -> 0.698
> >     wcsncpy-avx2    -> 0.730
> >     wcpncpy-avx2    -> 0.711
> >     wcsncat-avx2    -> 0.954
> >     wcscat-evex     -> 0.991
> >     wcscpy-evex     -> 0.587
> >     wcpcpy-evex     -> 0.695
> >     wcsncpy-evex    -> 0.719
> >     wcpncpy-evex    -> 0.694
> >     wcsncat-evex    -> 0.979
> >
> > Code Size Changes:
> >     This change (compared with the last two commits without it)
> >     increase the size of libc.so by ~19kb bytes. For reference this
> >     entire patchset increases libc.so by ~2.5kb (so without the
> >     wide-character functions libc.so would decrease by 16.5kb).
> >
> > Full check passes on x86-64 and build succeeds for all ISA levels w/
> > and w/o multiarch.
>
> Please separate AVX2 and EVEX to reduce the patch size.
Done in V3. Since the commit messages changed they didn't reply inchain
but there are two new patches in the set now.
>
> > ---
> >  sysdeps/x86_64/Makefile                     |   5 +
> >  sysdeps/x86_64/multiarch/Makefile           |  26 +++-
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c  | 135 +++++++++++++++++++-
> >  sysdeps/x86_64/multiarch/ifunc-wcs.h        |  60 +++++++++
> >  sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S  |   3 +
> >  sysdeps/x86_64/multiarch/wcpcpy-avx2.S      |   8 ++
> >  sysdeps/x86_64/multiarch/wcpcpy-evex.S      |   8 ++
> >  sysdeps/x86_64/multiarch/wcpcpy-generic.c   |  27 ++++
> >  sysdeps/x86_64/multiarch/wcpcpy.c           |  37 ++++++
> >  sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S |   3 +
> >  sysdeps/x86_64/multiarch/wcpncpy-avx2.S     |   8 ++
> >  sysdeps/x86_64/multiarch/wcpncpy-evex.S     |   8 ++
> >  sysdeps/x86_64/multiarch/wcpncpy-generic.c  |  27 ++++
> >  sysdeps/x86_64/multiarch/wcpncpy.c          |  37 ++++++
> >  sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S  |   3 +
> >  sysdeps/x86_64/multiarch/wcscat-avx2.S      |  10 ++
> >  sysdeps/x86_64/multiarch/wcscat-evex.S      |   9 ++
> >  sysdeps/x86_64/multiarch/wcscat-generic.c   |  27 ++++
> >  sysdeps/x86_64/multiarch/wcscat.c           |  37 ++++++
> >  sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S  |   3 +
> >  sysdeps/x86_64/multiarch/wcscpy-avx2.S      |   7 +
> >  sysdeps/x86_64/multiarch/wcscpy-evex.S      |   7 +
> >  sysdeps/x86_64/multiarch/wcscpy-generic.c   |   3 +-
> >  sysdeps/x86_64/multiarch/wcscpy.c           |  21 +++
> >  sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S |   3 +
> >  sysdeps/x86_64/multiarch/wcsncat-avx2.S     |   9 ++
> >  sysdeps/x86_64/multiarch/wcsncat-evex.S     |   9 ++
> >  sysdeps/x86_64/multiarch/wcsncat-generic.c  |  27 ++++
> >  sysdeps/x86_64/multiarch/wcsncat.c          |  34 +++++
> >  sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S |   3 +
> >  sysdeps/x86_64/multiarch/wcsncpy-avx2.S     |   7 +
> >  sysdeps/x86_64/multiarch/wcsncpy-evex.S     |   7 +
> >  sysdeps/x86_64/multiarch/wcsncpy-generic.c  |  27 ++++
> >  sysdeps/x86_64/multiarch/wcsncpy.c          |  37 ++++++
> >  sysdeps/x86_64/wcpcpy-generic.c             |  31 +++++
> >  sysdeps/x86_64/wcpcpy.S                     |  41 ++++++
> >  sysdeps/x86_64/wcpncpy-generic.c            |  31 +++++
> >  sysdeps/x86_64/wcpncpy.S                    |  41 ++++++
> >  sysdeps/x86_64/wcscat-generic.c             |  31 +++++
> >  sysdeps/x86_64/wcscat.S                     |  41 ++++++
> >  sysdeps/x86_64/wcscpy.S                     |   2 +
> >  sysdeps/x86_64/wcsncat-generic.c            |  31 +++++
> >  sysdeps/x86_64/wcsncat.S                    |  39 ++++++
> >  sysdeps/x86_64/wcsncpy-generic.c            |  31 +++++
> >  sysdeps/x86_64/wcsncpy.S                    |  41 ++++++
> >  45 files changed, 1036 insertions(+), 6 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-evex.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-generic.c
> >  create mode 100644 sysdeps/x86_64/multiarch/wcpcpy.c
> >  create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-evex.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-generic.c
> >  create mode 100644 sysdeps/x86_64/multiarch/wcpncpy.c
> >  create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcscat-evex.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcscat-generic.c
> >  create mode 100644 sysdeps/x86_64/multiarch/wcscat.c
> >  create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcscpy-evex.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsncat-evex.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsncat-generic.c
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsncat.c
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-evex.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-generic.c
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsncpy.c
> >  create mode 100644 sysdeps/x86_64/wcpcpy-generic.c
> >  create mode 100644 sysdeps/x86_64/wcpcpy.S
> >  create mode 100644 sysdeps/x86_64/wcpncpy-generic.c
> >  create mode 100644 sysdeps/x86_64/wcpncpy.S
> >  create mode 100644 sysdeps/x86_64/wcscat-generic.c
> >  create mode 100644 sysdeps/x86_64/wcscat.S
> >  create mode 100644 sysdeps/x86_64/wcsncat-generic.c
> >  create mode 100644 sysdeps/x86_64/wcsncat.S
> >  create mode 100644 sysdeps/x86_64/wcsncpy-generic.c
> >  create mode 100644 sysdeps/x86_64/wcsncpy.S
> >
> > diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
> > index 3627c5659f..688eb2d7c4 100644
> > --- a/sysdeps/x86_64/Makefile
> > +++ b/sysdeps/x86_64/Makefile
> > @@ -188,8 +188,13 @@ endif
> >  ifeq ($(subdir),wcsmbs)
> >
> >  sysdep_routines += \
> > +  wcpcpy-generic \
> > +  wcpncpy-generic \
> > +  wcscat-generic \
> >    wcscpy-generic \
> > +  wcsncat-generic \
> >    wcsncmp-generic \
> > +  wcsncpy-generic \
> >    wcsnlen-generic \
> >  # sysdep_routines
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index 066bfa48d9..f848fc0e28 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -131,6 +131,18 @@ endif
> >
> >  ifeq ($(subdir),wcsmbs)
> >  sysdep_routines += \
> > +  wcpcpy-avx2 \
> > +  wcpcpy-avx2-rtm \
> > +  wcpcpy-evex \
> > +  wcpcpy-generic \
> > +  wcpncpy-avx2 \
> > +  wcpncpy-avx2-rtm \
> > +  wcpncpy-evex \
> > +  wcpncpy-generic \
> > +  wcscat-avx2 \
> > +  wcscat-avx2-rtm \
> > +  wcscat-evex \
> > +  wcscat-generic \
> >    wcschr-avx2 \
> >    wcschr-avx2-rtm \
> >    wcschr-evex \
> > @@ -140,6 +152,10 @@ sysdep_routines += \
> >    wcscmp-avx2-rtm \
> >    wcscmp-evex \
> >    wcscmp-sse2 \
> > +  wcscpy-avx2 \
> > +  wcscpy-avx2-rtm \
> > +  wcscpy-evex \
> > +  wcscpy-generic \
> >    wcscpy-ssse3 \
> >    wcslen-avx2 \
> >    wcslen-avx2-rtm \
> > @@ -147,9 +163,17 @@ sysdep_routines += \
> >    wcslen-evex512 \
> >    wcslen-sse2 \
> >    wcslen-sse4_1 \
> > +  wcsncat-avx2 \
> > +  wcsncat-avx2-rtm \
> > +  wcsncat-evex \
> > +  wcsncat-generic \
> >    wcsncmp-avx2 \
> >    wcsncmp-avx2-rtm \
> >    wcsncmp-evex \
> > +  wcsncpy-avx2 \
> > +  wcsncpy-avx2-rtm \
> > +  wcsncpy-evex \
> > +  wcsncpy-generic \
> >    wcsnlen-avx2 \
> >    wcsnlen-avx2-rtm \
> >    wcsnlen-evex \
> > @@ -163,8 +187,8 @@ sysdep_routines += \
> >    wmemchr-avx2 \
> >    wmemchr-avx2-rtm \
> >    wmemchr-evex \
> > -  wmemchr-evex512 \
> >    wmemchr-evex-rtm \
> > +  wmemchr-evex512 \
> >    wmemchr-sse2 \
> >    wmemcmp-avx2-movbe \
> >    wmemcmp-avx2-movbe-rtm \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 7cebee7ec7..71e8953e91 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -901,16 +901,145 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> >    /* Support sysdeps/x86_64/multiarch/wcscpy.c.  */
> >    IFUNC_IMPL (i, name, wcscpy,
> > -             /* ISA V4 wrapper for SSSE3 implementation because
> > -                the SSSE3 implementation is also used at ISA
> > -                level 3/4.  */
> >               X86_IFUNC_IMPL_ADD_V4 (array, i, wcscpy,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wcscpy_evex)
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
> > +                                    (CPU_FEATURE_USABLE (AVX2)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wcscpy_avx2)
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
> > +                                    (CPU_FEATURE_USABLE (AVX2)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)
> > +                                     && CPU_FEATURE_USABLE (RTM)),
> > +                                    __wcscpy_avx2_rtm)
> > +             X86_IFUNC_IMPL_ADD_V2 (array, i, wcscpy,
> >                                      CPU_FEATURE_USABLE (SSSE3),
> >                                      __wcscpy_ssse3)
> >               X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
> >                                      1,
> >                                      __wcscpy_generic))
> >
> > +  /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
> > +  IFUNC_IMPL (i, name, wcsncpy,
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wcsncpy_evex)
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
> > +                                    (CPU_FEATURE_USABLE (AVX2)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wcsncpy_avx2)
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
> > +                                    (CPU_FEATURE_USABLE (AVX2)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)
> > +                                     && CPU_FEATURE_USABLE (RTM)),
> > +                                    __wcsncpy_avx2_rtm)
> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcpncpy,
> > +                                    1,
> > +                                    __wcsncpy_generic))
> > +
> > +  /* Support sysdeps/x86_64/multiarch/wcpcpy.c.  */
> > +  IFUNC_IMPL (i, name, wcpcpy,
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcpcpy,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wcpcpy_evex)
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
> > +                                    (CPU_FEATURE_USABLE (AVX2)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wcpcpy_avx2)
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
> > +                                    (CPU_FEATURE_USABLE (AVX2)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)
> > +                                     && CPU_FEATURE_USABLE (RTM)),
> > +                                    __wcpcpy_avx2_rtm)
> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcpcpy,
> > +                                    1,
> > +                                    __wcpcpy_generic))
> > +
> > +  /* Support sysdeps/x86_64/multiarch/wcpncpy.c.  */
> > +  IFUNC_IMPL (i, name, wcpncpy,
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcpncpy,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wcpncpy_evex)
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
> > +                                    (CPU_FEATURE_USABLE (AVX2)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wcpncpy_avx2)
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
> > +                                    (CPU_FEATURE_USABLE (AVX2)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)
> > +                                     && CPU_FEATURE_USABLE (RTM)),
> > +                                    __wcpncpy_avx2_rtm)
> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncpy,
> > +                                    1,
> > +                                    __wcpncpy_generic))
> > +
> > +  /* Support sysdeps/x86_64/multiarch/wcscat.c.  */
> > +  IFUNC_IMPL (i, name, wcscat,
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcscat,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wcscat_evex)
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
> > +                                    (CPU_FEATURE_USABLE (AVX2)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wcscat_avx2)
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
> > +                                    (CPU_FEATURE_USABLE (AVX2)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)
> > +                                     && CPU_FEATURE_USABLE (RTM)),
> > +                                    __wcscat_avx2_rtm)
> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcscat,
> > +                                    1,
> > +                                    __wcscat_generic))
> > +
> > +  /* Support sysdeps/x86_64/multiarch/wcsncat.c.  */
> > +  IFUNC_IMPL (i, name, wcsncat,
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncat,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wcsncat_evex)
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
> > +                                    (CPU_FEATURE_USABLE (AVX2)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wcsncat_avx2)
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
> > +                                    (CPU_FEATURE_USABLE (AVX2)
> > +                                     && CPU_FEATURE_USABLE (BMI1)
> > +                                     && CPU_FEATURE_USABLE (BMI2)
> > +                                     && CPU_FEATURE_USABLE (RTM)),
> > +                                    __wcsncat_avx2_rtm)
> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncat,
> > +                                    1,
> > +                                    __wcsncat_generic))
> > +
> >    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
> >    IFUNC_IMPL (i, name, wcslen,
> >               X86_IFUNC_IMPL_ADD_V4 (array, i, wcslen,
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-wcs.h b/sysdeps/x86_64/multiarch/ifunc-wcs.h
> > new file mode 100644
> > index 0000000000..cda633d8fb
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/ifunc-wcs.h
> > @@ -0,0 +1,60 @@
> > +/* Common definition for ifunc selections optimized wide-character
> > +   string copy functions.
> > +
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <init-arch.h>
> > +
> > +#ifndef GENERIC
> > +# define GENERIC generic
> > +#endif
> > +
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > +
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > +
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> > +
> > +static inline void *
> > +IFUNC_SELECTOR (void)
> > +{
> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
> > +
> > +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > +      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1)
> > +      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> > +      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> > +                                     AVX_Fast_Unaligned_Load, ))
> > +    {
> > +      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > +         && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > +       return OPTIMIZE (evex);
> > +
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > +       return OPTIMIZE (avx2_rtm);
> > +
> > +      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> > +                                      Prefer_No_VZEROUPPER, !))
> > +       return OPTIMIZE (avx2);
> > +
> > +    }
> > +
> > +  return OPTIMIZE (GENERIC);
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
> > new file mode 100644
> > index 0000000000..756280a3ab
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
> > @@ -0,0 +1,3 @@
> > +#define WCPCPY __wcpcpy_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > +#include "wcpcpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
> > new file mode 100644
> > index 0000000000..0fffd912d3
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
> > @@ -0,0 +1,8 @@
> > +#ifndef WCPCPY
> > +# define WCPCPY        __wcpcpy_avx2
> > +#endif
> > +
> > +#define USE_AS_STPCPY
> > +#define USE_AS_WCSCPY
> > +#define STRCPY WCPCPY
> > +#include "strcpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcpcpy-evex.S b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
> > new file mode 100644
> > index 0000000000..ac6429cc07
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
> > @@ -0,0 +1,8 @@
> > +#ifndef WCPCPY
> > +# define WCPCPY        __wcpcpy_evex
> > +#endif
> > +
> > +#define USE_AS_STPCPY
> > +#define USE_AS_WCSCPY
> > +#define STRCPY WCPCPY
> > +#include "strcpy-evex.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcpcpy-generic.c b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
> > new file mode 100644
> > index 0000000000..0ba29b081f
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
> > @@ -0,0 +1,27 @@
> > +/* wcpcpy.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* We always need to build this implementation as strspn-sse4 needs to
> > +   be able to fallback to it.  */
> > +#include <isa-level.h>
> > +#if ISA_SHOULD_BUILD (2)
> > +
> > +# define WCPCPY __wcpcpy_generic
> > +# include <wcsmbs/wcpcpy.c>
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/wcpcpy.c b/sysdeps/x86_64/multiarch/wcpcpy.c
> > new file mode 100644
> > index 0000000000..8f96ddbc99
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcpcpy.c
> > @@ -0,0 +1,37 @@
> > +/* Multiple versions of wcpcpy.
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* Define multiple versions only for the definition in libc.  */
> > +#if IS_IN (libc)
> > +# define __wcpcpy __redirect_wcpcpy
> > +# include <wchar.h>
> > +# undef __wcpcpy
> > +
> > +# define SYMBOL_NAME wcpcpy
> > +# include <init-arch.h>
> > +
> > +# include "ifunc-wcs.h"
> > +
> > +libc_ifunc_redirected (__redirect_wcpcpy, __wcpcpy, IFUNC_SELECTOR ());
> > +weak_alias (__wcpcpy, wcpcpy)
> > +# ifdef SHARED
> > +__hidden_ver1 (__wcpcpy, __GI___wcpcpy, __redirect_wcpcpy)
> > +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpcpy);
> > +# endif
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
> > new file mode 100644
> > index 0000000000..80600d6b01
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
> > @@ -0,0 +1,3 @@
> > +#define WCPNCPY        __wcpncpy_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > +#include "wcpncpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
> > new file mode 100644
> > index 0000000000..b7e594f7b7
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
> > @@ -0,0 +1,8 @@
> > +#ifndef WCPNCPY
> > +# define WCPNCPY       __wcpncpy_avx2
> > +#endif
> > +
> > +#define USE_AS_WCSCPY
> > +#define USE_AS_STPCPY
> > +#define STRNCPY        WCPNCPY
> > +#include "strncpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcpncpy-evex.S b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
> > new file mode 100644
> > index 0000000000..62ddb694fe
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
> > @@ -0,0 +1,8 @@
> > +#ifndef WCPNCPY
> > +# define WCPNCPY       __wcpncpy_evex
> > +#endif
> > +
> > +#define USE_AS_WCSCPY
> > +#define USE_AS_STPCPY
> > +#define STRNCPY        WCPNCPY
> > +#include "strncpy-evex.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcpncpy-generic.c b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
> > new file mode 100644
> > index 0000000000..4aab4ecdd2
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
> > @@ -0,0 +1,27 @@
> > +/* wcpncpy.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* We always need to build this implementation as strspn-sse4 needs to
> > +   be able to fallback to it.  */
> > +#include <isa-level.h>
> > +#if ISA_SHOULD_BUILD (2)
> > +
> > +# define WCPNCPY __wcpncpy_generic
> > +# include <wcsmbs/wcpncpy.c>
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/wcpncpy.c b/sysdeps/x86_64/multiarch/wcpncpy.c
> > new file mode 100644
> > index 0000000000..ed8f307e07
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcpncpy.c
> > @@ -0,0 +1,37 @@
> > +/* Multiple versions of wcpncpy.
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* Define multiple versions only for the definition in libc.  */
> > +#if IS_IN (libc)
> > +# define __wcpncpy __redirect_wcpncpy
> > +# include <wchar.h>
> > +# undef __wcpncpy
> > +
> > +# define SYMBOL_NAME wcpncpy
> > +# include <init-arch.h>
> > +
> > +# include "ifunc-wcs.h"
> > +
> > +libc_ifunc_redirected (__redirect_wcpncpy, __wcpncpy, IFUNC_SELECTOR ());
> > +weak_alias (__wcpncpy, wcpncpy)
> > +# ifdef SHARED
> > +__hidden_ver1 (__wcpncpy, __GI___wcpncpy, __redirect_wcpncpy)
> > +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpncpy);
> > +# endif
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
> > new file mode 100644
> > index 0000000000..e99449a2dc
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
> > @@ -0,0 +1,3 @@
> > +#define WCSCAT __wcscat_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > +#include "wcscat-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2.S b/sysdeps/x86_64/multiarch/wcscat-avx2.S
> > new file mode 100644
> > index 0000000000..a20f23c09d
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcscat-avx2.S
> > @@ -0,0 +1,10 @@
> > +#ifndef WCSCAT
> > +# define WCSCAT        __wcscat_avx2
> > +#endif
> > +
> > +#define USE_AS_WCSCPY
> > +#define USE_AS_STRCAT
> > +
> > +#define STRCPY WCSCAT
> > +
> > +#include "strcpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcscat-evex.S b/sysdeps/x86_64/multiarch/wcscat-evex.S
> > new file mode 100644
> > index 0000000000..1d017e4899
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcscat-evex.S
> > @@ -0,0 +1,9 @@
> > +#ifndef WCSCAT
> > +# define WCSCAT        __wcscat_evex
> > +#endif
> > +
> > +#define USE_AS_WCSCPY
> > +#define USE_AS_STRCAT
> > +
> > +#define STRCPY WCSCAT
> > +#include "strcpy-evex.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcscat-generic.c b/sysdeps/x86_64/multiarch/wcscat-generic.c
> > new file mode 100644
> > index 0000000000..6476f85bbb
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcscat-generic.c
> > @@ -0,0 +1,27 @@
> > +/* wcscat.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* We always need to build this implementation as strspn-sse4 needs to
> > +   be able to fallback to it.  */
> > +#include <isa-level.h>
> > +#if ISA_SHOULD_BUILD (2)
> > +
> > +# define WCSCAT __wcscat_generic
> > +# include <wcsmbs/wcscat.c>
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/wcscat.c b/sysdeps/x86_64/multiarch/wcscat.c
> > new file mode 100644
> > index 0000000000..3277c44561
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcscat.c
> > @@ -0,0 +1,37 @@
> > +/* Multiple versions of wcscat.
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* Define multiple versions only for the definition in libc.  */
> > +#if IS_IN (libc)
> > +# define __wcscat __redirect_wcscat
> > +# include <wchar.h>
> > +# undef __wcscat
> > +
> > +# define SYMBOL_NAME wcscat
> > +# include <init-arch.h>
> > +
> > +# include "ifunc-wcs.h"
> > +
> > +libc_ifunc_redirected (__redirect_wcscat, __wcscat, IFUNC_SELECTOR ());
> > +weak_alias (__wcscat, wcscat)
> > +# ifdef SHARED
> > +__hidden_ver1 (__wcscat, __GI___wcscat, __redirect_wcscat)
> > +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcscat);
> > +# endif
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
> > new file mode 100644
> > index 0000000000..2f800c8d3e
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
> > @@ -0,0 +1,3 @@
> > +#define WCSCPY __wcscpy_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > +#include "wcscpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2.S b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
> > new file mode 100644
> > index 0000000000..6bc509da07
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
> > @@ -0,0 +1,7 @@
> > +#ifndef WCSCPY
> > +# define WCSCPY        __wcscpy_avx2
> > +#endif
> > +
> > +#define USE_AS_WCSCPY
> > +#define STRCPY WCSCPY
> > +#include "strcpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcscpy-evex.S b/sysdeps/x86_64/multiarch/wcscpy-evex.S
> > new file mode 100644
> > index 0000000000..1069a8e224
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcscpy-evex.S
> > @@ -0,0 +1,7 @@
> > +#ifndef WCSCPY
> > +# define WCSCPY        __wcscpy_evex
> > +#endif
> > +
> > +#define USE_AS_WCSCPY
> > +#define STRCPY WCSCPY
> > +#include "strcpy-evex.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcscpy-generic.c b/sysdeps/x86_64/multiarch/wcscpy-generic.c
> > index 93d314aaad..600d606c45 100644
> > --- a/sysdeps/x86_64/multiarch/wcscpy-generic.c
> > +++ b/sysdeps/x86_64/multiarch/wcscpy-generic.c
> > @@ -18,8 +18,7 @@
> >
> >
> >  #include <isa-level.h>
> > -
> > -#if ISA_SHOULD_BUILD (1)
> > +#if ISA_SHOULD_BUILD (2)
> >
> >  # define WCSCPY  __wcscpy_generic
> >  # include <wcsmbs/wcscpy.c>
> > diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
> > index 92c917b6b4..7f6387817b 100644
> > --- a/sysdeps/x86_64/multiarch/wcscpy.c
> > +++ b/sysdeps/x86_64/multiarch/wcscpy.c
> > @@ -26,6 +26,11 @@
> >  # define SYMBOL_NAME wcscpy
> >  # include <init-arch.h>
> >
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > +
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > +
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> >
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> > @@ -35,6 +40,22 @@ IFUNC_SELECTOR (void)
> >  {
> >    const struct cpu_features* cpu_features = __get_cpu_features ();
> >
> > +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > +      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1)
> > +      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> > +      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load, ))
> > +    {
> > +      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > +         && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > +       return OPTIMIZE (evex);
> > +
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > +       return OPTIMIZE (avx2_rtm);
> > +
> > +      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER, !))
> > +       return OPTIMIZE (avx2);
> > +    }
> > +
> >    if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> >      return OPTIMIZE (ssse3);
> >
> > diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
> > new file mode 100644
> > index 0000000000..609d6e69c0
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
> > @@ -0,0 +1,3 @@
> > +#define WCSNCAT        __wcsncat_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > +#include "wcsncat-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2.S b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
> > new file mode 100644
> > index 0000000000..a72105b7e9
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
> > @@ -0,0 +1,9 @@
> > +#ifndef WCSNCAT
> > +# define WCSNCAT       __wcsncat_avx2
> > +#endif
> > +
> > +#define USE_AS_WCSCPY
> > +#define USE_AS_STRCAT
> > +
> > +#define STRNCAT        WCSNCAT
> > +#include "strncat-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsncat-evex.S b/sysdeps/x86_64/multiarch/wcsncat-evex.S
> > new file mode 100644
> > index 0000000000..392215950a
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsncat-evex.S
> > @@ -0,0 +1,9 @@
> > +#ifndef WCSCAT
> > +# define WCSCAT        __wcsncat_evex
> > +#endif
> > +
> > +#define USE_AS_WCSCPY
> > +#define USE_AS_STRCAT
> > +
> > +#define STRNCAT        WCSCAT
> > +#include "strncat-evex.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsncat-generic.c b/sysdeps/x86_64/multiarch/wcsncat-generic.c
> > new file mode 100644
> > index 0000000000..9ced02b35e
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsncat-generic.c
> > @@ -0,0 +1,27 @@
> > +/* wcsncat.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* We always need to build this implementation as strspn-sse4 needs to
> > +   be able to fallback to it.  */
> > +#include <isa-level.h>
> > +#if ISA_SHOULD_BUILD (2)
> > +
> > +# define WCSNCAT __wcsncat_generic
> > +# include <wcsmbs/wcsncat.c>
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/wcsncat.c b/sysdeps/x86_64/multiarch/wcsncat.c
> > new file mode 100644
> > index 0000000000..49c46aef08
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsncat.c
> > @@ -0,0 +1,34 @@
> > +/* Multiple versions of wcsncat.
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* Define multiple versions only for the definition in libc.  */
> > +#if IS_IN (libc)
> > +# define wcsncat __redirect_wcsncat
> > +# include <wchar.h>
> > +# undef wcsncat
> > +
> > +# define SYMBOL_NAME wcsncat
> > +# include "ifunc-wcs.h"
> > +
> > +libc_ifunc_redirected (__redirect_wcsncat, wcsncat, IFUNC_SELECTOR ());
> > +# ifdef SHARED
> > +__hidden_ver1 (wcsncat, __GI_wcsncat, __redirect_wcsncat)
> > +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncat);
> > +# endif
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
> > new file mode 100644
> > index 0000000000..cab5a6b820
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
> > @@ -0,0 +1,3 @@
> > +#define WCSNCPY        __wcsncpy_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > +#include "wcsncpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
> > new file mode 100644
> > index 0000000000..3a1a8a372c
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
> > @@ -0,0 +1,7 @@
> > +#ifndef WCSNCPY
> > +# define WCSNCPY       __wcsncpy_avx2
> > +#endif
> > +
> > +#define USE_AS_WCSCPY
> > +#define STRNCPY        WCSNCPY
> > +#include "strncpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsncpy-evex.S b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
> > new file mode 100644
> > index 0000000000..2debb8fd6b
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
> > @@ -0,0 +1,7 @@
> > +#ifndef WCSNCPY
> > +# define WCSNCPY       __wcsncpy_evex
> > +#endif
> > +
> > +#define USE_AS_WCSCPY
> > +#define STRNCPY        WCSNCPY
> > +#include "strncpy-evex.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsncpy-generic.c b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
> > new file mode 100644
> > index 0000000000..693521713b
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
> > @@ -0,0 +1,27 @@
> > +/* wcsncpy.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* We always need to build this implementation as strspn-sse4 needs to
> > +   be able to fallback to it.  */
> > +#include <isa-level.h>
> > +#if ISA_SHOULD_BUILD (2)
> > +
> > +# define WCSNCPY __wcsncpy_generic
> > +# include <wcsmbs/wcsncpy.c>
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/wcsncpy.c b/sysdeps/x86_64/multiarch/wcsncpy.c
> > new file mode 100644
> > index 0000000000..5b89dd4d27
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsncpy.c
> > @@ -0,0 +1,37 @@
> > +/* Multiple versions of wcsncpy.
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* Define multiple versions only for the definition in libc.  */
> > +#if IS_IN (libc)
> > +# define __wcsncpy __redirect_wcsncpy
> > +# include <wchar.h>
> > +# undef __wcsncpy
> > +
> > +# define SYMBOL_NAME wcsncpy
> > +# include <init-arch.h>
> > +
> > +# include "ifunc-wcs.h"
> > +
> > +libc_ifunc_redirected (__redirect_wcsncpy, __wcsncpy, IFUNC_SELECTOR ());
> > +weak_alias (__wcsncpy, wcsncpy)
> > +# ifdef SHARED
> > +__hidden_ver1 (__wcsncpy, __GI___wcsncpy, __redirect_wcsncpy)
> > +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncpy);
> > +# endif
> > +#endif
> > diff --git a/sysdeps/x86_64/wcpcpy-generic.c b/sysdeps/x86_64/wcpcpy-generic.c
> > new file mode 100644
> > index 0000000000..d52525f288
> > --- /dev/null
> > +++ b/sysdeps/x86_64/wcpcpy-generic.c
> > @@ -0,0 +1,31 @@
> > +/* wcpcpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* wcpcpy non-multiarch build is split into two files,
> > +   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
> > +   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
> > +   This must be split into two files because we cannot include C
> > +   code from assembly or vice versa.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if MINIMUM_X86_ISA_LEVEL <= 3
> > +
> > +# include <wcsmbs/wcpcpy.c>
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/wcpcpy.S b/sysdeps/x86_64/wcpcpy.S
> > new file mode 100644
> > index 0000000000..ec32dc070a
> > --- /dev/null
> > +++ b/sysdeps/x86_64/wcpcpy.S
> > @@ -0,0 +1,41 @@
> > +/* wcpcpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* wcpcpy non-multiarch build is split into two files,
> > +   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
> > +   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
> > +   This must be split into two files because we cannot include C
> > +   code from assembly or vice versa.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if MINIMUM_X86_ISA_LEVEL >= 4
> > +
> > +# define WCPCPY        __wcpcpy
> > +
> > +# define DEFAULT_IMPL_V4       "multiarch/wcpcpy-evex.S"
> > +# define DEFAULT_IMPL_V3       "multiarch/wcpcpy-avx2.S"
> > +/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> > +   should never be used from here.  */
> > +# define DEFAULT_IMPL_V1       "ERROR -- Invalid ISA IMPL"
> > +
> > +# include "isa-default-impl.h"
> > +
> > +weak_alias (__wcpcpy, wcpcpy)
> > +libc_hidden_def (__wcpcpy)
> > +#endif
> > diff --git a/sysdeps/x86_64/wcpncpy-generic.c b/sysdeps/x86_64/wcpncpy-generic.c
> > new file mode 100644
> > index 0000000000..871219a445
> > --- /dev/null
> > +++ b/sysdeps/x86_64/wcpncpy-generic.c
> > @@ -0,0 +1,31 @@
> > +/* wcpncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* wcpncpy non-multiarch build is split into two files,
> > +   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
> > +   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
> > +   This must be split into two files because we cannot include C
> > +   code from assembly or vice versa.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if MINIMUM_X86_ISA_LEVEL <= 3
> > +
> > +# include <wcsmbs/wcpncpy.c>
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/wcpncpy.S b/sysdeps/x86_64/wcpncpy.S
> > new file mode 100644
> > index 0000000000..68e6ff1836
> > --- /dev/null
> > +++ b/sysdeps/x86_64/wcpncpy.S
> > @@ -0,0 +1,41 @@
> > +/* wcpncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* wcpncpy non-multiarch build is split into two files,
> > +   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
> > +   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
> > +   This must be split into two files because we cannot include C
> > +   code from assembly or vice versa.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if MINIMUM_X86_ISA_LEVEL >= 4
> > +
> > +# define WCPNCPY       __wcpncpy
> > +
> > +# define DEFAULT_IMPL_V4       "multiarch/wcpncpy-evex.S"
> > +# define DEFAULT_IMPL_V3       "multiarch/wcpncpy-avx2.S"
> > +/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> > +   should never be used from here.  */
> > +# define DEFAULT_IMPL_V1       "ERROR -- Invalid ISA IMPL"
> > +
> > +# include "isa-default-impl.h"
> > +
> > +weak_alias (__wcpncpy, wcpncpy)
> > +libc_hidden_def (__wcpncpy)
> > +#endif
> > diff --git a/sysdeps/x86_64/wcscat-generic.c b/sysdeps/x86_64/wcscat-generic.c
> > new file mode 100644
> > index 0000000000..85f981a81f
> > --- /dev/null
> > +++ b/sysdeps/x86_64/wcscat-generic.c
> > @@ -0,0 +1,31 @@
> > +/* wcscat dispatch for RTLD and non-multiarch .c ISA level 1 build.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* wcscat non-multiarch build is split into two files,
> > +   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
> > +   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
> > +   This must be split into two files because we cannot include C
> > +   code from assembly or vice versa.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if MINIMUM_X86_ISA_LEVEL <= 3
> > +
> > +# include <wcsmbs/wcscat.c>
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/wcscat.S b/sysdeps/x86_64/wcscat.S
> > new file mode 100644
> > index 0000000000..007de3c40c
> > --- /dev/null
> > +++ b/sysdeps/x86_64/wcscat.S
> > @@ -0,0 +1,41 @@
> > +/* wcscat dispatch for RTLD and non-multiarch .c ISA level 1 build.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* wcscat non-multiarch build is split into two files,
> > +   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
> > +   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
> > +   This must be split into two files because we cannot include C
> > +   code from assembly or vice versa.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if MINIMUM_X86_ISA_LEVEL >= 4
> > +
> > +# define WCSCAT        __wcscat
> > +
> > +# define DEFAULT_IMPL_V4       "multiarch/wcscat-evex.S"
> > +# define DEFAULT_IMPL_V3       "multiarch/wcscat-avx2.S"
> > +/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> > +   should never be used from here.  */
> > +# define DEFAULT_IMPL_V1       "ERROR -- Invalid ISA IMPL"
> > +
> > +# include "isa-default-impl.h"
> > +
> > +weak_alias (__wcscat, wcscat)
> > +libc_hidden_def (__wcscat)
> > +#endif
> > diff --git a/sysdeps/x86_64/wcscpy.S b/sysdeps/x86_64/wcscpy.S
> > index 11d0bb4bab..ab9288ed74 100644
> > --- a/sysdeps/x86_64/wcscpy.S
> > +++ b/sysdeps/x86_64/wcscpy.S
> > @@ -28,6 +28,8 @@
> >
> >  # define WCSCPY        __wcscpy
> >
> > +# define DEFAULT_IMPL_V4       "multiarch/wcscpy-evex.S"
> > +# define DEFAULT_IMPL_V3       "multiarch/wcscpy-avx2.S"
> >  # define DEFAULT_IMPL_V2       "multiarch/wcscpy-ssse3.S"
> >  /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> >     should never be used from here.  */
> > diff --git a/sysdeps/x86_64/wcsncat-generic.c b/sysdeps/x86_64/wcsncat-generic.c
> > new file mode 100644
> > index 0000000000..2cc0f7b11a
> > --- /dev/null
> > +++ b/sysdeps/x86_64/wcsncat-generic.c
> > @@ -0,0 +1,31 @@
> > +/* wcsncat dispatch for RTLD and non-multiarch .c ISA level 1 build.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* wcsncat non-multiarch build is split into two files,
> > +   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
> > +   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
> > +   This must be split into two files because we cannot include C
> > +   code from assembly or vice versa.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if MINIMUM_X86_ISA_LEVEL <= 3
> > +
> > +# include <wcsmbs/wcsncat.c>
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/wcsncat.S b/sysdeps/x86_64/wcsncat.S
> > new file mode 100644
> > index 0000000000..3f4c7948db
> > --- /dev/null
> > +++ b/sysdeps/x86_64/wcsncat.S
> > @@ -0,0 +1,39 @@
> > +/* wcsncat dispatch for RTLD and non-multiarch .c ISA level 1 build.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* wcsncat non-multiarch build is split into two files,
> > +   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
> > +   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
> > +   This must be split into two files because we cannot include C
> > +   code from assembly or vice versa.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if MINIMUM_X86_ISA_LEVEL >= 4
> > +
> > +# define WCSNCAT       wcsncat
> > +
> > +# define DEFAULT_IMPL_V4       "multiarch/wcsncat-evex.S"
> > +# define DEFAULT_IMPL_V3       "multiarch/wcsncat-avx2.S"
> > +/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> > +   should never be used from here.  */
> > +# define DEFAULT_IMPL_V1       "ERROR -- Invalid ISA IMPL"
> > +
> > +# include "isa-default-impl.h"
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/wcsncpy-generic.c b/sysdeps/x86_64/wcsncpy-generic.c
> > new file mode 100644
> > index 0000000000..49d06b8ae8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/wcsncpy-generic.c
> > @@ -0,0 +1,31 @@
> > +/* wcsncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* wcsncpy non-multiarch build is split into two files,
> > +   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
> > +   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
> > +   This must be split into two files because we cannot include C
> > +   code from assembly or vice versa.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if MINIMUM_X86_ISA_LEVEL <= 3
> > +
> > +# include <wcsmbs/wcsncpy.c>
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/wcsncpy.S b/sysdeps/x86_64/wcsncpy.S
> > new file mode 100644
> > index 0000000000..e1428fd4c1
> > --- /dev/null
> > +++ b/sysdeps/x86_64/wcsncpy.S
> > @@ -0,0 +1,41 @@
> > +/* wcsncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* wcsncpy non-multiarch build is split into two files,
> > +   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
> > +   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
> > +   This must be split into two files because we cannot include C
> > +   code from assembly or vice versa.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if MINIMUM_X86_ISA_LEVEL >= 4
> > +
> > +# define WCSNCPY       __wcsncpy
> > +
> > +# define DEFAULT_IMPL_V4       "multiarch/wcsncpy-evex.S"
> > +# define DEFAULT_IMPL_V3       "multiarch/wcsncpy-avx2.S"
> > +/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> > +   should never be used from here.  */
> > +# define DEFAULT_IMPL_V1       "ERROR -- Invalid ISA IMPL"
> > +
> > +# include "isa-default-impl.h"
> > +
> > +weak_alias (__wcsncpy, wcsncpy)
> > +libc_hidden_def (__wcsncpy)
> > +#endif
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 1/5] benchtests: Make str{n}{cat|cpy} benchmarks output json
  2022-11-04 20:13 ` [PATCH v3 1/5] " Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-11-04 20:13   ` [PATCH v3 5/5] x86: Add avx2 " Noah Goldstein
@ 2022-11-04 21:01   ` H.J. Lu
  2022-11-04 21:24     ` Noah Goldstein
  4 siblings, 1 reply; 42+ messages in thread
From: H.J. Lu @ 2022-11-04 21:01 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Nov 4, 2022 at 1:13 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Json output is easier to parse and most other benchmarks already do
> the same.
> ---
>  benchtests/bench-strcat.c  | 106 ++++++++++++++++++++++++++-------
>  benchtests/bench-strcpy.c  | 118 ++++++++++++++++++++++++-------------
>  benchtests/bench-strncat.c |  94 +++++++++++++++++++++--------
>  benchtests/bench-strncpy.c |  94 +++++++++++++++++++++--------
>  4 files changed, 297 insertions(+), 115 deletions(-)
>
> diff --git a/benchtests/bench-strcat.c b/benchtests/bench-strcat.c
> index 749318e37e..890b34b4c1 100644
> --- a/benchtests/bench-strcat.c
> +++ b/benchtests/bench-strcat.c
> @@ -35,6 +35,7 @@
>  # define SMALL_CHAR 1273
>  #endif /* WIDE */
>
> +#include "json-lib.h"
>
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
>
> @@ -49,7 +50,7 @@ IMPL (STRCAT, 1)
>  IMPL (generic_strcat, 0)
>
>  static void
> -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src)
>  {
>    size_t k = STRLEN (dst), i, iters = INNER_LOOP_ITERS_LARGE;
>    timing_t start, stop, cur;
> @@ -64,7 +65,8 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
>
>    if (STRCMP (dst + k, src) != 0)
>      {
> -      error (0, 0, "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
> +      error (0, 0,
> +            "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
>              impl->name, dst, src);
>        ret = 1;
>        return;
> @@ -80,11 +82,12 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align1, size_t align2, size_t len1, size_t len2, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len1,
> +        size_t len2, int max_char)
>  {
>    size_t i;
>    CHAR *s1, *s2;
> @@ -107,53 +110,112 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2, int max_char)
>    for (i = 0; i < len2; i++)
>      s2[i] = 32 + 23 * i % (max_char - 32);
>
> -  printf ("Length %4zd/%4zd, alignment %2zd/%2zd:", len1, len2, align1, align2);
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_attr_uint (json_ctx, "len1", len1);
> +  json_attr_uint (json_ctx, "len2", len2);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
>      {
>        s2[len2] = '\0';
> -      do_one_test (impl, s2, s1);
> +      do_one_test (json_ctx, impl, s2, s1);
>      }
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  test_main (void)
>  {
> +  json_ctx_t json_ctx;
>    size_t i;
>
>    test_init ();
>
> -  printf ("%28s", "");
> +  test_init ();
> +
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
>    for (i = 0; i < 16; ++i)
>      {
> -      do_test (0, 0, i, i, SMALL_CHAR);
> -      do_test (0, 0, i, i, BIG_CHAR);
> -      do_test (0, i, i, i, SMALL_CHAR);
> -      do_test (i, 0, i, i, BIG_CHAR);
> +      do_test (&json_ctx, 0, 0, i, i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, i, BIG_CHAR);
> +      do_test (&json_ctx, 0, i, i, i, SMALL_CHAR);
> +      do_test (&json_ctx, i, 0, i, i, BIG_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (0, 0, 8 << i, 8 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 8 << i, 8 << i, SMALL_CHAR);
> -      do_test (0, 0, 8 << i, 2 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 8 << i, 2 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 8 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 8 << i, 2 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 2 << i, SMALL_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (i, 2 * i, 8 << i, 1, SMALL_CHAR);
> -      do_test (2 * i, i, 8 << i, 1, BIG_CHAR);
> -      do_test (i, i, 8 << i, 10, SMALL_CHAR);
> -      do_test (i, i, 8 << i, 10, BIG_CHAR);
> +      do_test (&json_ctx, i, 2 * i, 8 << i, 1, SMALL_CHAR);
> +      do_test (&json_ctx, 2 * i, i, 8 << i, 1, BIG_CHAR);
> +      do_test (&json_ctx, i, i, 8 << i, 10, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, 8 << i, 10, BIG_CHAR);
> +    }
> +
> +  for (i = 32; i < 256; i += 32)
> +    {
> +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
>      }
>
> +  for (; i < 512; i += 64)
> +    {
> +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
> +    }
> +
> +  for (; i < 1024; i += 128)
> +    {
> +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
> +    }
> +
> +  for (; i < 2048; i += 256)
> +    {
> +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
>    return ret;
>  }
>
> diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
> index 29deb8a46a..af8673e137 100644
> --- a/benchtests/bench-strcpy.c
> +++ b/benchtests/bench-strcpy.c
> @@ -26,16 +26,18 @@
>  # define SMALL_CHAR 127
>  #endif
>
> +#include "json-lib.h"
> +
>  #ifndef STRCPY_RESULT
>  # define STRCPY_RESULT(dst, len) dst
>  # define TEST_MAIN
>  # ifndef WIDE
> -#  define TEST_NAME "strcpy"
> +#   define TEST_NAME "strcpy"
>  # else
> -#  define TEST_NAME "wcscpy"
> -#  define generic_strcpy generic_wcscpy
> +#   define TEST_NAME "wcscpy"
> +#   define generic_strcpy generic_wcscpy
>  # endif
> -#include "bench-string.h"
> +# include "bench-string.h"
>
>  CHAR *
>  generic_strcpy (CHAR *dst, const CHAR *src)
> @@ -51,8 +53,8 @@ IMPL (generic_strcpy, 0)
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
>
>  static void
> -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
> -            size_t len __attribute__((unused)))
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
> +            size_t len __attribute__ ((unused)))
>  {
>    size_t i, iters = INNER_LOOP_ITERS;
>    timing_t start, stop, cur;
> @@ -77,23 +79,24 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
>    TIMING_NOW (start);
>    for (i = 0; i < iters; ++i)
>      {
> -         CALL (impl, dst, src);
> +      CALL (impl, dst, src);
>      }
>    TIMING_NOW (stop);
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align1, size_t align2, size_t len, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
> +        int max_char)
>  {
>    size_t i;
>    CHAR *s1, *s2;
> -/* For wcscpy: align1 and align2 here mean alignment not in bytes,
> -   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> -   len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
> +  /* For wcscpy: align1 and align2 here mean alignment not in bytes,
> +     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> +     len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
>    align1 &= 7;
>    if ((align1 + len) * sizeof (CHAR) >= page_size)
>      return;
> @@ -109,65 +112,96 @@ do_test (size_t align1, size_t align2, size_t len, int max_char)
>      s1[i] = 32 + 23 * i % (max_char - 32);
>    s1[len] = 0;
>
> -  printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len,
> -         align1 * sizeof (CHAR), align2 * sizeof (CHAR));
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_attr_uint (json_ctx, "len", len);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, s2, s1, len);
> +    do_one_test (json_ctx, impl, s2, s1, len);
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  test_main (void)
>  {
> +  json_ctx_t json_ctx;
>    size_t i;
>
>    test_init ();
>
> -  printf ("%23s", "");
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
>    for (i = 0; i < 16; ++i)
>      {
> -      do_test (0, 0, i, SMALL_CHAR);
> -      do_test (0, 0, i, BIG_CHAR);
> -      do_test (0, i, i, SMALL_CHAR);
> -      do_test (i, 0, i, BIG_CHAR);
> +      do_test (&json_ctx, 0, 0, i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, BIG_CHAR);
> +      do_test (&json_ctx, 0, i, i, SMALL_CHAR);
> +      do_test (&json_ctx, i, 0, i, BIG_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (0, 0, 8 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, SMALL_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (i, 2 * i, 8 << i, SMALL_CHAR);
> -      do_test (2 * i, i, 8 << i, BIG_CHAR);
> -      do_test (i, i, 8 << i, SMALL_CHAR);
> -      do_test (i, i, 8 << i, BIG_CHAR);
> +      do_test (&json_ctx, i, 2 * i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 2 * i, i, 8 << i, BIG_CHAR);
> +      do_test (&json_ctx, i, i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, 8 << i, BIG_CHAR);
>      }
>
> -  for (i = 16; i <= 512; i+=4)
> +  for (i = 16; i <= 512; i += 4)
>      {
> -      do_test (0, 4, i, SMALL_CHAR);
> -      do_test (4, 0, i, BIG_CHAR);
> -      do_test (4, 4, i, SMALL_CHAR);
> -      do_test (2, 2, i, BIG_CHAR);
> -      do_test (2, 6, i, SMALL_CHAR);
> -      do_test (6, 2, i, BIG_CHAR);
> -      do_test (1, 7, i, SMALL_CHAR);
> -      do_test (7, 1, i, BIG_CHAR);
> -      do_test (3, 4, i, SMALL_CHAR);
> -      do_test (4, 3, i, BIG_CHAR);
> -      do_test (5, 7, i, SMALL_CHAR);
> -      do_test (7, 5, i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 4, i, SMALL_CHAR);
> +      do_test (&json_ctx, 4, 0, i, BIG_CHAR);
> +      do_test (&json_ctx, 4, 4, i, SMALL_CHAR);
> +      do_test (&json_ctx, 2, 2, i, BIG_CHAR);
> +      do_test (&json_ctx, 2, 6, i, SMALL_CHAR);
> +      do_test (&json_ctx, 6, 2, i, BIG_CHAR);
> +      do_test (&json_ctx, 1, 7, i, SMALL_CHAR);
> +      do_test (&json_ctx, 7, 1, i, BIG_CHAR);
> +      do_test (&json_ctx, 3, 4, i, SMALL_CHAR);
> +      do_test (&json_ctx, 4, 3, i, BIG_CHAR);
> +      do_test (&json_ctx, 5, 7, i, SMALL_CHAR);
> +      do_test (&json_ctx, 7, 5, i, SMALL_CHAR);
>      }
>
> +  for (i = 1; i < 2048; i += i)
> +    {
> +      do_test (&json_ctx, 1, 0, i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, i, i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, i, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, i, SMALL_CHAR);
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
>    return ret;
>  }
>
> diff --git a/benchtests/bench-strncat.c b/benchtests/bench-strncat.c
> index b148c55279..5ccc09a4f8 100644
> --- a/benchtests/bench-strncat.c
> +++ b/benchtests/bench-strncat.c
> @@ -33,6 +33,8 @@
>  # define SMALL_CHAR 1273
>  #endif /* WIDE */
>
> +#include "json-lib.h"
> +
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
>
>  CHAR *
> @@ -49,7 +51,8 @@ IMPL (STRNCAT, 2)
>  IMPL (generic_strncat, 0)
>
>  static void
> -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
> +            size_t n)
>  {
>    size_t k = STRLEN (dst), i, iters = INNER_LOOP_ITERS8;
>    timing_t start, stop, cur;
> @@ -65,8 +68,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
>    size_t len = STRLEN (src);
>    if (MEMCMP (dst + k, src, len + 1 > n ? n : len + 1) != 0)
>      {
> -      error (0, 0, "Incorrect concatenation in function %s",
> -            impl->name);
> +      error (0, 0, "Incorrect concatenation in function %s", impl->name);
>        ret = 1;
>        return;
>      }
> @@ -88,12 +90,12 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align1, size_t align2, size_t len1, size_t len2,
> -        size_t n, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len1,
> +        size_t len2, size_t n, int max_char)
>  {
>    size_t i;
>    CHAR *s1, *s2;
> @@ -118,53 +120,93 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
>    for (i = 0; i < len2; i++)
>      s2[i] = 32 + 23 * i % (max_char - 32);
>
> -  printf ("Length %4zd/%4zd, alignment %2zd/%2zd, N %4zd:",
> -         len1, len2, align1, align2, n);
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_attr_uint (json_ctx, "len1", len1);
> +  json_attr_uint (json_ctx, "len2", len2);
> +  json_attr_uint (json_ctx, "n", n);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
>      {
>        s2[len2] = '\0';
> -      do_one_test (impl, s2, s1, n);
> +      do_one_test (json_ctx, impl, s2, s1, n);
>      }
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  int
>  main (void)
>  {
> +  json_ctx_t json_ctx;
>    size_t i, n;
>
>    test_init ();
>
> -  printf ("%28s", "");
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
> -  for (n = 2; n <= 2048; n*=4)
> +  for (n = 2; n <= 2048; n *= 4)
>      {
> -      do_test (0, 2, 2, 2, n, SMALL_CHAR);
> -      do_test (0, 0, 4, 4, n, SMALL_CHAR);
> -      do_test (4, 0, 4, 4, n, BIG_CHAR);
> -      do_test (0, 0, 8, 8, n, SMALL_CHAR);
> -      do_test (0, 8, 8, 8, n, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 2, 2, 2, n, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 4, 4, n, SMALL_CHAR);
> +      do_test (&json_ctx, 4, 0, 4, 4, n, BIG_CHAR);
> +      do_test (&json_ctx, 0, 0, 8, 8, n, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 8, 8, 8, n, SMALL_CHAR);
>
>        for (i = 1; i < 8; ++i)
>         {
> -         do_test (0, 0, 8 << i, 8 << i, n, SMALL_CHAR);
> -         do_test (8 - i, 2 * i, 8 << i, 8 << i, n, SMALL_CHAR);
> -         do_test (0, 0, 8 << i, 2 << i, n, SMALL_CHAR);
> -         do_test (8 - i, 2 * i, 8 << i, 2 << i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, 8 << i, 8 << i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 8 << i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, 8 << i, 2 << i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 2 << i, n, SMALL_CHAR);
>         }
>
>        for (i = 1; i < 8; ++i)
>         {
> -         do_test (i, 2 * i, 8 << i, 1, n, SMALL_CHAR);
> -         do_test (2 * i, i, 8 << i, 1, n, BIG_CHAR);
> -         do_test (i, i, 8 << i, 10, n, SMALL_CHAR);
> +         do_test (&json_ctx, i, 2 * i, 8 << i, 1, n, SMALL_CHAR);
> +         do_test (&json_ctx, 2 * i, i, 8 << i, 1, n, BIG_CHAR);
> +         do_test (&json_ctx, i, i, 8 << i, 10, n, SMALL_CHAR);
>         }
>      }
>
> +  for (i = 128; i < 2048; i += i)
> +    {
> +      for (n = i - 64; n <= i + 64; n += 32)
> +       {
> +         do_test (&json_ctx, 1, 0, i, i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 0, i, i, i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, i, i, n, SMALL_CHAR);
> +         do_test (&json_ctx, i, i, i, i, n, SMALL_CHAR);
> +         do_test (&json_ctx, 1, 0, i, n, i, SMALL_CHAR);
> +         do_test (&json_ctx, 0, i, i, n, i, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, i, n, i, SMALL_CHAR);
> +         do_test (&json_ctx, i, i, i, n, i, SMALL_CHAR);
> +       }
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
>    return ret;
>  }
> diff --git a/benchtests/bench-strncpy.c b/benchtests/bench-strncpy.c
> index 8207d99f4d..f621cbfe09 100644
> --- a/benchtests/bench-strncpy.c
> +++ b/benchtests/bench-strncpy.c
> @@ -24,6 +24,8 @@
>  # define SMALL_CHAR 127
>  #endif /* !WIDE */
>
> +#include "json-lib.h"
> +
>  #ifndef STRNCPY_RESULT
>  # define STRNCPY_RESULT(dst, len, n) dst
>  # define TEST_MAIN
> @@ -52,7 +54,8 @@ IMPL (generic_strncpy, 0)
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
>
>  static void
> -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
> +            size_t len, size_t n)
>  {
>    size_t i, iters = INNER_LOOP_ITERS_LARGE * (4 / CHARBYTES);
>    timing_t start, stop, cur;
> @@ -77,7 +80,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
>        size_t i;
>
>        for (i = len; i < n; ++i)
> -       if (dst [i] != '\0')
> +       if (dst[i] != '\0')
>           {
>             error (0, 0, "Wrong result in function %s", impl->name);
>             ret = 1;
> @@ -94,17 +97,18 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
>
>    TIMING_DIFF (cur, start, stop);
>
> -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +  json_element_double (json_ctx, (double) cur / (double) iters);
>  }
>
>  static void
> -do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
> +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
> +        size_t n, int max_char)
>  {
>    size_t i;
>    CHAR *s1, *s2;
>
> -/* For wcsncpy: align1 and align2 here mean alignment not in bytes,
> -   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)).  */
> +  /* For wcsncpy: align1 and align2 here mean alignment not in bytes,
> +     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)).  */
>    align1 &= 7;
>    if ((align1 + len) * sizeof (CHAR) >= page_size)
>      return;
> @@ -123,46 +127,86 @@ do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
>         ++i)
>      s1[i] = 32 + 32 * i % (max_char - 32);
>
> -  printf ("Length %4zd, n %4zd, alignment %2zd/%2zd:", len, n, align1, align2);
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "align1", align1);
> +  json_attr_uint (json_ctx, "align2", align2);
> +  json_attr_uint (json_ctx, "len", len);
> +  json_attr_uint (json_ctx, "n", n);
> +  json_attr_uint (json_ctx, "max_char", max_char);
> +
> +  json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (impl, s2, s1, len, n);
> +    do_one_test (json_ctx, impl, s2, s1, len, n);
>
> -  putchar ('\n');
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
>  }
>
>  static int
>  test_main (void)
>  {
> -  size_t i;
> +  json_ctx_t json_ctx;
> +  size_t i, j;
>
>    test_init ();
>
> -  printf ("%28s", "");
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -    printf ("\t%s", impl->name);
> -  putchar ('\n');
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (i, i, 16, 16, SMALL_CHAR);
> -      do_test (i, i, 16, 16, BIG_CHAR);
> -      do_test (i, 2 * i, 16, 16, SMALL_CHAR);
> -      do_test (2 * i, i, 16, 16, BIG_CHAR);
> -      do_test (8 - i, 2 * i, 1 << i, 2 << i, SMALL_CHAR);
> -      do_test (2 * i, 8 - i, 2 << i, 1 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 1 << i, 2 << i, BIG_CHAR);
> -      do_test (2 * i, 8 - i, 2 << i, 1 << i, BIG_CHAR);
> +      do_test (&json_ctx, i, i, 16, 16, SMALL_CHAR);
> +      do_test (&json_ctx, i, i, 16, 16, BIG_CHAR);
> +      do_test (&json_ctx, i, 2 * i, 16, 16, SMALL_CHAR);
> +      do_test (&json_ctx, 2 * i, i, 16, 16, BIG_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 1 << i, 2 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 2 * i, 8 - i, 2 << i, 1 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 1 << i, 2 << i, BIG_CHAR);
> +      do_test (&json_ctx, 2 * i, 8 - i, 2 << i, 1 << i, BIG_CHAR);
>      }
>
>    for (i = 1; i < 8; ++i)
>      {
> -      do_test (0, 0, 4 << i, 8 << i, SMALL_CHAR);
> -      do_test (0, 0, 16 << i, 8 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 4 << i, 8 << i, SMALL_CHAR);
> -      do_test (8 - i, 2 * i, 16 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 4 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 0, 0, 16 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 4 << i, 8 << i, SMALL_CHAR);
> +      do_test (&json_ctx, 8 - i, 2 * i, 16 << i, 8 << i, SMALL_CHAR);
>      }
>
> +  for (i = 128; i < 2048; i += i)
> +    {
> +      for (j = i - 64; j <= i + 64; j += 32)
> +       {
> +         do_test (&json_ctx, 1, 0, i, j, SMALL_CHAR);
> +         do_test (&json_ctx, 0, i, i, j, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, i, j, SMALL_CHAR);
> +         do_test (&json_ctx, i, i, i, j, SMALL_CHAR);
> +         do_test (&json_ctx, 1, 0, j, i, SMALL_CHAR);
> +         do_test (&json_ctx, 0, i, j, i, SMALL_CHAR);
> +         do_test (&json_ctx, 0, 0, j, i, SMALL_CHAR);
> +         do_test (&json_ctx, i, i, j, i, SMALL_CHAR);
> +       }
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
>    return ret;
>  }
>
> --
> 2.34.1
>

LGTM.

This patch is standalone.  Please check it in first.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 1/5] benchtests: Make str{n}{cat|cpy} benchmarks output json
  2022-11-04 21:01   ` [PATCH v3 1/5] benchtests: Make str{n}{cat|cpy} benchmarks output json H.J. Lu
@ 2022-11-04 21:24     ` Noah Goldstein
  0 siblings, 0 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04 21:24 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Nov 4, 2022 at 2:02 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Nov 4, 2022 at 1:13 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Json output is easier to parse and most other benchmarks already do
> > the same.
> > ---
> >  benchtests/bench-strcat.c  | 106 ++++++++++++++++++++++++++-------
> >  benchtests/bench-strcpy.c  | 118 ++++++++++++++++++++++++-------------
> >  benchtests/bench-strncat.c |  94 +++++++++++++++++++++--------
> >  benchtests/bench-strncpy.c |  94 +++++++++++++++++++++--------
> >  4 files changed, 297 insertions(+), 115 deletions(-)
> >
> > diff --git a/benchtests/bench-strcat.c b/benchtests/bench-strcat.c
> > index 749318e37e..890b34b4c1 100644
> > --- a/benchtests/bench-strcat.c
> > +++ b/benchtests/bench-strcat.c
> > @@ -35,6 +35,7 @@
> >  # define SMALL_CHAR 1273
> >  #endif /* WIDE */
> >
> > +#include "json-lib.h"
> >
> >  typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
> >
> > @@ -49,7 +50,7 @@ IMPL (STRCAT, 1)
> >  IMPL (generic_strcat, 0)
> >
> >  static void
> > -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
> > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src)
> >  {
> >    size_t k = STRLEN (dst), i, iters = INNER_LOOP_ITERS_LARGE;
> >    timing_t start, stop, cur;
> > @@ -64,7 +65,8 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
> >
> >    if (STRCMP (dst + k, src) != 0)
> >      {
> > -      error (0, 0, "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
> > +      error (0, 0,
> > +            "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
> >              impl->name, dst, src);
> >        ret = 1;
> >        return;
> > @@ -80,11 +82,12 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
> >
> >    TIMING_DIFF (cur, start, stop);
> >
> > -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> > +  json_element_double (json_ctx, (double) cur / (double) iters);
> >  }
> >
> >  static void
> > -do_test (size_t align1, size_t align2, size_t len1, size_t len2, int max_char)
> > +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len1,
> > +        size_t len2, int max_char)
> >  {
> >    size_t i;
> >    CHAR *s1, *s2;
> > @@ -107,53 +110,112 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2, int max_char)
> >    for (i = 0; i < len2; i++)
> >      s2[i] = 32 + 23 * i % (max_char - 32);
> >
> > -  printf ("Length %4zd/%4zd, alignment %2zd/%2zd:", len1, len2, align1, align2);
> > +  json_element_object_begin (json_ctx);
> > +  json_attr_uint (json_ctx, "align1", align1);
> > +  json_attr_uint (json_ctx, "align2", align2);
> > +  json_attr_uint (json_ctx, "len1", len1);
> > +  json_attr_uint (json_ctx, "len2", len2);
> > +  json_attr_uint (json_ctx, "max_char", max_char);
> > +
> > +  json_array_begin (json_ctx, "timings");
> >
> >    FOR_EACH_IMPL (impl, 0)
> >      {
> >        s2[len2] = '\0';
> > -      do_one_test (impl, s2, s1);
> > +      do_one_test (json_ctx, impl, s2, s1);
> >      }
> >
> > -  putchar ('\n');
> > +  json_array_end (json_ctx);
> > +  json_element_object_end (json_ctx);
> >  }
> >
> >  int
> >  test_main (void)
> >  {
> > +  json_ctx_t json_ctx;
> >    size_t i;
> >
> >    test_init ();
> >
> > -  printf ("%28s", "");
> > +  test_init ();
> > +
> > +  json_init (&json_ctx, 0, stdout);
> > +
> > +  json_document_begin (&json_ctx);
> > +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> > +
> > +  json_attr_object_begin (&json_ctx, "functions");
> > +  json_attr_object_begin (&json_ctx, TEST_NAME);
> > +  json_attr_string (&json_ctx, "bench-variant", "");
> > +
> > +  json_array_begin (&json_ctx, "ifuncs");
> >    FOR_EACH_IMPL (impl, 0)
> > -    printf ("\t%s", impl->name);
> > -  putchar ('\n');
> > +    json_element_string (&json_ctx, impl->name);
> > +  json_array_end (&json_ctx);
> > +
> > +  json_array_begin (&json_ctx, "results");
> >
> >    for (i = 0; i < 16; ++i)
> >      {
> > -      do_test (0, 0, i, i, SMALL_CHAR);
> > -      do_test (0, 0, i, i, BIG_CHAR);
> > -      do_test (0, i, i, i, SMALL_CHAR);
> > -      do_test (i, 0, i, i, BIG_CHAR);
> > +      do_test (&json_ctx, 0, 0, i, i, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 0, i, i, BIG_CHAR);
> > +      do_test (&json_ctx, 0, i, i, i, SMALL_CHAR);
> > +      do_test (&json_ctx, i, 0, i, i, BIG_CHAR);
> >      }
> >
> >    for (i = 1; i < 8; ++i)
> >      {
> > -      do_test (0, 0, 8 << i, 8 << i, SMALL_CHAR);
> > -      do_test (8 - i, 2 * i, 8 << i, 8 << i, SMALL_CHAR);
> > -      do_test (0, 0, 8 << i, 2 << i, SMALL_CHAR);
> > -      do_test (8 - i, 2 * i, 8 << i, 2 << i, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 0, 8 << i, 8 << i, SMALL_CHAR);
> > +      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 8 << i, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 0, 8 << i, 2 << i, SMALL_CHAR);
> > +      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 2 << i, SMALL_CHAR);
> >      }
> >
> >    for (i = 1; i < 8; ++i)
> >      {
> > -      do_test (i, 2 * i, 8 << i, 1, SMALL_CHAR);
> > -      do_test (2 * i, i, 8 << i, 1, BIG_CHAR);
> > -      do_test (i, i, 8 << i, 10, SMALL_CHAR);
> > -      do_test (i, i, 8 << i, 10, BIG_CHAR);
> > +      do_test (&json_ctx, i, 2 * i, 8 << i, 1, SMALL_CHAR);
> > +      do_test (&json_ctx, 2 * i, i, 8 << i, 1, BIG_CHAR);
> > +      do_test (&json_ctx, i, i, 8 << i, 10, SMALL_CHAR);
> > +      do_test (&json_ctx, i, i, 8 << i, 10, BIG_CHAR);
> > +    }
> > +
> > +  for (i = 32; i < 256; i += 32)
> > +    {
> > +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> > +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
> >      }
> >
> > +  for (; i < 512; i += 64)
> > +    {
> > +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> > +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
> > +    }
> > +
> > +  for (; i < 1024; i += 128)
> > +    {
> > +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> > +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
> > +    }
> > +
> > +  for (; i < 2048; i += 256)
> > +    {
> > +      do_test (&json_ctx, 1, 0, i, 31, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, i, i, 31, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 0, i, 31, SMALL_CHAR);
> > +      do_test (&json_ctx, i, i, i, 31, SMALL_CHAR);
> > +    }
> > +
> > +  json_array_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_document_end (&json_ctx);
> > +
> >    return ret;
> >  }
> >
> > diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
> > index 29deb8a46a..af8673e137 100644
> > --- a/benchtests/bench-strcpy.c
> > +++ b/benchtests/bench-strcpy.c
> > @@ -26,16 +26,18 @@
> >  # define SMALL_CHAR 127
> >  #endif
> >
> > +#include "json-lib.h"
> > +
> >  #ifndef STRCPY_RESULT
> >  # define STRCPY_RESULT(dst, len) dst
> >  # define TEST_MAIN
> >  # ifndef WIDE
> > -#  define TEST_NAME "strcpy"
> > +#   define TEST_NAME "strcpy"
> >  # else
> > -#  define TEST_NAME "wcscpy"
> > -#  define generic_strcpy generic_wcscpy
> > +#   define TEST_NAME "wcscpy"
> > +#   define generic_strcpy generic_wcscpy
> >  # endif
> > -#include "bench-string.h"
> > +# include "bench-string.h"
> >
> >  CHAR *
> >  generic_strcpy (CHAR *dst, const CHAR *src)
> > @@ -51,8 +53,8 @@ IMPL (generic_strcpy, 0)
> >  typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
> >
> >  static void
> > -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
> > -            size_t len __attribute__((unused)))
> > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
> > +            size_t len __attribute__ ((unused)))
> >  {
> >    size_t i, iters = INNER_LOOP_ITERS;
> >    timing_t start, stop, cur;
> > @@ -77,23 +79,24 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
> >    TIMING_NOW (start);
> >    for (i = 0; i < iters; ++i)
> >      {
> > -         CALL (impl, dst, src);
> > +      CALL (impl, dst, src);
> >      }
> >    TIMING_NOW (stop);
> >
> >    TIMING_DIFF (cur, start, stop);
> >
> > -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> > +  json_element_double (json_ctx, (double) cur / (double) iters);
> >  }
> >
> >  static void
> > -do_test (size_t align1, size_t align2, size_t len, int max_char)
> > +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
> > +        int max_char)
> >  {
> >    size_t i;
> >    CHAR *s1, *s2;
> > -/* For wcscpy: align1 and align2 here mean alignment not in bytes,
> > -   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> > -   len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
> > +  /* For wcscpy: align1 and align2 here mean alignment not in bytes,
> > +     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
> > +     len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
> >    align1 &= 7;
> >    if ((align1 + len) * sizeof (CHAR) >= page_size)
> >      return;
> > @@ -109,65 +112,96 @@ do_test (size_t align1, size_t align2, size_t len, int max_char)
> >      s1[i] = 32 + 23 * i % (max_char - 32);
> >    s1[len] = 0;
> >
> > -  printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len,
> > -         align1 * sizeof (CHAR), align2 * sizeof (CHAR));
> > +  json_element_object_begin (json_ctx);
> > +  json_attr_uint (json_ctx, "align1", align1);
> > +  json_attr_uint (json_ctx, "align2", align2);
> > +  json_attr_uint (json_ctx, "len", len);
> > +  json_attr_uint (json_ctx, "max_char", max_char);
> > +
> > +  json_array_begin (json_ctx, "timings");
> >
> >    FOR_EACH_IMPL (impl, 0)
> > -    do_one_test (impl, s2, s1, len);
> > +    do_one_test (json_ctx, impl, s2, s1, len);
> >
> > -  putchar ('\n');
> > +  json_array_end (json_ctx);
> > +  json_element_object_end (json_ctx);
> >  }
> >
> >  int
> >  test_main (void)
> >  {
> > +  json_ctx_t json_ctx;
> >    size_t i;
> >
> >    test_init ();
> >
> > -  printf ("%23s", "");
> > +  json_init (&json_ctx, 0, stdout);
> > +
> > +  json_document_begin (&json_ctx);
> > +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> > +
> > +  json_attr_object_begin (&json_ctx, "functions");
> > +  json_attr_object_begin (&json_ctx, TEST_NAME);
> > +  json_attr_string (&json_ctx, "bench-variant", "");
> > +
> > +  json_array_begin (&json_ctx, "ifuncs");
> >    FOR_EACH_IMPL (impl, 0)
> > -    printf ("\t%s", impl->name);
> > -  putchar ('\n');
> > +    json_element_string (&json_ctx, impl->name);
> > +  json_array_end (&json_ctx);
> > +
> > +  json_array_begin (&json_ctx, "results");
> >
> >    for (i = 0; i < 16; ++i)
> >      {
> > -      do_test (0, 0, i, SMALL_CHAR);
> > -      do_test (0, 0, i, BIG_CHAR);
> > -      do_test (0, i, i, SMALL_CHAR);
> > -      do_test (i, 0, i, BIG_CHAR);
> > +      do_test (&json_ctx, 0, 0, i, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 0, i, BIG_CHAR);
> > +      do_test (&json_ctx, 0, i, i, SMALL_CHAR);
> > +      do_test (&json_ctx, i, 0, i, BIG_CHAR);
> >      }
> >
> >    for (i = 1; i < 8; ++i)
> >      {
> > -      do_test (0, 0, 8 << i, SMALL_CHAR);
> > -      do_test (8 - i, 2 * i, 8 << i, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 0, 8 << i, SMALL_CHAR);
> > +      do_test (&json_ctx, 8 - i, 2 * i, 8 << i, SMALL_CHAR);
> >      }
> >
> >    for (i = 1; i < 8; ++i)
> >      {
> > -      do_test (i, 2 * i, 8 << i, SMALL_CHAR);
> > -      do_test (2 * i, i, 8 << i, BIG_CHAR);
> > -      do_test (i, i, 8 << i, SMALL_CHAR);
> > -      do_test (i, i, 8 << i, BIG_CHAR);
> > +      do_test (&json_ctx, i, 2 * i, 8 << i, SMALL_CHAR);
> > +      do_test (&json_ctx, 2 * i, i, 8 << i, BIG_CHAR);
> > +      do_test (&json_ctx, i, i, 8 << i, SMALL_CHAR);
> > +      do_test (&json_ctx, i, i, 8 << i, BIG_CHAR);
> >      }
> >
> > -  for (i = 16; i <= 512; i+=4)
> > +  for (i = 16; i <= 512; i += 4)
> >      {
> > -      do_test (0, 4, i, SMALL_CHAR);
> > -      do_test (4, 0, i, BIG_CHAR);
> > -      do_test (4, 4, i, SMALL_CHAR);
> > -      do_test (2, 2, i, BIG_CHAR);
> > -      do_test (2, 6, i, SMALL_CHAR);
> > -      do_test (6, 2, i, BIG_CHAR);
> > -      do_test (1, 7, i, SMALL_CHAR);
> > -      do_test (7, 1, i, BIG_CHAR);
> > -      do_test (3, 4, i, SMALL_CHAR);
> > -      do_test (4, 3, i, BIG_CHAR);
> > -      do_test (5, 7, i, SMALL_CHAR);
> > -      do_test (7, 5, i, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 4, i, SMALL_CHAR);
> > +      do_test (&json_ctx, 4, 0, i, BIG_CHAR);
> > +      do_test (&json_ctx, 4, 4, i, SMALL_CHAR);
> > +      do_test (&json_ctx, 2, 2, i, BIG_CHAR);
> > +      do_test (&json_ctx, 2, 6, i, SMALL_CHAR);
> > +      do_test (&json_ctx, 6, 2, i, BIG_CHAR);
> > +      do_test (&json_ctx, 1, 7, i, SMALL_CHAR);
> > +      do_test (&json_ctx, 7, 1, i, BIG_CHAR);
> > +      do_test (&json_ctx, 3, 4, i, SMALL_CHAR);
> > +      do_test (&json_ctx, 4, 3, i, BIG_CHAR);
> > +      do_test (&json_ctx, 5, 7, i, SMALL_CHAR);
> > +      do_test (&json_ctx, 7, 5, i, SMALL_CHAR);
> >      }
> >
> > +  for (i = 1; i < 2048; i += i)
> > +    {
> > +      do_test (&json_ctx, 1, 0, i, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, i, i, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 0, i, SMALL_CHAR);
> > +      do_test (&json_ctx, i, i, i, SMALL_CHAR);
> > +    }
> > +
> > +  json_array_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_document_end (&json_ctx);
> > +
> >    return ret;
> >  }
> >
> > diff --git a/benchtests/bench-strncat.c b/benchtests/bench-strncat.c
> > index b148c55279..5ccc09a4f8 100644
> > --- a/benchtests/bench-strncat.c
> > +++ b/benchtests/bench-strncat.c
> > @@ -33,6 +33,8 @@
> >  # define SMALL_CHAR 1273
> >  #endif /* WIDE */
> >
> > +#include "json-lib.h"
> > +
> >  typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
> >
> >  CHAR *
> > @@ -49,7 +51,8 @@ IMPL (STRNCAT, 2)
> >  IMPL (generic_strncat, 0)
> >
> >  static void
> > -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
> > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
> > +            size_t n)
> >  {
> >    size_t k = STRLEN (dst), i, iters = INNER_LOOP_ITERS8;
> >    timing_t start, stop, cur;
> > @@ -65,8 +68,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
> >    size_t len = STRLEN (src);
> >    if (MEMCMP (dst + k, src, len + 1 > n ? n : len + 1) != 0)
> >      {
> > -      error (0, 0, "Incorrect concatenation in function %s",
> > -            impl->name);
> > +      error (0, 0, "Incorrect concatenation in function %s", impl->name);
> >        ret = 1;
> >        return;
> >      }
> > @@ -88,12 +90,12 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
> >
> >    TIMING_DIFF (cur, start, stop);
> >
> > -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> > +  json_element_double (json_ctx, (double) cur / (double) iters);
> >  }
> >
> >  static void
> > -do_test (size_t align1, size_t align2, size_t len1, size_t len2,
> > -        size_t n, int max_char)
> > +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len1,
> > +        size_t len2, size_t n, int max_char)
> >  {
> >    size_t i;
> >    CHAR *s1, *s2;
> > @@ -118,53 +120,93 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
> >    for (i = 0; i < len2; i++)
> >      s2[i] = 32 + 23 * i % (max_char - 32);
> >
> > -  printf ("Length %4zd/%4zd, alignment %2zd/%2zd, N %4zd:",
> > -         len1, len2, align1, align2, n);
> > +  json_element_object_begin (json_ctx);
> > +  json_attr_uint (json_ctx, "align1", align1);
> > +  json_attr_uint (json_ctx, "align2", align2);
> > +  json_attr_uint (json_ctx, "len1", len1);
> > +  json_attr_uint (json_ctx, "len2", len2);
> > +  json_attr_uint (json_ctx, "n", n);
> > +  json_attr_uint (json_ctx, "max_char", max_char);
> > +
> > +  json_array_begin (json_ctx, "timings");
> >
> >    FOR_EACH_IMPL (impl, 0)
> >      {
> >        s2[len2] = '\0';
> > -      do_one_test (impl, s2, s1, n);
> > +      do_one_test (json_ctx, impl, s2, s1, n);
> >      }
> >
> > -  putchar ('\n');
> > +  json_array_end (json_ctx);
> > +  json_element_object_end (json_ctx);
> >  }
> >
> >  int
> >  main (void)
> >  {
> > +  json_ctx_t json_ctx;
> >    size_t i, n;
> >
> >    test_init ();
> >
> > -  printf ("%28s", "");
> > +  json_init (&json_ctx, 0, stdout);
> > +
> > +  json_document_begin (&json_ctx);
> > +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> > +
> > +  json_attr_object_begin (&json_ctx, "functions");
> > +  json_attr_object_begin (&json_ctx, TEST_NAME);
> > +  json_attr_string (&json_ctx, "bench-variant", "");
> > +
> > +  json_array_begin (&json_ctx, "ifuncs");
> >    FOR_EACH_IMPL (impl, 0)
> > -    printf ("\t%s", impl->name);
> > -  putchar ('\n');
> > +    json_element_string (&json_ctx, impl->name);
> > +  json_array_end (&json_ctx);
> > +
> > +  json_array_begin (&json_ctx, "results");
> >
> > -  for (n = 2; n <= 2048; n*=4)
> > +  for (n = 2; n <= 2048; n *= 4)
> >      {
> > -      do_test (0, 2, 2, 2, n, SMALL_CHAR);
> > -      do_test (0, 0, 4, 4, n, SMALL_CHAR);
> > -      do_test (4, 0, 4, 4, n, BIG_CHAR);
> > -      do_test (0, 0, 8, 8, n, SMALL_CHAR);
> > -      do_test (0, 8, 8, 8, n, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 2, 2, 2, n, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 0, 4, 4, n, SMALL_CHAR);
> > +      do_test (&json_ctx, 4, 0, 4, 4, n, BIG_CHAR);
> > +      do_test (&json_ctx, 0, 0, 8, 8, n, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 8, 8, 8, n, SMALL_CHAR);
> >
> >        for (i = 1; i < 8; ++i)
> >         {
> > -         do_test (0, 0, 8 << i, 8 << i, n, SMALL_CHAR);
> > -         do_test (8 - i, 2 * i, 8 << i, 8 << i, n, SMALL_CHAR);
> > -         do_test (0, 0, 8 << i, 2 << i, n, SMALL_CHAR);
> > -         do_test (8 - i, 2 * i, 8 << i, 2 << i, n, SMALL_CHAR);
> > +         do_test (&json_ctx, 0, 0, 8 << i, 8 << i, n, SMALL_CHAR);
> > +         do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 8 << i, n, SMALL_CHAR);
> > +         do_test (&json_ctx, 0, 0, 8 << i, 2 << i, n, SMALL_CHAR);
> > +         do_test (&json_ctx, 8 - i, 2 * i, 8 << i, 2 << i, n, SMALL_CHAR);
> >         }
> >
> >        for (i = 1; i < 8; ++i)
> >         {
> > -         do_test (i, 2 * i, 8 << i, 1, n, SMALL_CHAR);
> > -         do_test (2 * i, i, 8 << i, 1, n, BIG_CHAR);
> > -         do_test (i, i, 8 << i, 10, n, SMALL_CHAR);
> > +         do_test (&json_ctx, i, 2 * i, 8 << i, 1, n, SMALL_CHAR);
> > +         do_test (&json_ctx, 2 * i, i, 8 << i, 1, n, BIG_CHAR);
> > +         do_test (&json_ctx, i, i, 8 << i, 10, n, SMALL_CHAR);
> >         }
> >      }
> >
> > +  for (i = 128; i < 2048; i += i)
> > +    {
> > +      for (n = i - 64; n <= i + 64; n += 32)
> > +       {
> > +         do_test (&json_ctx, 1, 0, i, i, n, SMALL_CHAR);
> > +         do_test (&json_ctx, 0, i, i, i, n, SMALL_CHAR);
> > +         do_test (&json_ctx, 0, 0, i, i, n, SMALL_CHAR);
> > +         do_test (&json_ctx, i, i, i, i, n, SMALL_CHAR);
> > +         do_test (&json_ctx, 1, 0, i, n, i, SMALL_CHAR);
> > +         do_test (&json_ctx, 0, i, i, n, i, SMALL_CHAR);
> > +         do_test (&json_ctx, 0, 0, i, n, i, SMALL_CHAR);
> > +         do_test (&json_ctx, i, i, i, n, i, SMALL_CHAR);
> > +       }
> > +    }
> > +
> > +  json_array_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_document_end (&json_ctx);
> > +
> >    return ret;
> >  }
> > diff --git a/benchtests/bench-strncpy.c b/benchtests/bench-strncpy.c
> > index 8207d99f4d..f621cbfe09 100644
> > --- a/benchtests/bench-strncpy.c
> > +++ b/benchtests/bench-strncpy.c
> > @@ -24,6 +24,8 @@
> >  # define SMALL_CHAR 127
> >  #endif /* !WIDE */
> >
> > +#include "json-lib.h"
> > +
> >  #ifndef STRNCPY_RESULT
> >  # define STRNCPY_RESULT(dst, len, n) dst
> >  # define TEST_MAIN
> > @@ -52,7 +54,8 @@ IMPL (generic_strncpy, 0)
> >  typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
> >
> >  static void
> > -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
> > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *dst, const CHAR *src,
> > +            size_t len, size_t n)
> >  {
> >    size_t i, iters = INNER_LOOP_ITERS_LARGE * (4 / CHARBYTES);
> >    timing_t start, stop, cur;
> > @@ -77,7 +80,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
> >        size_t i;
> >
> >        for (i = len; i < n; ++i)
> > -       if (dst [i] != '\0')
> > +       if (dst[i] != '\0')
> >           {
> >             error (0, 0, "Wrong result in function %s", impl->name);
> >             ret = 1;
> > @@ -94,17 +97,18 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t len, size_t n)
> >
> >    TIMING_DIFF (cur, start, stop);
> >
> > -  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> > +  json_element_double (json_ctx, (double) cur / (double) iters);
> >  }
> >
> >  static void
> > -do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
> > +do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
> > +        size_t n, int max_char)
> >  {
> >    size_t i;
> >    CHAR *s1, *s2;
> >
> > -/* For wcsncpy: align1 and align2 here mean alignment not in bytes,
> > -   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)).  */
> > +  /* For wcsncpy: align1 and align2 here mean alignment not in bytes,
> > +     but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)).  */
> >    align1 &= 7;
> >    if ((align1 + len) * sizeof (CHAR) >= page_size)
> >      return;
> > @@ -123,46 +127,86 @@ do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
> >         ++i)
> >      s1[i] = 32 + 32 * i % (max_char - 32);
> >
> > -  printf ("Length %4zd, n %4zd, alignment %2zd/%2zd:", len, n, align1, align2);
> > +  json_element_object_begin (json_ctx);
> > +  json_attr_uint (json_ctx, "align1", align1);
> > +  json_attr_uint (json_ctx, "align2", align2);
> > +  json_attr_uint (json_ctx, "len", len);
> > +  json_attr_uint (json_ctx, "n", n);
> > +  json_attr_uint (json_ctx, "max_char", max_char);
> > +
> > +  json_array_begin (json_ctx, "timings");
> >
> >    FOR_EACH_IMPL (impl, 0)
> > -    do_one_test (impl, s2, s1, len, n);
> > +    do_one_test (json_ctx, impl, s2, s1, len, n);
> >
> > -  putchar ('\n');
> > +  json_array_end (json_ctx);
> > +  json_element_object_end (json_ctx);
> >  }
> >
> >  static int
> >  test_main (void)
> >  {
> > -  size_t i;
> > +  json_ctx_t json_ctx;
> > +  size_t i, j;
> >
> >    test_init ();
> >
> > -  printf ("%28s", "");
> > +  json_init (&json_ctx, 0, stdout);
> > +
> > +  json_document_begin (&json_ctx);
> > +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> > +
> > +  json_attr_object_begin (&json_ctx, "functions");
> > +  json_attr_object_begin (&json_ctx, TEST_NAME);
> > +  json_attr_string (&json_ctx, "bench-variant", "");
> > +
> > +  json_array_begin (&json_ctx, "ifuncs");
> >    FOR_EACH_IMPL (impl, 0)
> > -    printf ("\t%s", impl->name);
> > -  putchar ('\n');
> > +    json_element_string (&json_ctx, impl->name);
> > +  json_array_end (&json_ctx);
> > +
> > +  json_array_begin (&json_ctx, "results");
> >
> >    for (i = 1; i < 8; ++i)
> >      {
> > -      do_test (i, i, 16, 16, SMALL_CHAR);
> > -      do_test (i, i, 16, 16, BIG_CHAR);
> > -      do_test (i, 2 * i, 16, 16, SMALL_CHAR);
> > -      do_test (2 * i, i, 16, 16, BIG_CHAR);
> > -      do_test (8 - i, 2 * i, 1 << i, 2 << i, SMALL_CHAR);
> > -      do_test (2 * i, 8 - i, 2 << i, 1 << i, SMALL_CHAR);
> > -      do_test (8 - i, 2 * i, 1 << i, 2 << i, BIG_CHAR);
> > -      do_test (2 * i, 8 - i, 2 << i, 1 << i, BIG_CHAR);
> > +      do_test (&json_ctx, i, i, 16, 16, SMALL_CHAR);
> > +      do_test (&json_ctx, i, i, 16, 16, BIG_CHAR);
> > +      do_test (&json_ctx, i, 2 * i, 16, 16, SMALL_CHAR);
> > +      do_test (&json_ctx, 2 * i, i, 16, 16, BIG_CHAR);
> > +      do_test (&json_ctx, 8 - i, 2 * i, 1 << i, 2 << i, SMALL_CHAR);
> > +      do_test (&json_ctx, 2 * i, 8 - i, 2 << i, 1 << i, SMALL_CHAR);
> > +      do_test (&json_ctx, 8 - i, 2 * i, 1 << i, 2 << i, BIG_CHAR);
> > +      do_test (&json_ctx, 2 * i, 8 - i, 2 << i, 1 << i, BIG_CHAR);
> >      }
> >
> >    for (i = 1; i < 8; ++i)
> >      {
> > -      do_test (0, 0, 4 << i, 8 << i, SMALL_CHAR);
> > -      do_test (0, 0, 16 << i, 8 << i, SMALL_CHAR);
> > -      do_test (8 - i, 2 * i, 4 << i, 8 << i, SMALL_CHAR);
> > -      do_test (8 - i, 2 * i, 16 << i, 8 << i, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 0, 4 << i, 8 << i, SMALL_CHAR);
> > +      do_test (&json_ctx, 0, 0, 16 << i, 8 << i, SMALL_CHAR);
> > +      do_test (&json_ctx, 8 - i, 2 * i, 4 << i, 8 << i, SMALL_CHAR);
> > +      do_test (&json_ctx, 8 - i, 2 * i, 16 << i, 8 << i, SMALL_CHAR);
> >      }
> >
> > +  for (i = 128; i < 2048; i += i)
> > +    {
> > +      for (j = i - 64; j <= i + 64; j += 32)
> > +       {
> > +         do_test (&json_ctx, 1, 0, i, j, SMALL_CHAR);
> > +         do_test (&json_ctx, 0, i, i, j, SMALL_CHAR);
> > +         do_test (&json_ctx, 0, 0, i, j, SMALL_CHAR);
> > +         do_test (&json_ctx, i, i, i, j, SMALL_CHAR);
> > +         do_test (&json_ctx, 1, 0, j, i, SMALL_CHAR);
> > +         do_test (&json_ctx, 0, i, j, i, SMALL_CHAR);
> > +         do_test (&json_ctx, 0, 0, j, i, SMALL_CHAR);
> > +         do_test (&json_ctx, i, i, j, i, SMALL_CHAR);
> > +       }
> > +    }
> > +
> > +  json_array_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_document_end (&json_ctx);
> > +
> >    return ret;
> >  }
> >
> > --
> > 2.34.1
> >
>
> LGTM.
>
> This patch is standalone.  Please check it in first.
>
> Thanks.
>
> --
> H.J.

Okay.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/5] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
  2022-11-04 20:13   ` [PATCH v3 2/5] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
@ 2022-11-04 21:46     ` H.J. Lu
  2022-11-04 22:27       ` Noah Goldstein
  0 siblings, 1 reply; 42+ messages in thread
From: H.J. Lu @ 2022-11-04 21:46 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Nov 04, 2022 at 01:13:11PM -0700, Noah Goldstein wrote:
> Optimizations are:
>     1. Use more overlapping stores to avoid branches.
>     2. Reduce how unrolled the aligning copies are (this is more of a
>        code-size save, its a negative for some sizes in terms of
>        perf).
>     3. Improve the loop a bit (similiar to what we do in strlen with
>        2x vpminu + kortest instead of 3x vpminu + kmov + test).
>     4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
>        number that are taken.
> 
> Performance Changes:
> 
>     Times are from N = 10 runs of the benchmark suite and are
>     reported as geometric mean of all ratios of
>     New Implementation / Old Implementation.
> 
>     stpcpy-evex      -> 0.922
>     strcat-evex      -> 0.985
>     strcpy-evex      -> 0.880
> 
>     strncpy-evex     -> 0.831
>     stpncpy-evex     -> 0.780
> 
>     strncat-evex     -> 0.958
> 
> Code Size Changes:
>     function         -> Bytes New / Bytes Old -> Ratio
> 
>     strcat-evex      ->  819 / 1874 -> 0.437
>     strcpy-evex      ->  700 / 1074 -> 0.652
>     stpcpy-evex      ->  735 / 1094 -> 0.672
> 
>     strncpy-evex     -> 1397 / 2611 -> 0.535
>     stpncpy-evex     -> 1489 / 2691 -> 0.553
> 
>     strncat-evex     -> 1184 / 2832 -> 0.418
> 
> Notes:
>     1. Because of the significant difference between the
>        implementations they are split into three files.
> 
>            strcpy-evex.S    -> strcpy, stpcpy, strcat
>            strncpy-evex.S   -> strncpy
>            strncat-evex.S    > strncat
> 
>        I couldn't find a way to merge them without making the
>        ifdefs incredibly difficult to follow.
> 
>     2. All implementations can be made evex512 by including
>        "x86-evex512-vecs.h" at the top.
> 
>     3. All implementations have an optional define:
>         `USE_EVEX_MASKED_STORE`
>        Setting to one uses evex-masked stores for handling short
>        strings.  This saves code size and branches.  It's disabled
>        for all implementations are the moment as there are some
>        serious drawbacks to masked stores in certain cases, but
>        that may be fixed on future architectures.
> 
> Full check passes on x86-64 and build succeeds for all ISA levels w/
> and w/o multiarch.
> ---
>  sysdeps/x86_64/multiarch/stpncpy-evex.S       |    5 +-
>  sysdeps/x86_64/multiarch/strcat-evex.S        |  291 +---
>  sysdeps/x86_64/multiarch/strcat-strlen-evex.S |  110 ++
>  sysdeps/x86_64/multiarch/strcpy-evex.S        | 1282 ++++++-----------
>  sysdeps/x86_64/multiarch/strncat-evex.S       |  525 ++++++-
>  sysdeps/x86_64/multiarch/strncpy-evex.S       |  995 ++++++++++++-
>  .../multiarch/strncpy-or-cat-overflow-def.h   |   65 +
>  7 files changed, 2100 insertions(+), 1173 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> 
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> index 99ea76a372..3693491baa 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
> +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> @@ -3,6 +3,5 @@
>  #endif
>  
>  #define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY	STPNCPY
> -#include "strcpy-evex.S"
> +#define STRNCPY	STPNCPY
> +#include "strncpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
> index 0e2df947e9..b4207b7889 100644
> --- a/sysdeps/x86_64/multiarch/strcat-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcat-evex.S
> @@ -1,286 +1,7 @@
> -/* strcat with 256-bit EVEX instructions.
> -   Copyright (C) 2021-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <isa-level.h>
> -
> -#if ISA_SHOULD_BUILD (4)
> -
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_evex
> -# endif
> -
> -# define VMOVU		vmovdqu64
> -# define VMOVA		vmovdqa64
> -
> -/* zero register */
> -# define XMMZERO	xmm16
> -# define YMMZERO	ymm16
> -# define YMM0		ymm17
> -# define YMM1		ymm18
> -
> -# define USE_AS_STRCAT
> -
> -/* Number of bytes in a vector register */
> -# define VEC_SIZE	32
> -
> -	.section .text.evex,"ax",@progbits
> -ENTRY (STRCAT)
> -	mov	%rdi, %r9
> -# ifdef USE_AS_STRNCAT
> -	mov	%rdx, %r8
> -# endif
> -
> -	xor	%eax, %eax
> -	mov	%edi, %ecx
> -	and	$((VEC_SIZE * 4) - 1), %ecx
> -	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
> -	cmp	$(VEC_SIZE * 3), %ecx
> -	ja	L(fourth_vector_boundary)
> -	vpcmpb	$0, (%rdi), %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_first_vector)
> -	mov	%rdi, %rax
> -	and	$-VEC_SIZE, %rax
> -	jmp	L(align_vec_size_start)
> -L(fourth_vector_boundary):
> -	mov	%rdi, %rax
> -	and	$-VEC_SIZE, %rax
> -	vpcmpb	$0, (%rax), %YMMZERO, %k0
> -	mov	$-1, %r10d
> -	sub	%rax, %rcx
> -	shl	%cl, %r10d
> -	kmovd	%k0, %edx
> -	and	%r10d, %edx
> -	jnz	L(exit)
> -
> -L(align_vec_size_start):
> -	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -	add	$(VEC_SIZE * 4), %rax
> -	kmovd	%k4, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -	kmovd	%k4, %edx
> -	add	$(VEC_SIZE * 4), %rax
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -	add	$(VEC_SIZE * 4), %rax
> -	kmovd	%k4, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -	add	$(VEC_SIZE * 5), %rax
> -	kmovd	%k4, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
> -	add	$VEC_SIZE, %rax
> -	kmovd	%k0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
> -	add	$VEC_SIZE, %rax
> -	kmovd	%k0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
> -	add	$VEC_SIZE, %rax
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	add	$VEC_SIZE, %rax
> -
> -	.p2align 4
> -L(align_four_vec_loop):
> -	VMOVA	(%rax), %YMM0
> -	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
> -	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
> -	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
> -	vpminub	%YMM0, %YMM1, %YMM0
> -	/* If K0 != 0, there is a null byte.  */
> -	vpcmpb	$0, %YMM0, %YMMZERO, %k0
> -	add	$(VEC_SIZE * 4), %rax
> -	ktestd	%k0, %k0
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
> -	sub	$(VEC_SIZE * 5), %rax
> -	kmovd	%k0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 4), %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit):
> -	sub	%rdi, %rax
> -L(exit_null_on_first_vector):
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_second_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$VEC_SIZE, %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_third_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 2), %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_fourth_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 3), %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_fifth_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 4), %rax
> -
> -	.p2align 4
> -L(StartStrcpyPart):
> -	lea	(%r9, %rax), %rdi
> -	mov	%rsi, %rcx
> -	mov	%r9, %rax      /* save result */
> -
> -# ifdef USE_AS_STRNCAT
> -	test	%r8, %r8
> -	jz	L(ExitZero)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-evex.S"
> +#ifndef STRCAT
> +# define STRCAT	__strcat_evex
>  #endif
> +
> +#define USE_AS_STRCAT
> +#define STRCPY	STRCAT
> +#include "strcpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> new file mode 100644
> index 0000000000..9530d7b683
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> @@ -0,0 +1,110 @@
> +/* strlen used for begining of str{n}cat using EVEX 256/512.
> +   Copyright (C) 2011-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* NOTE: This file is meant to be included by strcat-evex or
> +   strncat-evex and does not standalone.  Before including %rdi
> +   must be saved in %rax.  */

Since this file isn't standalone, please rename it to .h.

> +
> +
> +/* Simple strlen implementation that ends at
> +   L(strcat_strlen_done).  */
> +	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
> +	movq	%rdi, %r8
> +	andq	$(VEC_SIZE * -1), %r8
> +	VPCMPEQ	(%r8), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +#ifdef USE_AS_WCSCPY
> +	subl	%r8d, %edi
> +	shrl	$2, %edi
> +#endif
> +	shrx	%VRDI, %VRCX, %VRCX
> +#ifdef USE_AS_WCSCPY
> +	movq	%rax, %rdi
> +#endif
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v0)
> +
> +
> +	VPCMPEQ	VEC_SIZE(%r8), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +	leaq	(VEC_SIZE)(%r8), %rdi
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v0)
> +
> +	VPCMPEQ	(VEC_SIZE * 2)(%r8), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v1)
> +
> +	VPCMPEQ	(VEC_SIZE * 3)(%r8), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v2)
> +
> +	VPCMPEQ	(VEC_SIZE * 4)(%r8), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v3)
> +
> +	andq	$-(VEC_SIZE * 4), %rdi
> +	.p2align 4,, 8
> +L(loop_2x_vec):
> +	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(0)
> +	VPMIN	(VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
> +	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(2)
> +	VPMIN	(VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
> +	VPTESTN	%VMM(1), %VMM(1), %k1
> +	VPTESTN	%VMM(3), %VMM(3), %k3
> +	subq	$(VEC_SIZE * -4), %rdi
> +	KORTEST	%k1, %k3
> +	jz	L(loop_2x_vec)
> +
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v0)
> +
> +	KMOV	%k1, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v1)
> +
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v2)
> +
> +	KMOV	%k3, %VRCX
> +L(bsf_and_done_v3):
> +	addq	$VEC_SIZE, %rdi
> +L(bsf_and_done_v2):
> +	bsf	%VRCX, %VRCX
> +	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
> +	jmp	L(strcat_strlen_done)
> +
> +	.p2align 4,, 4
> +L(bsf_and_done_v1):
> +	addq	$VEC_SIZE, %rdi
> +L(bsf_and_done_v0):
> +	bsf	%VRCX, %VRCX
> +#ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
> +#else
> +	addq	%rcx, %rdi
> +#endif
> +L(strcat_strlen_done):
> diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
> index 82e45ac675..1ba0195ed2 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
> @@ -1,4 +1,4 @@
> -/* strcpy with 256-bit EVEX instructions.
> +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
>     Copyright (C) 2021-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>  
> @@ -17,990 +17,526 @@
>     <https://www.gnu.org/licenses/>.  */
>  
>  #include <isa-level.h>
> -
>  #if ISA_SHOULD_BUILD (4)
>  
>  
> -# ifndef USE_AS_STRCAT
> -#  include <sysdep.h>
> +	/* Use evex-masked stores for small sizes. Turned off at the
> +	   moment.  */
> +# define USE_EVEX_MASKED_STORE	0
> +	/* Use movsb in page cross case to save code size.  */
> +# define USE_MOVSB_IN_PAGE_CROSS	1
>  
> -#  ifndef STRCPY
> -#   define STRCPY  __strcpy_evex
> -#  endif
> +# include <sysdep.h>
>  
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
>  # endif
>  
> -# define VMOVU		vmovdqu64
> -# define VMOVA		vmovdqa64
> -
> -/* Number of bytes in a vector register */
> -# ifndef VEC_SIZE
> -#  define VEC_SIZE	32
> +# ifndef STRCPY
> +#  define STRCPY	__strcpy_evex
>  # endif
>  
> -# define XMM2		xmm18
> -# define XMM3		xmm19
>  
> -# define YMM2		ymm18
> -# define YMM3		ymm19
> -# define YMM4		ymm20
> -# define YMM5		ymm21
> -# define YMM6		ymm22
> -# define YMM7		ymm23
> +# ifdef USE_AS_WCSCPY
> +#  define VMOVU_MASK	vmovdqu32
> +#  define VPMIN	vpminud
> +#  define VPTESTN	vptestnmd
> +#  define VPTEST	vptestmd
> +#  define VPCMPEQ	vpcmpeqd
> +#  define CHAR_SIZE	4
>  
> -# ifndef USE_AS_STRCAT
> +#  define REP_MOVS	rep movsd
>  
> -/* zero register */
> -#  define XMMZERO	xmm16
> -#  define YMMZERO	ymm16
> -#  define YMM1		ymm17
> -
> -	.section .text.evex,"ax",@progbits
> -ENTRY (STRCPY)
> -#  ifdef USE_AS_STRNCPY
> -	mov	%RDX_LP, %R8_LP
> -	test	%R8_LP, %R8_LP
> -	jz	L(ExitZero)
> -#  endif
> -	mov	%rsi, %rcx
> -#  ifndef USE_AS_STPCPY
> -	mov	%rdi, %rax      /* save result */
> -#  endif
> +#  define USE_WIDE_CHAR
> +# else
> +#  define VMOVU_MASK	vmovdqu8
> +#  define VPMIN	vpminub
> +#  define VPTESTN	vptestnmb
> +#  define VPTEST	vptestmb
> +#  define VPCMPEQ	vpcmpeqb
> +#  define CHAR_SIZE	1
>  
> -	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
> +#  define REP_MOVS	rep movsb
>  # endif
>  
> -	and	$((VEC_SIZE * 4) - 1), %ecx
> -	cmp	$(VEC_SIZE * 2), %ecx
> -	jbe	L(SourceStringAlignmentLessTwoVecSize)
> -
> -	and	$-VEC_SIZE, %rsi
> -	and	$(VEC_SIZE - 1), %ecx
> -
> -	vpcmpb	$0, (%rsi), %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	shr	%cl, %rdx
> +# include "reg-macros.h"
>  
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -	mov	$VEC_SIZE, %r10
> -	sub	%rcx, %r10
> -	cmp	%r10, %r8
> -#  else
> -	mov	$(VEC_SIZE + 1), %r10
> -	sub	%rcx, %r10
> -	cmp	%r10, %r8
> -#  endif
> -	jbe	L(CopyVecSizeTailCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeTail)
> -
> -	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
> -	kmovd	%k1, %edx
>  
> -# ifdef USE_AS_STRNCPY
> -	add	$VEC_SIZE, %r10
> -	cmp	%r10, %r8
> -	jbe	L(CopyTwoVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jnz	L(CopyTwoVecSize)
> -
> -	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
> -	VMOVU	%YMM2, (%rdi)
> -
> -/* If source address alignment != destination address alignment */
> -	.p2align 4
> -L(UnalignVecSizeBoth):
> -	sub	%rcx, %rdi
> -# ifdef USE_AS_STRNCPY
> -	add	%rcx, %r8
> -	sbb	%rcx, %rcx
> -	or	%rcx, %r8
> -# endif
> -	mov	$VEC_SIZE, %rcx
> -	VMOVA	(%rsi, %rcx), %YMM2
> -	VMOVU	%YMM2, (%rdi, %rcx)
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$(VEC_SIZE * 3), %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec2)
> +# ifdef USE_AS_STPCPY
> +#  define END_REG	rax
>  # else
> -	jnz	L(CopyVecSize)
> +#  define END_REG	rdi, %rdx, CHAR_SIZE
>  # endif
>  
> -	VMOVU	%YMM2, (%rdi, %rcx)
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
> -	vpcmpb	$0, %YMM3, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec3)
> +# ifdef USE_AS_STRCAT
> +#  define PAGE_ALIGN_REG	edx
> +#  define PAGE_ALIGN_REG_64	rdx
>  # else
> -	jnz	L(CopyVecSize)
> +#  define PAGE_ALIGN_REG	eax
> +#  define PAGE_ALIGN_REG_64	rax
>  # endif
>  
> -	VMOVU	%YMM3, (%rdi, %rcx)
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
> -	vpcmpb	$0, %YMM4, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec4)
> -# else
> -	jnz	L(CopyVecSize)
> -# endif
> +# define VZERO	VMM(7)
> +# define VZERO_128	VMM_128(7)
>  
> -	VMOVU	%YMM4, (%rdi, %rcx)
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec2)
> -# else
> -	jnz	L(CopyVecSize)
> -# endif
>  
> -	VMOVU	%YMM2, (%rdi, %rcx)
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec2)
> -# else
> -	jnz	L(CopyVecSize)
> -# endif
> +# define PAGE_SIZE	4096
> +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
>  
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
> -	VMOVU	%YMM2, (%rdi, %rcx)
> -	vpcmpb	$0, %YMM3, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec3)
> -# else
> -	jnz	L(CopyVecSize)
> -# endif
>  
> -	VMOVU	%YMM3, (%rdi, %rcx)
> -	mov	%rsi, %rdx
> -	lea	VEC_SIZE(%rsi, %rcx), %rsi
> -	and	$-(VEC_SIZE * 4), %rsi
> -	sub	%rsi, %rdx
> -	sub	%rdx, %rdi
> -# ifdef USE_AS_STRNCPY
> -	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
> -# endif
> -L(UnalignedFourVecSizeLoop):
> -	VMOVA	(%rsi), %YMM4
> -	VMOVA	VEC_SIZE(%rsi), %YMM5
> -	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
> -	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
> -	vpminub	%YMM5, %YMM4, %YMM2
> -	vpminub	%YMM7, %YMM6, %YMM3
> -	vpminub	%YMM2, %YMM3, %YMM2
> -	/* If K7 != 0, there is a null byte.  */
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k7
> -	kmovd	%k7, %edx
> -# ifdef USE_AS_STRNCPY
> -	sub	$(VEC_SIZE * 4), %r8
> -	jbe	L(UnalignedLeaveCase2OrCase3)
> +	.section SECTION(.text), "ax", @progbits
> +ENTRY(STRCPY)
> +# ifdef USE_AS_STRCAT
> +	movq	%rdi, %rax
> +#  include "strcat-strlen-evex.S"
>  # endif
> -	test	%edx, %edx
> -	jnz	L(UnalignedFourVecSizeLeave)
> -
> -L(UnalignedFourVecSizeLoop_start):
> -	add	$(VEC_SIZE * 4), %rdi
> -	add	$(VEC_SIZE * 4), %rsi
> -	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
> -	VMOVA	(%rsi), %YMM4
> -	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
> -	VMOVA	VEC_SIZE(%rsi), %YMM5
> -	vpminub	%YMM5, %YMM4, %YMM2
> -	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
> -	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
> -	VMOVU	%YMM7, -VEC_SIZE(%rdi)
> -	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
> -	vpminub	%YMM7, %YMM6, %YMM3
> -	vpminub	%YMM2, %YMM3, %YMM2
> -	/* If K7 != 0, there is a null byte.  */
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k7
> -	kmovd	%k7, %edx
> -# ifdef USE_AS_STRNCPY
> -	sub	$(VEC_SIZE * 4), %r8
> -	jbe	L(UnalignedLeaveCase2OrCase3)
> +
> +	movl	%esi, %PAGE_ALIGN_REG
> +	andl	$(PAGE_SIZE - 1), %PAGE_ALIGN_REG
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
> +	ja	L(page_cross)
> +L(page_cross_continue):
> +	VMOVU	(%rsi), %VMM(0)
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +	movq	%rdi, %rax
>  # endif
> -	test	%edx, %edx
> -	jz	L(UnalignedFourVecSizeLoop_start)
>  
> -L(UnalignedFourVecSizeLeave):
> -	vpcmpb	$0, %YMM4, %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeUnaligned_0)
>  
> -	vpcmpb	$0, %YMM5, %YMMZERO, %k2
> -	kmovd	%k2, %ecx
> -	test	%ecx, %ecx
> -	jnz	L(CopyVecSizeUnaligned_16)
> +	/* Two short string implementations. One with traditional
> +	   branching approach and one with masked instructions (which
> +	   have potential for dramatically bad perf if dst splits a
> +	   page and is not in the TLB).  */
> +# if USE_EVEX_MASKED_STORE
> +	VPTEST	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +#  ifdef USE_AS_WCSCPY
> +	subl	$((1 << CHAR_PER_VEC)- 1), %VRCX
> +#  else
> +	inc	%VRCX
> +#  endif
> +	jz	L(more_1x_vec)
> +	KMOV	%VRCX, %k1
> +	KXOR	%k0, %k1, %k1
>  
> -	vpcmpb	$0, %YMM6, %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeUnaligned_32)
> -
> -	vpcmpb	$0, %YMM7, %YMMZERO, %k4
> -	kmovd	%k4, %ecx
> -	bsf	%ecx, %edx
> -	VMOVU	%YMM4, (%rdi)
> -	VMOVU	%YMM5, VEC_SIZE(%rdi)
> -	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
> -# endif
> -	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
> -	add	$(VEC_SIZE - 1), %r8
> -	sub	%rdx, %r8
> -	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> -# else
> -	add	$(VEC_SIZE * 3), %rsi
> -	add	$(VEC_SIZE * 3), %rdi
> -	jmp	L(CopyVecSizeExit)
> -# endif
> +	VMOVU_MASK %VMM(0), (%rdi){%k1}
>  
> -/* If source address alignment == destination address alignment */
> +#  ifdef USE_AS_STPCPY
> +	bsf	%VRCX, %VRCX
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
> +#  endif
> +	ret
>  
> -L(SourceStringAlignmentLessTwoVecSize):
> -	VMOVU	(%rsi), %YMM3
> -	VMOVU	VEC_SIZE(%rsi), %YMM2
> -	vpcmpb	$0, %YMM3, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> +# else
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jz	L(more_1x_vec)
>  
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -	cmp	$VEC_SIZE, %r8
> +	xorl	%edx, %edx
> +	bsf	%VRCX, %VRDX
> +#  ifdef USE_AS_STPCPY
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#  endif
> +
> +	/* Use mask bits in rcx to detect which copy we need. If the low
> +	   mask is zero then there must be a bit set in the upper half.
> +	   I.e if rcx != 0 and ecx == 0, then match must be upper 32
> +	   bits so we use L(copy_32_63).  */
> +#  if VEC_SIZE == 64
> +#   ifdef USE_AS_WCSCPY
> +	testb	%cl, %cl
> +#   else
> +	testl	%ecx, %ecx
> +#   endif
> +	jz	L(copy_32_63)
> +#  endif
> +
> +#  ifdef USE_AS_WCSCPY
> +	testb	$0xf, %cl
>  #  else
> -	cmp	$(VEC_SIZE + 1), %r8
> +	testw	%cx, %cx
>  #  endif
> -	jbe	L(CopyVecSizeTail1Case2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeTail1)
> +	jz	L(copy_16_31)
>  
> -	VMOVU	%YMM3, (%rdi)
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k0
> -	kmovd	%k0, %edx
>  
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -	cmp	$(VEC_SIZE * 2), %r8
> +#  ifdef USE_AS_WCSCPY
> +	testb	$0x3, %cl
>  #  else
> -	cmp	$((VEC_SIZE * 2) + 1), %r8
> +	testb	%cl, %cl
>  #  endif
> -	jbe	L(CopyTwoVecSize1Case2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jnz	L(CopyTwoVecSize1)
> -
> -	and	$-VEC_SIZE, %rsi
> -	and	$(VEC_SIZE - 1), %ecx
> -	jmp	L(UnalignVecSizeBoth)
> +	jz	L(copy_8_15)
>  
> -/*------End of main part with loops---------------------*/
>  
> -/* Case1 */
> +#  ifdef USE_AS_WCSCPY
> +	vmovd	%VMM_128(0), (%rdi)
> +	/* No need to copy, we know its zero.  */
> +	movl	$0, (%END_REG)
>  
> -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> -	.p2align 4
> -L(CopyVecSize):
> -	add	%rcx, %rdi
> -# endif
> -L(CopyVecSizeTail):
> -	add	%rcx, %rsi
> -L(CopyVecSizeTail1):
> -	bsf	%edx, %edx
> -L(CopyVecSizeExit):
> -	cmp	$32, %edx
> -	jae	L(Exit32_63)
> -	cmp	$16, %edx
> -	jae	L(Exit16_31)
> -	cmp	$8, %edx
> -	jae	L(Exit8_15)
> -	cmp	$4, %edx
> -	jae	L(Exit4_7)
> -	cmp	$3, %edx
> -	je	L(Exit3)
> -	cmp	$1, %edx
> -	ja	L(Exit2)
> -	je	L(Exit1)
> -	movb	$0, (%rdi)
> -# ifdef USE_AS_STPCPY
> -	lea	(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$1, %r8
> -	lea	1(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> -# endif
>  	ret
> +#  else
>  
> -	.p2align 4
> -L(CopyTwoVecSize1):
> -	add	$VEC_SIZE, %rsi
> -	add	$VEC_SIZE, %rdi
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$VEC_SIZE, %r8
> -# endif
> -	jmp	L(CopyVecSizeTail1)
> -
> -	.p2align 4
> -L(CopyTwoVecSize):
> -	bsf	%edx, %edx
> -	add	%rcx, %rsi
> -	add	$VEC_SIZE, %edx
> -	sub	%ecx, %edx
> -	jmp	L(CopyVecSizeExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnaligned_0):
> -	bsf	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -	VMOVU	%YMM4, (%rdi)
> -	add	$((VEC_SIZE * 4) - 1), %r8
> -	sub	%rdx, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> -# else
> -	jmp	L(CopyVecSizeExit)
> -# endif
> +	testb	$0x7, %cl
> +	jz	L(copy_4_7)
>  
> -	.p2align 4
> -L(CopyVecSizeUnaligned_16):
> -	bsf	%ecx, %edx
> -	VMOVU	%YMM4, (%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	VEC_SIZE(%rdi, %rdx), %rax
> -# endif
> -	VMOVU	%YMM5, VEC_SIZE(%rdi)
> -	add	$((VEC_SIZE * 3) - 1), %r8
> -	sub	%rdx, %r8
> -	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> -# else
> -	add	$VEC_SIZE, %rsi
> -	add	$VEC_SIZE, %rdi
> -	jmp	L(CopyVecSizeExit)
> -# endif
>  
> -	.p2align 4
> -L(CopyVecSizeUnaligned_32):
> -	bsf	%edx, %edx
> -	VMOVU	%YMM4, (%rdi)
> -	VMOVU	%YMM5, VEC_SIZE(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
> -# endif
> -	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
> -	add	$((VEC_SIZE * 2) - 1), %r8
> -	sub	%rdx, %r8
> -	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> -# else
> -	add	$(VEC_SIZE * 2), %rsi
> -	add	$(VEC_SIZE * 2), %rdi
> -	jmp	L(CopyVecSizeExit)
> -# endif
> +	test	%edx, %edx
> +	jz	L(set_null_term)
>  
> -# ifdef USE_AS_STRNCPY
> -#  ifndef USE_AS_STRCAT
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec6):
> -	VMOVU	%YMM6, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec5):
> -	VMOVU	%YMM5, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec4):
> -	VMOVU	%YMM4, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec3):
> -	VMOVU	%YMM3, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> +	/* NB: make this `vmovw` if support for AVX512-FP16 is added.
> +	 */
> +	vmovd	%VMM_128(0), %esi
> +	movw	%si, (%rdi)
> +
> +	.p2align 4,, 1
> +L(set_null_term):
> +	/* No need to copy, we know its zero.  */
> +	movb	$0, (%END_REG)
> +	ret
>  #  endif
>  
> -/* Case2 */
> -
> -	.p2align 4
> -L(CopyVecSizeCase2):
> -	add	$VEC_SIZE, %r8
> -	add	%rcx, %rdi
> -	add	%rcx, %rsi
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyTwoVecSizeCase2):
> -	add	%rcx, %rsi
> -	bsf	%edx, %edx
> -	add	$VEC_SIZE, %edx
> -	sub	%ecx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -L(CopyVecSizeTailCase2):
> -	add	%rcx, %rsi
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -L(CopyVecSizeTail1Case2):
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -/* Case2 or Case3,  Case3 */
> -
> -	.p2align 4
> -L(CopyVecSizeCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyVecSizeCase2)
> -L(CopyVecSizeCase3):
> -	add	$VEC_SIZE, %r8
> -	add	%rcx, %rdi
> -	add	%rcx, %rsi
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyTwoVecSizeCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyTwoVecSizeCase2)
> -	add	%rcx, %rsi
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyVecSizeTailCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyVecSizeTailCase2)
> -	add	%rcx, %rsi
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyTwoVecSize1Case2OrCase3):
> -	add	$VEC_SIZE, %rdi
> -	add	$VEC_SIZE, %rsi
> -	sub	$VEC_SIZE, %r8
> -L(CopyVecSizeTail1Case2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyVecSizeTail1Case2)
> -	jmp	L(StrncpyExit)
> +#  if VEC_SIZE == 64
> +	.p2align 4,, 6
> +L(copy_32_63):
> +	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +	VMOVU	%VMM_256(0), (%rdi)
> +	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
> +	ret
> +#  endif
> +
> +
> +	.p2align 4,, 6
> +L(copy_16_31):
> +	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
> +	   and will save code size.  */
> +	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +	VMOVU	%VMM_128(0), (%rdi)
> +	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
> +	ret
> +
> +	.p2align 4,, 8
> +L(copy_8_15):
> +#  ifdef USE_AS_WCSCPY
> +	movl	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> +#  else
> +	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
> +#  endif
> +	vmovq	%VMM_128(0), (%rdi)
> +	movq	%rcx, -(8 - CHAR_SIZE)(%END_REG)
> +	ret
>  # endif
>  
> -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
>  
> -	.p2align 4
> -L(Exit1):
> -	movzwl	(%rsi), %edx
> -	mov	%dx, (%rdi)
> -# ifdef USE_AS_STPCPY
> -	lea	1(%rdi), %rax
> +# ifndef USE_AS_WCSCPY
> +	.p2align 4,, 12
> +L(copy_4_7):
> +	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> +	vmovd	%VMM_128(0), (%rdi)
> +	movl	%ecx, -(4 - CHAR_SIZE)(%END_REG)
> +	ret
>  # endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$2, %r8
> -	lea	2(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +
> +
> +	.p2align 4,, 8
> +L(more_1x_vec):
> +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> +	VMOVU	%VMM(0), (%rdi)
>  # endif
> -	ret
> +	subq	%rsi, %rdi
> +	andq	$-(VEC_SIZE), %rsi
> +	addq	%rsi, %rdi
> +	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
>  
> -	.p2align 4
> -L(Exit2):
> -	movzwl	(%rsi), %ecx
> -	mov	%cx, (%rdi)
> -	movb	$0, 2(%rdi)
> +	/* Ideally we store after moves to minimize impact of potential
> +	   false-dependencies.  */
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +	VMOVU	%VMM(0), (%rax)
> +# endif
> +
> +	VPTESTN	%VMM(1), %VMM(1), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x1)
> +
> +	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
> +	VMOVU	%VMM(1), VEC_SIZE(%rdi)
> +
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x2)
> +
> +	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
> +
> +	VPTESTN	%VMM(3), %VMM(3), %k0
> +	KMOV	%k0, %VRDX
> +	test	%VRDX, %VRDX
> +	jnz	L(ret_vec_x3)
> +
> +	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	VPTESTN	%VMM(4), %VMM(4), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x4)
> +
> +	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
> +
> +
> +	/* Align for 4x loop.  */
> +	subq	%rsi, %rdi
> +
> +	/* + VEC_SIZE * 5 because we never added the original VEC_SIZE
> +	   we covered before aligning.  */
> +	subq	$-(VEC_SIZE * 5), %rsi
> +	andq	$-(VEC_SIZE * 4), %rsi
> +
> +
> +	/* Load first half of the loop before entry.  */
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +	KORTEST	%k2, %k4
> +	jnz	L(loop_4x_done)
> +
> +	.p2align 4,, 11
> +L(loop_4x_vec):
> +
> +	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> +
> +	subq	$(VEC_SIZE * -4), %rsi
> +
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +	KORTEST	%k2, %k4
> +	jz	L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +	/* Restore rdi (%rdi).  */
> +	addq	%rsi, %rdi
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x0_end)
> +	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +
> +	KMOV	%k2, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x1)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x2)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +	/* Place L(ret_vec_x4) here to save code size.  We get a
> +	   meaningfuly benefit doing this for stpcpy.  */
> +	KMOV	%k4, %VRDX
> +L(ret_vec_x3):
> +	bsf	%VRDX, %VRDX
> +	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -	lea	2(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$3, %r8
> -	lea	3(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	leaq	(VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
>  # endif
> +L(return_end):
>  	ret
>  
> -	.p2align 4
> -L(Exit3):
> -	mov	(%rsi), %edx
> -	mov	%edx, (%rdi)
> +	.p2align 4,, 6
> +L(ret_vec_x0_end):
> +	bsf	%VRCX, %VRCX
>  # ifdef USE_AS_STPCPY
> -	lea	3(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$4, %r8
> -	lea	4(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
> +	inc	%VRCX
> +	VMOVU	(-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  	ret
>  
> -	.p2align 4
> -L(Exit4_7):
> -	mov	(%rsi), %ecx
> -	mov	%ecx, (%rdi)
> -	mov	-3(%rsi, %rdx), %ecx
> -	mov	%ecx, -3(%rdi, %rdx)
> +	.p2align 4,, 8
> +L(ret_vec_x1):
> +	bsf	%VRCX, %VRCX
> +	VMOVU	(VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	%rdx, %r8
> -	sub	$1, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
>  	ret
>  
> -	.p2align 4
> -L(Exit8_15):
> -	mov	(%rsi), %rcx
> -	mov	-7(%rsi, %rdx), %r9
> -	mov	%rcx, (%rdi)
> -	mov	%r9, -7(%rdi, %rdx)
> +	.p2align 4,, 4
> +L(ret_vec_x2):
> +	bsf	%VRCX, %VRCX
> +	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	%rdx, %r8
> -	sub	$1, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
>  	ret
>  
> -	.p2align 4
> -L(Exit16_31):
> -	VMOVU	(%rsi), %XMM2
> -	VMOVU	-15(%rsi, %rdx), %XMM3
> -	VMOVU	%XMM2, (%rdi)
> -	VMOVU	%XMM3, -15(%rdi, %rdx)
> +	/* ret_vec_x3 reuses return code after the loop.  */
> +	.p2align 4,, 6
> +L(ret_vec_x4):
> +	bsf	%VRCX, %VRCX
> +	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub %rdx, %r8
> -	sub $1, %r8
> -	lea 1(%rdi, %rdx), %rdi
> -	jnz L(StrncpyFillTailWithZero)
> +	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
>  	ret
>  
> -	.p2align 4
> -L(Exit32_63):
> -	VMOVU	(%rsi), %YMM2
> -	VMOVU	-31(%rsi, %rdx), %YMM3
> -	VMOVU	%YMM2, (%rdi)
> -	VMOVU	%YMM3, -31(%rdi, %rdx)
> -# ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> +
> +	.p2align 4,, 4
> +L(page_cross):
> +# ifndef USE_AS_STRCAT
> +	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
>  # endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	%rdx, %r8
> -	sub	$1, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	movq	%rsi, %rcx
> +	andq	$(VEC_SIZE * -1), %rcx
> +
> +	VPCMPEQ	(%rcx), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +# ifdef USE_AS_WCSCPY
> +	andl	$(VEC_SIZE - 1), %PAGE_ALIGN_REG
> +	shrl	$2, %PAGE_ALIGN_REG
>  # endif
> -	ret
> +	shrx	%VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
>  
> -# ifdef USE_AS_STRNCPY
> +# if USE_MOVSB_IN_PAGE_CROSS
> +	/* Optimizing more aggressively for space as this is very cold
> +	   code. This saves 2x cache lines.  */
>  
> -	.p2align 4
> -L(StrncpyExit1):
> -	movzbl	(%rsi), %edx
> -	mov	%dl, (%rdi)
> -#  ifdef USE_AS_STPCPY
> -	lea	1(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, 1(%rdi)
> +	/* This adds once to the later result which will get correct
> +	   copy bounds. NB: this can never zero-out a non-zero RCX as
> +	   to be in the page cross case rsi cannot be aligned and we
> +	   already right-shift rcx by the misalignment.  */
> +	shl	%VRCX
> +	jz	L(page_cross_continue)
> +#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +	movq	%rdi, %rax
>  #  endif
> -	ret
> +	bsf	%VRCX, %VRCX
> +	REP_MOVS
>  
> -	.p2align 4
> -L(StrncpyExit2):
> -	movzwl	(%rsi), %edx
> -	mov	%dx, (%rdi)
>  #  ifdef USE_AS_STPCPY
> -	lea	2(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, 2(%rdi)
> +	leaq	-CHAR_SIZE(%rdi), %rax
>  #  endif
>  	ret
>  
> -	.p2align 4
> -L(StrncpyExit3_4):
> -	movzwl	(%rsi), %ecx
> -	movzwl	-2(%rsi, %r8), %edx
> -	mov	%cx, (%rdi)
> -	mov	%dx, -2(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	ret
>  
> -	.p2align 4
> -L(StrncpyExit5_8):
> -	mov	(%rsi), %ecx
> -	mov	-4(%rsi, %r8), %edx
> -	mov	%ecx, (%rdi)
> -	mov	%edx, -4(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	ret
> +# else
> +	/* Check if we found zero-char before end of page.  */
> +	test	%VRCX, %VRCX
> +	jz	L(page_cross_continue)
>  
> -	.p2align 4
> -L(StrncpyExit9_16):
> -	mov	(%rsi), %rcx
> -	mov	-8(%rsi, %r8), %rdx
> -	mov	%rcx, (%rdi)
> -	mov	%rdx, -8(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	ret
> +	/* Traditional copy case, essentially same as used in non-page-
> +	   cross case but since we can't reuse VMM(0) we need twice as
> +	   many loads from rsi.  */
>  
> -	.p2align 4
> -L(StrncpyExit17_32):
> -	VMOVU	(%rsi), %XMM2
> -	VMOVU	-16(%rsi, %r8), %XMM3
> -	VMOVU	%XMM2, (%rdi)
> -	VMOVU	%XMM3, -16(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> +#  ifndef USE_AS_STRCAT
> +	xorl	%edx, %edx
>  #  endif
> -	ret
> -
> -	.p2align 4
> -L(StrncpyExit33_64):
> -	/*  0/32, 31/16 */
> -	VMOVU	(%rsi), %YMM2
> -	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
> -	VMOVU	%YMM2, (%rdi)
> -	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
> +	/* Dependency on rdi must already have been satisfied.  */
> +	bsf	%VRCX, %VRDX
>  #  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#  elif !defined USE_AS_STRCAT
> +	movq	%rdi, %rax
>  #  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	ret
>  
> -	.p2align 4
> -L(StrncpyExit65):
> -	/* 0/32, 32/32, 64/1 */
> -	VMOVU	(%rsi), %YMM2
> -	VMOVU	32(%rsi), %YMM3
> -	mov	64(%rsi), %cl
> -	VMOVU	%YMM2, (%rdi)
> -	VMOVU	%YMM3, 32(%rdi)
> -	mov	%cl, 64(%rdi)
> -#  ifdef USE_AS_STPCPY
> -	lea	65(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, 65(%rdi)
> +#  if VEC_SIZE == 64
> +#   ifdef USE_AS_WCSCPY
> +	testb	%cl, %cl
> +#   else
> +	test	%ecx, %ecx
> +#   endif
> +	jz	L(page_cross_copy_32_63)
>  #  endif
> -	ret
> -
> -#  ifndef USE_AS_STRCAT
>  
> -	.p2align 4
> -L(Fill1):
> -	mov	%dl, (%rdi)
> -	ret
> +#  ifdef USE_AS_WCSCPY
> +	testb	$0xf, %cl
> +#  else
> +	testw	%cx, %cx
> +#  endif
> +	jz	L(page_cross_copy_16_31)
>  
> -	.p2align 4
> -L(Fill2):
> -	mov	%dx, (%rdi)
> -	ret
> +#  ifdef USE_AS_WCSCPY
> +	testb	$0x3, %cl
> +#  else
> +	testb	%cl, %cl
> +#  endif
> +	jz	L(page_cross_copy_8_15)
>  
> -	.p2align 4
> -L(Fill3_4):
> -	mov	%dx, (%rdi)
> -	mov     %dx, -2(%rdi, %r8)
> +#  ifdef USE_AS_WCSCPY
> +	movl	(%rsi), %esi
> +	movl	%esi, (%rdi)
> +	movl	$0, (%END_REG)
>  	ret
> +#  else
>  
> -	.p2align 4
> -L(Fill5_8):
> -	mov	%edx, (%rdi)
> -	mov     %edx, -4(%rdi, %r8)
> -	ret
> +	testb	$0x7, %cl
> +	jz	L(page_cross_copy_4_7)
>  
> -	.p2align 4
> -L(Fill9_16):
> -	mov	%rdx, (%rdi)
> -	mov	%rdx, -8(%rdi, %r8)
> +	test	%edx, %edx
> +	jz	L(page_cross_set_null_term)
> +	movzwl	(%rsi), %ecx
> +	movw	%cx, (%rdi)
> +L(page_cross_set_null_term):
> +	movb	$0, (%END_REG)
>  	ret
>  
> -	.p2align 4
> -L(Fill17_32):
> -	VMOVU	%XMMZERO, (%rdi)
> -	VMOVU	%XMMZERO, -16(%rdi, %r8)
> -	ret
>  
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec2):
> -	VMOVU	%YMM2, (%rdi, %rcx)
> -
> -	.p2align 4
> -L(CopyVecSizeVecExit):
> -	bsf	%edx, %edx
> -	add	$(VEC_SIZE - 1), %r8
> -	add	%rcx, %rdi
> -#   ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -#   endif
> -	sub	%rdx, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -
> -	.p2align 4
> -L(StrncpyFillTailWithZero):
> -	xor	%edx, %edx
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(StrncpyFillExit)
> -
> -	VMOVU	%YMMZERO, (%rdi)
> -	add	$VEC_SIZE, %rdi
> -
> -	mov	%rdi, %rsi
> -	and	$(VEC_SIZE - 1), %esi
> -	sub	%rsi, %rdi
> -	add	%rsi, %r8
> -	sub	$(VEC_SIZE * 4), %r8
> -	jb	L(StrncpyFillLessFourVecSize)
> -
> -L(StrncpyFillLoopVmovdqa):
> -	VMOVA	%YMMZERO, (%rdi)
> -	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
> -	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
> -	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
> -	add	$(VEC_SIZE * 4), %rdi
> -	sub	$(VEC_SIZE * 4), %r8
> -	jae	L(StrncpyFillLoopVmovdqa)
> -
> -L(StrncpyFillLessFourVecSize):
> -	add	$(VEC_SIZE * 2), %r8
> -	jl	L(StrncpyFillLessTwoVecSize)
> -	VMOVA	%YMMZERO, (%rdi)
> -	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
> -	add	$(VEC_SIZE * 2), %rdi
> -	sub	$VEC_SIZE, %r8
> -	jl	L(StrncpyFillExit)
> -	VMOVA	%YMMZERO, (%rdi)
> -	add	$VEC_SIZE, %rdi
> -	jmp	L(Fill)
> -
> -	.p2align 4
> -L(StrncpyFillLessTwoVecSize):
> -	add	$VEC_SIZE, %r8
> -	jl	L(StrncpyFillExit)
> -	VMOVA	%YMMZERO, (%rdi)
> -	add	$VEC_SIZE, %rdi
> -	jmp	L(Fill)
> -
> -	.p2align 4
> -L(StrncpyFillExit):
> -	add	$VEC_SIZE, %r8
> -L(Fill):
> -	cmp	$17, %r8d
> -	jae	L(Fill17_32)
> -	cmp	$9, %r8d
> -	jae	L(Fill9_16)
> -	cmp	$5, %r8d
> -	jae	L(Fill5_8)
> -	cmp	$3, %r8d
> -	jae	L(Fill3_4)
> -	cmp	$1, %r8d
> -	ja	L(Fill2)
> -	je	L(Fill1)
> +	.p2align 4,, 4
> +L(page_cross_copy_4_7):
> +	movl	(%rsi), %ecx
> +	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
> +	movl	%ecx, (%rdi)
> +	movl	%esi, -(4 - CHAR_SIZE)(%END_REG)
>  	ret
> -
> -/* end of ifndef USE_AS_STRCAT */
>  #  endif
>  
> -	.p2align 4
> -L(UnalignedLeaveCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(UnalignedFourVecSizeLeaveCase2)
> -L(UnalignedFourVecSizeLeaveCase3):
> -	lea	(VEC_SIZE * 4)(%r8), %rcx
> -	and	$-VEC_SIZE, %rcx
> -	add	$(VEC_SIZE * 3), %r8
> -	jl	L(CopyVecSizeCase3)
> -	VMOVU	%YMM4, (%rdi)
> -	sub	$VEC_SIZE, %r8
> -	jb	L(CopyVecSizeCase3)
> -	VMOVU	%YMM5, VEC_SIZE(%rdi)
> -	sub	$VEC_SIZE, %r8
> -	jb	L(CopyVecSizeCase3)
> -	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
> -	sub	$VEC_SIZE, %r8
> -	jb	L(CopyVecSizeCase3)
> -	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
> -#  ifdef USE_AS_STPCPY
> -	lea	(VEC_SIZE * 4)(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (VEC_SIZE * 4)(%rdi)
> -#  endif
> +#  if VEC_SIZE == 64
> +	.p2align 4,, 4
> +L(page_cross_copy_32_63):
> +	VMOVU	(%rsi), %VMM_256(0)
> +	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +	VMOVU	%VMM_256(0), (%rdi)
> +	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
>  	ret
> -
> -	.p2align 4
> -L(UnalignedFourVecSizeLeaveCase2):
> -	xor	%ecx, %ecx
> -	vpcmpb	$0, %YMM4, %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	add	$(VEC_SIZE * 3), %r8
> -	jle	L(CopyVecSizeCase2OrCase3)
> -	test	%edx, %edx
> -#  ifndef USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec4)
> -#  else
> -	jnz	L(CopyVecSize)
> -#  endif
> -	vpcmpb	$0, %YMM5, %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	VMOVU	%YMM4, (%rdi)
> -	add	$VEC_SIZE, %rcx
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -	test	%edx, %edx
> -#  ifndef USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec5)
> -#  else
> -	jnz	L(CopyVecSize)
>  #  endif
>  
> -	vpcmpb	$0, %YMM6, %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	VMOVU	%YMM5, VEC_SIZE(%rdi)
> -	add	$VEC_SIZE, %rcx
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -	test	%edx, %edx
> -#  ifndef USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec6)
> -#  else
> -	jnz	L(CopyVecSize)
> -#  endif
> -
> -	vpcmpb	$0, %YMM7, %YMMZERO, %k4
> -	kmovd	%k4, %edx
> -	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
> -	lea	VEC_SIZE(%rdi, %rcx), %rdi
> -	lea	VEC_SIZE(%rsi, %rcx), %rsi
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -L(StrncpyExit):
> -	cmp	$65, %r8d
> -	je	L(StrncpyExit65)
> -	cmp	$33, %r8d
> -	jae	L(StrncpyExit33_64)
> -	cmp	$17, %r8d
> -	jae	L(StrncpyExit17_32)
> -	cmp	$9, %r8d
> -	jae	L(StrncpyExit9_16)
> -	cmp	$5, %r8d
> -	jae	L(StrncpyExit5_8)
> -	cmp	$3, %r8d
> -	jae	L(StrncpyExit3_4)
> -	cmp	$1, %r8d
> -	ja	L(StrncpyExit2)
> -	je	L(StrncpyExit1)
> -#  ifdef USE_AS_STPCPY
> -	mov	%rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi)
> -#  endif
> +	.p2align 4,, 4
> +L(page_cross_copy_16_31):
> +	vmovdqu	(%rsi), %xmm0
> +	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +	vmovdqu	%xmm0, (%rdi)
> +	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
>  	ret
>  
> -	.p2align 4
> -L(ExitZero):
> -#  ifndef USE_AS_STRCAT
> -	mov	%rdi, %rax
> -#  endif
> +	.p2align 4,, 4
> +L(page_cross_copy_8_15):
> +	movq	(%rsi), %rcx
> +	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> +	movq	%rcx, (%rdi)
> +	movq	%rsi, -(8 - CHAR_SIZE)(%END_REG)
>  	ret
> -
> -# endif
> -
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# else
> -END (STRCAT)
>  # endif
> +END(STRCPY)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
> index 203a19bf21..d648ba5cfe 100644
> --- a/sysdeps/x86_64/multiarch/strncat-evex.S
> +++ b/sysdeps/x86_64/multiarch/strncat-evex.S
> @@ -1,7 +1,520 @@
> -#ifndef STRNCAT
> -# define STRNCAT	__strncat_evex
> -#endif
> +/* {wcs|str}ncat  with 256/512-bit EVEX.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +	/* Use evex-masked stores for small sizes. Turned off at the
> +	   moment.  */
> +# define USE_EVEX_MASKED_STORE	0
> +
> +# include <sysdep.h>
> +
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +# ifndef STRNCAT
> +#  define STRNCAT	__strncat_evex
> +# endif
> +
> +
> +# ifdef USE_AS_WCSCPY
> +#  define movNULL	movl
> +#  define VMOVU_MASK	vmovdqu32
> +#  define VPMIN	vpminud
> +#  define VPTESTN	vptestnmd
> +#  define VPTEST	vptestmd
> +#  define VPCMPEQ	vpcmpeqd
> +#  define CHAR_SIZE	4
> +
> +#  define REP_MOVS	rep movsd
> +
> +#  define VMASK_REG	VR10
> +#  define FIND_FIRST_ONE(src, dst)	movl $CHAR_PER_VEC, %dst; bsf %src, %dst
> +
> +#  define USE_WIDE_CHAR
> +# else
> +#  define movNULL	movb
> +#  define VMOVU_MASK	vmovdqu8
> +#  define VPMIN	vpminub
> +#  define VPTESTN	vptestnmb
> +#  define VPTEST	vptestmb
> +#  define VPCMPEQ	vpcmpeqb
> +#  define CHAR_SIZE	1
> +
> +#  define REP_MOVS	rep movsb
> +
> +#  define VMASK_REG	VRCX
> +#  define FIND_FIRST_ONE(src, dst)	tzcnt %src, %dst
> +
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# include "reg-macros.h"
> +
> +
> +# define VZERO	VMM(7)
> +# define VZERO_128	VMM_128(7)
> +
> +# define PAGE_SIZE	4096
> +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
> +
> +	.section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCAT)
> +	movq	%rdi, %rax
> +
> +	/* NB: It's safe to filter out zero-length strings WITHOUT
> +	   setting null-term. Destination MUST be a null-terminated
> +	   string so essentially the work is already done.  */
> +# ifdef USE_AS_WCSCPY
> +	leaq	-1(%rdx), %rcx
> +	shrq	$56, %rcx
> +	jnz	L(zero_len)
> +# else
> +	test	%rdx, %rdx
> +	jle	L(zero_len)
> +# endif
> +
> +# include "strcat-strlen-evex.S"
> +
> +	movl	%esi, %ecx
> +	andl	$(PAGE_SIZE - 1), %ecx
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
> +	ja	L(page_cross)
> +L(page_cross_continue):
> +	VMOVU	(%rsi), %VMM(0)
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +
> +	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
> +	   <= CHAR_PER_VEC with masked instructions (which have
> +	   potential for dramatically bad perf if dst splits a page and
> +	   is not in the TLB).  */
> +# if USE_EVEX_MASKED_STORE
> +	KMOV	%k0, %VRCX
> +	FIND_FIRST_ONE (VRCX, VR8)
> +	cmpq	%r8, %rdx
> +	jbe	L(less_1x_vec)
> +
> +	test	%VRCX, %VRCX
> +	jz	L(more_1x_vec)
> +
> +	blsmsk	%VRCX, %VRCX
> +	KMOV	%VRCX, %k1
> +	VMOVU_MASK %VMM(0), (%rdi){%k1}
> +	ret
> +
> +L(less_1x_vec):
> +	mov	$-1, %VRCX
> +	bzhi	%VRDX, %VRCX, %VRCX
> +	KMOV	%VRCX, %k1
> +	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
> +	VMOVU_MASK %VMM(0), (%rdi){%k1}
> +
> +	ret
> +# else
> +	KMOV	%k0, %VMASK_REG
> +	/* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
> +	   %VMASK_REG, %VRCX` for wcsncat.  */
> +	FIND_FIRST_ONE (VMASK_REG, VRCX)
> +	cmpq	%rcx, %rdx
> +	jbe	L(less_1x_vec)
> +
> +	/* If there were no zero-CHARs (rcx was zero before
> +	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> +	cmpl	$CHAR_PER_VEC, %ecx
> +	je	L(more_1x_vec)
> +
> +	movl	%ecx, %edx
> +
> +L(less_1x_vec):
> +#  if VEC_SIZE == 64
> +	cmpl	$(32 / CHAR_SIZE), %edx
> +	jae	L(copy_32_63)
> +#  endif
> +
> +	cmpl	$(16 / CHAR_SIZE), %edx
> +	jae	L(copy_16_31)
> +
> +
> +	cmpl	$(8 / CHAR_SIZE), %edx
> +	jae	L(copy_8_15)
> +
> +#  ifdef USE_AS_WCSCPY
> +	vmovd	%VMM_128(0), (%rdi)
> +	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +#  else
> +
> +	cmpl	$4, %edx
> +	jae	L(copy_4_7)
> +
> +	movzbl	(%rsi), %ecx
> +	cmpl	$1, %edx
> +	jbe	L(set_null_term)
> +
> +	movzwl	1(%rsi), %esi
> +	movw	%si, 1(%rdi)
> +
> +	.p2align 4,, 1
> +L(set_null_term):
> +	movb	%cl, (%rdi)
> +	movNULL	$0, (%rdi, %rdx)
> +	ret
> +#  endif
> +
> +#  if VEC_SIZE == 64
> +	.p2align 4,, 6
> +L(copy_32_63):
> +	VMOVU	-(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +	VMOVU	%VMM_256(0), (%rdi)
> +	VMOVU	%VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
> +	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +#  endif
> +	.p2align 4,, 6
> +L(copy_16_31):
> +	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
> +	   and will save code size.  */
> +	vmovdqu	-(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +	VMOVU	%VMM_128(0), (%rdi)
> +	vmovdqu	%xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
> +	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +
> +	.p2align 4,, 2
> +L(copy_8_15):
> +	movq	-(8)(%rsi, %rdx, CHAR_SIZE), %rcx
> +	vmovq	%VMM_128(0), (%rdi)
> +	movq	%rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
> +	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +
> +#  ifndef USE_AS_WCSCPY
> +	.p2align 4,, 12
> +L(copy_4_7):
> +	movl	-(4)(%rsi, %rdx, CHAR_SIZE), %ecx
> +	vmovd	%VMM_128(0), (%rdi)
> +	movl	%ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
> +	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +#  endif
> +
> +# endif
> +	.p2align 4,, 4
> +L(zero_len):
> +# ifdef USE_AS_WCSCPY
> +	test	%rdx, %rdx
> +# endif
> +	jne	OVERFLOW_STRCAT
> +	ret
>  
> -#define USE_AS_STRNCAT
> -#define STRCAT	STRNCAT
> -#include "strcat-evex.S"
> +	.p2align 4,, 8
> +L(more_1x_vec):
> +	VMOVU	%VMM(0), (%rdi)
> +
> +	/* We are going to align rsi here so will need to be able to re-
> +	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> +	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
> +
> +	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> +	subq	%rsi, %rdi
> +	andq	$-(VEC_SIZE), %rsi
> +L(loop_last_4x_vec):
> +	addq	%rsi, %rdi
> +	subq	%rsi, %rdx
> +# ifdef USE_AS_WCSCPY
> +	shrq	$2, %rdx
> +# endif
> +
> +	/* Will need this regardless.  */
> +	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
> +	VPTESTN	%VMM(1), %VMM(1), %k0
> +	KMOV	%k0, %VMASK_REG
> +
> +	cmpq	$(CHAR_PER_VEC * 2), %rdx
> +	ja	L(more_2x_vec)
> +
> +L(last_2x_vec):
> +	FIND_FIRST_ONE (VMASK_REG, VRCX)
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x1_len)
> +
> +	/* If there were no zero-CHARs (rcx was zero before
> +	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> +	cmpl	$CHAR_PER_VEC, %ecx
> +	jne	L(ret_vec_x1)
> +
> +	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	addl	$-CHAR_PER_VEC, %edx
> +	bzhi	%VRDX, %VRCX, %VR8
> +	jz	L(ret_vec_x2_len)
> +L(ret_vec_x2):
> +	bsf	%VRCX, %VRDX
> +L(ret_vec_x2_len):
> +	VMOVU	(VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	movNULL	$0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +	VMOVU	%VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +	ret
> +
> +	.p2align 4,, 4
> +L(ret_vec_x1_len):
> +	movl	%edx, %ecx
> +L(ret_vec_x1):
> +	VMOVU	(VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	movNULL	$0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
> +	VMOVU	%VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +	VZEROUPPER_RETURN
> +
> +
> +	.p2align 4,, 8
> +L(last_4x_vec):
> +	addl	$-(CHAR_PER_VEC * 4), %edx
> +	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
> +	VPTESTN	%VMM(1), %VMM(1), %k0
> +	KMOV	%k0, %VMASK_REG
> +	subq	$-(VEC_SIZE * 4), %rsi
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	cmpl	$(CHAR_PER_VEC * 2), %edx
> +	jbe	L(last_2x_vec)
> +	.p2align 4,, 8
> +L(more_2x_vec):
> +# ifdef USE_AS_WCSCPY
> +	xorl	%ecx, %ecx
> +# endif
> +	bsf	%VMASK_REG, %VRCX
> +	jnz	L(ret_vec_x1)
> +
> +	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x2)
> +
> +	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
> +	VPTESTN	%VMM(3), %VMM(3), %k0
> +	KMOV	%k0, %VMASK_REG
> +
> +	cmpq	$(CHAR_PER_VEC * 4), %rdx
> +	ja	L(more_4x_vec)
> +
> +	/* Adjust length before going to L(ret_vec_x3_len) or
> +	   L(ret_vec_x3).  */
> +	addl	$(CHAR_PER_VEC * -2), %edx
> +
> +	FIND_FIRST_ONE (VMASK_REG, VRCX)
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x3_len)
> +
> +	/* If there were no zero-CHARs (rcx was zero before
> +	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> +	cmpl	$CHAR_PER_VEC, %ecx
> +	jne	L(ret_vec_x3)
> +
> +	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	VPTESTN	%VMM(4), %VMM(4), %k0
> +	KMOV	%k0, %VRCX
> +	addl	$-CHAR_PER_VEC, %edx
> +	bzhi	%VRDX, %VRCX, %VR8
> +	jz	L(ret_vec_x4_len)
> +L(ret_vec_x4):
> +	bsf	%VRCX, %VRDX
> +L(ret_vec_x4_len):
> +	VMOVU	(VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	movNULL	$0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
> +	VMOVU	%VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +	ret
> +
> +	.p2align 4,, 4
> +L(ret_vec_x3_len):
> +	movl	%edx, %ecx
> +L(ret_vec_x3):
> +	VMOVU	(VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	movNULL	$0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> +	VMOVU	%VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +	ret
> +
> +	.p2align 4,, 8
> +L(more_4x_vec):
> +# ifdef USE_AS_WCSCPY
> +	xorl	%ecx, %ecx
> +# endif
> +	bsf	%VMASK_REG, %VRCX
> +	jnz	L(ret_vec_x3)
> +
> +	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	VPTESTN	%VMM(4), %VMM(4), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x4)
> +
> +	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
> +
> +	/* Check if we are near the end before aligning.  */
> +	cmpq	$(CHAR_PER_VEC * 8), %rdx
> +	jbe	L(last_4x_vec)
> +
> +
> +	/* Add rsi to rdx (length) before aligning rsi. NB: Since we
> +	   filtered out huge lengths this cannot overflow.  */
> +# ifdef USE_AS_WCSCPY
> +	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
> +# else
> +	addq	%rsi, %rdx
> +# endif
> +
> +	/* Subtract rsi from rdi before aligning (add back will have
> +	   correct rdi for aligned rsi).  */
> +	subq	%rsi, %rdi
> +	subq	$-(VEC_SIZE * 5), %rsi
> +	andq	$(VEC_SIZE * -4), %rsi
> +
> +	/* Load first half of the loop before entry.  */
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +
> +	/* Offset rsi by VEC_SIZE so that we can jump to
> +	   L(loop_last_4x_vec).  */
> +	addq	$-(VEC_SIZE), %rsi
> +	KORTEST	%k2, %k4
> +	jnz	L(loop_4x_done)
> +
> +	/* Store loop end in r9.  */
> +	leaq	-(VEC_SIZE * 5)(%rdx), %r9
> +
> +	.p2align 4,, 11
> +L(loop_4x_vec):
> +	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> +
> +	subq	$(VEC_SIZE * -4), %rsi
> +	cmpq	%rsi, %r9
> +	jbe	L(loop_last_4x_vec)
> +
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +	KORTEST	%k2, %k4
> +	jz	L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +	/* Restore rdi (dst).  */
> +	addq	%rsi, %rdi
> +
> +	/* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
> +	   test with bsf.  */
> +	bsf	%VRCX, %VRCX
> +	jnz	L(ret_vec_x1)
> +	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
> +
> +	KMOV	%k2, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x2)
> +	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> +
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	bsf	%VRCX, %VRCX
> +	jnz	L(ret_vec_x3)
> +	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> +
> +	KMOV	%k4, %VRCX
> +	bsf	%VRCX, %VRCX
> +	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +	ret
> +
> +
> +	.p2align 4,, 4
> +L(page_cross):
> +	movq	%rsi, %r8
> +	andq	$(VEC_SIZE * -1), %r8
> +	VPCMPEQ	(%r8), %VZERO, %k0
> +
> +# ifdef USE_AS_WCSCPY
> +	KMOV	%k0, %VR9
> +	shrl	$2, %ecx
> +	andl	$(CHAR_PER_VEC - 1), %ecx
> +	shrx	%VRCX, %VR9, %VRCX
> +# else
> +	KMOV	%k0, %VRCX
> +	shrx	%VRSI, %VRCX, %VRCX
> +# endif
> +
> +	subl	%esi, %r8d
> +	andl	$(VEC_SIZE - 1), %r8d
> +# ifdef USE_AS_WCSCPY
> +	shrl	$2, %r8d
> +# endif
> +	cmpq	%r8, %rdx
> +	jbe	L(page_cross_small)
> +	/* Optimizing more for space as this is very cold code. This
> +	   saves 2x cache lines.  */
> +
> +	/* This adds once to the later result which will get correct
> +	   copy bounds. NB: this can never zero-out a non-zero RCX as
> +	   to be in the page cross case rsi cannot be aligned and we
> +	   already right-shift rcx by the misalignment.  */
> +	shl	%VRCX
> +	jz	L(page_cross_continue)
> +	bsf	%VRCX, %VRCX
> +	REP_MOVS
> +	ret
> +
> +L(page_cross_small):
> +	tzcnt	%VRCX, %VRCX
> +	jz	L(page_cross_setz)
> +	cmpl	%edx, %ecx
> +	cmova	%edx, %ecx
> +
> +# ifdef USE_AS_WCSCPY
> +	rep	movsd
> +# else
> +	rep	movsb
> +# endif
> +L(page_cross_setz):
> +	movNULL	$0, (%rdi)
> +	ret
> +END(STRNCAT)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
> index 1b3426d511..49eaf4cbd9 100644
> --- a/sysdeps/x86_64/multiarch/strncpy-evex.S
> +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
> @@ -1,7 +1,990 @@
> -#ifndef STRNCPY
> -# define STRNCPY	__strncpy_evex
> -#endif
> +/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +	/* Use evex-masked stores for small sizes. Turned off at the
> +	   moment.  */
> +# define USE_EVEX_MASKED_STORE	0
> +
> +
> +# include <sysdep.h>
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +
> +# ifndef STRNCPY
> +#  define STRNCPY	__strncpy_evex
> +# endif
> +
> +# ifdef USE_AS_WCSCPY
> +#  define VMOVU_MASK	vmovdqu32
> +#  define VPCMPEQ	vpcmpeqd
> +#  define VPMIN	vpminud
> +#  define VPTESTN	vptestnmd
> +#  define VPTEST	vptestmd
> +#  define CHAR_SIZE	4
> +
> +#  define REP_MOVS	rep movsd
> +#  define REP_STOS	rep stosl
> +
> +#  define USE_WIDE_CHAR
> +
> +# else
> +#  define VMOVU_MASK	vmovdqu8
> +#  define VPCMPEQ	vpcmpeqb
> +#  define VPMIN	vpminub
> +#  define VPTESTN	vptestnmb
> +#  define VPTEST	vptestmb
> +#  define CHAR_SIZE	1
> +
> +#  define REP_MOVS	rep movsb
> +#  define REP_STOS	rep stosb
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# define PAGE_SIZE	4096
> +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
> +
> +# include "reg-macros.h"
> +
> +
> +# define VZERO	VMM(7)
> +# define VZERO_256	VMM_256(7)
> +# define VZERO_128	VMM_128(7)
> +
> +# if VEC_SIZE == 64
> +#  define VZERO_HALF	VZERO_256
> +# else
> +#  define VZERO_HALF	VZERO_128
> +# endif
> +
> +	.section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCPY)
> +	/* Filter zero length strings and very long strings.  Zero
> +	   length strings just return, very long strings are handled by
> +	   just running rep stos{b|l} to zero set (which will almost
> +	   certainly segfault), if that succeeds then just calling
> +	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
> +# ifdef USE_AS_WCSCPY
> +	decq	%rdx
> +	movq	%rdx, %rax
> +	/* 56 is end of max supported address space.  */
> +	shr	$56, %rax
> +	jnz	L(zero_len)
> +# else
> +	decq	%rdx
> +	/* If the flag needs to become `jb` replace `dec` with `sub`.
> +	 */
> +	jl	L(zero_len)
> +# endif
> +
> +	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
> +	movl	%esi, %eax
> +	andl	$(PAGE_SIZE - 1), %eax
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
> +	ja	L(page_cross)
> +
> +L(page_cross_continue):
> +	VMOVU	(%rsi), %VMM(0)
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +
> +	/* If no STPCPY just save end ahead of time.  */
> +# ifndef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +# endif
> +
> +
> +	cmpq	$(CHAR_PER_VEC), %rdx
> +
> +	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
> +	   <= CHAR_PER_VEC with masked instructions (which have
> +	   potential for dramatically bad perf if dst splits a page and
> +	   is not in the TLB).  */
> +# if USE_EVEX_MASKED_STORE
> +	/* `jae` because length rdx is now length - 1.  */
> +	jae	L(more_1x_vec)
> +
> +	/* If there where multiple zero-CHAR matches in the first VEC,
> +	   VRCX will be overset but thats fine since any oversets where
> +	   at zero-positions anyways.  */
> +
> +#  ifdef USE_AS_STPCPY
> +	tzcnt	%VRCX, %VRAX
> +	cmpl	%eax, %edx
> +	cmovb	%edx, %eax
> +#   ifdef USE_AS_WCSCPY
> +	adcl	$0, %eax
> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
> +#   else
> +	adcq	%rdi, %rax
> +#   endif
> +#  endif
> +	dec	%VRCX
> +
> +	/* Zero out all non-zero CHAR's after the first zero match.  */
> +	KMOV	%VRCX, %k1
> +
> +	/* Use VZERO as destination so this can be reused for
> +	   L(zfill_less_vec) (which if jumped to by subsequent logic
> +	   will have zerod out VZERO.  */
> +	VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
> +L(zfill_less_vec):
> +	/* Get mask for what we need to set.  */
> +	incl	%edx
> +	mov	$-1, %VRCX
> +	bzhi	%VRDX, %VRCX, %VRCX
> +	KMOV	%VRCX, %k1
> +	VMOVU_MASK %VZERO, (%rdi){%k1}
> +	ret
> +
> +	.p2align 4,, 4
> +L(zero_len):
> +	cmpq	$-1, %rdx
> +	jne	L(best_effort_strncpy)
> +	movq	%rdi, %rax
> +	ret
> +
> +	.p2align 4,, 8
> +L(more_1x_vec):
> +# else
> +	/* `jb` because length rdx is now length - 1.  */
> +	jb	L(less_1x_vec)
> +# endif
> +
> +
> +	/* This may overset but thats fine because we still need to zero
> +	   fill.  */
> +	VMOVU	%VMM(0), (%rdi)
> +
> +
> +	/* Length must be >= CHAR_PER_VEC so match here means we must
> +	   zero-fill.  */
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill)
> +
> +
> +	/* We are going to align rsi here so will need to be able to re-
> +	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> +	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
> +	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> +	subq	%rsi, %rdi
> +	andq	$-(VEC_SIZE), %rsi
> +
> +L(loop_last_4x_vec):
> +	addq	%rsi, %rdi
> +	subq	%rsi, %rdx
> +# ifdef USE_AS_WCSCPY
> +	shrq	$2, %rdx
> +# endif
> +
> +	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
> +	VPTESTN	%VMM(1), %VMM(1), %k0
> +	KMOV	%k0, %VRCX
> +
> +	/* -1 because of the `dec %rdx` earlier.  */
> +	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
> +	ja	L(more_2x_vec)
> +
> +L(last_2x_vec):
> +	/* This will be need to be computed no matter what. We do it
> +	   ahead of time for CHAR_PER_VEC == 64 because we can't adjust
> +	   the value of `tzcnt` with a shift.  */
> +# if CHAR_PER_VEC == 64
> +	tzcntq	%rcx, %rcx
> +# endif
> +
> +	cmpl	$(CHAR_PER_VEC), %edx
> +	jb	L(ret_vec_x1_len)
> +
> +	/* Seperate logic for CHAR_PER_VEC == 64 because we already did
> +	   `tzcnt` on VRCX.  */
> +# if CHAR_PER_VEC == 64
> +	/* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
> +	cmpb	$CHAR_PER_VEC, %cl
> +	jnz	L(ret_vec_x1_no_bsf)
> +# else
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x1)
> +# endif
> +
> +
> +
> +	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %VZERO, %k0
> +	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
> +	KMOV	%k0, %VRCX
> +
> +# if CHAR_PER_VEC < 64
> +	/* This essentiallys adds CHAR_PER_VEC to computed result.  */
> +	shlq	$CHAR_PER_VEC, %rcx
> +# else
> +	tzcntq	%rcx, %rcx
> +	addl	$CHAR_PER_VEC, %ecx
> +# endif
> +
> +	.p2align 4,, 4
> +L(ret_vec_x1_len):
> +	/* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
> +	   already been done.  */
> +# if CHAR_PER_VEC < 64
> +	tzcntq	%rcx, %rcx
> +# endif
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x1_len_no_zfill)
> +	/* Fall through (expectation) is copy len < buffer len.  */
> +	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +L(ret_vec_x1_len_no_zfill_mov):
> +	movl	%ecx, %edx
> +# ifdef USE_AS_STPCPY
> +	/* clear flags.  */
> +	xorl	%ecx, %ecx
> +# endif
> +L(ret_vec_x1_len_no_zfill):
> +	VMOVU	((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +	leal	(VEC_SIZE)(%rdx), %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# endif
> +	ret
> +
> +
> +	.p2align 4,, 10
> +L(ret_vec_x1):
> +	bsf	%VRCX, %VRCX
> +L(ret_vec_x1_no_bsf):
> +	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +	subl	%ecx, %edx
> +	cmpl	$CHAR_PER_VEC, %edx
> +	jb	L(ret_vec_x1_len_no_zfill_mov)
> +	/* Fall through (expectation) is copy len < buffer len.  */
> +	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
> +	VMOVU	%VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +	leaq	(VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
> +# endif
> +	ret
> +
> +	.p2align 4,, 8
> +L(last_4x_vec):
> +	/* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
> +	   $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
> +	   using `movzbl`.  */
> +# if CHAR_PER_VEC == 64
> +	movzbl	%dl, %edx
> +# else
> +	andl	$(CHAR_PER_VEC * 4 - 1), %edx
> +# endif
> +	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
> +	VPTESTN	%VMM(1), %VMM(1), %k0
> +	KMOV	%k0, %VRCX
> +	subq	$-(VEC_SIZE * 4), %rsi
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	cmpl	$(CHAR_PER_VEC * 2 - 1), %edx
> +	jbe	L(last_2x_vec)
> +	.p2align 4,, 8
> +L(more_2x_vec):
> +	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
> +	test	%VRCX, %VRCX
> +	/* Must fill at least 2x VEC.  */
> +	jnz	L(zfill_vec1)
> +
> +	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	/* Must fill at least 1x VEC.  */
> +	jnz	L(zfill_vec2)
> +
> +	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
> +	VPTESTN	%VMM(3), %VMM(3), %k0
> +	KMOV	%k0, %VRCX
> +
> +	/* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
> +	cmpq	$(CHAR_PER_VEC * 4 - 1), %rdx
> +	ja	L(more_4x_vec)
> +
> +	subl	$(CHAR_PER_VEC * 3), %edx
> +	jb	L(ret_vec_x3_len)
> +
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x3)
> +
> +	VPCMPEQ	(VEC_SIZE * 4)(%rsi), %VZERO, %k0
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	KMOV	%k0, %VRCX
> +	tzcnt	%VRCX, %VRCX
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x4_len_no_zfill)
> +	/* Fall through (expectation) is copy len < buffer len.  */
> +	VMOVU	%VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +	movl	%ecx, %edx
> +L(ret_vec_x4_len_no_zfill):
> +	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +	leal	(VEC_SIZE * 4 + 0)(%rdx), %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# endif
> +	ret
> +
> +
> +L(ret_vec_x3_len):
> +	addl	$(CHAR_PER_VEC * 1), %edx
> +	tzcnt	%VRCX, %VRCX
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x3_len_no_zfill)
> +	/* Fall through (expectation) is copy len < buffer len.  */
> +	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +L(ret_vec_x3_len_no_zfill_mov):
> +	movl	%ecx, %edx
> +# ifdef USE_AS_STPCPY
> +	/* clear flags.  */
> +	xorl	%ecx, %ecx
> +# endif
> +	.p2align 4,, 4
> +L(ret_vec_x3_len_no_zfill):
> +	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +	leal	(VEC_SIZE * 3 + 0)(%rdx), %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# endif
> +	ret
> +
> +
> +	.p2align 4,, 8
> +L(ret_vec_x3):
> +	bsf	%VRCX, %VRCX
> +	VMOVU	%VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
> +	subl	%ecx, %edx
> +	jl	L(ret_vec_x3_len_no_zfill_mov)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
> +# endif
> +	ret
> +
> +	.p2align 4,, 8
> +L(more_4x_vec):
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill_vec3)
> +
> +	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
> +	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
> +	VPTESTN	%VMM(4), %VMM(4), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill_vec4)
>  
> -#define USE_AS_STRNCPY
> -#define STRCPY	STRNCPY
> -#include "strcpy-evex.S"
> +	/* Recheck length before aligning.  */
> +	cmpq	$(CHAR_PER_VEC * 8 - 1), %rdx
> +	jbe	L(last_4x_vec)
> +
> +	/* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
> +# ifdef USE_AS_WCSCPY
> +	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
> +# else
> +	addq	%rsi, %rdx
> +# endif
> +	subq	%rsi, %rdi
> +	subq	$-(VEC_SIZE * 5), %rsi
> +	andq	$(VEC_SIZE * -4), %rsi
> +
> +
> +	/* Load first half of the loop before entry.  */
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +
> +
> +	/* Offset rsi by VEC_SIZE so that we can jump to
> +	   L(loop_last_4x_vec).  */
> +	addq	$-(VEC_SIZE), %rsi
> +	KORTEST	%k2, %k4
> +	jnz	L(loop_4x_done)
> +
> +	/* Store loop end in r9.  */
> +	leaq	-(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
> +
> +	.p2align 4,, 11
> +L(loop_4x_vec):
> +	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> +
> +	subq	$(VEC_SIZE * -4), %rsi
> +	cmpq	%rsi, %r9
> +	jbe	L(loop_last_4x_vec)
> +
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +	KORTEST	%k2, %k4
> +	jz	L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +	/* Restore rdx (length).  */
> +	subq	%rsi, %rdx
> +# ifdef USE_AS_WCSCPY
> +	shrq	$2, %rdx
> +# endif
> +	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +	/* Restore rdi (dst).  */
> +	addq	%rsi, %rdi
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill_vec1)
> +
> +	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> +	KMOV	%k2, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill_vec2)
> +
> +	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill_vec3)
> +
> +	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
> +	KMOV	%k4, %VRCX
> +	// Zfill more....
> +
> +	.p2align 4,, 4
> +L(zfill_vec4):
> +	subq	$(VEC_SIZE * -2), %rdi
> +	addq	$(CHAR_PER_VEC * -2), %rdx
> +L(zfill_vec2):
> +	subq	$(VEC_SIZE * -2), %rdi
> +	addq	$(CHAR_PER_VEC * -1), %rdx
> +L(zfill):
> +	/* VRCX must be non-zero.  */
> +	bsf	%VRCX, %VRCX
> +
> +	/* Adjust length / dst for zfill.  */
> +	subq	%rcx, %rdx
> +# ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
> +# else
> +	addq	%rcx, %rdi
> +# endif
> +# ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +# endif
> +L(zfill_from_page_cross):
> +
> +	/* From here on out its just memset(rdi, 0, rdx).  */
> +	cmpq	$CHAR_PER_VEC, %rdx
> +	jb	L(zfill_less_vec)
> +
> +L(zfill_more_1x_vec):
> +	VMOVU	%VZERO, (%rdi)
> +	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
> +	ja	L(zfill_more_2x_vec)
> +L(zfill_done0):
> +	ret
> +
> +	/* Coming from vec1/vec2 we must be able to zfill at least 2x
> +	   VEC.  */
> +	.p2align 4,, 8
> +L(zfill_vec3):
> +	subq	$(VEC_SIZE * -2), %rdi
> +	addq	$(CHAR_PER_VEC * -2), %rdx
> +	.p2align 4,, 2
> +L(zfill_vec1):
> +	bsfq	%rcx, %rcx
> +	/* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
> +	 */
> +	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> +	subq	%rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +# endif
> +
> +
> +	VMOVU	%VZERO, (%rdi)
> +	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +	cmpq	$(CHAR_PER_VEC * 2), %rdx
> +	jb	L(zfill_done0)
> +L(zfill_more_2x_vec):
> +	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +	VMOVU	%VZERO, (VEC_SIZE)(%rdi)
> +	subq	$(CHAR_PER_VEC * 4 - 1), %rdx
> +	jbe	L(zfill_done)
> +
> +# ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rdx
> +# else
> +	addq	%rdi, %rdx
> +# endif
> +
> +	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi)
> +	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi)
> +
> +
> +	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
> +	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
> +
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	cmpq	%rdi, %rdx
> +	jbe	L(zfill_done)
> +
> +	/* Align rdi and zfill loop.  */
> +	andq	$-(VEC_SIZE), %rdi
> +	.p2align 4,, 12
> +L(zfill_loop_4x_vec):
> +	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
> +	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
> +	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
> +	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	cmpq	%rdi, %rdx
> +	ja	L(zfill_loop_4x_vec)
> +L(zfill_done):
> +	ret
> +
> +
> +	/* Less 1x VEC case if we are not using evex masked store.  */
> +# if !USE_EVEX_MASKED_STORE
> +	.p2align 4,, 8
> +L(copy_1x):
> +	/* Special case for copy 1x. It can be handled quickly and many
> +	   buffer sizes have convenient alignment.  */
> +	VMOVU	%VMM(0), (%rdi)
> +	/* If no zeros then we are done.  */
> +	testl	%ecx, %ecx
> +	jz	L(ret_1x_1x)
> +
> +	/* Need to zfill, not we know that length <= CHAR_PER_VEC so we
> +	   only handle the small case here.  */
> +	bsf	%VRCX, %VRCX
> +L(zfill_less_vec_no_bsf):
> +	/* Adjust length / dst then just zfill less_vec.  */
> +	subq	%rcx, %rdx
> +#  ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
> +#  else
> +	addq	%rcx, %rdi
> +#  endif
> +#  ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#  endif
> +
> +L(zfill_less_vec):
> +	cmpl	$((VEC_SIZE / 2) / CHAR_SIZE), %edx
> +	jb	L(zfill_less_half)
> +
> +	VMOVU	%VZERO_HALF, (%rdi)
> +	VMOVU	%VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +	ret
> +#  ifdef USE_AS_STPCPY
> +L(ret_1x_1x):
> +	leaq	CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
> +	ret
> +#  endif
> +
> +
> +#  if VEC_SIZE == 64
> +	.p2align 4,, 4
> +L(copy_32_63):
> +	/* Overfill to avoid branches.  */
> +	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +	VMOVU	%VMM_256(0), (%rdi)
> +	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +
> +	/* We are taking advantage of the fact that to be here we must
> +	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
> +	   way for overwriting.  */
> +	cmpl	%ecx, %edx
> +	ja	L(zfill_less_vec_no_bsf)
> +#   ifndef USE_AS_STPCPY
> +L(ret_1x_1x):
> +#   else
> +#    ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#    else
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#    endif
> +#   endif
> +	ret
> +#  endif
> +
> +	.p2align 4,, 4
> +L(copy_16_31):
> +	/* Overfill to avoid branches.  */
> +	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +	VMOVU	%VMM_128(0), (%rdi)
> +	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +	cmpl	%ecx, %edx
> +
> +	/* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
> +	   we have a larger copy block for 32-63 so this is just falls
> +	   through to zfill 16-31. If VEC_SIZE == 32 then we check for
> +	   full zfill of less 1x VEC.  */
> +#  if VEC_SIZE == 64
> +	jbe	L(ret_16_31)
> +	subl	%ecx, %edx
> +#   ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
> +#   else
> +	addq	%rcx, %rdi
> +#   endif
> +#   ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#   endif
> +L(zfill_less_half):
> +L(zfill_less_32):
> +	cmpl	$(16 / CHAR_SIZE), %edx
> +	jb	L(zfill_less_16)
> +	VMOVU	%VZERO_128, (%rdi)
> +	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +#   ifdef USE_AS_STPCPY
> +	ret
> +#   endif
> +L(ret_16_31):
> +#   ifdef USE_AS_STPCPY
> +#    ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#    else
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#    endif
> +#   endif
> +	ret
> +#  else
> +	/* VEC_SIZE == 32 begins.  */
> +	ja	L(zfill_less_vec_no_bsf)
> +#   ifndef USE_AS_STPCPY
> +L(ret_1x_1x):
> +#   else
> +#    ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#    else
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#    endif
> +#   endif
> +	ret
> +#  endif
> +
> +
> +	.p2align 4,, 4
> +L(copy_8_15):
> +	/* Overfill to avoid branches.  */
> +	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> +	vmovq	%VMM_128(0), (%rdi)
> +	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_8_15)
> +	subl	%ecx, %edx
> +#  ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
> +#  else
> +	addq	%rcx, %rdi
> +#  endif
> +#  ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#  endif
> +	.p2align 4,, 8
> +#  if VEC_SIZE == 32
> +L(zfill_less_half):
> +#  endif
> +L(zfill_less_16):
> +	xorl	%ecx, %ecx
> +	cmpl	$(8 / CHAR_SIZE), %edx
> +	jb	L(zfill_less_8)
> +	movq	%rcx, (%rdi)
> +	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +#  ifndef USE_AS_STPCPY
> +L(ret_8_15):
> +#  endif
> +	ret
> +
> +	.p2align 4,, 8
> +L(less_1x_vec):
> +	je	L(copy_1x)
> +
> +	/* We will need `tzcnt` result for all other copy sizes.  */
> +	tzcnt	%VRCX, %VRCX
> +#  if VEC_SIZE == 64
> +	cmpl	$(32 / CHAR_SIZE), %edx
> +	jae	L(copy_32_63)
> +#  endif
> +
> +	cmpl	$(16 / CHAR_SIZE), %edx
> +	jae	L(copy_16_31)
> +
> +	cmpl	$(8 / CHAR_SIZE), %edx
> +	jae	L(copy_8_15)
> +#  ifdef USE_AS_WCSCPY
> +	testl	%ecx, %ecx
> +	jz	L(zfill_less_8_set_ret)
> +
> +	movl	(%rsi, %rdx, CHAR_SIZE), %esi
> +	vmovd	%VMM_128(0), (%rdi)
> +	movl	%esi, (%rdi, %rdx, CHAR_SIZE)
> +#   ifdef USE_AS_STPCPY
> +	cmpl	%ecx, %edx
> +L(ret_8_15):
> +	adcq	$0, %rdx
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#   endif
> +	ret
> +L(zfill_less_8_set_ret):
> +	xorl	%ecx, %ecx
> +#   ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#   endif
> +L(zfill_less_8):
> +	movl	%ecx, (%rdi)
> +	movl	%ecx, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +#  else
> +	cmpl	$3, %edx
> +	jb	L(copy_0_3)
> +	/* Overfill to avoid branches.  */
> +	movl	-3(%rsi, %rdx), %esi
> +	vmovd	%VMM_128(0), (%rdi)
> +	movl	%esi, -3(%rdi, %rdx)
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_4_7)
> +	subq	%rcx, %rdx
> +	addq	%rcx, %rdi
> +#   ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#   endif
> +	xorl	%ecx, %ecx
> +	.p2align 4,, 8
> +L(zfill_less_8):
> +	cmpl	$3, %edx
> +	jb	L(zfill_less_3)
> +	movl	%ecx, (%rdi)
> +	movl	%ecx, -3(%rdi, %rdx)
> +#   ifdef USE_AS_STPCPY
> +	ret
> +#   endif
> +
> +L(ret_4_7):
> +#   ifdef USE_AS_STPCPY
> +L(ret_8_15):
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#   endif
> +	ret
> +
> +	.p2align 4,, 4
> +L(zfill_less_3):
> +	testl	%edx, %edx
> +	jz	L(zfill_1)
> +	movw	%cx, (%rdi)
> +L(zfill_1):
> +	movb	%cl, (%rdi, %rdx)
> +	ret
> +
> +	.p2align 4,, 8
> +L(copy_0_3):
> +	vmovd	%VMM_128(0), %r8d
> +	testl	%edx, %edx
> +	jz	L(copy_1)
> +	movw	%r8w, (%rdi)
> +	cmpl	%ecx, %edx
> +	ja	L(zfill_from_1)
> +	movzbl	(%rsi, %rdx), %r8d
> +#   ifdef USE_AS_STPCPY
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +	movb	%r8b, (%rdi, %rdx)
> +	ret
> +#   endif
> +
> +L(copy_1):
> +#   ifdef USE_AS_STPCPY
> +	movl	%edx, %eax
> +	cmpl	%ecx, %edx
> +	adcq	%rdi, %rax
> +#   endif
> +#   ifdef USE_AS_WCSCPY
> +	vmovd	%VMM_128(0), (%rdi)
> +#   else
> +	movb	%r8b, (%rdi, %rdx)
> +#   endif
> +	ret
> +#  endif
> +
> +
> +#  ifndef USE_AS_WCSCPY
> +	.p2align 4,, 8
> +L(zfill_from_1):
> +#   ifdef USE_AS_STPCPY
> +	leaq	(%rdi, %rcx), %rax
> +#   endif
> +	movw	$0, -1(%rdi, %rdx)
> +	ret
> +#  endif
> +
> +	.p2align 4,, 4
> +L(zero_len):
> +	incq	%rdx
> +	jne	L(best_effort_strncpy)
> +	movq	%rdi, %rax
> +	ret
> +# endif
> +
> +
> +	.p2align 4,, 4
> +	.p2align 6,, 8
> +L(page_cross):
> +	movq	%rsi, %rax
> +	andq	$(VEC_SIZE * -1), %rax
> +	VPCMPEQ	(%rax), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +# ifdef USE_AS_WCSCPY
> +	movl	%esi, %r8d
> +	shrl	$2, %r8d
> +	andl	$(CHAR_PER_VEC - 1), %r8d
> +	shrx	%VR8, %VRCX, %VRCX
> +# else
> +	shrx	%VRSI, %VRCX, %VRCX
> +# endif
> +
> +	/* Compute amount of bytes we checked.  */
> +	subl	%esi, %eax
> +	andl	$(VEC_SIZE - 1), %eax
> +# ifdef USE_AS_WCSCPY
> +	shrl	$2, %eax
> +# endif
> +
> +	/* If rax > rdx then we are finishing the copy at the end of the
> +	   page.  */
> +	cmpq	%rax, %rdx
> +	jb	L(page_cross_small)
> +
> +
> +	/* If rcx is non-zero then continue.  */
> +	test	%VRCX, %VRCX
> +	jz	L(page_cross_continue)
> +
> +	/* We found zero-CHAR so need to copy then zfill (we know we
> +	   didn't cover all of length here).  */
> +	bsf	%VRCX, %VRCX
> +L(movsb_and_zfill):
> +	incl	%ecx
> +	subq	%rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +	leaq	-CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
> +# else
> +	movq	%rdi, %rax
> +# endif
> +
> +	REP_MOVS
> +# ifdef USE_AS_WCSCPY
> +	movl	$0, (%rdi)
> +# else
> +	movb	$0, (%rdi)
> +# endif
> +	jmp	L(zfill_from_page_cross)
> +
> +L(page_cross_small):
> +	tzcnt	%VRCX, %VRCX
> +	cmpl	%ecx, %edx
> +	jbe	L(page_cross_copy_only)
> +
> +	/* Do a zfill of the tail before copying.  */
> +	movq	%rdi, %r9
> +	xorl	%eax, %eax
> +
> +	movl	%ecx, %r8d
> +
> +	subl	%ecx, %edx
> +	leaq	CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> +	movl	%edx, %ecx
> +	REP_STOS
> +	movq	%r9, %rdi
> +	movl	%r8d, %edx
> +L(page_cross_copy_only):
> +	leal	1(%rdx), %ecx
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	adcl	$0, %edx
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# else
> +	movq	%rdi, %rax
> +# endif
> +	REP_MOVS
> +	ret
> +
> +
> +L(best_effort_strncpy):
> +	movq	%rdx, %rcx
> +	xorl	%eax, %eax
> +	movq	%rdi, %r8
> +	/* The length is >= 2^63. We very much so expect to segfault at
> +	   rep stos. If that doesn't happen then just strcpy to finish.
> +	 */
> +	REP_STOS
> +	movq	%r8, %rdi
> +	jmp	OVERFLOW_STRCPY
> +END(STRNCPY)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> new file mode 100644
> index 0000000000..d5ff4cbe50
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h

Please add a copyright notice.

> @@ -0,0 +1,65 @@
> +#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
> +#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
> +
> +#if defined USE_MULTIARCH && IS_IN(libc)
> +#  define UNDERSCORES __
> +#  ifdef USE_WITH_SSE2
> +#    define ISA_EXT _sse2
> +#  elif defined USE_WITH_AVX
> +#    ifdef USE_WITH_RTM
> +#      define ISA_EXT _avx_rtm
> +#    else
> +#      define ISA_EXT _avx
> +#    endif
> +#  elif defined USE_WITH_AVX2

Do we have a function with both AVX and AVX2 versions? If not, should
keep just 1.

> +#    ifdef USE_WITH_RTM
> +#      define ISA_EXT _avx2_rtm
> +#    else
> +#      define ISA_EXT _avx2
> +#    endif
> +
> +#  elif defined USE_WITH_EVEX256
> +#    define ISA_EXT _evex
> +#  elif defined USE_WITH_EVEX512
> +#    define ISA_EXT _evex512
> +#  endif
> +#else
> +#  define UNDERSCORES
> +#  define ISA_EXT
> +#endif
> +
> +#ifdef USE_AS_WCSCPY
> +#  define STRCPY_PREFIX wc
> +#  define STRCAT_PREFIX wcs
> +#  ifdef USE_AS_STPCPY
> +#    define STRCPY_POSTFIX pcpy
> +#  else
> +#    define STRCPY_POSTFIX scpy
> +#  endif
> +#else
> +#  define STRCPY_PREFIX st
> +#  define STRCAT_PREFIX str
> +#  ifdef USE_AS_STPCPY
> +#    define STRCPY_POSTFIX pcpy
> +#  else
> +#    define STRCPY_POSTFIX rcpy
> +#  endif
> +#endif
> +#define STRCAT_POSTFIX cat
> +
> +#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext)                 \
> +  underscores##prefix##postfix##ext
> +
> +#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
> +
> +#ifndef OVERFLOW_STRCPY
> +#  define OVERFLOW_STRCPY                                                     \
> +    OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
> +#endif
> +
> +#ifndef OVERFLOW_STRCAT
> +#  define OVERFLOW_STRCAT                                                     \
> +    OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
> +#endif
> +
> +#endif
> -- 
> 2.34.1
>  

H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/5] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
  2022-11-04 21:46     ` H.J. Lu
@ 2022-11-04 22:27       ` Noah Goldstein
  2022-11-04 22:47         ` H.J. Lu
  0 siblings, 1 reply; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04 22:27 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Nov 4, 2022 at 2:46 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Nov 04, 2022 at 01:13:11PM -0700, Noah Goldstein wrote:
> > Optimizations are:
> >     1. Use more overlapping stores to avoid branches.
> >     2. Reduce how unrolled the aligning copies are (this is more of a
> >        code-size save, its a negative for some sizes in terms of
> >        perf).
> >     3. Improve the loop a bit (similiar to what we do in strlen with
> >        2x vpminu + kortest instead of 3x vpminu + kmov + test).
> >     4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
> >        number that are taken.
> >
> > Performance Changes:
> >
> >     Times are from N = 10 runs of the benchmark suite and are
> >     reported as geometric mean of all ratios of
> >     New Implementation / Old Implementation.
> >
> >     stpcpy-evex      -> 0.922
> >     strcat-evex      -> 0.985
> >     strcpy-evex      -> 0.880
> >
> >     strncpy-evex     -> 0.831
> >     stpncpy-evex     -> 0.780
> >
> >     strncat-evex     -> 0.958
> >
> > Code Size Changes:
> >     function         -> Bytes New / Bytes Old -> Ratio
> >
> >     strcat-evex      ->  819 / 1874 -> 0.437
> >     strcpy-evex      ->  700 / 1074 -> 0.652
> >     stpcpy-evex      ->  735 / 1094 -> 0.672
> >
> >     strncpy-evex     -> 1397 / 2611 -> 0.535
> >     stpncpy-evex     -> 1489 / 2691 -> 0.553
> >
> >     strncat-evex     -> 1184 / 2832 -> 0.418
> >
> > Notes:
> >     1. Because of the significant difference between the
> >        implementations they are split into three files.
> >
> >            strcpy-evex.S    -> strcpy, stpcpy, strcat
> >            strncpy-evex.S   -> strncpy
> >            strncat-evex.S    > strncat
> >
> >        I couldn't find a way to merge them without making the
> >        ifdefs incredibly difficult to follow.
> >
> >     2. All implementations can be made evex512 by including
> >        "x86-evex512-vecs.h" at the top.
> >
> >     3. All implementations have an optional define:
> >         `USE_EVEX_MASKED_STORE`
> >        Setting to one uses evex-masked stores for handling short
> >        strings.  This saves code size and branches.  It's disabled
> >        for all implementations are the moment as there are some
> >        serious drawbacks to masked stores in certain cases, but
> >        that may be fixed on future architectures.
> >
> > Full check passes on x86-64 and build succeeds for all ISA levels w/
> > and w/o multiarch.
> > ---
> >  sysdeps/x86_64/multiarch/stpncpy-evex.S       |    5 +-
> >  sysdeps/x86_64/multiarch/strcat-evex.S        |  291 +---
> >  sysdeps/x86_64/multiarch/strcat-strlen-evex.S |  110 ++
> >  sysdeps/x86_64/multiarch/strcpy-evex.S        | 1282 ++++++-----------
> >  sysdeps/x86_64/multiarch/strncat-evex.S       |  525 ++++++-
> >  sysdeps/x86_64/multiarch/strncpy-evex.S       |  995 ++++++++++++-
> >  .../multiarch/strncpy-or-cat-overflow-def.h   |   65 +
> >  7 files changed, 2100 insertions(+), 1173 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> >
> > diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> > index 99ea76a372..3693491baa 100644
> > --- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
> > +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> > @@ -3,6 +3,5 @@
> >  #endif
> >
> >  #define USE_AS_STPCPY
> > -#define USE_AS_STRNCPY
> > -#define STRCPY       STPNCPY
> > -#include "strcpy-evex.S"
> > +#define STRNCPY      STPNCPY
> > +#include "strncpy-evex.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
> > index 0e2df947e9..b4207b7889 100644
> > --- a/sysdeps/x86_64/multiarch/strcat-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strcat-evex.S
> > @@ -1,286 +1,7 @@
> > -/* strcat with 256-bit EVEX instructions.
> > -   Copyright (C) 2021-2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -#include <isa-level.h>
> > -
> > -#if ISA_SHOULD_BUILD (4)
> > -
> > -
> > -# include <sysdep.h>
> > -
> > -# ifndef STRCAT
> > -#  define STRCAT  __strcat_evex
> > -# endif
> > -
> > -# define VMOVU               vmovdqu64
> > -# define VMOVA               vmovdqa64
> > -
> > -/* zero register */
> > -# define XMMZERO     xmm16
> > -# define YMMZERO     ymm16
> > -# define YMM0                ymm17
> > -# define YMM1                ymm18
> > -
> > -# define USE_AS_STRCAT
> > -
> > -/* Number of bytes in a vector register */
> > -# define VEC_SIZE    32
> > -
> > -     .section .text.evex,"ax",@progbits
> > -ENTRY (STRCAT)
> > -     mov     %rdi, %r9
> > -# ifdef USE_AS_STRNCAT
> > -     mov     %rdx, %r8
> > -# endif
> > -
> > -     xor     %eax, %eax
> > -     mov     %edi, %ecx
> > -     and     $((VEC_SIZE * 4) - 1), %ecx
> > -     vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> > -     cmp     $(VEC_SIZE * 3), %ecx
> > -     ja      L(fourth_vector_boundary)
> > -     vpcmpb  $0, (%rdi), %YMMZERO, %k0
> > -     kmovd   %k0, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_first_vector)
> > -     mov     %rdi, %rax
> > -     and     $-VEC_SIZE, %rax
> > -     jmp     L(align_vec_size_start)
> > -L(fourth_vector_boundary):
> > -     mov     %rdi, %rax
> > -     and     $-VEC_SIZE, %rax
> > -     vpcmpb  $0, (%rax), %YMMZERO, %k0
> > -     mov     $-1, %r10d
> > -     sub     %rax, %rcx
> > -     shl     %cl, %r10d
> > -     kmovd   %k0, %edx
> > -     and     %r10d, %edx
> > -     jnz     L(exit)
> > -
> > -L(align_vec_size_start):
> > -     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> > -     kmovd   %k0, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_second_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > -     kmovd   %k1, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_third_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > -     kmovd   %k2, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_fourth_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > -     kmovd   %k3, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_fifth_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > -     add     $(VEC_SIZE * 4), %rax
> > -     kmovd   %k4, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_second_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > -     kmovd   %k1, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_third_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > -     kmovd   %k2, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_fourth_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > -     kmovd   %k3, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_fifth_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > -     kmovd   %k4, %edx
> > -     add     $(VEC_SIZE * 4), %rax
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_second_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > -     kmovd   %k1, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_third_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > -     kmovd   %k2, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_fourth_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > -     kmovd   %k3, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_fifth_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > -     add     $(VEC_SIZE * 4), %rax
> > -     kmovd   %k4, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_second_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > -     kmovd   %k1, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_third_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > -     kmovd   %k2, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_fourth_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > -     kmovd   %k3, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_fifth_vector)
> > -
> > -     test    $((VEC_SIZE * 4) - 1), %rax
> > -     jz      L(align_four_vec_loop)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > -     add     $(VEC_SIZE * 5), %rax
> > -     kmovd   %k4, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit)
> > -
> > -     test    $((VEC_SIZE * 4) - 1), %rax
> > -     jz      L(align_four_vec_loop)
> > -
> > -     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> > -     add     $VEC_SIZE, %rax
> > -     kmovd   %k0, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit)
> > -
> > -     test    $((VEC_SIZE * 4) - 1), %rax
> > -     jz      L(align_four_vec_loop)
> > -
> > -     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> > -     add     $VEC_SIZE, %rax
> > -     kmovd   %k0, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit)
> > -
> > -     test    $((VEC_SIZE * 4) - 1), %rax
> > -     jz      L(align_four_vec_loop)
> > -
> > -     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k1
> > -     add     $VEC_SIZE, %rax
> > -     kmovd   %k1, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit)
> > -
> > -     add     $VEC_SIZE, %rax
> > -
> > -     .p2align 4
> > -L(align_four_vec_loop):
> > -     VMOVA   (%rax), %YMM0
> > -     VMOVA   (VEC_SIZE * 2)(%rax), %YMM1
> > -     vpminub VEC_SIZE(%rax), %YMM0, %YMM0
> > -     vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1
> > -     vpminub %YMM0, %YMM1, %YMM0
> > -     /* If K0 != 0, there is a null byte.  */
> > -     vpcmpb  $0, %YMM0, %YMMZERO, %k0
> > -     add     $(VEC_SIZE * 4), %rax
> > -     ktestd  %k0, %k0
> > -     jz      L(align_four_vec_loop)
> > -
> > -     vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
> > -     sub     $(VEC_SIZE * 5), %rax
> > -     kmovd   %k0, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_second_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > -     kmovd   %k1, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_third_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > -     kmovd   %k2, %edx
> > -     test    %edx, %edx
> > -     jnz     L(exit_null_on_fourth_vector)
> > -
> > -     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > -     kmovd   %k3, %edx
> > -     sub     %rdi, %rax
> > -     bsf     %rdx, %rdx
> > -     add     %rdx, %rax
> > -     add     $(VEC_SIZE * 4), %rax
> > -     jmp     L(StartStrcpyPart)
> > -
> > -     .p2align 4
> > -L(exit):
> > -     sub     %rdi, %rax
> > -L(exit_null_on_first_vector):
> > -     bsf     %rdx, %rdx
> > -     add     %rdx, %rax
> > -     jmp     L(StartStrcpyPart)
> > -
> > -     .p2align 4
> > -L(exit_null_on_second_vector):
> > -     sub     %rdi, %rax
> > -     bsf     %rdx, %rdx
> > -     add     %rdx, %rax
> > -     add     $VEC_SIZE, %rax
> > -     jmp     L(StartStrcpyPart)
> > -
> > -     .p2align 4
> > -L(exit_null_on_third_vector):
> > -     sub     %rdi, %rax
> > -     bsf     %rdx, %rdx
> > -     add     %rdx, %rax
> > -     add     $(VEC_SIZE * 2), %rax
> > -     jmp     L(StartStrcpyPart)
> > -
> > -     .p2align 4
> > -L(exit_null_on_fourth_vector):
> > -     sub     %rdi, %rax
> > -     bsf     %rdx, %rdx
> > -     add     %rdx, %rax
> > -     add     $(VEC_SIZE * 3), %rax
> > -     jmp     L(StartStrcpyPart)
> > -
> > -     .p2align 4
> > -L(exit_null_on_fifth_vector):
> > -     sub     %rdi, %rax
> > -     bsf     %rdx, %rdx
> > -     add     %rdx, %rax
> > -     add     $(VEC_SIZE * 4), %rax
> > -
> > -     .p2align 4
> > -L(StartStrcpyPart):
> > -     lea     (%r9, %rax), %rdi
> > -     mov     %rsi, %rcx
> > -     mov     %r9, %rax      /* save result */
> > -
> > -# ifdef USE_AS_STRNCAT
> > -     test    %r8, %r8
> > -     jz      L(ExitZero)
> > -#  define USE_AS_STRNCPY
> > -# endif
> > -
> > -# include "strcpy-evex.S"
> > +#ifndef STRCAT
> > +# define STRCAT      __strcat_evex
> >  #endif
> > +
> > +#define USE_AS_STRCAT
> > +#define STRCPY       STRCAT
> > +#include "strcpy-evex.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> > new file mode 100644
> > index 0000000000..9530d7b683
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> > @@ -0,0 +1,110 @@
> > +/* strlen used for begining of str{n}cat using EVEX 256/512.
> > +   Copyright (C) 2011-2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +/* NOTE: This file is meant to be included by strcat-evex or
> > +   strncat-evex and does not standalone.  Before including %rdi
> > +   must be saved in %rax.  */
>
> Since this file isn't standalone, please rename it to .h.

Can it be .h.S so it plays well it IDE modes?
>
> > +
> > +
> > +/* Simple strlen implementation that ends at
> > +   L(strcat_strlen_done).  */
> > +     vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> > +     movq    %rdi, %r8
> > +     andq    $(VEC_SIZE * -1), %r8
> > +     VPCMPEQ (%r8), %VZERO, %k0
> > +     KMOV    %k0, %VRCX
> > +#ifdef USE_AS_WCSCPY
> > +     subl    %r8d, %edi
> > +     shrl    $2, %edi
> > +#endif
> > +     shrx    %VRDI, %VRCX, %VRCX
> > +#ifdef USE_AS_WCSCPY
> > +     movq    %rax, %rdi
> > +#endif
> > +     test    %VRCX, %VRCX
> > +     jnz     L(bsf_and_done_v0)
> > +
> > +
> > +     VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0
> > +     KMOV    %k0, %VRCX
> > +     leaq    (VEC_SIZE)(%r8), %rdi
> > +     test    %VRCX, %VRCX
> > +     jnz     L(bsf_and_done_v0)
> > +
> > +     VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(bsf_and_done_v1)
> > +
> > +     VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(bsf_and_done_v2)
> > +
> > +     VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(bsf_and_done_v3)
> > +
> > +     andq    $-(VEC_SIZE * 4), %rdi
> > +     .p2align 4,, 8
> > +L(loop_2x_vec):
> > +     VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(0)
> > +     VPMIN   (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
> > +     VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(2)
> > +     VPMIN   (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
> > +     VPTESTN %VMM(1), %VMM(1), %k1
> > +     VPTESTN %VMM(3), %VMM(3), %k3
> > +     subq    $(VEC_SIZE * -4), %rdi
> > +     KORTEST %k1, %k3
> > +     jz      L(loop_2x_vec)
> > +
> > +     VPTESTN %VMM(0), %VMM(0), %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(bsf_and_done_v0)
> > +
> > +     KMOV    %k1, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(bsf_and_done_v1)
> > +
> > +     VPTESTN %VMM(2), %VMM(2), %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(bsf_and_done_v2)
> > +
> > +     KMOV    %k3, %VRCX
> > +L(bsf_and_done_v3):
> > +     addq    $VEC_SIZE, %rdi
> > +L(bsf_and_done_v2):
> > +     bsf     %VRCX, %VRCX
> > +     leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
> > +     jmp     L(strcat_strlen_done)
> > +
> > +     .p2align 4,, 4
> > +L(bsf_and_done_v1):
> > +     addq    $VEC_SIZE, %rdi
> > +L(bsf_and_done_v0):
> > +     bsf     %VRCX, %VRCX
> > +#ifdef USE_AS_WCSCPY
> > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > +#else
> > +     addq    %rcx, %rdi
> > +#endif
> > +L(strcat_strlen_done):
> > diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
> > index 82e45ac675..1ba0195ed2 100644
> > --- a/sysdeps/x86_64/multiarch/strcpy-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
> > @@ -1,4 +1,4 @@
> > -/* strcpy with 256-bit EVEX instructions.
> > +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
> >     Copyright (C) 2021-2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> > @@ -17,990 +17,526 @@
> >     <https://www.gnu.org/licenses/>.  */
> >
> >  #include <isa-level.h>
> > -
> >  #if ISA_SHOULD_BUILD (4)
> >
> >
> > -# ifndef USE_AS_STRCAT
> > -#  include <sysdep.h>
> > +     /* Use evex-masked stores for small sizes. Turned off at the
> > +        moment.  */
> > +# define USE_EVEX_MASKED_STORE       0
> > +     /* Use movsb in page cross case to save code size.  */
> > +# define USE_MOVSB_IN_PAGE_CROSS     1
> >
> > -#  ifndef STRCPY
> > -#   define STRCPY  __strcpy_evex
> > -#  endif
> > +# include <sysdep.h>
> >
> > +# ifndef VEC_SIZE
> > +#  include "x86-evex256-vecs.h"
> >  # endif
> >
> > -# define VMOVU               vmovdqu64
> > -# define VMOVA               vmovdqa64
> > -
> > -/* Number of bytes in a vector register */
> > -# ifndef VEC_SIZE
> > -#  define VEC_SIZE   32
> > +# ifndef STRCPY
> > +#  define STRCPY     __strcpy_evex
> >  # endif
> >
> > -# define XMM2                xmm18
> > -# define XMM3                xmm19
> >
> > -# define YMM2                ymm18
> > -# define YMM3                ymm19
> > -# define YMM4                ymm20
> > -# define YMM5                ymm21
> > -# define YMM6                ymm22
> > -# define YMM7                ymm23
> > +# ifdef USE_AS_WCSCPY
> > +#  define VMOVU_MASK vmovdqu32
> > +#  define VPMIN      vpminud
> > +#  define VPTESTN    vptestnmd
> > +#  define VPTEST     vptestmd
> > +#  define VPCMPEQ    vpcmpeqd
> > +#  define CHAR_SIZE  4
> >
> > -# ifndef USE_AS_STRCAT
> > +#  define REP_MOVS   rep movsd
> >
> > -/* zero register */
> > -#  define XMMZERO    xmm16
> > -#  define YMMZERO    ymm16
> > -#  define YMM1               ymm17
> > -
> > -     .section .text.evex,"ax",@progbits
> > -ENTRY (STRCPY)
> > -#  ifdef USE_AS_STRNCPY
> > -     mov     %RDX_LP, %R8_LP
> > -     test    %R8_LP, %R8_LP
> > -     jz      L(ExitZero)
> > -#  endif
> > -     mov     %rsi, %rcx
> > -#  ifndef USE_AS_STPCPY
> > -     mov     %rdi, %rax      /* save result */
> > -#  endif
> > +#  define USE_WIDE_CHAR
> > +# else
> > +#  define VMOVU_MASK vmovdqu8
> > +#  define VPMIN      vpminub
> > +#  define VPTESTN    vptestnmb
> > +#  define VPTEST     vptestmb
> > +#  define VPCMPEQ    vpcmpeqb
> > +#  define CHAR_SIZE  1
> >
> > -     vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> > +#  define REP_MOVS   rep movsb
> >  # endif
> >
> > -     and     $((VEC_SIZE * 4) - 1), %ecx
> > -     cmp     $(VEC_SIZE * 2), %ecx
> > -     jbe     L(SourceStringAlignmentLessTwoVecSize)
> > -
> > -     and     $-VEC_SIZE, %rsi
> > -     and     $(VEC_SIZE - 1), %ecx
> > -
> > -     vpcmpb  $0, (%rsi), %YMMZERO, %k0
> > -     kmovd   %k0, %edx
> > -     shr     %cl, %rdx
> > +# include "reg-macros.h"
> >
> > -# ifdef USE_AS_STRNCPY
> > -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > -     mov     $VEC_SIZE, %r10
> > -     sub     %rcx, %r10
> > -     cmp     %r10, %r8
> > -#  else
> > -     mov     $(VEC_SIZE + 1), %r10
> > -     sub     %rcx, %r10
> > -     cmp     %r10, %r8
> > -#  endif
> > -     jbe     L(CopyVecSizeTailCase2OrCase3)
> > -# endif
> > -     test    %edx, %edx
> > -     jnz     L(CopyVecSizeTail)
> > -
> > -     vpcmpb  $0, VEC_SIZE(%rsi), %YMMZERO, %k1
> > -     kmovd   %k1, %edx
> >
> > -# ifdef USE_AS_STRNCPY
> > -     add     $VEC_SIZE, %r10
> > -     cmp     %r10, %r8
> > -     jbe     L(CopyTwoVecSizeCase2OrCase3)
> > -# endif
> > -     test    %edx, %edx
> > -     jnz     L(CopyTwoVecSize)
> > -
> > -     VMOVU   (%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
> > -     VMOVU   %YMM2, (%rdi)
> > -
> > -/* If source address alignment != destination address alignment */
> > -     .p2align 4
> > -L(UnalignVecSizeBoth):
> > -     sub     %rcx, %rdi
> > -# ifdef USE_AS_STRNCPY
> > -     add     %rcx, %r8
> > -     sbb     %rcx, %rcx
> > -     or      %rcx, %r8
> > -# endif
> > -     mov     $VEC_SIZE, %rcx
> > -     VMOVA   (%rsi, %rcx), %YMM2
> > -     VMOVU   %YMM2, (%rdi, %rcx)
> > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> > -     vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > -     kmovd   %k0, %edx
> > -     add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -     sub     $(VEC_SIZE * 3), %r8
> > -     jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -     test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -     jnz     L(CopyVecSizeUnalignedVec2)
> > +# ifdef USE_AS_STPCPY
> > +#  define END_REG    rax
> >  # else
> > -     jnz     L(CopyVecSize)
> > +#  define END_REG    rdi, %rdx, CHAR_SIZE
> >  # endif
> >
> > -     VMOVU   %YMM2, (%rdi, %rcx)
> > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM3
> > -     vpcmpb  $0, %YMM3, %YMMZERO, %k0
> > -     kmovd   %k0, %edx
> > -     add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -     sub     $VEC_SIZE, %r8
> > -     jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -     test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -     jnz     L(CopyVecSizeUnalignedVec3)
> > +# ifdef USE_AS_STRCAT
> > +#  define PAGE_ALIGN_REG     edx
> > +#  define PAGE_ALIGN_REG_64  rdx
> >  # else
> > -     jnz     L(CopyVecSize)
> > +#  define PAGE_ALIGN_REG     eax
> > +#  define PAGE_ALIGN_REG_64  rax
> >  # endif
> >
> > -     VMOVU   %YMM3, (%rdi, %rcx)
> > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM4
> > -     vpcmpb  $0, %YMM4, %YMMZERO, %k0
> > -     kmovd   %k0, %edx
> > -     add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -     sub     $VEC_SIZE, %r8
> > -     jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -     test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -     jnz     L(CopyVecSizeUnalignedVec4)
> > -# else
> > -     jnz     L(CopyVecSize)
> > -# endif
> > +# define VZERO       VMM(7)
> > +# define VZERO_128   VMM_128(7)
> >
> > -     VMOVU   %YMM4, (%rdi, %rcx)
> > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> > -     vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > -     kmovd   %k0, %edx
> > -     add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -     sub     $VEC_SIZE, %r8
> > -     jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -     test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -     jnz     L(CopyVecSizeUnalignedVec2)
> > -# else
> > -     jnz     L(CopyVecSize)
> > -# endif
> >
> > -     VMOVU   %YMM2, (%rdi, %rcx)
> > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> > -     vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > -     kmovd   %k0, %edx
> > -     add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -     sub     $VEC_SIZE, %r8
> > -     jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -     test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -     jnz     L(CopyVecSizeUnalignedVec2)
> > -# else
> > -     jnz     L(CopyVecSize)
> > -# endif
> > +# define PAGE_SIZE   4096
> > +# define CHAR_PER_VEC        (VEC_SIZE / CHAR_SIZE)
> >
> > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM3
> > -     VMOVU   %YMM2, (%rdi, %rcx)
> > -     vpcmpb  $0, %YMM3, %YMMZERO, %k0
> > -     kmovd   %k0, %edx
> > -     add     $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > -     sub     $VEC_SIZE, %r8
> > -     jbe     L(CopyVecSizeCase2OrCase3)
> > -# endif
> > -     test    %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -     jnz     L(CopyVecSizeUnalignedVec3)
> > -# else
> > -     jnz     L(CopyVecSize)
> > -# endif
> >
> > -     VMOVU   %YMM3, (%rdi, %rcx)
> > -     mov     %rsi, %rdx
> > -     lea     VEC_SIZE(%rsi, %rcx), %rsi
> > -     and     $-(VEC_SIZE * 4), %rsi
> > -     sub     %rsi, %rdx
> > -     sub     %rdx, %rdi
> > -# ifdef USE_AS_STRNCPY
> > -     lea     (VEC_SIZE * 8)(%r8, %rdx), %r8
> > -# endif
> > -L(UnalignedFourVecSizeLoop):
> > -     VMOVA   (%rsi), %YMM4
> > -     VMOVA   VEC_SIZE(%rsi), %YMM5
> > -     VMOVA   (VEC_SIZE * 2)(%rsi), %YMM6
> > -     VMOVA   (VEC_SIZE * 3)(%rsi), %YMM7
> > -     vpminub %YMM5, %YMM4, %YMM2
> > -     vpminub %YMM7, %YMM6, %YMM3
> > -     vpminub %YMM2, %YMM3, %YMM2
> > -     /* If K7 != 0, there is a null byte.  */
> > -     vpcmpb  $0, %YMM2, %YMMZERO, %k7
> > -     kmovd   %k7, %edx
> > -# ifdef USE_AS_STRNCPY
> > -     sub     $(VEC_SIZE * 4), %r8
> > -     jbe     L(UnalignedLeaveCase2OrCase3)
> > +     .section SECTION(.text), "ax", @progbits
> > +ENTRY(STRCPY)
> > +# ifdef USE_AS_STRCAT
> > +     movq    %rdi, %rax
> > +#  include "strcat-strlen-evex.S"
> >  # endif
> > -     test    %edx, %edx
> > -     jnz     L(UnalignedFourVecSizeLeave)
> > -
> > -L(UnalignedFourVecSizeLoop_start):
> > -     add     $(VEC_SIZE * 4), %rdi
> > -     add     $(VEC_SIZE * 4), %rsi
> > -     VMOVU   %YMM4, -(VEC_SIZE * 4)(%rdi)
> > -     VMOVA   (%rsi), %YMM4
> > -     VMOVU   %YMM5, -(VEC_SIZE * 3)(%rdi)
> > -     VMOVA   VEC_SIZE(%rsi), %YMM5
> > -     vpminub %YMM5, %YMM4, %YMM2
> > -     VMOVU   %YMM6, -(VEC_SIZE * 2)(%rdi)
> > -     VMOVA   (VEC_SIZE * 2)(%rsi), %YMM6
> > -     VMOVU   %YMM7, -VEC_SIZE(%rdi)
> > -     VMOVA   (VEC_SIZE * 3)(%rsi), %YMM7
> > -     vpminub %YMM7, %YMM6, %YMM3
> > -     vpminub %YMM2, %YMM3, %YMM2
> > -     /* If K7 != 0, there is a null byte.  */
> > -     vpcmpb  $0, %YMM2, %YMMZERO, %k7
> > -     kmovd   %k7, %edx
> > -# ifdef USE_AS_STRNCPY
> > -     sub     $(VEC_SIZE * 4), %r8
> > -     jbe     L(UnalignedLeaveCase2OrCase3)
> > +
> > +     movl    %esi, %PAGE_ALIGN_REG
> > +     andl    $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
> > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
> > +     ja      L(page_cross)
> > +L(page_cross_continue):
> > +     VMOVU   (%rsi), %VMM(0)
> > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > +     movq    %rdi, %rax
> >  # endif
> > -     test    %edx, %edx
> > -     jz      L(UnalignedFourVecSizeLoop_start)
> >
> > -L(UnalignedFourVecSizeLeave):
> > -     vpcmpb  $0, %YMM4, %YMMZERO, %k1
> > -     kmovd   %k1, %edx
> > -     test    %edx, %edx
> > -     jnz     L(CopyVecSizeUnaligned_0)
> >
> > -     vpcmpb  $0, %YMM5, %YMMZERO, %k2
> > -     kmovd   %k2, %ecx
> > -     test    %ecx, %ecx
> > -     jnz     L(CopyVecSizeUnaligned_16)
> > +     /* Two short string implementations. One with traditional
> > +        branching approach and one with masked instructions (which
> > +        have potential for dramatically bad perf if dst splits a
> > +        page and is not in the TLB).  */
> > +# if USE_EVEX_MASKED_STORE
> > +     VPTEST  %VMM(0), %VMM(0), %k0
> > +     KMOV    %k0, %VRCX
> > +#  ifdef USE_AS_WCSCPY
> > +     subl    $((1 << CHAR_PER_VEC)- 1), %VRCX
> > +#  else
> > +     inc     %VRCX
> > +#  endif
> > +     jz      L(more_1x_vec)
> > +     KMOV    %VRCX, %k1
> > +     KXOR    %k0, %k1, %k1
> >
> > -     vpcmpb  $0, %YMM6, %YMMZERO, %k3
> > -     kmovd   %k3, %edx
> > -     test    %edx, %edx
> > -     jnz     L(CopyVecSizeUnaligned_32)
> > -
> > -     vpcmpb  $0, %YMM7, %YMMZERO, %k4
> > -     kmovd   %k4, %ecx
> > -     bsf     %ecx, %edx
> > -     VMOVU   %YMM4, (%rdi)
> > -     VMOVU   %YMM5, VEC_SIZE(%rdi)
> > -     VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > -     lea     (VEC_SIZE * 3)(%rdi, %rdx), %rax
> > -# endif
> > -     VMOVU   %YMM7, (VEC_SIZE * 3)(%rdi)
> > -     add     $(VEC_SIZE - 1), %r8
> > -     sub     %rdx, %r8
> > -     lea     ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
> > -     jmp     L(StrncpyFillTailWithZero)
> > -# else
> > -     add     $(VEC_SIZE * 3), %rsi
> > -     add     $(VEC_SIZE * 3), %rdi
> > -     jmp     L(CopyVecSizeExit)
> > -# endif
> > +     VMOVU_MASK %VMM(0), (%rdi){%k1}
> >
> > -/* If source address alignment == destination address alignment */
> > +#  ifdef USE_AS_STPCPY
> > +     bsf     %VRCX, %VRCX
> > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rax
> > +#  endif
> > +     ret
> >
> > -L(SourceStringAlignmentLessTwoVecSize):
> > -     VMOVU   (%rsi), %YMM3
> > -     VMOVU   VEC_SIZE(%rsi), %YMM2
> > -     vpcmpb  $0, %YMM3, %YMMZERO, %k0
> > -     kmovd   %k0, %edx
> > +# else
> > +     VPTESTN %VMM(0), %VMM(0), %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jz      L(more_1x_vec)
> >
> > -# ifdef USE_AS_STRNCPY
> > -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > -     cmp     $VEC_SIZE, %r8
> > +     xorl    %edx, %edx
> > +     bsf     %VRCX, %VRDX
> > +#  ifdef USE_AS_STPCPY
> > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > +#  endif
> > +
> > +     /* Use mask bits in rcx to detect which copy we need. If the low
> > +        mask is zero then there must be a bit set in the upper half.
> > +        I.e if rcx != 0 and ecx == 0, then match must be upper 32
> > +        bits so we use L(copy_32_63).  */
> > +#  if VEC_SIZE == 64
> > +#   ifdef USE_AS_WCSCPY
> > +     testb   %cl, %cl
> > +#   else
> > +     testl   %ecx, %ecx
> > +#   endif
> > +     jz      L(copy_32_63)
> > +#  endif
> > +
> > +#  ifdef USE_AS_WCSCPY
> > +     testb   $0xf, %cl
> >  #  else
> > -     cmp     $(VEC_SIZE + 1), %r8
> > +     testw   %cx, %cx
> >  #  endif
> > -     jbe     L(CopyVecSizeTail1Case2OrCase3)
> > -# endif
> > -     test    %edx, %edx
> > -     jnz     L(CopyVecSizeTail1)
> > +     jz      L(copy_16_31)
> >
> > -     VMOVU   %YMM3, (%rdi)
> > -     vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > -     kmovd   %k0, %edx
> >
> > -# ifdef USE_AS_STRNCPY
> > -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > -     cmp     $(VEC_SIZE * 2), %r8
> > +#  ifdef USE_AS_WCSCPY
> > +     testb   $0x3, %cl
> >  #  else
> > -     cmp     $((VEC_SIZE * 2) + 1), %r8
> > +     testb   %cl, %cl
> >  #  endif
> > -     jbe     L(CopyTwoVecSize1Case2OrCase3)
> > -# endif
> > -     test    %edx, %edx
> > -     jnz     L(CopyTwoVecSize1)
> > -
> > -     and     $-VEC_SIZE, %rsi
> > -     and     $(VEC_SIZE - 1), %ecx
> > -     jmp     L(UnalignVecSizeBoth)
> > +     jz      L(copy_8_15)
> >
> > -/*------End of main part with loops---------------------*/
> >
> > -/* Case1 */
> > +#  ifdef USE_AS_WCSCPY
> > +     vmovd   %VMM_128(0), (%rdi)
> > +     /* No need to copy, we know its zero.  */
> > +     movl    $0, (%END_REG)
> >
> > -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> > -     .p2align 4
> > -L(CopyVecSize):
> > -     add     %rcx, %rdi
> > -# endif
> > -L(CopyVecSizeTail):
> > -     add     %rcx, %rsi
> > -L(CopyVecSizeTail1):
> > -     bsf     %edx, %edx
> > -L(CopyVecSizeExit):
> > -     cmp     $32, %edx
> > -     jae     L(Exit32_63)
> > -     cmp     $16, %edx
> > -     jae     L(Exit16_31)
> > -     cmp     $8, %edx
> > -     jae     L(Exit8_15)
> > -     cmp     $4, %edx
> > -     jae     L(Exit4_7)
> > -     cmp     $3, %edx
> > -     je      L(Exit3)
> > -     cmp     $1, %edx
> > -     ja      L(Exit2)
> > -     je      L(Exit1)
> > -     movb    $0, (%rdi)
> > -# ifdef USE_AS_STPCPY
> > -     lea     (%rdi), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -     sub     $1, %r8
> > -     lea     1(%rdi), %rdi
> > -     jnz     L(StrncpyFillTailWithZero)
> > -# endif
> >       ret
> > +#  else
> >
> > -     .p2align 4
> > -L(CopyTwoVecSize1):
> > -     add     $VEC_SIZE, %rsi
> > -     add     $VEC_SIZE, %rdi
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -     sub     $VEC_SIZE, %r8
> > -# endif
> > -     jmp     L(CopyVecSizeTail1)
> > -
> > -     .p2align 4
> > -L(CopyTwoVecSize):
> > -     bsf     %edx, %edx
> > -     add     %rcx, %rsi
> > -     add     $VEC_SIZE, %edx
> > -     sub     %ecx, %edx
> > -     jmp     L(CopyVecSizeExit)
> > -
> > -     .p2align 4
> > -L(CopyVecSizeUnaligned_0):
> > -     bsf     %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > -     lea     (%rdi, %rdx), %rax
> > -# endif
> > -     VMOVU   %YMM4, (%rdi)
> > -     add     $((VEC_SIZE * 4) - 1), %r8
> > -     sub     %rdx, %r8
> > -     lea     1(%rdi, %rdx), %rdi
> > -     jmp     L(StrncpyFillTailWithZero)
> > -# else
> > -     jmp     L(CopyVecSizeExit)
> > -# endif
> > +     testb   $0x7, %cl
> > +     jz      L(copy_4_7)
> >
> > -     .p2align 4
> > -L(CopyVecSizeUnaligned_16):
> > -     bsf     %ecx, %edx
> > -     VMOVU   %YMM4, (%rdi)
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > -     lea     VEC_SIZE(%rdi, %rdx), %rax
> > -# endif
> > -     VMOVU   %YMM5, VEC_SIZE(%rdi)
> > -     add     $((VEC_SIZE * 3) - 1), %r8
> > -     sub     %rdx, %r8
> > -     lea     (VEC_SIZE + 1)(%rdi, %rdx), %rdi
> > -     jmp     L(StrncpyFillTailWithZero)
> > -# else
> > -     add     $VEC_SIZE, %rsi
> > -     add     $VEC_SIZE, %rdi
> > -     jmp     L(CopyVecSizeExit)
> > -# endif
> >
> > -     .p2align 4
> > -L(CopyVecSizeUnaligned_32):
> > -     bsf     %edx, %edx
> > -     VMOVU   %YMM4, (%rdi)
> > -     VMOVU   %YMM5, VEC_SIZE(%rdi)
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > -     lea     (VEC_SIZE * 2)(%rdi, %rdx), %rax
> > -# endif
> > -     VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > -     add     $((VEC_SIZE * 2) - 1), %r8
> > -     sub     %rdx, %r8
> > -     lea     ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
> > -     jmp     L(StrncpyFillTailWithZero)
> > -# else
> > -     add     $(VEC_SIZE * 2), %rsi
> > -     add     $(VEC_SIZE * 2), %rdi
> > -     jmp     L(CopyVecSizeExit)
> > -# endif
> > +     test    %edx, %edx
> > +     jz      L(set_null_term)
> >
> > -# ifdef USE_AS_STRNCPY
> > -#  ifndef USE_AS_STRCAT
> > -     .p2align 4
> > -L(CopyVecSizeUnalignedVec6):
> > -     VMOVU   %YMM6, (%rdi, %rcx)
> > -     jmp     L(CopyVecSizeVecExit)
> > -
> > -     .p2align 4
> > -L(CopyVecSizeUnalignedVec5):
> > -     VMOVU   %YMM5, (%rdi, %rcx)
> > -     jmp     L(CopyVecSizeVecExit)
> > -
> > -     .p2align 4
> > -L(CopyVecSizeUnalignedVec4):
> > -     VMOVU   %YMM4, (%rdi, %rcx)
> > -     jmp     L(CopyVecSizeVecExit)
> > -
> > -     .p2align 4
> > -L(CopyVecSizeUnalignedVec3):
> > -     VMOVU   %YMM3, (%rdi, %rcx)
> > -     jmp     L(CopyVecSizeVecExit)
> > +     /* NB: make this `vmovw` if support for AVX512-FP16 is added.
> > +      */
> > +     vmovd   %VMM_128(0), %esi
> > +     movw    %si, (%rdi)
> > +
> > +     .p2align 4,, 1
> > +L(set_null_term):
> > +     /* No need to copy, we know its zero.  */
> > +     movb    $0, (%END_REG)
> > +     ret
> >  #  endif
> >
> > -/* Case2 */
> > -
> > -     .p2align 4
> > -L(CopyVecSizeCase2):
> > -     add     $VEC_SIZE, %r8
> > -     add     %rcx, %rdi
> > -     add     %rcx, %rsi
> > -     bsf     %edx, %edx
> > -     cmp     %r8d, %edx
> > -     jb      L(CopyVecSizeExit)
> > -     jmp     L(StrncpyExit)
> > -
> > -     .p2align 4
> > -L(CopyTwoVecSizeCase2):
> > -     add     %rcx, %rsi
> > -     bsf     %edx, %edx
> > -     add     $VEC_SIZE, %edx
> > -     sub     %ecx, %edx
> > -     cmp     %r8d, %edx
> > -     jb      L(CopyVecSizeExit)
> > -     jmp     L(StrncpyExit)
> > -
> > -L(CopyVecSizeTailCase2):
> > -     add     %rcx, %rsi
> > -     bsf     %edx, %edx
> > -     cmp     %r8d, %edx
> > -     jb      L(CopyVecSizeExit)
> > -     jmp     L(StrncpyExit)
> > -
> > -L(CopyVecSizeTail1Case2):
> > -     bsf     %edx, %edx
> > -     cmp     %r8d, %edx
> > -     jb      L(CopyVecSizeExit)
> > -     jmp     L(StrncpyExit)
> > -
> > -/* Case2 or Case3,  Case3 */
> > -
> > -     .p2align 4
> > -L(CopyVecSizeCase2OrCase3):
> > -     test    %rdx, %rdx
> > -     jnz     L(CopyVecSizeCase2)
> > -L(CopyVecSizeCase3):
> > -     add     $VEC_SIZE, %r8
> > -     add     %rcx, %rdi
> > -     add     %rcx, %rsi
> > -     jmp     L(StrncpyExit)
> > -
> > -     .p2align 4
> > -L(CopyTwoVecSizeCase2OrCase3):
> > -     test    %rdx, %rdx
> > -     jnz     L(CopyTwoVecSizeCase2)
> > -     add     %rcx, %rsi
> > -     jmp     L(StrncpyExit)
> > -
> > -     .p2align 4
> > -L(CopyVecSizeTailCase2OrCase3):
> > -     test    %rdx, %rdx
> > -     jnz     L(CopyVecSizeTailCase2)
> > -     add     %rcx, %rsi
> > -     jmp     L(StrncpyExit)
> > -
> > -     .p2align 4
> > -L(CopyTwoVecSize1Case2OrCase3):
> > -     add     $VEC_SIZE, %rdi
> > -     add     $VEC_SIZE, %rsi
> > -     sub     $VEC_SIZE, %r8
> > -L(CopyVecSizeTail1Case2OrCase3):
> > -     test    %rdx, %rdx
> > -     jnz     L(CopyVecSizeTail1Case2)
> > -     jmp     L(StrncpyExit)
> > +#  if VEC_SIZE == 64
> > +     .p2align 4,, 6
> > +L(copy_32_63):
> > +     VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > +     VMOVU   %VMM_256(0), (%rdi)
> > +     VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
> > +     ret
> > +#  endif
> > +
> > +
> > +     .p2align 4,, 6
> > +L(copy_16_31):
> > +     /* Use xmm1 explicitly here as it won't require a `vzeroupper`
> > +        and will save code size.  */
> > +     vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > +     VMOVU   %VMM_128(0), (%rdi)
> > +     vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
> > +     ret
> > +
> > +     .p2align 4,, 8
> > +L(copy_8_15):
> > +#  ifdef USE_AS_WCSCPY
> > +     movl    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> > +#  else
> > +     movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
> > +#  endif
> > +     vmovq   %VMM_128(0), (%rdi)
> > +     movq    %rcx, -(8 - CHAR_SIZE)(%END_REG)
> > +     ret
> >  # endif
> >
> > -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
> >
> > -     .p2align 4
> > -L(Exit1):
> > -     movzwl  (%rsi), %edx
> > -     mov     %dx, (%rdi)
> > -# ifdef USE_AS_STPCPY
> > -     lea     1(%rdi), %rax
> > +# ifndef USE_AS_WCSCPY
> > +     .p2align 4,, 12
> > +L(copy_4_7):
> > +     movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> > +     vmovd   %VMM_128(0), (%rdi)
> > +     movl    %ecx, -(4 - CHAR_SIZE)(%END_REG)
> > +     ret
> >  # endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -     sub     $2, %r8
> > -     lea     2(%rdi), %rdi
> > -     jnz     L(StrncpyFillTailWithZero)
> > +
> > +
> > +     .p2align 4,, 8
> > +L(more_1x_vec):
> > +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > +     VMOVU   %VMM(0), (%rdi)
> >  # endif
> > -     ret
> > +     subq    %rsi, %rdi
> > +     andq    $-(VEC_SIZE), %rsi
> > +     addq    %rsi, %rdi
> > +     VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> >
> > -     .p2align 4
> > -L(Exit2):
> > -     movzwl  (%rsi), %ecx
> > -     mov     %cx, (%rdi)
> > -     movb    $0, 2(%rdi)
> > +     /* Ideally we store after moves to minimize impact of potential
> > +        false-dependencies.  */
> > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > +     VMOVU   %VMM(0), (%rax)
> > +# endif
> > +
> > +     VPTESTN %VMM(1), %VMM(1), %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(ret_vec_x1)
> > +
> > +     VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > +     VMOVU   %VMM(1), VEC_SIZE(%rdi)
> > +
> > +     VPTESTN %VMM(2), %VMM(2), %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(ret_vec_x2)
> > +
> > +     VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> > +     VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > +
> > +     VPTESTN %VMM(3), %VMM(3), %k0
> > +     KMOV    %k0, %VRDX
> > +     test    %VRDX, %VRDX
> > +     jnz     L(ret_vec_x3)
> > +
> > +     VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > +     VPTESTN %VMM(4), %VMM(4), %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(ret_vec_x4)
> > +
> > +     VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> > +
> > +
> > +     /* Align for 4x loop.  */
> > +     subq    %rsi, %rdi
> > +
> > +     /* + VEC_SIZE * 5 because we never added the original VEC_SIZE
> > +        we covered before aligning.  */
> > +     subq    $-(VEC_SIZE * 5), %rsi
> > +     andq    $-(VEC_SIZE * 4), %rsi
> > +
> > +
> > +     /* Load first half of the loop before entry.  */
> > +     VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +     VPTESTN %VMM(4), %VMM(4), %k2
> > +     VPTESTN %VMM(6), %VMM(6), %k4
> > +     KORTEST %k2, %k4
> > +     jnz     L(loop_4x_done)
> > +
> > +     .p2align 4,, 11
> > +L(loop_4x_vec):
> > +
> > +     VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
> > +     VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > +     VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> > +     VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> > +
> > +     subq    $(VEC_SIZE * -4), %rsi
> > +
> > +     VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > +
> > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +     VPTESTN %VMM(4), %VMM(4), %k2
> > +     VPTESTN %VMM(6), %VMM(6), %k4
> > +     KORTEST %k2, %k4
> > +     jz      L(loop_4x_vec)
> > +
> > +L(loop_4x_done):
> > +     VPTESTN %VMM(0), %VMM(0), %k0
> > +     KMOV    %k0, %VRCX
> > +     /* Restore rdi (%rdi).  */
> > +     addq    %rsi, %rdi
> > +     test    %VRCX, %VRCX
> > +     jnz     L(ret_vec_x0_end)
> > +     VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> > +
> > +     KMOV    %k2, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(ret_vec_x1)
> > +     VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> > +
> > +     VPTESTN %VMM(2), %VMM(2), %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(ret_vec_x2)
> > +     VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> > +     /* Place L(ret_vec_x4) here to save code size.  We get a
> > +        meaningfuly benefit doing this for stpcpy.  */
> > +     KMOV    %k4, %VRDX
> > +L(ret_vec_x3):
> > +     bsf     %VRDX, %VRDX
> > +     VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > +     VMOVU   %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> >  # ifdef USE_AS_STPCPY
> > -     lea     2(%rdi), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -     sub     $3, %r8
> > -     lea     3(%rdi), %rdi
> > -     jnz     L(StrncpyFillTailWithZero)
> > +     leaq    (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
> >  # endif
> > +L(return_end):
> >       ret
> >
> > -     .p2align 4
> > -L(Exit3):
> > -     mov     (%rsi), %edx
> > -     mov     %edx, (%rdi)
> > +     .p2align 4,, 6
> > +L(ret_vec_x0_end):
> > +     bsf     %VRCX, %VRCX
> >  # ifdef USE_AS_STPCPY
> > -     lea     3(%rdi), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -     sub     $4, %r8
> > -     lea     4(%rdi), %rdi
> > -     jnz     L(StrncpyFillTailWithZero)
> > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rax
> >  # endif
> > +     inc     %VRCX
> > +     VMOVU   (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > +     VMOVU   %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> >       ret
> >
> > -     .p2align 4
> > -L(Exit4_7):
> > -     mov     (%rsi), %ecx
> > -     mov     %ecx, (%rdi)
> > -     mov     -3(%rsi, %rdx), %ecx
> > -     mov     %ecx, -3(%rdi, %rdx)
> > +     .p2align 4,, 8
> > +L(ret_vec_x1):
> > +     bsf     %VRCX, %VRCX
> > +     VMOVU   (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > +     VMOVU   %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> >  # ifdef USE_AS_STPCPY
> > -     lea     (%rdi, %rdx), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -     sub     %rdx, %r8
> > -     sub     $1, %r8
> > -     lea     1(%rdi, %rdx), %rdi
> > -     jnz     L(StrncpyFillTailWithZero)
> > +     leaq    VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
> >  # endif
> >       ret
> >
> > -     .p2align 4
> > -L(Exit8_15):
> > -     mov     (%rsi), %rcx
> > -     mov     -7(%rsi, %rdx), %r9
> > -     mov     %rcx, (%rdi)
> > -     mov     %r9, -7(%rdi, %rdx)
> > +     .p2align 4,, 4
> > +L(ret_vec_x2):
> > +     bsf     %VRCX, %VRCX
> > +     VMOVU   ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > +     VMOVU   %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> >  # ifdef USE_AS_STPCPY
> > -     lea     (%rdi, %rdx), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -     sub     %rdx, %r8
> > -     sub     $1, %r8
> > -     lea     1(%rdi, %rdx), %rdi
> > -     jnz     L(StrncpyFillTailWithZero)
> > +     leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
> >  # endif
> >       ret
> >
> > -     .p2align 4
> > -L(Exit16_31):
> > -     VMOVU   (%rsi), %XMM2
> > -     VMOVU   -15(%rsi, %rdx), %XMM3
> > -     VMOVU   %XMM2, (%rdi)
> > -     VMOVU   %XMM3, -15(%rdi, %rdx)
> > +     /* ret_vec_x3 reuses return code after the loop.  */
> > +     .p2align 4,, 6
> > +L(ret_vec_x4):
> > +     bsf     %VRCX, %VRCX
> > +     VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > +     VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> >  # ifdef USE_AS_STPCPY
> > -     lea     (%rdi, %rdx), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -     sub %rdx, %r8
> > -     sub $1, %r8
> > -     lea 1(%rdi, %rdx), %rdi
> > -     jnz L(StrncpyFillTailWithZero)
> > +     leaq    (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
> >  # endif
> >       ret
> >
> > -     .p2align 4
> > -L(Exit32_63):
> > -     VMOVU   (%rsi), %YMM2
> > -     VMOVU   -31(%rsi, %rdx), %YMM3
> > -     VMOVU   %YMM2, (%rdi)
> > -     VMOVU   %YMM3, -31(%rdi, %rdx)
> > -# ifdef USE_AS_STPCPY
> > -     lea     (%rdi, %rdx), %rax
> > +
> > +     .p2align 4,, 4
> > +L(page_cross):
> > +# ifndef USE_AS_STRCAT
> > +     vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> >  # endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -     sub     %rdx, %r8
> > -     sub     $1, %r8
> > -     lea     1(%rdi, %rdx), %rdi
> > -     jnz     L(StrncpyFillTailWithZero)
> > +     movq    %rsi, %rcx
> > +     andq    $(VEC_SIZE * -1), %rcx
> > +
> > +     VPCMPEQ (%rcx), %VZERO, %k0
> > +     KMOV    %k0, %VRCX
> > +# ifdef USE_AS_WCSCPY
> > +     andl    $(VEC_SIZE - 1), %PAGE_ALIGN_REG
> > +     shrl    $2, %PAGE_ALIGN_REG
> >  # endif
> > -     ret
> > +     shrx    %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
> >
> > -# ifdef USE_AS_STRNCPY
> > +# if USE_MOVSB_IN_PAGE_CROSS
> > +     /* Optimizing more aggressively for space as this is very cold
> > +        code. This saves 2x cache lines.  */
> >
> > -     .p2align 4
> > -L(StrncpyExit1):
> > -     movzbl  (%rsi), %edx
> > -     mov     %dl, (%rdi)
> > -#  ifdef USE_AS_STPCPY
> > -     lea     1(%rdi), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -     movb    $0, 1(%rdi)
> > +     /* This adds once to the later result which will get correct
> > +        copy bounds. NB: this can never zero-out a non-zero RCX as
> > +        to be in the page cross case rsi cannot be aligned and we
> > +        already right-shift rcx by the misalignment.  */
> > +     shl     %VRCX
> > +     jz      L(page_cross_continue)
> > +#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > +     movq    %rdi, %rax
> >  #  endif
> > -     ret
> > +     bsf     %VRCX, %VRCX
> > +     REP_MOVS
> >
> > -     .p2align 4
> > -L(StrncpyExit2):
> > -     movzwl  (%rsi), %edx
> > -     mov     %dx, (%rdi)
> >  #  ifdef USE_AS_STPCPY
> > -     lea     2(%rdi), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -     movb    $0, 2(%rdi)
> > +     leaq    -CHAR_SIZE(%rdi), %rax
> >  #  endif
> >       ret
> >
> > -     .p2align 4
> > -L(StrncpyExit3_4):
> > -     movzwl  (%rsi), %ecx
> > -     movzwl  -2(%rsi, %r8), %edx
> > -     mov     %cx, (%rdi)
> > -     mov     %dx, -2(%rdi, %r8)
> > -#  ifdef USE_AS_STPCPY
> > -     lea     (%rdi, %r8), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -     movb    $0, (%rdi, %r8)
> > -#  endif
> > -     ret
> >
> > -     .p2align 4
> > -L(StrncpyExit5_8):
> > -     mov     (%rsi), %ecx
> > -     mov     -4(%rsi, %r8), %edx
> > -     mov     %ecx, (%rdi)
> > -     mov     %edx, -4(%rdi, %r8)
> > -#  ifdef USE_AS_STPCPY
> > -     lea     (%rdi, %r8), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -     movb    $0, (%rdi, %r8)
> > -#  endif
> > -     ret
> > +# else
> > +     /* Check if we found zero-char before end of page.  */
> > +     test    %VRCX, %VRCX
> > +     jz      L(page_cross_continue)
> >
> > -     .p2align 4
> > -L(StrncpyExit9_16):
> > -     mov     (%rsi), %rcx
> > -     mov     -8(%rsi, %r8), %rdx
> > -     mov     %rcx, (%rdi)
> > -     mov     %rdx, -8(%rdi, %r8)
> > -#  ifdef USE_AS_STPCPY
> > -     lea     (%rdi, %r8), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -     movb    $0, (%rdi, %r8)
> > -#  endif
> > -     ret
> > +     /* Traditional copy case, essentially same as used in non-page-
> > +        cross case but since we can't reuse VMM(0) we need twice as
> > +        many loads from rsi.  */
> >
> > -     .p2align 4
> > -L(StrncpyExit17_32):
> > -     VMOVU   (%rsi), %XMM2
> > -     VMOVU   -16(%rsi, %r8), %XMM3
> > -     VMOVU   %XMM2, (%rdi)
> > -     VMOVU   %XMM3, -16(%rdi, %r8)
> > -#  ifdef USE_AS_STPCPY
> > -     lea     (%rdi, %r8), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -     movb    $0, (%rdi, %r8)
> > +#  ifndef USE_AS_STRCAT
> > +     xorl    %edx, %edx
> >  #  endif
> > -     ret
> > -
> > -     .p2align 4
> > -L(StrncpyExit33_64):
> > -     /*  0/32, 31/16 */
> > -     VMOVU   (%rsi), %YMM2
> > -     VMOVU   -VEC_SIZE(%rsi, %r8), %YMM3
> > -     VMOVU   %YMM2, (%rdi)
> > -     VMOVU   %YMM3, -VEC_SIZE(%rdi, %r8)
> > +     /* Dependency on rdi must already have been satisfied.  */
> > +     bsf     %VRCX, %VRDX
> >  #  ifdef USE_AS_STPCPY
> > -     lea     (%rdi, %r8), %rax
> > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > +#  elif !defined USE_AS_STRCAT
> > +     movq    %rdi, %rax
> >  #  endif
> > -#  ifdef USE_AS_STRCAT
> > -     movb    $0, (%rdi, %r8)
> > -#  endif
> > -     ret
> >
> > -     .p2align 4
> > -L(StrncpyExit65):
> > -     /* 0/32, 32/32, 64/1 */
> > -     VMOVU   (%rsi), %YMM2
> > -     VMOVU   32(%rsi), %YMM3
> > -     mov     64(%rsi), %cl
> > -     VMOVU   %YMM2, (%rdi)
> > -     VMOVU   %YMM3, 32(%rdi)
> > -     mov     %cl, 64(%rdi)
> > -#  ifdef USE_AS_STPCPY
> > -     lea     65(%rdi), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -     movb    $0, 65(%rdi)
> > +#  if VEC_SIZE == 64
> > +#   ifdef USE_AS_WCSCPY
> > +     testb   %cl, %cl
> > +#   else
> > +     test    %ecx, %ecx
> > +#   endif
> > +     jz      L(page_cross_copy_32_63)
> >  #  endif
> > -     ret
> > -
> > -#  ifndef USE_AS_STRCAT
> >
> > -     .p2align 4
> > -L(Fill1):
> > -     mov     %dl, (%rdi)
> > -     ret
> > +#  ifdef USE_AS_WCSCPY
> > +     testb   $0xf, %cl
> > +#  else
> > +     testw   %cx, %cx
> > +#  endif
> > +     jz      L(page_cross_copy_16_31)
> >
> > -     .p2align 4
> > -L(Fill2):
> > -     mov     %dx, (%rdi)
> > -     ret
> > +#  ifdef USE_AS_WCSCPY
> > +     testb   $0x3, %cl
> > +#  else
> > +     testb   %cl, %cl
> > +#  endif
> > +     jz      L(page_cross_copy_8_15)
> >
> > -     .p2align 4
> > -L(Fill3_4):
> > -     mov     %dx, (%rdi)
> > -     mov     %dx, -2(%rdi, %r8)
> > +#  ifdef USE_AS_WCSCPY
> > +     movl    (%rsi), %esi
> > +     movl    %esi, (%rdi)
> > +     movl    $0, (%END_REG)
> >       ret
> > +#  else
> >
> > -     .p2align 4
> > -L(Fill5_8):
> > -     mov     %edx, (%rdi)
> > -     mov     %edx, -4(%rdi, %r8)
> > -     ret
> > +     testb   $0x7, %cl
> > +     jz      L(page_cross_copy_4_7)
> >
> > -     .p2align 4
> > -L(Fill9_16):
> > -     mov     %rdx, (%rdi)
> > -     mov     %rdx, -8(%rdi, %r8)
> > +     test    %edx, %edx
> > +     jz      L(page_cross_set_null_term)
> > +     movzwl  (%rsi), %ecx
> > +     movw    %cx, (%rdi)
> > +L(page_cross_set_null_term):
> > +     movb    $0, (%END_REG)
> >       ret
> >
> > -     .p2align 4
> > -L(Fill17_32):
> > -     VMOVU   %XMMZERO, (%rdi)
> > -     VMOVU   %XMMZERO, -16(%rdi, %r8)
> > -     ret
> >
> > -     .p2align 4
> > -L(CopyVecSizeUnalignedVec2):
> > -     VMOVU   %YMM2, (%rdi, %rcx)
> > -
> > -     .p2align 4
> > -L(CopyVecSizeVecExit):
> > -     bsf     %edx, %edx
> > -     add     $(VEC_SIZE - 1), %r8
> > -     add     %rcx, %rdi
> > -#   ifdef USE_AS_STPCPY
> > -     lea     (%rdi, %rdx), %rax
> > -#   endif
> > -     sub     %rdx, %r8
> > -     lea     1(%rdi, %rdx), %rdi
> > -
> > -     .p2align 4
> > -L(StrncpyFillTailWithZero):
> > -     xor     %edx, %edx
> > -     sub     $VEC_SIZE, %r8
> > -     jbe     L(StrncpyFillExit)
> > -
> > -     VMOVU   %YMMZERO, (%rdi)
> > -     add     $VEC_SIZE, %rdi
> > -
> > -     mov     %rdi, %rsi
> > -     and     $(VEC_SIZE - 1), %esi
> > -     sub     %rsi, %rdi
> > -     add     %rsi, %r8
> > -     sub     $(VEC_SIZE * 4), %r8
> > -     jb      L(StrncpyFillLessFourVecSize)
> > -
> > -L(StrncpyFillLoopVmovdqa):
> > -     VMOVA   %YMMZERO, (%rdi)
> > -     VMOVA   %YMMZERO, VEC_SIZE(%rdi)
> > -     VMOVA   %YMMZERO, (VEC_SIZE * 2)(%rdi)
> > -     VMOVA   %YMMZERO, (VEC_SIZE * 3)(%rdi)
> > -     add     $(VEC_SIZE * 4), %rdi
> > -     sub     $(VEC_SIZE * 4), %r8
> > -     jae     L(StrncpyFillLoopVmovdqa)
> > -
> > -L(StrncpyFillLessFourVecSize):
> > -     add     $(VEC_SIZE * 2), %r8
> > -     jl      L(StrncpyFillLessTwoVecSize)
> > -     VMOVA   %YMMZERO, (%rdi)
> > -     VMOVA   %YMMZERO, VEC_SIZE(%rdi)
> > -     add     $(VEC_SIZE * 2), %rdi
> > -     sub     $VEC_SIZE, %r8
> > -     jl      L(StrncpyFillExit)
> > -     VMOVA   %YMMZERO, (%rdi)
> > -     add     $VEC_SIZE, %rdi
> > -     jmp     L(Fill)
> > -
> > -     .p2align 4
> > -L(StrncpyFillLessTwoVecSize):
> > -     add     $VEC_SIZE, %r8
> > -     jl      L(StrncpyFillExit)
> > -     VMOVA   %YMMZERO, (%rdi)
> > -     add     $VEC_SIZE, %rdi
> > -     jmp     L(Fill)
> > -
> > -     .p2align 4
> > -L(StrncpyFillExit):
> > -     add     $VEC_SIZE, %r8
> > -L(Fill):
> > -     cmp     $17, %r8d
> > -     jae     L(Fill17_32)
> > -     cmp     $9, %r8d
> > -     jae     L(Fill9_16)
> > -     cmp     $5, %r8d
> > -     jae     L(Fill5_8)
> > -     cmp     $3, %r8d
> > -     jae     L(Fill3_4)
> > -     cmp     $1, %r8d
> > -     ja      L(Fill2)
> > -     je      L(Fill1)
> > +     .p2align 4,, 4
> > +L(page_cross_copy_4_7):
> > +     movl    (%rsi), %ecx
> > +     movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
> > +     movl    %ecx, (%rdi)
> > +     movl    %esi, -(4 - CHAR_SIZE)(%END_REG)
> >       ret
> > -
> > -/* end of ifndef USE_AS_STRCAT */
> >  #  endif
> >
> > -     .p2align 4
> > -L(UnalignedLeaveCase2OrCase3):
> > -     test    %rdx, %rdx
> > -     jnz     L(UnalignedFourVecSizeLeaveCase2)
> > -L(UnalignedFourVecSizeLeaveCase3):
> > -     lea     (VEC_SIZE * 4)(%r8), %rcx
> > -     and     $-VEC_SIZE, %rcx
> > -     add     $(VEC_SIZE * 3), %r8
> > -     jl      L(CopyVecSizeCase3)
> > -     VMOVU   %YMM4, (%rdi)
> > -     sub     $VEC_SIZE, %r8
> > -     jb      L(CopyVecSizeCase3)
> > -     VMOVU   %YMM5, VEC_SIZE(%rdi)
> > -     sub     $VEC_SIZE, %r8
> > -     jb      L(CopyVecSizeCase3)
> > -     VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > -     sub     $VEC_SIZE, %r8
> > -     jb      L(CopyVecSizeCase3)
> > -     VMOVU   %YMM7, (VEC_SIZE * 3)(%rdi)
> > -#  ifdef USE_AS_STPCPY
> > -     lea     (VEC_SIZE * 4)(%rdi), %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -     movb    $0, (VEC_SIZE * 4)(%rdi)
> > -#  endif
> > +#  if VEC_SIZE == 64
> > +     .p2align 4,, 4
> > +L(page_cross_copy_32_63):
> > +     VMOVU   (%rsi), %VMM_256(0)
> > +     VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > +     VMOVU   %VMM_256(0), (%rdi)
> > +     VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
> >       ret
> > -
> > -     .p2align 4
> > -L(UnalignedFourVecSizeLeaveCase2):
> > -     xor     %ecx, %ecx
> > -     vpcmpb  $0, %YMM4, %YMMZERO, %k1
> > -     kmovd   %k1, %edx
> > -     add     $(VEC_SIZE * 3), %r8
> > -     jle     L(CopyVecSizeCase2OrCase3)
> > -     test    %edx, %edx
> > -#  ifndef USE_AS_STRCAT
> > -     jnz     L(CopyVecSizeUnalignedVec4)
> > -#  else
> > -     jnz     L(CopyVecSize)
> > -#  endif
> > -     vpcmpb  $0, %YMM5, %YMMZERO, %k2
> > -     kmovd   %k2, %edx
> > -     VMOVU   %YMM4, (%rdi)
> > -     add     $VEC_SIZE, %rcx
> > -     sub     $VEC_SIZE, %r8
> > -     jbe     L(CopyVecSizeCase2OrCase3)
> > -     test    %edx, %edx
> > -#  ifndef USE_AS_STRCAT
> > -     jnz     L(CopyVecSizeUnalignedVec5)
> > -#  else
> > -     jnz     L(CopyVecSize)
> >  #  endif
> >
> > -     vpcmpb  $0, %YMM6, %YMMZERO, %k3
> > -     kmovd   %k3, %edx
> > -     VMOVU   %YMM5, VEC_SIZE(%rdi)
> > -     add     $VEC_SIZE, %rcx
> > -     sub     $VEC_SIZE, %r8
> > -     jbe     L(CopyVecSizeCase2OrCase3)
> > -     test    %edx, %edx
> > -#  ifndef USE_AS_STRCAT
> > -     jnz     L(CopyVecSizeUnalignedVec6)
> > -#  else
> > -     jnz     L(CopyVecSize)
> > -#  endif
> > -
> > -     vpcmpb  $0, %YMM7, %YMMZERO, %k4
> > -     kmovd   %k4, %edx
> > -     VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > -     lea     VEC_SIZE(%rdi, %rcx), %rdi
> > -     lea     VEC_SIZE(%rsi, %rcx), %rsi
> > -     bsf     %edx, %edx
> > -     cmp     %r8d, %edx
> > -     jb      L(CopyVecSizeExit)
> > -L(StrncpyExit):
> > -     cmp     $65, %r8d
> > -     je      L(StrncpyExit65)
> > -     cmp     $33, %r8d
> > -     jae     L(StrncpyExit33_64)
> > -     cmp     $17, %r8d
> > -     jae     L(StrncpyExit17_32)
> > -     cmp     $9, %r8d
> > -     jae     L(StrncpyExit9_16)
> > -     cmp     $5, %r8d
> > -     jae     L(StrncpyExit5_8)
> > -     cmp     $3, %r8d
> > -     jae     L(StrncpyExit3_4)
> > -     cmp     $1, %r8d
> > -     ja      L(StrncpyExit2)
> > -     je      L(StrncpyExit1)
> > -#  ifdef USE_AS_STPCPY
> > -     mov     %rdi, %rax
> > -#  endif
> > -#  ifdef USE_AS_STRCAT
> > -     movb    $0, (%rdi)
> > -#  endif
> > +     .p2align 4,, 4
> > +L(page_cross_copy_16_31):
> > +     vmovdqu (%rsi), %xmm0
> > +     vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > +     vmovdqu %xmm0, (%rdi)
> > +     vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
> >       ret
> >
> > -     .p2align 4
> > -L(ExitZero):
> > -#  ifndef USE_AS_STRCAT
> > -     mov     %rdi, %rax
> > -#  endif
> > +     .p2align 4,, 4
> > +L(page_cross_copy_8_15):
> > +     movq    (%rsi), %rcx
> > +     movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> > +     movq    %rcx, (%rdi)
> > +     movq    %rsi, -(8 - CHAR_SIZE)(%END_REG)
> >       ret
> > -
> > -# endif
> > -
> > -# ifndef USE_AS_STRCAT
> > -END (STRCPY)
> > -# else
> > -END (STRCAT)
> >  # endif
> > +END(STRCPY)
> >  #endif
> > diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
> > index 203a19bf21..d648ba5cfe 100644
> > --- a/sysdeps/x86_64/multiarch/strncat-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strncat-evex.S
> > @@ -1,7 +1,520 @@
> > -#ifndef STRNCAT
> > -# define STRNCAT     __strncat_evex
> > -#endif
> > +/* {wcs|str}ncat  with 256/512-bit EVEX.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (4)
> > +
> > +     /* Use evex-masked stores for small sizes. Turned off at the
> > +        moment.  */
> > +# define USE_EVEX_MASKED_STORE       0
> > +
> > +# include <sysdep.h>
> > +
> > +# ifndef VEC_SIZE
> > +#  include "x86-evex256-vecs.h"
> > +# endif
> > +
> > +# ifndef STRNCAT
> > +#  define STRNCAT    __strncat_evex
> > +# endif
> > +
> > +
> > +# ifdef USE_AS_WCSCPY
> > +#  define movNULL    movl
> > +#  define VMOVU_MASK vmovdqu32
> > +#  define VPMIN      vpminud
> > +#  define VPTESTN    vptestnmd
> > +#  define VPTEST     vptestmd
> > +#  define VPCMPEQ    vpcmpeqd
> > +#  define CHAR_SIZE  4
> > +
> > +#  define REP_MOVS   rep movsd
> > +
> > +#  define VMASK_REG  VR10
> > +#  define FIND_FIRST_ONE(src, dst)   movl $CHAR_PER_VEC, %dst; bsf %src, %dst
> > +
> > +#  define USE_WIDE_CHAR
> > +# else
> > +#  define movNULL    movb
> > +#  define VMOVU_MASK vmovdqu8
> > +#  define VPMIN      vpminub
> > +#  define VPTESTN    vptestnmb
> > +#  define VPTEST     vptestmb
> > +#  define VPCMPEQ    vpcmpeqb
> > +#  define CHAR_SIZE  1
> > +
> > +#  define REP_MOVS   rep movsb
> > +
> > +#  define VMASK_REG  VRCX
> > +#  define FIND_FIRST_ONE(src, dst)   tzcnt %src, %dst
> > +
> > +# endif
> > +
> > +# include "strncpy-or-cat-overflow-def.h"
> > +
> > +# include "reg-macros.h"
> > +
> > +
> > +# define VZERO       VMM(7)
> > +# define VZERO_128   VMM_128(7)
> > +
> > +# define PAGE_SIZE   4096
> > +# define CHAR_PER_VEC        (VEC_SIZE / CHAR_SIZE)
> > +
> > +     .section SECTION(.text), "ax", @progbits
> > +ENTRY(STRNCAT)
> > +     movq    %rdi, %rax
> > +
> > +     /* NB: It's safe to filter out zero-length strings WITHOUT
> > +        setting null-term. Destination MUST be a null-terminated
> > +        string so essentially the work is already done.  */
> > +# ifdef USE_AS_WCSCPY
> > +     leaq    -1(%rdx), %rcx
> > +     shrq    $56, %rcx
> > +     jnz     L(zero_len)
> > +# else
> > +     test    %rdx, %rdx
> > +     jle     L(zero_len)
> > +# endif
> > +
> > +# include "strcat-strlen-evex.S"
> > +
> > +     movl    %esi, %ecx
> > +     andl    $(PAGE_SIZE - 1), %ecx
> > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > +     ja      L(page_cross)
> > +L(page_cross_continue):
> > +     VMOVU   (%rsi), %VMM(0)
> > +     VPTESTN %VMM(0), %VMM(0), %k0
> > +
> > +     /* If USE_EVEX_MASK_STORE is enabled then we just handle length
> > +        <= CHAR_PER_VEC with masked instructions (which have
> > +        potential for dramatically bad perf if dst splits a page and
> > +        is not in the TLB).  */
> > +# if USE_EVEX_MASKED_STORE
> > +     KMOV    %k0, %VRCX
> > +     FIND_FIRST_ONE (VRCX, VR8)
> > +     cmpq    %r8, %rdx
> > +     jbe     L(less_1x_vec)
> > +
> > +     test    %VRCX, %VRCX
> > +     jz      L(more_1x_vec)
> > +
> > +     blsmsk  %VRCX, %VRCX
> > +     KMOV    %VRCX, %k1
> > +     VMOVU_MASK %VMM(0), (%rdi){%k1}
> > +     ret
> > +
> > +L(less_1x_vec):
> > +     mov     $-1, %VRCX
> > +     bzhi    %VRDX, %VRCX, %VRCX
> > +     KMOV    %VRCX, %k1
> > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > +     VMOVU_MASK %VMM(0), (%rdi){%k1}
> > +
> > +     ret
> > +# else
> > +     KMOV    %k0, %VMASK_REG
> > +     /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
> > +        %VMASK_REG, %VRCX` for wcsncat.  */
> > +     FIND_FIRST_ONE (VMASK_REG, VRCX)
> > +     cmpq    %rcx, %rdx
> > +     jbe     L(less_1x_vec)
> > +
> > +     /* If there were no zero-CHARs (rcx was zero before
> > +        FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> > +     cmpl    $CHAR_PER_VEC, %ecx
> > +     je      L(more_1x_vec)
> > +
> > +     movl    %ecx, %edx
> > +
> > +L(less_1x_vec):
> > +#  if VEC_SIZE == 64
> > +     cmpl    $(32 / CHAR_SIZE), %edx
> > +     jae     L(copy_32_63)
> > +#  endif
> > +
> > +     cmpl    $(16 / CHAR_SIZE), %edx
> > +     jae     L(copy_16_31)
> > +
> > +
> > +     cmpl    $(8 / CHAR_SIZE), %edx
> > +     jae     L(copy_8_15)
> > +
> > +#  ifdef USE_AS_WCSCPY
> > +     vmovd   %VMM_128(0), (%rdi)
> > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > +     ret
> > +#  else
> > +
> > +     cmpl    $4, %edx
> > +     jae     L(copy_4_7)
> > +
> > +     movzbl  (%rsi), %ecx
> > +     cmpl    $1, %edx
> > +     jbe     L(set_null_term)
> > +
> > +     movzwl  1(%rsi), %esi
> > +     movw    %si, 1(%rdi)
> > +
> > +     .p2align 4,, 1
> > +L(set_null_term):
> > +     movb    %cl, (%rdi)
> > +     movNULL $0, (%rdi, %rdx)
> > +     ret
> > +#  endif
> > +
> > +#  if VEC_SIZE == 64
> > +     .p2align 4,, 6
> > +L(copy_32_63):
> > +     VMOVU   -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > +     VMOVU   %VMM_256(0), (%rdi)
> > +     VMOVU   %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
> > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > +     ret
> > +#  endif
> > +     .p2align 4,, 6
> > +L(copy_16_31):
> > +     /* Use xmm1 explicitly here as it won't require a `vzeroupper`
> > +        and will save code size.  */
> > +     vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > +     VMOVU   %VMM_128(0), (%rdi)
> > +     vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
> > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > +     ret
> > +
> > +     .p2align 4,, 2
> > +L(copy_8_15):
> > +     movq    -(8)(%rsi, %rdx, CHAR_SIZE), %rcx
> > +     vmovq   %VMM_128(0), (%rdi)
> > +     movq    %rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
> > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > +     ret
> > +
> > +#  ifndef USE_AS_WCSCPY
> > +     .p2align 4,, 12
> > +L(copy_4_7):
> > +     movl    -(4)(%rsi, %rdx, CHAR_SIZE), %ecx
> > +     vmovd   %VMM_128(0), (%rdi)
> > +     movl    %ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
> > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > +     ret
> > +#  endif
> > +
> > +# endif
> > +     .p2align 4,, 4
> > +L(zero_len):
> > +# ifdef USE_AS_WCSCPY
> > +     test    %rdx, %rdx
> > +# endif
> > +     jne     OVERFLOW_STRCAT
> > +     ret
> >
> > -#define USE_AS_STRNCAT
> > -#define STRCAT       STRNCAT
> > -#include "strcat-evex.S"
> > +     .p2align 4,, 8
> > +L(more_1x_vec):
> > +     VMOVU   %VMM(0), (%rdi)
> > +
> > +     /* We are going to align rsi here so will need to be able to re-
> > +        adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> > +        so rsi + rdx * CHAR_SIZE cannot overflow.  */
> > +
> > +     leaq    (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> > +     subq    %rsi, %rdi
> > +     andq    $-(VEC_SIZE), %rsi
> > +L(loop_last_4x_vec):
> > +     addq    %rsi, %rdi
> > +     subq    %rsi, %rdx
> > +# ifdef USE_AS_WCSCPY
> > +     shrq    $2, %rdx
> > +# endif
> > +
> > +     /* Will need this regardless.  */
> > +     VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> > +     VPTESTN %VMM(1), %VMM(1), %k0
> > +     KMOV    %k0, %VMASK_REG
> > +
> > +     cmpq    $(CHAR_PER_VEC * 2), %rdx
> > +     ja      L(more_2x_vec)
> > +
> > +L(last_2x_vec):
> > +     FIND_FIRST_ONE (VMASK_REG, VRCX)
> > +     cmpl    %ecx, %edx
> > +     jbe     L(ret_vec_x1_len)
> > +
> > +     /* If there were no zero-CHARs (rcx was zero before
> > +        FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> > +     cmpl    $CHAR_PER_VEC, %ecx
> > +     jne     L(ret_vec_x1)
> > +
> > +     VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > +     VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > +     VPTESTN %VMM(2), %VMM(2), %k0
> > +     KMOV    %k0, %VRCX
> > +     addl    $-CHAR_PER_VEC, %edx
> > +     bzhi    %VRDX, %VRCX, %VR8
> > +     jz      L(ret_vec_x2_len)
> > +L(ret_vec_x2):
> > +     bsf     %VRCX, %VRDX
> > +L(ret_vec_x2_len):
> > +     VMOVU   (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > +     movNULL $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> > +     VMOVU   %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +     ret
> > +
> > +     .p2align 4,, 4
> > +L(ret_vec_x1_len):
> > +     movl    %edx, %ecx
> > +L(ret_vec_x1):
> > +     VMOVU   (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > +     movNULL $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
> > +     VMOVU   %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > +     VZEROUPPER_RETURN
> > +
> > +
> > +     .p2align 4,, 8
> > +L(last_4x_vec):
> > +     addl    $-(CHAR_PER_VEC * 4), %edx
> > +     VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(1)
> > +     VPTESTN %VMM(1), %VMM(1), %k0
> > +     KMOV    %k0, %VMASK_REG
> > +     subq    $-(VEC_SIZE * 4), %rsi
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     cmpl    $(CHAR_PER_VEC * 2), %edx
> > +     jbe     L(last_2x_vec)
> > +     .p2align 4,, 8
> > +L(more_2x_vec):
> > +# ifdef USE_AS_WCSCPY
> > +     xorl    %ecx, %ecx
> > +# endif
> > +     bsf     %VMASK_REG, %VRCX
> > +     jnz     L(ret_vec_x1)
> > +
> > +     VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > +     VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > +     VPTESTN %VMM(2), %VMM(2), %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(ret_vec_x2)
> > +
> > +     VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> > +     VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > +     VPTESTN %VMM(3), %VMM(3), %k0
> > +     KMOV    %k0, %VMASK_REG
> > +
> > +     cmpq    $(CHAR_PER_VEC * 4), %rdx
> > +     ja      L(more_4x_vec)
> > +
> > +     /* Adjust length before going to L(ret_vec_x3_len) or
> > +        L(ret_vec_x3).  */
> > +     addl    $(CHAR_PER_VEC * -2), %edx
> > +
> > +     FIND_FIRST_ONE (VMASK_REG, VRCX)
> > +     cmpl    %ecx, %edx
> > +     jbe     L(ret_vec_x3_len)
> > +
> > +     /* If there were no zero-CHARs (rcx was zero before
> > +        FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> > +     cmpl    $CHAR_PER_VEC, %ecx
> > +     jne     L(ret_vec_x3)
> > +
> > +     VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > +     VPTESTN %VMM(4), %VMM(4), %k0
> > +     KMOV    %k0, %VRCX
> > +     addl    $-CHAR_PER_VEC, %edx
> > +     bzhi    %VRDX, %VRCX, %VR8
> > +     jz      L(ret_vec_x4_len)
> > +L(ret_vec_x4):
> > +     bsf     %VRCX, %VRDX
> > +L(ret_vec_x4_len):
> > +     VMOVU   (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > +     movNULL $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
> > +     VMOVU   %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +     ret
> > +
> > +     .p2align 4,, 4
> > +L(ret_vec_x3_len):
> > +     movl    %edx, %ecx
> > +L(ret_vec_x3):
> > +     VMOVU   (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > +     movNULL $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> > +     VMOVU   %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > +     ret
> > +
> > +     .p2align 4,, 8
> > +L(more_4x_vec):
> > +# ifdef USE_AS_WCSCPY
> > +     xorl    %ecx, %ecx
> > +# endif
> > +     bsf     %VMASK_REG, %VRCX
> > +     jnz     L(ret_vec_x3)
> > +
> > +     VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > +     VPTESTN %VMM(4), %VMM(4), %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(ret_vec_x4)
> > +
> > +     VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> > +
> > +     /* Check if we are near the end before aligning.  */
> > +     cmpq    $(CHAR_PER_VEC * 8), %rdx
> > +     jbe     L(last_4x_vec)
> > +
> > +
> > +     /* Add rsi to rdx (length) before aligning rsi. NB: Since we
> > +        filtered out huge lengths this cannot overflow.  */
> > +# ifdef USE_AS_WCSCPY
> > +     leaq    (%rsi, %rdx, CHAR_SIZE), %rdx
> > +# else
> > +     addq    %rsi, %rdx
> > +# endif
> > +
> > +     /* Subtract rsi from rdi before aligning (add back will have
> > +        correct rdi for aligned rsi).  */
> > +     subq    %rsi, %rdi
> > +     subq    $-(VEC_SIZE * 5), %rsi
> > +     andq    $(VEC_SIZE * -4), %rsi
> > +
> > +     /* Load first half of the loop before entry.  */
> > +     VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +     VPTESTN %VMM(4), %VMM(4), %k2
> > +     VPTESTN %VMM(6), %VMM(6), %k4
> > +
> > +     /* Offset rsi by VEC_SIZE so that we can jump to
> > +        L(loop_last_4x_vec).  */
> > +     addq    $-(VEC_SIZE), %rsi
> > +     KORTEST %k2, %k4
> > +     jnz     L(loop_4x_done)
> > +
> > +     /* Store loop end in r9.  */
> > +     leaq    -(VEC_SIZE * 5)(%rdx), %r9
> > +
> > +     .p2align 4,, 11
> > +L(loop_4x_vec):
> > +     VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > +     VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> > +     VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> > +     VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> > +
> > +     subq    $(VEC_SIZE * -4), %rsi
> > +     cmpq    %rsi, %r9
> > +     jbe     L(loop_last_4x_vec)
> > +
> > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> > +     VMOVA   (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> > +
> > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +     VPTESTN %VMM(4), %VMM(4), %k2
> > +     VPTESTN %VMM(6), %VMM(6), %k4
> > +     KORTEST %k2, %k4
> > +     jz      L(loop_4x_vec)
> > +
> > +L(loop_4x_done):
> > +     VPTESTN %VMM(0), %VMM(0), %k0
> > +     KMOV    %k0, %VRCX
> > +     /* Restore rdi (dst).  */
> > +     addq    %rsi, %rdi
> > +
> > +     /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
> > +        test with bsf.  */
> > +     bsf     %VRCX, %VRCX
> > +     jnz     L(ret_vec_x1)
> > +     VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
> > +
> > +     KMOV    %k2, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(ret_vec_x2)
> > +     VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> > +
> > +     VPTESTN %VMM(2), %VMM(2), %k0
> > +     KMOV    %k0, %VRCX
> > +     bsf     %VRCX, %VRCX
> > +     jnz     L(ret_vec_x3)
> > +     VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> > +
> > +     KMOV    %k4, %VRCX
> > +     bsf     %VRCX, %VRCX
> > +     VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > +     VMOVU   %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > +     ret
> > +
> > +
> > +     .p2align 4,, 4
> > +L(page_cross):
> > +     movq    %rsi, %r8
> > +     andq    $(VEC_SIZE * -1), %r8
> > +     VPCMPEQ (%r8), %VZERO, %k0
> > +
> > +# ifdef USE_AS_WCSCPY
> > +     KMOV    %k0, %VR9
> > +     shrl    $2, %ecx
> > +     andl    $(CHAR_PER_VEC - 1), %ecx
> > +     shrx    %VRCX, %VR9, %VRCX
> > +# else
> > +     KMOV    %k0, %VRCX
> > +     shrx    %VRSI, %VRCX, %VRCX
> > +# endif
> > +
> > +     subl    %esi, %r8d
> > +     andl    $(VEC_SIZE - 1), %r8d
> > +# ifdef USE_AS_WCSCPY
> > +     shrl    $2, %r8d
> > +# endif
> > +     cmpq    %r8, %rdx
> > +     jbe     L(page_cross_small)
> > +     /* Optimizing more for space as this is very cold code. This
> > +        saves 2x cache lines.  */
> > +
> > +     /* This adds once to the later result which will get correct
> > +        copy bounds. NB: this can never zero-out a non-zero RCX as
> > +        to be in the page cross case rsi cannot be aligned and we
> > +        already right-shift rcx by the misalignment.  */
> > +     shl     %VRCX
> > +     jz      L(page_cross_continue)
> > +     bsf     %VRCX, %VRCX
> > +     REP_MOVS
> > +     ret
> > +
> > +L(page_cross_small):
> > +     tzcnt   %VRCX, %VRCX
> > +     jz      L(page_cross_setz)
> > +     cmpl    %edx, %ecx
> > +     cmova   %edx, %ecx
> > +
> > +# ifdef USE_AS_WCSCPY
> > +     rep     movsd
> > +# else
> > +     rep     movsb
> > +# endif
> > +L(page_cross_setz):
> > +     movNULL $0, (%rdi)
> > +     ret
> > +END(STRNCAT)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
> > index 1b3426d511..49eaf4cbd9 100644
> > --- a/sysdeps/x86_64/multiarch/strncpy-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
> > @@ -1,7 +1,990 @@
> > -#ifndef STRNCPY
> > -# define STRNCPY     __strncpy_evex
> > -#endif
> > +/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (4)
> > +
> > +     /* Use evex-masked stores for small sizes. Turned off at the
> > +        moment.  */
> > +# define USE_EVEX_MASKED_STORE       0
> > +
> > +
> > +# include <sysdep.h>
> > +# ifndef VEC_SIZE
> > +#  include "x86-evex256-vecs.h"
> > +# endif
> > +
> > +
> > +# ifndef STRNCPY
> > +#  define STRNCPY    __strncpy_evex
> > +# endif
> > +
> > +# ifdef USE_AS_WCSCPY
> > +#  define VMOVU_MASK vmovdqu32
> > +#  define VPCMPEQ    vpcmpeqd
> > +#  define VPMIN      vpminud
> > +#  define VPTESTN    vptestnmd
> > +#  define VPTEST     vptestmd
> > +#  define CHAR_SIZE  4
> > +
> > +#  define REP_MOVS   rep movsd
> > +#  define REP_STOS   rep stosl
> > +
> > +#  define USE_WIDE_CHAR
> > +
> > +# else
> > +#  define VMOVU_MASK vmovdqu8
> > +#  define VPCMPEQ    vpcmpeqb
> > +#  define VPMIN      vpminub
> > +#  define VPTESTN    vptestnmb
> > +#  define VPTEST     vptestmb
> > +#  define CHAR_SIZE  1
> > +
> > +#  define REP_MOVS   rep movsb
> > +#  define REP_STOS   rep stosb
> > +# endif
> > +
> > +# include "strncpy-or-cat-overflow-def.h"
> > +
> > +# define PAGE_SIZE   4096
> > +# define CHAR_PER_VEC        (VEC_SIZE / CHAR_SIZE)
> > +
> > +# include "reg-macros.h"
> > +
> > +
> > +# define VZERO       VMM(7)
> > +# define VZERO_256   VMM_256(7)
> > +# define VZERO_128   VMM_128(7)
> > +
> > +# if VEC_SIZE == 64
> > +#  define VZERO_HALF VZERO_256
> > +# else
> > +#  define VZERO_HALF VZERO_128
> > +# endif
> > +
> > +     .section SECTION(.text), "ax", @progbits
> > +ENTRY(STRNCPY)
> > +     /* Filter zero length strings and very long strings.  Zero
> > +        length strings just return, very long strings are handled by
> > +        just running rep stos{b|l} to zero set (which will almost
> > +        certainly segfault), if that succeeds then just calling
> > +        OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
> > +# ifdef USE_AS_WCSCPY
> > +     decq    %rdx
> > +     movq    %rdx, %rax
> > +     /* 56 is end of max supported address space.  */
> > +     shr     $56, %rax
> > +     jnz     L(zero_len)
> > +# else
> > +     decq    %rdx
> > +     /* If the flag needs to become `jb` replace `dec` with `sub`.
> > +      */
> > +     jl      L(zero_len)
> > +# endif
> > +
> > +     vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> > +     movl    %esi, %eax
> > +     andl    $(PAGE_SIZE - 1), %eax
> > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +     ja      L(page_cross)
> > +
> > +L(page_cross_continue):
> > +     VMOVU   (%rsi), %VMM(0)
> > +     VPTESTN %VMM(0), %VMM(0), %k0
> > +     KMOV    %k0, %VRCX
> > +
> > +     /* If no STPCPY just save end ahead of time.  */
> > +# ifndef USE_AS_STPCPY
> > +     movq    %rdi, %rax
> > +# endif
> > +
> > +
> > +     cmpq    $(CHAR_PER_VEC), %rdx
> > +
> > +     /* If USE_EVEX_MASK_STORE is enabled then we just handle length
> > +        <= CHAR_PER_VEC with masked instructions (which have
> > +        potential for dramatically bad perf if dst splits a page and
> > +        is not in the TLB).  */
> > +# if USE_EVEX_MASKED_STORE
> > +     /* `jae` because length rdx is now length - 1.  */
> > +     jae     L(more_1x_vec)
> > +
> > +     /* If there where multiple zero-CHAR matches in the first VEC,
> > +        VRCX will be overset but thats fine since any oversets where
> > +        at zero-positions anyways.  */
> > +
> > +#  ifdef USE_AS_STPCPY
> > +     tzcnt   %VRCX, %VRAX
> > +     cmpl    %eax, %edx
> > +     cmovb   %edx, %eax
> > +#   ifdef USE_AS_WCSCPY
> > +     adcl    $0, %eax
> > +     leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +#   else
> > +     adcq    %rdi, %rax
> > +#   endif
> > +#  endif
> > +     dec     %VRCX
> > +
> > +     /* Zero out all non-zero CHAR's after the first zero match.  */
> > +     KMOV    %VRCX, %k1
> > +
> > +     /* Use VZERO as destination so this can be reused for
> > +        L(zfill_less_vec) (which if jumped to by subsequent logic
> > +        will have zerod out VZERO.  */
> > +     VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
> > +L(zfill_less_vec):
> > +     /* Get mask for what we need to set.  */
> > +     incl    %edx
> > +     mov     $-1, %VRCX
> > +     bzhi    %VRDX, %VRCX, %VRCX
> > +     KMOV    %VRCX, %k1
> > +     VMOVU_MASK %VZERO, (%rdi){%k1}
> > +     ret
> > +
> > +     .p2align 4,, 4
> > +L(zero_len):
> > +     cmpq    $-1, %rdx
> > +     jne     L(best_effort_strncpy)
> > +     movq    %rdi, %rax
> > +     ret
> > +
> > +     .p2align 4,, 8
> > +L(more_1x_vec):
> > +# else
> > +     /* `jb` because length rdx is now length - 1.  */
> > +     jb      L(less_1x_vec)
> > +# endif
> > +
> > +
> > +     /* This may overset but thats fine because we still need to zero
> > +        fill.  */
> > +     VMOVU   %VMM(0), (%rdi)
> > +
> > +
> > +     /* Length must be >= CHAR_PER_VEC so match here means we must
> > +        zero-fill.  */
> > +     test    %VRCX, %VRCX
> > +     jnz     L(zfill)
> > +
> > +
> > +     /* We are going to align rsi here so will need to be able to re-
> > +        adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> > +        so rsi + rdx * CHAR_SIZE cannot overflow.  */
> > +     leaq    (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> > +     subq    %rsi, %rdi
> > +     andq    $-(VEC_SIZE), %rsi
> > +
> > +L(loop_last_4x_vec):
> > +     addq    %rsi, %rdi
> > +     subq    %rsi, %rdx
> > +# ifdef USE_AS_WCSCPY
> > +     shrq    $2, %rdx
> > +# endif
> > +
> > +     VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> > +     VPTESTN %VMM(1), %VMM(1), %k0
> > +     KMOV    %k0, %VRCX
> > +
> > +     /* -1 because of the `dec %rdx` earlier.  */
> > +     cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
> > +     ja      L(more_2x_vec)
> > +
> > +L(last_2x_vec):
> > +     /* This will be need to be computed no matter what. We do it
> > +        ahead of time for CHAR_PER_VEC == 64 because we can't adjust
> > +        the value of `tzcnt` with a shift.  */
> > +# if CHAR_PER_VEC == 64
> > +     tzcntq  %rcx, %rcx
> > +# endif
> > +
> > +     cmpl    $(CHAR_PER_VEC), %edx
> > +     jb      L(ret_vec_x1_len)
> > +
> > +     /* Seperate logic for CHAR_PER_VEC == 64 because we already did
> > +        `tzcnt` on VRCX.  */
> > +# if CHAR_PER_VEC == 64
> > +     /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
> > +     cmpb    $CHAR_PER_VEC, %cl
> > +     jnz     L(ret_vec_x1_no_bsf)
> > +# else
> > +     test    %VRCX, %VRCX
> > +     jnz     L(ret_vec_x1)
> > +# endif
> > +
> > +
> > +
> > +     VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0
> > +     VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > +     KMOV    %k0, %VRCX
> > +
> > +# if CHAR_PER_VEC < 64
> > +     /* This essentiallys adds CHAR_PER_VEC to computed result.  */
> > +     shlq    $CHAR_PER_VEC, %rcx
> > +# else
> > +     tzcntq  %rcx, %rcx
> > +     addl    $CHAR_PER_VEC, %ecx
> > +# endif
> > +
> > +     .p2align 4,, 4
> > +L(ret_vec_x1_len):
> > +     /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
> > +        already been done.  */
> > +# if CHAR_PER_VEC < 64
> > +     tzcntq  %rcx, %rcx
> > +# endif
> > +     cmpl    %ecx, %edx
> > +     jbe     L(ret_vec_x1_len_no_zfill)
> > +     /* Fall through (expectation) is copy len < buffer len.  */
> > +     VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +L(ret_vec_x1_len_no_zfill_mov):
> > +     movl    %ecx, %edx
> > +# ifdef USE_AS_STPCPY
> > +     /* clear flags.  */
> > +     xorl    %ecx, %ecx
> > +# endif
> > +L(ret_vec_x1_len_no_zfill):
> > +     VMOVU   ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > +     VMOVU   %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +# ifdef USE_AS_STPCPY
> > +#  ifdef USE_AS_WCSCPY
> > +     adcq    $0, %rdx
> > +     leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
> > +#  else
> > +     leal    (VEC_SIZE)(%rdx), %eax
> > +     adcq    %rdi, %rax
> > +#  endif
> > +# endif
> > +     ret
> > +
> > +
> > +     .p2align 4,, 10
> > +L(ret_vec_x1):
> > +     bsf     %VRCX, %VRCX
> > +L(ret_vec_x1_no_bsf):
> > +     VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +     subl    %ecx, %edx
> > +     cmpl    $CHAR_PER_VEC, %edx
> > +     jb      L(ret_vec_x1_len_no_zfill_mov)
> > +     /* Fall through (expectation) is copy len < buffer len.  */
> > +     VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > +     VMOVU   %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
> > +# ifdef USE_AS_STPCPY
> > +     leaq    (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
> > +# endif
> > +     ret
> > +
> > +     .p2align 4,, 8
> > +L(last_4x_vec):
> > +     /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
> > +        $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
> > +        using `movzbl`.  */
> > +# if CHAR_PER_VEC == 64
> > +     movzbl  %dl, %edx
> > +# else
> > +     andl    $(CHAR_PER_VEC * 4 - 1), %edx
> > +# endif
> > +     VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(1)
> > +     VPTESTN %VMM(1), %VMM(1), %k0
> > +     KMOV    %k0, %VRCX
> > +     subq    $-(VEC_SIZE * 4), %rsi
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     cmpl    $(CHAR_PER_VEC * 2 - 1), %edx
> > +     jbe     L(last_2x_vec)
> > +     .p2align 4,, 8
> > +L(more_2x_vec):
> > +     VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > +     test    %VRCX, %VRCX
> > +     /* Must fill at least 2x VEC.  */
> > +     jnz     L(zfill_vec1)
> > +
> > +     VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > +     VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > +     VPTESTN %VMM(2), %VMM(2), %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     /* Must fill at least 1x VEC.  */
> > +     jnz     L(zfill_vec2)
> > +
> > +     VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> > +     VPTESTN %VMM(3), %VMM(3), %k0
> > +     KMOV    %k0, %VRCX
> > +
> > +     /* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
> > +     cmpq    $(CHAR_PER_VEC * 4 - 1), %rdx
> > +     ja      L(more_4x_vec)
> > +
> > +     subl    $(CHAR_PER_VEC * 3), %edx
> > +     jb      L(ret_vec_x3_len)
> > +
> > +     test    %VRCX, %VRCX
> > +     jnz     L(ret_vec_x3)
> > +
> > +     VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0
> > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > +     KMOV    %k0, %VRCX
> > +     tzcnt   %VRCX, %VRCX
> > +     cmpl    %ecx, %edx
> > +     jbe     L(ret_vec_x4_len_no_zfill)
> > +     /* Fall through (expectation) is copy len < buffer len.  */
> > +     VMOVU   %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +     movl    %ecx, %edx
> > +L(ret_vec_x4_len_no_zfill):
> > +     VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > +     VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +# ifdef USE_AS_STPCPY
> > +#  ifdef USE_AS_WCSCPY
> > +     adcq    $0, %rdx
> > +     leaq    (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
> > +#  else
> > +     leal    (VEC_SIZE * 4 + 0)(%rdx), %eax
> > +     adcq    %rdi, %rax
> > +#  endif
> > +# endif
> > +     ret
> > +
> > +
> > +L(ret_vec_x3_len):
> > +     addl    $(CHAR_PER_VEC * 1), %edx
> > +     tzcnt   %VRCX, %VRCX
> > +     cmpl    %ecx, %edx
> > +     jbe     L(ret_vec_x3_len_no_zfill)
> > +     /* Fall through (expectation) is copy len < buffer len.  */
> > +     VMOVU   %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +L(ret_vec_x3_len_no_zfill_mov):
> > +     movl    %ecx, %edx
> > +# ifdef USE_AS_STPCPY
> > +     /* clear flags.  */
> > +     xorl    %ecx, %ecx
> > +# endif
> > +     .p2align 4,, 4
> > +L(ret_vec_x3_len_no_zfill):
> > +     VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > +     VMOVU   %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > +# ifdef USE_AS_STPCPY
> > +#  ifdef USE_AS_WCSCPY
> > +     adcq    $0, %rdx
> > +     leaq    (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
> > +#  else
> > +     leal    (VEC_SIZE * 3 + 0)(%rdx), %eax
> > +     adcq    %rdi, %rax
> > +#  endif
> > +# endif
> > +     ret
> > +
> > +
> > +     .p2align 4,, 8
> > +L(ret_vec_x3):
> > +     bsf     %VRCX, %VRCX
> > +     VMOVU   %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
> > +     subl    %ecx, %edx
> > +     jl      L(ret_vec_x3_len_no_zfill_mov)
> > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > +     VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> > +# ifdef USE_AS_STPCPY
> > +     leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
> > +# endif
> > +     ret
> > +
> > +     .p2align 4,, 8
> > +L(more_4x_vec):
> > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > +     test    %VRCX, %VRCX
> > +     jnz     L(zfill_vec3)
> > +
> > +     VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > +     VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> > +     VPTESTN %VMM(4), %VMM(4), %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(zfill_vec4)
> >
> > -#define USE_AS_STRNCPY
> > -#define STRCPY       STRNCPY
> > -#include "strcpy-evex.S"
> > +     /* Recheck length before aligning.  */
> > +     cmpq    $(CHAR_PER_VEC * 8 - 1), %rdx
> > +     jbe     L(last_4x_vec)
> > +
> > +     /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
> > +# ifdef USE_AS_WCSCPY
> > +     leaq    (%rsi, %rdx, CHAR_SIZE), %rdx
> > +# else
> > +     addq    %rsi, %rdx
> > +# endif
> > +     subq    %rsi, %rdi
> > +     subq    $-(VEC_SIZE * 5), %rsi
> > +     andq    $(VEC_SIZE * -4), %rsi
> > +
> > +
> > +     /* Load first half of the loop before entry.  */
> > +     VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +     VPTESTN %VMM(4), %VMM(4), %k2
> > +     VPTESTN %VMM(6), %VMM(6), %k4
> > +
> > +
> > +     /* Offset rsi by VEC_SIZE so that we can jump to
> > +        L(loop_last_4x_vec).  */
> > +     addq    $-(VEC_SIZE), %rsi
> > +     KORTEST %k2, %k4
> > +     jnz     L(loop_4x_done)
> > +
> > +     /* Store loop end in r9.  */
> > +     leaq    -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
> > +
> > +     .p2align 4,, 11
> > +L(loop_4x_vec):
> > +     VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > +     VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> > +     VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> > +     VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> > +
> > +     subq    $(VEC_SIZE * -4), %rsi
> > +     cmpq    %rsi, %r9
> > +     jbe     L(loop_last_4x_vec)
> > +
> > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> > +     VMOVA   (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> > +
> > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > +     VPTESTN %VMM(4), %VMM(4), %k2
> > +     VPTESTN %VMM(6), %VMM(6), %k4
> > +     KORTEST %k2, %k4
> > +     jz      L(loop_4x_vec)
> > +
> > +L(loop_4x_done):
> > +     /* Restore rdx (length).  */
> > +     subq    %rsi, %rdx
> > +# ifdef USE_AS_WCSCPY
> > +     shrq    $2, %rdx
> > +# endif
> > +     VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > +     /* Restore rdi (dst).  */
> > +     addq    %rsi, %rdi
> > +     VPTESTN %VMM(0), %VMM(0), %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(zfill_vec1)
> > +
> > +     VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> > +     KMOV    %k2, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(zfill_vec2)
> > +
> > +     VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> > +     VPTESTN %VMM(2), %VMM(2), %k0
> > +     KMOV    %k0, %VRCX
> > +     test    %VRCX, %VRCX
> > +     jnz     L(zfill_vec3)
> > +
> > +     VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
> > +     KMOV    %k4, %VRCX
> > +     // Zfill more....
> > +
> > +     .p2align 4,, 4
> > +L(zfill_vec4):
> > +     subq    $(VEC_SIZE * -2), %rdi
> > +     addq    $(CHAR_PER_VEC * -2), %rdx
> > +L(zfill_vec2):
> > +     subq    $(VEC_SIZE * -2), %rdi
> > +     addq    $(CHAR_PER_VEC * -1), %rdx
> > +L(zfill):
> > +     /* VRCX must be non-zero.  */
> > +     bsf     %VRCX, %VRCX
> > +
> > +     /* Adjust length / dst for zfill.  */
> > +     subq    %rcx, %rdx
> > +# ifdef USE_AS_WCSCPY
> > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > +# else
> > +     addq    %rcx, %rdi
> > +# endif
> > +# ifdef USE_AS_STPCPY
> > +     movq    %rdi, %rax
> > +# endif
> > +L(zfill_from_page_cross):
> > +
> > +     /* From here on out its just memset(rdi, 0, rdx).  */
> > +     cmpq    $CHAR_PER_VEC, %rdx
> > +     jb      L(zfill_less_vec)
> > +
> > +L(zfill_more_1x_vec):
> > +     VMOVU   %VZERO, (%rdi)
> > +     VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +     cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
> > +     ja      L(zfill_more_2x_vec)
> > +L(zfill_done0):
> > +     ret
> > +
> > +     /* Coming from vec1/vec2 we must be able to zfill at least 2x
> > +        VEC.  */
> > +     .p2align 4,, 8
> > +L(zfill_vec3):
> > +     subq    $(VEC_SIZE * -2), %rdi
> > +     addq    $(CHAR_PER_VEC * -2), %rdx
> > +     .p2align 4,, 2
> > +L(zfill_vec1):
> > +     bsfq    %rcx, %rcx
> > +     /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
> > +      */
> > +     leaq    VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> > +     subq    %rcx, %rdx
> > +# ifdef USE_AS_STPCPY
> > +     movq    %rdi, %rax
> > +# endif
> > +
> > +
> > +     VMOVU   %VZERO, (%rdi)
> > +     VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +     cmpq    $(CHAR_PER_VEC * 2), %rdx
> > +     jb      L(zfill_done0)
> > +L(zfill_more_2x_vec):
> > +     VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> > +     VMOVU   %VZERO, (VEC_SIZE)(%rdi)
> > +     subq    $(CHAR_PER_VEC * 4 - 1), %rdx
> > +     jbe     L(zfill_done)
> > +
> > +# ifdef USE_AS_WCSCPY
> > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rdx
> > +# else
> > +     addq    %rdi, %rdx
> > +# endif
> > +
> > +     VMOVU   %VZERO, (VEC_SIZE * 2)(%rdi)
> > +     VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi)
> > +
> > +
> > +     VMOVU   %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
> > +     VMOVU   %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
> > +
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     cmpq    %rdi, %rdx
> > +     jbe     L(zfill_done)
> > +
> > +     /* Align rdi and zfill loop.  */
> > +     andq    $-(VEC_SIZE), %rdi
> > +     .p2align 4,, 12
> > +L(zfill_loop_4x_vec):
> > +     VMOVA   %VZERO, (VEC_SIZE * 0)(%rdi)
> > +     VMOVA   %VZERO, (VEC_SIZE * 1)(%rdi)
> > +     VMOVA   %VZERO, (VEC_SIZE * 2)(%rdi)
> > +     VMOVA   %VZERO, (VEC_SIZE * 3)(%rdi)
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     cmpq    %rdi, %rdx
> > +     ja      L(zfill_loop_4x_vec)
> > +L(zfill_done):
> > +     ret
> > +
> > +
> > +     /* Less 1x VEC case if we are not using evex masked store.  */
> > +# if !USE_EVEX_MASKED_STORE
> > +     .p2align 4,, 8
> > +L(copy_1x):
> > +     /* Special case for copy 1x. It can be handled quickly and many
> > +        buffer sizes have convenient alignment.  */
> > +     VMOVU   %VMM(0), (%rdi)
> > +     /* If no zeros then we are done.  */
> > +     testl   %ecx, %ecx
> > +     jz      L(ret_1x_1x)
> > +
> > +     /* Need to zfill, not we know that length <= CHAR_PER_VEC so we
> > +        only handle the small case here.  */
> > +     bsf     %VRCX, %VRCX
> > +L(zfill_less_vec_no_bsf):
> > +     /* Adjust length / dst then just zfill less_vec.  */
> > +     subq    %rcx, %rdx
> > +#  ifdef USE_AS_WCSCPY
> > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > +#  else
> > +     addq    %rcx, %rdi
> > +#  endif
> > +#  ifdef USE_AS_STPCPY
> > +     movq    %rdi, %rax
> > +#  endif
> > +
> > +L(zfill_less_vec):
> > +     cmpl    $((VEC_SIZE / 2) / CHAR_SIZE), %edx
> > +     jb      L(zfill_less_half)
> > +
> > +     VMOVU   %VZERO_HALF, (%rdi)
> > +     VMOVU   %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +     ret
> > +#  ifdef USE_AS_STPCPY
> > +L(ret_1x_1x):
> > +     leaq    CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
> > +     ret
> > +#  endif
> > +
> > +
> > +#  if VEC_SIZE == 64
> > +     .p2align 4,, 4
> > +L(copy_32_63):
> > +     /* Overfill to avoid branches.  */
> > +     VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > +     VMOVU   %VMM_256(0), (%rdi)
> > +     VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +
> > +     /* We are taking advantage of the fact that to be here we must
> > +        be writing null-term as (%rdi, %rcx) we have a byte of lee-
> > +        way for overwriting.  */
> > +     cmpl    %ecx, %edx
> > +     ja      L(zfill_less_vec_no_bsf)
> > +#   ifndef USE_AS_STPCPY
> > +L(ret_1x_1x):
> > +#   else
> > +#    ifdef USE_AS_WCSCPY
> > +     adcq    $0, %rdx
> > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > +#    else
> > +     movl    %edx, %eax
> > +     adcq    %rdi, %rax
> > +#    endif
> > +#   endif
> > +     ret
> > +#  endif
> > +
> > +     .p2align 4,, 4
> > +L(copy_16_31):
> > +     /* Overfill to avoid branches.  */
> > +     vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > +     VMOVU   %VMM_128(0), (%rdi)
> > +     vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +     cmpl    %ecx, %edx
> > +
> > +     /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
> > +        we have a larger copy block for 32-63 so this is just falls
> > +        through to zfill 16-31. If VEC_SIZE == 32 then we check for
> > +        full zfill of less 1x VEC.  */
> > +#  if VEC_SIZE == 64
> > +     jbe     L(ret_16_31)
> > +     subl    %ecx, %edx
> > +#   ifdef USE_AS_WCSCPY
> > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > +#   else
> > +     addq    %rcx, %rdi
> > +#   endif
> > +#   ifdef USE_AS_STPCPY
> > +     movq    %rdi, %rax
> > +#   endif
> > +L(zfill_less_half):
> > +L(zfill_less_32):
> > +     cmpl    $(16 / CHAR_SIZE), %edx
> > +     jb      L(zfill_less_16)
> > +     VMOVU   %VZERO_128, (%rdi)
> > +     VMOVU   %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +#   ifdef USE_AS_STPCPY
> > +     ret
> > +#   endif
> > +L(ret_16_31):
> > +#   ifdef USE_AS_STPCPY
> > +#    ifdef USE_AS_WCSCPY
> > +     adcq    $0, %rdx
> > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > +#    else
> > +     movl    %edx, %eax
> > +     adcq    %rdi, %rax
> > +#    endif
> > +#   endif
> > +     ret
> > +#  else
> > +     /* VEC_SIZE == 32 begins.  */
> > +     ja      L(zfill_less_vec_no_bsf)
> > +#   ifndef USE_AS_STPCPY
> > +L(ret_1x_1x):
> > +#   else
> > +#    ifdef USE_AS_WCSCPY
> > +     adcq    $0, %rdx
> > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > +#    else
> > +     movl    %edx, %eax
> > +     adcq    %rdi, %rax
> > +#    endif
> > +#   endif
> > +     ret
> > +#  endif
> > +
> > +
> > +     .p2align 4,, 4
> > +L(copy_8_15):
> > +     /* Overfill to avoid branches.  */
> > +     movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> > +     vmovq   %VMM_128(0), (%rdi)
> > +     movq    %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +     cmpl    %ecx, %edx
> > +     jbe     L(ret_8_15)
> > +     subl    %ecx, %edx
> > +#  ifdef USE_AS_WCSCPY
> > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > +#  else
> > +     addq    %rcx, %rdi
> > +#  endif
> > +#  ifdef USE_AS_STPCPY
> > +     movq    %rdi, %rax
> > +#  endif
> > +     .p2align 4,, 8
> > +#  if VEC_SIZE == 32
> > +L(zfill_less_half):
> > +#  endif
> > +L(zfill_less_16):
> > +     xorl    %ecx, %ecx
> > +     cmpl    $(8 / CHAR_SIZE), %edx
> > +     jb      L(zfill_less_8)
> > +     movq    %rcx, (%rdi)
> > +     movq    %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +#  ifndef USE_AS_STPCPY
> > +L(ret_8_15):
> > +#  endif
> > +     ret
> > +
> > +     .p2align 4,, 8
> > +L(less_1x_vec):
> > +     je      L(copy_1x)
> > +
> > +     /* We will need `tzcnt` result for all other copy sizes.  */
> > +     tzcnt   %VRCX, %VRCX
> > +#  if VEC_SIZE == 64
> > +     cmpl    $(32 / CHAR_SIZE), %edx
> > +     jae     L(copy_32_63)
> > +#  endif
> > +
> > +     cmpl    $(16 / CHAR_SIZE), %edx
> > +     jae     L(copy_16_31)
> > +
> > +     cmpl    $(8 / CHAR_SIZE), %edx
> > +     jae     L(copy_8_15)
> > +#  ifdef USE_AS_WCSCPY
> > +     testl   %ecx, %ecx
> > +     jz      L(zfill_less_8_set_ret)
> > +
> > +     movl    (%rsi, %rdx, CHAR_SIZE), %esi
> > +     vmovd   %VMM_128(0), (%rdi)
> > +     movl    %esi, (%rdi, %rdx, CHAR_SIZE)
> > +#   ifdef USE_AS_STPCPY
> > +     cmpl    %ecx, %edx
> > +L(ret_8_15):
> > +     adcq    $0, %rdx
> > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > +#   endif
> > +     ret
> > +L(zfill_less_8_set_ret):
> > +     xorl    %ecx, %ecx
> > +#   ifdef USE_AS_STPCPY
> > +     movq    %rdi, %rax
> > +#   endif
> > +L(zfill_less_8):
> > +     movl    %ecx, (%rdi)
> > +     movl    %ecx, (%rdi, %rdx, CHAR_SIZE)
> > +     ret
> > +#  else
> > +     cmpl    $3, %edx
> > +     jb      L(copy_0_3)
> > +     /* Overfill to avoid branches.  */
> > +     movl    -3(%rsi, %rdx), %esi
> > +     vmovd   %VMM_128(0), (%rdi)
> > +     movl    %esi, -3(%rdi, %rdx)
> > +     cmpl    %ecx, %edx
> > +     jbe     L(ret_4_7)
> > +     subq    %rcx, %rdx
> > +     addq    %rcx, %rdi
> > +#   ifdef USE_AS_STPCPY
> > +     movq    %rdi, %rax
> > +#   endif
> > +     xorl    %ecx, %ecx
> > +     .p2align 4,, 8
> > +L(zfill_less_8):
> > +     cmpl    $3, %edx
> > +     jb      L(zfill_less_3)
> > +     movl    %ecx, (%rdi)
> > +     movl    %ecx, -3(%rdi, %rdx)
> > +#   ifdef USE_AS_STPCPY
> > +     ret
> > +#   endif
> > +
> > +L(ret_4_7):
> > +#   ifdef USE_AS_STPCPY
> > +L(ret_8_15):
> > +     movl    %edx, %eax
> > +     adcq    %rdi, %rax
> > +#   endif
> > +     ret
> > +
> > +     .p2align 4,, 4
> > +L(zfill_less_3):
> > +     testl   %edx, %edx
> > +     jz      L(zfill_1)
> > +     movw    %cx, (%rdi)
> > +L(zfill_1):
> > +     movb    %cl, (%rdi, %rdx)
> > +     ret
> > +
> > +     .p2align 4,, 8
> > +L(copy_0_3):
> > +     vmovd   %VMM_128(0), %r8d
> > +     testl   %edx, %edx
> > +     jz      L(copy_1)
> > +     movw    %r8w, (%rdi)
> > +     cmpl    %ecx, %edx
> > +     ja      L(zfill_from_1)
> > +     movzbl  (%rsi, %rdx), %r8d
> > +#   ifdef USE_AS_STPCPY
> > +     movl    %edx, %eax
> > +     adcq    %rdi, %rax
> > +     movb    %r8b, (%rdi, %rdx)
> > +     ret
> > +#   endif
> > +
> > +L(copy_1):
> > +#   ifdef USE_AS_STPCPY
> > +     movl    %edx, %eax
> > +     cmpl    %ecx, %edx
> > +     adcq    %rdi, %rax
> > +#   endif
> > +#   ifdef USE_AS_WCSCPY
> > +     vmovd   %VMM_128(0), (%rdi)
> > +#   else
> > +     movb    %r8b, (%rdi, %rdx)
> > +#   endif
> > +     ret
> > +#  endif
> > +
> > +
> > +#  ifndef USE_AS_WCSCPY
> > +     .p2align 4,, 8
> > +L(zfill_from_1):
> > +#   ifdef USE_AS_STPCPY
> > +     leaq    (%rdi, %rcx), %rax
> > +#   endif
> > +     movw    $0, -1(%rdi, %rdx)
> > +     ret
> > +#  endif
> > +
> > +     .p2align 4,, 4
> > +L(zero_len):
> > +     incq    %rdx
> > +     jne     L(best_effort_strncpy)
> > +     movq    %rdi, %rax
> > +     ret
> > +# endif
> > +
> > +
> > +     .p2align 4,, 4
> > +     .p2align 6,, 8
> > +L(page_cross):
> > +     movq    %rsi, %rax
> > +     andq    $(VEC_SIZE * -1), %rax
> > +     VPCMPEQ (%rax), %VZERO, %k0
> > +     KMOV    %k0, %VRCX
> > +# ifdef USE_AS_WCSCPY
> > +     movl    %esi, %r8d
> > +     shrl    $2, %r8d
> > +     andl    $(CHAR_PER_VEC - 1), %r8d
> > +     shrx    %VR8, %VRCX, %VRCX
> > +# else
> > +     shrx    %VRSI, %VRCX, %VRCX
> > +# endif
> > +
> > +     /* Compute amount of bytes we checked.  */
> > +     subl    %esi, %eax
> > +     andl    $(VEC_SIZE - 1), %eax
> > +# ifdef USE_AS_WCSCPY
> > +     shrl    $2, %eax
> > +# endif
> > +
> > +     /* If rax > rdx then we are finishing the copy at the end of the
> > +        page.  */
> > +     cmpq    %rax, %rdx
> > +     jb      L(page_cross_small)
> > +
> > +
> > +     /* If rcx is non-zero then continue.  */
> > +     test    %VRCX, %VRCX
> > +     jz      L(page_cross_continue)
> > +
> > +     /* We found zero-CHAR so need to copy then zfill (we know we
> > +        didn't cover all of length here).  */
> > +     bsf     %VRCX, %VRCX
> > +L(movsb_and_zfill):
> > +     incl    %ecx
> > +     subq    %rcx, %rdx
> > +# ifdef USE_AS_STPCPY
> > +     leaq    -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
> > +# else
> > +     movq    %rdi, %rax
> > +# endif
> > +
> > +     REP_MOVS
> > +# ifdef USE_AS_WCSCPY
> > +     movl    $0, (%rdi)
> > +# else
> > +     movb    $0, (%rdi)
> > +# endif
> > +     jmp     L(zfill_from_page_cross)
> > +
> > +L(page_cross_small):
> > +     tzcnt   %VRCX, %VRCX
> > +     cmpl    %ecx, %edx
> > +     jbe     L(page_cross_copy_only)
> > +
> > +     /* Do a zfill of the tail before copying.  */
> > +     movq    %rdi, %r9
> > +     xorl    %eax, %eax
> > +
> > +     movl    %ecx, %r8d
> > +
> > +     subl    %ecx, %edx
> > +     leaq    CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> > +     movl    %edx, %ecx
> > +     REP_STOS
> > +     movq    %r9, %rdi
> > +     movl    %r8d, %edx
> > +L(page_cross_copy_only):
> > +     leal    1(%rdx), %ecx
> > +# ifdef USE_AS_STPCPY
> > +#  ifdef USE_AS_WCSCPY
> > +     adcl    $0, %edx
> > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > +#  else
> > +     movl    %edx, %eax
> > +     adcq    %rdi, %rax
> > +#  endif
> > +# else
> > +     movq    %rdi, %rax
> > +# endif
> > +     REP_MOVS
> > +     ret
> > +
> > +
> > +L(best_effort_strncpy):
> > +     movq    %rdx, %rcx
> > +     xorl    %eax, %eax
> > +     movq    %rdi, %r8
> > +     /* The length is >= 2^63. We very much so expect to segfault at
> > +        rep stos. If that doesn't happen then just strcpy to finish.
> > +      */
> > +     REP_STOS
> > +     movq    %r8, %rdi
> > +     jmp     OVERFLOW_STRCPY
> > +END(STRNCPY)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> > new file mode 100644
> > index 0000000000..d5ff4cbe50
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
>
> Please add a copyright notice.
>
> > @@ -0,0 +1,65 @@
> > +#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
> > +#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
> > +
> > +#if defined USE_MULTIARCH && IS_IN(libc)
> > +#  define UNDERSCORES __
> > +#  ifdef USE_WITH_SSE2
> > +#    define ISA_EXT _sse2
> > +#  elif defined USE_WITH_AVX
> > +#    ifdef USE_WITH_RTM
> > +#      define ISA_EXT _avx_rtm
> > +#    else
> > +#      define ISA_EXT _avx
> > +#    endif
> > +#  elif defined USE_WITH_AVX2
>
> Do we have a function with both AVX and AVX2 versions? If not, should
> keep just 1.
>
> > +#    ifdef USE_WITH_RTM
> > +#      define ISA_EXT _avx2_rtm
> > +#    else
> > +#      define ISA_EXT _avx2
> > +#    endif
> > +
> > +#  elif defined USE_WITH_EVEX256
> > +#    define ISA_EXT _evex
> > +#  elif defined USE_WITH_EVEX512
> > +#    define ISA_EXT _evex512
> > +#  endif
> > +#else
> > +#  define UNDERSCORES
> > +#  define ISA_EXT
> > +#endif
> > +
> > +#ifdef USE_AS_WCSCPY
> > +#  define STRCPY_PREFIX wc
> > +#  define STRCAT_PREFIX wcs
> > +#  ifdef USE_AS_STPCPY
> > +#    define STRCPY_POSTFIX pcpy
> > +#  else
> > +#    define STRCPY_POSTFIX scpy
> > +#  endif
> > +#else
> > +#  define STRCPY_PREFIX st
> > +#  define STRCAT_PREFIX str
> > +#  ifdef USE_AS_STPCPY
> > +#    define STRCPY_POSTFIX pcpy
> > +#  else
> > +#    define STRCPY_POSTFIX rcpy
> > +#  endif
> > +#endif
> > +#define STRCAT_POSTFIX cat
> > +
> > +#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext)                 \
> > +  underscores##prefix##postfix##ext
> > +
> > +#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
> > +
> > +#ifndef OVERFLOW_STRCPY
> > +#  define OVERFLOW_STRCPY                                                     \
> > +    OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
> > +#endif
> > +
> > +#ifndef OVERFLOW_STRCAT
> > +#  define OVERFLOW_STRCAT                                                     \
> > +    OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
> > +#endif
> > +
> > +#endif
> > --
> > 2.34.1
> >
>
> H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/5] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
  2022-11-04 22:27       ` Noah Goldstein
@ 2022-11-04 22:47         ` H.J. Lu
  2022-11-04 23:06           ` Noah Goldstein
  0 siblings, 1 reply; 42+ messages in thread
From: H.J. Lu @ 2022-11-04 22:47 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Nov 4, 2022 at 3:28 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Nov 4, 2022 at 2:46 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Nov 04, 2022 at 01:13:11PM -0700, Noah Goldstein wrote:
> > > Optimizations are:
> > >     1. Use more overlapping stores to avoid branches.
> > >     2. Reduce how unrolled the aligning copies are (this is more of a
> > >        code-size save, its a negative for some sizes in terms of
> > >        perf).
> > >     3. Improve the loop a bit (similiar to what we do in strlen with
> > >        2x vpminu + kortest instead of 3x vpminu + kmov + test).
> > >     4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
> > >        number that are taken.
> > >
> > > Performance Changes:
> > >
> > >     Times are from N = 10 runs of the benchmark suite and are
> > >     reported as geometric mean of all ratios of
> > >     New Implementation / Old Implementation.
> > >
> > >     stpcpy-evex      -> 0.922
> > >     strcat-evex      -> 0.985
> > >     strcpy-evex      -> 0.880
> > >
> > >     strncpy-evex     -> 0.831
> > >     stpncpy-evex     -> 0.780
> > >
> > >     strncat-evex     -> 0.958
> > >
> > > Code Size Changes:
> > >     function         -> Bytes New / Bytes Old -> Ratio
> > >
> > >     strcat-evex      ->  819 / 1874 -> 0.437
> > >     strcpy-evex      ->  700 / 1074 -> 0.652
> > >     stpcpy-evex      ->  735 / 1094 -> 0.672
> > >
> > >     strncpy-evex     -> 1397 / 2611 -> 0.535
> > >     stpncpy-evex     -> 1489 / 2691 -> 0.553
> > >
> > >     strncat-evex     -> 1184 / 2832 -> 0.418
> > >
> > > Notes:
> > >     1. Because of the significant difference between the
> > >        implementations they are split into three files.
> > >
> > >            strcpy-evex.S    -> strcpy, stpcpy, strcat
> > >            strncpy-evex.S   -> strncpy
> > >            strncat-evex.S    > strncat
> > >
> > >        I couldn't find a way to merge them without making the
> > >        ifdefs incredibly difficult to follow.
> > >
> > >     2. All implementations can be made evex512 by including
> > >        "x86-evex512-vecs.h" at the top.
> > >
> > >     3. All implementations have an optional define:
> > >         `USE_EVEX_MASKED_STORE`
> > >        Setting to one uses evex-masked stores for handling short
> > >        strings.  This saves code size and branches.  It's disabled
> > >        for all implementations are the moment as there are some
> > >        serious drawbacks to masked stores in certain cases, but
> > >        that may be fixed on future architectures.
> > >
> > > Full check passes on x86-64 and build succeeds for all ISA levels w/
> > > and w/o multiarch.
> > > ---
> > >  sysdeps/x86_64/multiarch/stpncpy-evex.S       |    5 +-
> > >  sysdeps/x86_64/multiarch/strcat-evex.S        |  291 +---
> > >  sysdeps/x86_64/multiarch/strcat-strlen-evex.S |  110 ++
> > >  sysdeps/x86_64/multiarch/strcpy-evex.S        | 1282 ++++++-----------
> > >  sysdeps/x86_64/multiarch/strncat-evex.S       |  525 ++++++-
> > >  sysdeps/x86_64/multiarch/strncpy-evex.S       |  995 ++++++++++++-
> > >  .../multiarch/strncpy-or-cat-overflow-def.h   |   65 +
> > >  7 files changed, 2100 insertions(+), 1173 deletions(-)
> > >  create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> > > index 99ea76a372..3693491baa 100644
> > > --- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
> > > +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> > > @@ -3,6 +3,5 @@
> > >  #endif
> > >
> > >  #define USE_AS_STPCPY
> > > -#define USE_AS_STRNCPY
> > > -#define STRCPY       STPNCPY
> > > -#include "strcpy-evex.S"
> > > +#define STRNCPY      STPNCPY
> > > +#include "strncpy-evex.S"
> > > diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
> > > index 0e2df947e9..b4207b7889 100644
> > > --- a/sysdeps/x86_64/multiarch/strcat-evex.S
> > > +++ b/sysdeps/x86_64/multiarch/strcat-evex.S
> > > @@ -1,286 +1,7 @@
> > > -/* strcat with 256-bit EVEX instructions.
> > > -   Copyright (C) 2021-2022 Free Software Foundation, Inc.
> > > -   This file is part of the GNU C Library.
> > > -
> > > -   The GNU C Library is free software; you can redistribute it and/or
> > > -   modify it under the terms of the GNU Lesser General Public
> > > -   License as published by the Free Software Foundation; either
> > > -   version 2.1 of the License, or (at your option) any later version.
> > > -
> > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > -   Lesser General Public License for more details.
> > > -
> > > -   You should have received a copy of the GNU Lesser General Public
> > > -   License along with the GNU C Library; if not, see
> > > -   <https://www.gnu.org/licenses/>.  */
> > > -
> > > -#include <isa-level.h>
> > > -
> > > -#if ISA_SHOULD_BUILD (4)
> > > -
> > > -
> > > -# include <sysdep.h>
> > > -
> > > -# ifndef STRCAT
> > > -#  define STRCAT  __strcat_evex
> > > -# endif
> > > -
> > > -# define VMOVU               vmovdqu64
> > > -# define VMOVA               vmovdqa64
> > > -
> > > -/* zero register */
> > > -# define XMMZERO     xmm16
> > > -# define YMMZERO     ymm16
> > > -# define YMM0                ymm17
> > > -# define YMM1                ymm18
> > > -
> > > -# define USE_AS_STRCAT
> > > -
> > > -/* Number of bytes in a vector register */
> > > -# define VEC_SIZE    32
> > > -
> > > -     .section .text.evex,"ax",@progbits
> > > -ENTRY (STRCAT)
> > > -     mov     %rdi, %r9
> > > -# ifdef USE_AS_STRNCAT
> > > -     mov     %rdx, %r8
> > > -# endif
> > > -
> > > -     xor     %eax, %eax
> > > -     mov     %edi, %ecx
> > > -     and     $((VEC_SIZE * 4) - 1), %ecx
> > > -     vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> > > -     cmp     $(VEC_SIZE * 3), %ecx
> > > -     ja      L(fourth_vector_boundary)
> > > -     vpcmpb  $0, (%rdi), %YMMZERO, %k0
> > > -     kmovd   %k0, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_first_vector)
> > > -     mov     %rdi, %rax
> > > -     and     $-VEC_SIZE, %rax
> > > -     jmp     L(align_vec_size_start)
> > > -L(fourth_vector_boundary):
> > > -     mov     %rdi, %rax
> > > -     and     $-VEC_SIZE, %rax
> > > -     vpcmpb  $0, (%rax), %YMMZERO, %k0
> > > -     mov     $-1, %r10d
> > > -     sub     %rax, %rcx
> > > -     shl     %cl, %r10d
> > > -     kmovd   %k0, %edx
> > > -     and     %r10d, %edx
> > > -     jnz     L(exit)
> > > -
> > > -L(align_vec_size_start):
> > > -     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> > > -     kmovd   %k0, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_second_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > > -     kmovd   %k1, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_third_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > > -     kmovd   %k2, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_fourth_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > > -     kmovd   %k3, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_fifth_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > > -     add     $(VEC_SIZE * 4), %rax
> > > -     kmovd   %k4, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_second_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > > -     kmovd   %k1, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_third_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > > -     kmovd   %k2, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_fourth_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > > -     kmovd   %k3, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_fifth_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > > -     kmovd   %k4, %edx
> > > -     add     $(VEC_SIZE * 4), %rax
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_second_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > > -     kmovd   %k1, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_third_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > > -     kmovd   %k2, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_fourth_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > > -     kmovd   %k3, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_fifth_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > > -     add     $(VEC_SIZE * 4), %rax
> > > -     kmovd   %k4, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_second_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > > -     kmovd   %k1, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_third_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > > -     kmovd   %k2, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_fourth_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > > -     kmovd   %k3, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_fifth_vector)
> > > -
> > > -     test    $((VEC_SIZE * 4) - 1), %rax
> > > -     jz      L(align_four_vec_loop)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > > -     add     $(VEC_SIZE * 5), %rax
> > > -     kmovd   %k4, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit)
> > > -
> > > -     test    $((VEC_SIZE * 4) - 1), %rax
> > > -     jz      L(align_four_vec_loop)
> > > -
> > > -     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> > > -     add     $VEC_SIZE, %rax
> > > -     kmovd   %k0, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit)
> > > -
> > > -     test    $((VEC_SIZE * 4) - 1), %rax
> > > -     jz      L(align_four_vec_loop)
> > > -
> > > -     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> > > -     add     $VEC_SIZE, %rax
> > > -     kmovd   %k0, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit)
> > > -
> > > -     test    $((VEC_SIZE * 4) - 1), %rax
> > > -     jz      L(align_four_vec_loop)
> > > -
> > > -     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k1
> > > -     add     $VEC_SIZE, %rax
> > > -     kmovd   %k1, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit)
> > > -
> > > -     add     $VEC_SIZE, %rax
> > > -
> > > -     .p2align 4
> > > -L(align_four_vec_loop):
> > > -     VMOVA   (%rax), %YMM0
> > > -     VMOVA   (VEC_SIZE * 2)(%rax), %YMM1
> > > -     vpminub VEC_SIZE(%rax), %YMM0, %YMM0
> > > -     vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1
> > > -     vpminub %YMM0, %YMM1, %YMM0
> > > -     /* If K0 != 0, there is a null byte.  */
> > > -     vpcmpb  $0, %YMM0, %YMMZERO, %k0
> > > -     add     $(VEC_SIZE * 4), %rax
> > > -     ktestd  %k0, %k0
> > > -     jz      L(align_four_vec_loop)
> > > -
> > > -     vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
> > > -     sub     $(VEC_SIZE * 5), %rax
> > > -     kmovd   %k0, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_second_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > > -     kmovd   %k1, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_third_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > > -     kmovd   %k2, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(exit_null_on_fourth_vector)
> > > -
> > > -     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > > -     kmovd   %k3, %edx
> > > -     sub     %rdi, %rax
> > > -     bsf     %rdx, %rdx
> > > -     add     %rdx, %rax
> > > -     add     $(VEC_SIZE * 4), %rax
> > > -     jmp     L(StartStrcpyPart)
> > > -
> > > -     .p2align 4
> > > -L(exit):
> > > -     sub     %rdi, %rax
> > > -L(exit_null_on_first_vector):
> > > -     bsf     %rdx, %rdx
> > > -     add     %rdx, %rax
> > > -     jmp     L(StartStrcpyPart)
> > > -
> > > -     .p2align 4
> > > -L(exit_null_on_second_vector):
> > > -     sub     %rdi, %rax
> > > -     bsf     %rdx, %rdx
> > > -     add     %rdx, %rax
> > > -     add     $VEC_SIZE, %rax
> > > -     jmp     L(StartStrcpyPart)
> > > -
> > > -     .p2align 4
> > > -L(exit_null_on_third_vector):
> > > -     sub     %rdi, %rax
> > > -     bsf     %rdx, %rdx
> > > -     add     %rdx, %rax
> > > -     add     $(VEC_SIZE * 2), %rax
> > > -     jmp     L(StartStrcpyPart)
> > > -
> > > -     .p2align 4
> > > -L(exit_null_on_fourth_vector):
> > > -     sub     %rdi, %rax
> > > -     bsf     %rdx, %rdx
> > > -     add     %rdx, %rax
> > > -     add     $(VEC_SIZE * 3), %rax
> > > -     jmp     L(StartStrcpyPart)
> > > -
> > > -     .p2align 4
> > > -L(exit_null_on_fifth_vector):
> > > -     sub     %rdi, %rax
> > > -     bsf     %rdx, %rdx
> > > -     add     %rdx, %rax
> > > -     add     $(VEC_SIZE * 4), %rax
> > > -
> > > -     .p2align 4
> > > -L(StartStrcpyPart):
> > > -     lea     (%r9, %rax), %rdi
> > > -     mov     %rsi, %rcx
> > > -     mov     %r9, %rax      /* save result */
> > > -
> > > -# ifdef USE_AS_STRNCAT
> > > -     test    %r8, %r8
> > > -     jz      L(ExitZero)
> > > -#  define USE_AS_STRNCPY
> > > -# endif
> > > -
> > > -# include "strcpy-evex.S"
> > > +#ifndef STRCAT
> > > +# define STRCAT      __strcat_evex
> > >  #endif
> > > +
> > > +#define USE_AS_STRCAT
> > > +#define STRCPY       STRCAT
> > > +#include "strcpy-evex.S"
> > > diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> > > new file mode 100644
> > > index 0000000000..9530d7b683
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> > > @@ -0,0 +1,110 @@
> > > +/* strlen used for begining of str{n}cat using EVEX 256/512.
> > > +   Copyright (C) 2011-2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +
> > > +/* NOTE: This file is meant to be included by strcat-evex or
> > > +   strncat-evex and does not standalone.  Before including %rdi
> > > +   must be saved in %rax.  */
> >
> > Since this file isn't standalone, please rename it to .h.
>
> Can it be .h.S so it plays well it IDE modes?

It sounds reasonable.

> >
> > > +
> > > +
> > > +/* Simple strlen implementation that ends at
> > > +   L(strcat_strlen_done).  */
> > > +     vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> > > +     movq    %rdi, %r8
> > > +     andq    $(VEC_SIZE * -1), %r8
> > > +     VPCMPEQ (%r8), %VZERO, %k0
> > > +     KMOV    %k0, %VRCX
> > > +#ifdef USE_AS_WCSCPY
> > > +     subl    %r8d, %edi
> > > +     shrl    $2, %edi
> > > +#endif
> > > +     shrx    %VRDI, %VRCX, %VRCX
> > > +#ifdef USE_AS_WCSCPY
> > > +     movq    %rax, %rdi
> > > +#endif
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(bsf_and_done_v0)
> > > +
> > > +
> > > +     VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0
> > > +     KMOV    %k0, %VRCX
> > > +     leaq    (VEC_SIZE)(%r8), %rdi
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(bsf_and_done_v0)
> > > +
> > > +     VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(bsf_and_done_v1)
> > > +
> > > +     VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(bsf_and_done_v2)
> > > +
> > > +     VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(bsf_and_done_v3)
> > > +
> > > +     andq    $-(VEC_SIZE * 4), %rdi
> > > +     .p2align 4,, 8
> > > +L(loop_2x_vec):
> > > +     VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(0)
> > > +     VPMIN   (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
> > > +     VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(2)
> > > +     VPMIN   (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
> > > +     VPTESTN %VMM(1), %VMM(1), %k1
> > > +     VPTESTN %VMM(3), %VMM(3), %k3
> > > +     subq    $(VEC_SIZE * -4), %rdi
> > > +     KORTEST %k1, %k3
> > > +     jz      L(loop_2x_vec)
> > > +
> > > +     VPTESTN %VMM(0), %VMM(0), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(bsf_and_done_v0)
> > > +
> > > +     KMOV    %k1, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(bsf_and_done_v1)
> > > +
> > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(bsf_and_done_v2)
> > > +
> > > +     KMOV    %k3, %VRCX
> > > +L(bsf_and_done_v3):
> > > +     addq    $VEC_SIZE, %rdi
> > > +L(bsf_and_done_v2):
> > > +     bsf     %VRCX, %VRCX
> > > +     leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
> > > +     jmp     L(strcat_strlen_done)
> > > +
> > > +     .p2align 4,, 4
> > > +L(bsf_and_done_v1):
> > > +     addq    $VEC_SIZE, %rdi
> > > +L(bsf_and_done_v0):
> > > +     bsf     %VRCX, %VRCX
> > > +#ifdef USE_AS_WCSCPY
> > > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > +#else
> > > +     addq    %rcx, %rdi
> > > +#endif
> > > +L(strcat_strlen_done):
> > > diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
> > > index 82e45ac675..1ba0195ed2 100644
> > > --- a/sysdeps/x86_64/multiarch/strcpy-evex.S
> > > +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
> > > @@ -1,4 +1,4 @@
> > > -/* strcpy with 256-bit EVEX instructions.
> > > +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
> > >     Copyright (C) 2021-2022 Free Software Foundation, Inc.
> > >     This file is part of the GNU C Library.
> > >
> > > @@ -17,990 +17,526 @@
> > >     <https://www.gnu.org/licenses/>.  */
> > >
> > >  #include <isa-level.h>
> > > -
> > >  #if ISA_SHOULD_BUILD (4)
> > >
> > >
> > > -# ifndef USE_AS_STRCAT
> > > -#  include <sysdep.h>
> > > +     /* Use evex-masked stores for small sizes. Turned off at the
> > > +        moment.  */
> > > +# define USE_EVEX_MASKED_STORE       0
> > > +     /* Use movsb in page cross case to save code size.  */
> > > +# define USE_MOVSB_IN_PAGE_CROSS     1
> > >
> > > -#  ifndef STRCPY
> > > -#   define STRCPY  __strcpy_evex
> > > -#  endif
> > > +# include <sysdep.h>
> > >
> > > +# ifndef VEC_SIZE
> > > +#  include "x86-evex256-vecs.h"
> > >  # endif
> > >
> > > -# define VMOVU               vmovdqu64
> > > -# define VMOVA               vmovdqa64
> > > -
> > > -/* Number of bytes in a vector register */
> > > -# ifndef VEC_SIZE
> > > -#  define VEC_SIZE   32
> > > +# ifndef STRCPY
> > > +#  define STRCPY     __strcpy_evex
> > >  # endif
> > >
> > > -# define XMM2                xmm18
> > > -# define XMM3                xmm19
> > >
> > > -# define YMM2                ymm18
> > > -# define YMM3                ymm19
> > > -# define YMM4                ymm20
> > > -# define YMM5                ymm21
> > > -# define YMM6                ymm22
> > > -# define YMM7                ymm23
> > > +# ifdef USE_AS_WCSCPY
> > > +#  define VMOVU_MASK vmovdqu32
> > > +#  define VPMIN      vpminud
> > > +#  define VPTESTN    vptestnmd
> > > +#  define VPTEST     vptestmd
> > > +#  define VPCMPEQ    vpcmpeqd
> > > +#  define CHAR_SIZE  4
> > >
> > > -# ifndef USE_AS_STRCAT
> > > +#  define REP_MOVS   rep movsd
> > >
> > > -/* zero register */
> > > -#  define XMMZERO    xmm16
> > > -#  define YMMZERO    ymm16
> > > -#  define YMM1               ymm17
> > > -
> > > -     .section .text.evex,"ax",@progbits
> > > -ENTRY (STRCPY)
> > > -#  ifdef USE_AS_STRNCPY
> > > -     mov     %RDX_LP, %R8_LP
> > > -     test    %R8_LP, %R8_LP
> > > -     jz      L(ExitZero)
> > > -#  endif
> > > -     mov     %rsi, %rcx
> > > -#  ifndef USE_AS_STPCPY
> > > -     mov     %rdi, %rax      /* save result */
> > > -#  endif
> > > +#  define USE_WIDE_CHAR
> > > +# else
> > > +#  define VMOVU_MASK vmovdqu8
> > > +#  define VPMIN      vpminub
> > > +#  define VPTESTN    vptestnmb
> > > +#  define VPTEST     vptestmb
> > > +#  define VPCMPEQ    vpcmpeqb
> > > +#  define CHAR_SIZE  1
> > >
> > > -     vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> > > +#  define REP_MOVS   rep movsb
> > >  # endif
> > >
> > > -     and     $((VEC_SIZE * 4) - 1), %ecx
> > > -     cmp     $(VEC_SIZE * 2), %ecx
> > > -     jbe     L(SourceStringAlignmentLessTwoVecSize)
> > > -
> > > -     and     $-VEC_SIZE, %rsi
> > > -     and     $(VEC_SIZE - 1), %ecx
> > > -
> > > -     vpcmpb  $0, (%rsi), %YMMZERO, %k0
> > > -     kmovd   %k0, %edx
> > > -     shr     %cl, %rdx
> > > +# include "reg-macros.h"
> > >
> > > -# ifdef USE_AS_STRNCPY
> > > -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > > -     mov     $VEC_SIZE, %r10
> > > -     sub     %rcx, %r10
> > > -     cmp     %r10, %r8
> > > -#  else
> > > -     mov     $(VEC_SIZE + 1), %r10
> > > -     sub     %rcx, %r10
> > > -     cmp     %r10, %r8
> > > -#  endif
> > > -     jbe     L(CopyVecSizeTailCase2OrCase3)
> > > -# endif
> > > -     test    %edx, %edx
> > > -     jnz     L(CopyVecSizeTail)
> > > -
> > > -     vpcmpb  $0, VEC_SIZE(%rsi), %YMMZERO, %k1
> > > -     kmovd   %k1, %edx
> > >
> > > -# ifdef USE_AS_STRNCPY
> > > -     add     $VEC_SIZE, %r10
> > > -     cmp     %r10, %r8
> > > -     jbe     L(CopyTwoVecSizeCase2OrCase3)
> > > -# endif
> > > -     test    %edx, %edx
> > > -     jnz     L(CopyTwoVecSize)
> > > -
> > > -     VMOVU   (%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
> > > -     VMOVU   %YMM2, (%rdi)
> > > -
> > > -/* If source address alignment != destination address alignment */
> > > -     .p2align 4
> > > -L(UnalignVecSizeBoth):
> > > -     sub     %rcx, %rdi
> > > -# ifdef USE_AS_STRNCPY
> > > -     add     %rcx, %r8
> > > -     sbb     %rcx, %rcx
> > > -     or      %rcx, %r8
> > > -# endif
> > > -     mov     $VEC_SIZE, %rcx
> > > -     VMOVA   (%rsi, %rcx), %YMM2
> > > -     VMOVU   %YMM2, (%rdi, %rcx)
> > > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> > > -     vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > > -     kmovd   %k0, %edx
> > > -     add     $VEC_SIZE, %rcx
> > > -# ifdef USE_AS_STRNCPY
> > > -     sub     $(VEC_SIZE * 3), %r8
> > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > -# endif
> > > -     test    %edx, %edx
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -     jnz     L(CopyVecSizeUnalignedVec2)
> > > +# ifdef USE_AS_STPCPY
> > > +#  define END_REG    rax
> > >  # else
> > > -     jnz     L(CopyVecSize)
> > > +#  define END_REG    rdi, %rdx, CHAR_SIZE
> > >  # endif
> > >
> > > -     VMOVU   %YMM2, (%rdi, %rcx)
> > > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM3
> > > -     vpcmpb  $0, %YMM3, %YMMZERO, %k0
> > > -     kmovd   %k0, %edx
> > > -     add     $VEC_SIZE, %rcx
> > > -# ifdef USE_AS_STRNCPY
> > > -     sub     $VEC_SIZE, %r8
> > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > -# endif
> > > -     test    %edx, %edx
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -     jnz     L(CopyVecSizeUnalignedVec3)
> > > +# ifdef USE_AS_STRCAT
> > > +#  define PAGE_ALIGN_REG     edx
> > > +#  define PAGE_ALIGN_REG_64  rdx
> > >  # else
> > > -     jnz     L(CopyVecSize)
> > > +#  define PAGE_ALIGN_REG     eax
> > > +#  define PAGE_ALIGN_REG_64  rax
> > >  # endif
> > >
> > > -     VMOVU   %YMM3, (%rdi, %rcx)
> > > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM4
> > > -     vpcmpb  $0, %YMM4, %YMMZERO, %k0
> > > -     kmovd   %k0, %edx
> > > -     add     $VEC_SIZE, %rcx
> > > -# ifdef USE_AS_STRNCPY
> > > -     sub     $VEC_SIZE, %r8
> > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > -# endif
> > > -     test    %edx, %edx
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -     jnz     L(CopyVecSizeUnalignedVec4)
> > > -# else
> > > -     jnz     L(CopyVecSize)
> > > -# endif
> > > +# define VZERO       VMM(7)
> > > +# define VZERO_128   VMM_128(7)
> > >
> > > -     VMOVU   %YMM4, (%rdi, %rcx)
> > > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> > > -     vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > > -     kmovd   %k0, %edx
> > > -     add     $VEC_SIZE, %rcx
> > > -# ifdef USE_AS_STRNCPY
> > > -     sub     $VEC_SIZE, %r8
> > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > -# endif
> > > -     test    %edx, %edx
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -     jnz     L(CopyVecSizeUnalignedVec2)
> > > -# else
> > > -     jnz     L(CopyVecSize)
> > > -# endif
> > >
> > > -     VMOVU   %YMM2, (%rdi, %rcx)
> > > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> > > -     vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > > -     kmovd   %k0, %edx
> > > -     add     $VEC_SIZE, %rcx
> > > -# ifdef USE_AS_STRNCPY
> > > -     sub     $VEC_SIZE, %r8
> > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > -# endif
> > > -     test    %edx, %edx
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -     jnz     L(CopyVecSizeUnalignedVec2)
> > > -# else
> > > -     jnz     L(CopyVecSize)
> > > -# endif
> > > +# define PAGE_SIZE   4096
> > > +# define CHAR_PER_VEC        (VEC_SIZE / CHAR_SIZE)
> > >
> > > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM3
> > > -     VMOVU   %YMM2, (%rdi, %rcx)
> > > -     vpcmpb  $0, %YMM3, %YMMZERO, %k0
> > > -     kmovd   %k0, %edx
> > > -     add     $VEC_SIZE, %rcx
> > > -# ifdef USE_AS_STRNCPY
> > > -     sub     $VEC_SIZE, %r8
> > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > -# endif
> > > -     test    %edx, %edx
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -     jnz     L(CopyVecSizeUnalignedVec3)
> > > -# else
> > > -     jnz     L(CopyVecSize)
> > > -# endif
> > >
> > > -     VMOVU   %YMM3, (%rdi, %rcx)
> > > -     mov     %rsi, %rdx
> > > -     lea     VEC_SIZE(%rsi, %rcx), %rsi
> > > -     and     $-(VEC_SIZE * 4), %rsi
> > > -     sub     %rsi, %rdx
> > > -     sub     %rdx, %rdi
> > > -# ifdef USE_AS_STRNCPY
> > > -     lea     (VEC_SIZE * 8)(%r8, %rdx), %r8
> > > -# endif
> > > -L(UnalignedFourVecSizeLoop):
> > > -     VMOVA   (%rsi), %YMM4
> > > -     VMOVA   VEC_SIZE(%rsi), %YMM5
> > > -     VMOVA   (VEC_SIZE * 2)(%rsi), %YMM6
> > > -     VMOVA   (VEC_SIZE * 3)(%rsi), %YMM7
> > > -     vpminub %YMM5, %YMM4, %YMM2
> > > -     vpminub %YMM7, %YMM6, %YMM3
> > > -     vpminub %YMM2, %YMM3, %YMM2
> > > -     /* If K7 != 0, there is a null byte.  */
> > > -     vpcmpb  $0, %YMM2, %YMMZERO, %k7
> > > -     kmovd   %k7, %edx
> > > -# ifdef USE_AS_STRNCPY
> > > -     sub     $(VEC_SIZE * 4), %r8
> > > -     jbe     L(UnalignedLeaveCase2OrCase3)
> > > +     .section SECTION(.text), "ax", @progbits
> > > +ENTRY(STRCPY)
> > > +# ifdef USE_AS_STRCAT
> > > +     movq    %rdi, %rax
> > > +#  include "strcat-strlen-evex.S"
> > >  # endif
> > > -     test    %edx, %edx
> > > -     jnz     L(UnalignedFourVecSizeLeave)
> > > -
> > > -L(UnalignedFourVecSizeLoop_start):
> > > -     add     $(VEC_SIZE * 4), %rdi
> > > -     add     $(VEC_SIZE * 4), %rsi
> > > -     VMOVU   %YMM4, -(VEC_SIZE * 4)(%rdi)
> > > -     VMOVA   (%rsi), %YMM4
> > > -     VMOVU   %YMM5, -(VEC_SIZE * 3)(%rdi)
> > > -     VMOVA   VEC_SIZE(%rsi), %YMM5
> > > -     vpminub %YMM5, %YMM4, %YMM2
> > > -     VMOVU   %YMM6, -(VEC_SIZE * 2)(%rdi)
> > > -     VMOVA   (VEC_SIZE * 2)(%rsi), %YMM6
> > > -     VMOVU   %YMM7, -VEC_SIZE(%rdi)
> > > -     VMOVA   (VEC_SIZE * 3)(%rsi), %YMM7
> > > -     vpminub %YMM7, %YMM6, %YMM3
> > > -     vpminub %YMM2, %YMM3, %YMM2
> > > -     /* If K7 != 0, there is a null byte.  */
> > > -     vpcmpb  $0, %YMM2, %YMMZERO, %k7
> > > -     kmovd   %k7, %edx
> > > -# ifdef USE_AS_STRNCPY
> > > -     sub     $(VEC_SIZE * 4), %r8
> > > -     jbe     L(UnalignedLeaveCase2OrCase3)
> > > +
> > > +     movl    %esi, %PAGE_ALIGN_REG
> > > +     andl    $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
> > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
> > > +     ja      L(page_cross)
> > > +L(page_cross_continue):
> > > +     VMOVU   (%rsi), %VMM(0)
> > > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > > +     movq    %rdi, %rax
> > >  # endif
> > > -     test    %edx, %edx
> > > -     jz      L(UnalignedFourVecSizeLoop_start)
> > >
> > > -L(UnalignedFourVecSizeLeave):
> > > -     vpcmpb  $0, %YMM4, %YMMZERO, %k1
> > > -     kmovd   %k1, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(CopyVecSizeUnaligned_0)
> > >
> > > -     vpcmpb  $0, %YMM5, %YMMZERO, %k2
> > > -     kmovd   %k2, %ecx
> > > -     test    %ecx, %ecx
> > > -     jnz     L(CopyVecSizeUnaligned_16)
> > > +     /* Two short string implementations. One with traditional
> > > +        branching approach and one with masked instructions (which
> > > +        have potential for dramatically bad perf if dst splits a
> > > +        page and is not in the TLB).  */
> > > +# if USE_EVEX_MASKED_STORE
> > > +     VPTEST  %VMM(0), %VMM(0), %k0
> > > +     KMOV    %k0, %VRCX
> > > +#  ifdef USE_AS_WCSCPY
> > > +     subl    $((1 << CHAR_PER_VEC)- 1), %VRCX
> > > +#  else
> > > +     inc     %VRCX
> > > +#  endif
> > > +     jz      L(more_1x_vec)
> > > +     KMOV    %VRCX, %k1
> > > +     KXOR    %k0, %k1, %k1
> > >
> > > -     vpcmpb  $0, %YMM6, %YMMZERO, %k3
> > > -     kmovd   %k3, %edx
> > > -     test    %edx, %edx
> > > -     jnz     L(CopyVecSizeUnaligned_32)
> > > -
> > > -     vpcmpb  $0, %YMM7, %YMMZERO, %k4
> > > -     kmovd   %k4, %ecx
> > > -     bsf     %ecx, %edx
> > > -     VMOVU   %YMM4, (%rdi)
> > > -     VMOVU   %YMM5, VEC_SIZE(%rdi)
> > > -     VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -# ifdef USE_AS_STPCPY
> > > -     lea     (VEC_SIZE * 3)(%rdi, %rdx), %rax
> > > -# endif
> > > -     VMOVU   %YMM7, (VEC_SIZE * 3)(%rdi)
> > > -     add     $(VEC_SIZE - 1), %r8
> > > -     sub     %rdx, %r8
> > > -     lea     ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
> > > -     jmp     L(StrncpyFillTailWithZero)
> > > -# else
> > > -     add     $(VEC_SIZE * 3), %rsi
> > > -     add     $(VEC_SIZE * 3), %rdi
> > > -     jmp     L(CopyVecSizeExit)
> > > -# endif
> > > +     VMOVU_MASK %VMM(0), (%rdi){%k1}
> > >
> > > -/* If source address alignment == destination address alignment */
> > > +#  ifdef USE_AS_STPCPY
> > > +     bsf     %VRCX, %VRCX
> > > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rax
> > > +#  endif
> > > +     ret
> > >
> > > -L(SourceStringAlignmentLessTwoVecSize):
> > > -     VMOVU   (%rsi), %YMM3
> > > -     VMOVU   VEC_SIZE(%rsi), %YMM2
> > > -     vpcmpb  $0, %YMM3, %YMMZERO, %k0
> > > -     kmovd   %k0, %edx
> > > +# else
> > > +     VPTESTN %VMM(0), %VMM(0), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jz      L(more_1x_vec)
> > >
> > > -# ifdef USE_AS_STRNCPY
> > > -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > > -     cmp     $VEC_SIZE, %r8
> > > +     xorl    %edx, %edx
> > > +     bsf     %VRCX, %VRDX
> > > +#  ifdef USE_AS_STPCPY
> > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > > +#  endif
> > > +
> > > +     /* Use mask bits in rcx to detect which copy we need. If the low
> > > +        mask is zero then there must be a bit set in the upper half.
> > > +        I.e if rcx != 0 and ecx == 0, then match must be upper 32
> > > +        bits so we use L(copy_32_63).  */
> > > +#  if VEC_SIZE == 64
> > > +#   ifdef USE_AS_WCSCPY
> > > +     testb   %cl, %cl
> > > +#   else
> > > +     testl   %ecx, %ecx
> > > +#   endif
> > > +     jz      L(copy_32_63)
> > > +#  endif
> > > +
> > > +#  ifdef USE_AS_WCSCPY
> > > +     testb   $0xf, %cl
> > >  #  else
> > > -     cmp     $(VEC_SIZE + 1), %r8
> > > +     testw   %cx, %cx
> > >  #  endif
> > > -     jbe     L(CopyVecSizeTail1Case2OrCase3)
> > > -# endif
> > > -     test    %edx, %edx
> > > -     jnz     L(CopyVecSizeTail1)
> > > +     jz      L(copy_16_31)
> > >
> > > -     VMOVU   %YMM3, (%rdi)
> > > -     vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > > -     kmovd   %k0, %edx
> > >
> > > -# ifdef USE_AS_STRNCPY
> > > -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > > -     cmp     $(VEC_SIZE * 2), %r8
> > > +#  ifdef USE_AS_WCSCPY
> > > +     testb   $0x3, %cl
> > >  #  else
> > > -     cmp     $((VEC_SIZE * 2) + 1), %r8
> > > +     testb   %cl, %cl
> > >  #  endif
> > > -     jbe     L(CopyTwoVecSize1Case2OrCase3)
> > > -# endif
> > > -     test    %edx, %edx
> > > -     jnz     L(CopyTwoVecSize1)
> > > -
> > > -     and     $-VEC_SIZE, %rsi
> > > -     and     $(VEC_SIZE - 1), %ecx
> > > -     jmp     L(UnalignVecSizeBoth)
> > > +     jz      L(copy_8_15)
> > >
> > > -/*------End of main part with loops---------------------*/
> > >
> > > -/* Case1 */
> > > +#  ifdef USE_AS_WCSCPY
> > > +     vmovd   %VMM_128(0), (%rdi)
> > > +     /* No need to copy, we know its zero.  */
> > > +     movl    $0, (%END_REG)
> > >
> > > -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> > > -     .p2align 4
> > > -L(CopyVecSize):
> > > -     add     %rcx, %rdi
> > > -# endif
> > > -L(CopyVecSizeTail):
> > > -     add     %rcx, %rsi
> > > -L(CopyVecSizeTail1):
> > > -     bsf     %edx, %edx
> > > -L(CopyVecSizeExit):
> > > -     cmp     $32, %edx
> > > -     jae     L(Exit32_63)
> > > -     cmp     $16, %edx
> > > -     jae     L(Exit16_31)
> > > -     cmp     $8, %edx
> > > -     jae     L(Exit8_15)
> > > -     cmp     $4, %edx
> > > -     jae     L(Exit4_7)
> > > -     cmp     $3, %edx
> > > -     je      L(Exit3)
> > > -     cmp     $1, %edx
> > > -     ja      L(Exit2)
> > > -     je      L(Exit1)
> > > -     movb    $0, (%rdi)
> > > -# ifdef USE_AS_STPCPY
> > > -     lea     (%rdi), %rax
> > > -# endif
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -     sub     $1, %r8
> > > -     lea     1(%rdi), %rdi
> > > -     jnz     L(StrncpyFillTailWithZero)
> > > -# endif
> > >       ret
> > > +#  else
> > >
> > > -     .p2align 4
> > > -L(CopyTwoVecSize1):
> > > -     add     $VEC_SIZE, %rsi
> > > -     add     $VEC_SIZE, %rdi
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -     sub     $VEC_SIZE, %r8
> > > -# endif
> > > -     jmp     L(CopyVecSizeTail1)
> > > -
> > > -     .p2align 4
> > > -L(CopyTwoVecSize):
> > > -     bsf     %edx, %edx
> > > -     add     %rcx, %rsi
> > > -     add     $VEC_SIZE, %edx
> > > -     sub     %ecx, %edx
> > > -     jmp     L(CopyVecSizeExit)
> > > -
> > > -     .p2align 4
> > > -L(CopyVecSizeUnaligned_0):
> > > -     bsf     %edx, %edx
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -# ifdef USE_AS_STPCPY
> > > -     lea     (%rdi, %rdx), %rax
> > > -# endif
> > > -     VMOVU   %YMM4, (%rdi)
> > > -     add     $((VEC_SIZE * 4) - 1), %r8
> > > -     sub     %rdx, %r8
> > > -     lea     1(%rdi, %rdx), %rdi
> > > -     jmp     L(StrncpyFillTailWithZero)
> > > -# else
> > > -     jmp     L(CopyVecSizeExit)
> > > -# endif
> > > +     testb   $0x7, %cl
> > > +     jz      L(copy_4_7)
> > >
> > > -     .p2align 4
> > > -L(CopyVecSizeUnaligned_16):
> > > -     bsf     %ecx, %edx
> > > -     VMOVU   %YMM4, (%rdi)
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -# ifdef USE_AS_STPCPY
> > > -     lea     VEC_SIZE(%rdi, %rdx), %rax
> > > -# endif
> > > -     VMOVU   %YMM5, VEC_SIZE(%rdi)
> > > -     add     $((VEC_SIZE * 3) - 1), %r8
> > > -     sub     %rdx, %r8
> > > -     lea     (VEC_SIZE + 1)(%rdi, %rdx), %rdi
> > > -     jmp     L(StrncpyFillTailWithZero)
> > > -# else
> > > -     add     $VEC_SIZE, %rsi
> > > -     add     $VEC_SIZE, %rdi
> > > -     jmp     L(CopyVecSizeExit)
> > > -# endif
> > >
> > > -     .p2align 4
> > > -L(CopyVecSizeUnaligned_32):
> > > -     bsf     %edx, %edx
> > > -     VMOVU   %YMM4, (%rdi)
> > > -     VMOVU   %YMM5, VEC_SIZE(%rdi)
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -# ifdef USE_AS_STPCPY
> > > -     lea     (VEC_SIZE * 2)(%rdi, %rdx), %rax
> > > -# endif
> > > -     VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > > -     add     $((VEC_SIZE * 2) - 1), %r8
> > > -     sub     %rdx, %r8
> > > -     lea     ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
> > > -     jmp     L(StrncpyFillTailWithZero)
> > > -# else
> > > -     add     $(VEC_SIZE * 2), %rsi
> > > -     add     $(VEC_SIZE * 2), %rdi
> > > -     jmp     L(CopyVecSizeExit)
> > > -# endif
> > > +     test    %edx, %edx
> > > +     jz      L(set_null_term)
> > >
> > > -# ifdef USE_AS_STRNCPY
> > > -#  ifndef USE_AS_STRCAT
> > > -     .p2align 4
> > > -L(CopyVecSizeUnalignedVec6):
> > > -     VMOVU   %YMM6, (%rdi, %rcx)
> > > -     jmp     L(CopyVecSizeVecExit)
> > > -
> > > -     .p2align 4
> > > -L(CopyVecSizeUnalignedVec5):
> > > -     VMOVU   %YMM5, (%rdi, %rcx)
> > > -     jmp     L(CopyVecSizeVecExit)
> > > -
> > > -     .p2align 4
> > > -L(CopyVecSizeUnalignedVec4):
> > > -     VMOVU   %YMM4, (%rdi, %rcx)
> > > -     jmp     L(CopyVecSizeVecExit)
> > > -
> > > -     .p2align 4
> > > -L(CopyVecSizeUnalignedVec3):
> > > -     VMOVU   %YMM3, (%rdi, %rcx)
> > > -     jmp     L(CopyVecSizeVecExit)
> > > +     /* NB: make this `vmovw` if support for AVX512-FP16 is added.
> > > +      */
> > > +     vmovd   %VMM_128(0), %esi
> > > +     movw    %si, (%rdi)
> > > +
> > > +     .p2align 4,, 1
> > > +L(set_null_term):
> > > +     /* No need to copy, we know its zero.  */
> > > +     movb    $0, (%END_REG)
> > > +     ret
> > >  #  endif
> > >
> > > -/* Case2 */
> > > -
> > > -     .p2align 4
> > > -L(CopyVecSizeCase2):
> > > -     add     $VEC_SIZE, %r8
> > > -     add     %rcx, %rdi
> > > -     add     %rcx, %rsi
> > > -     bsf     %edx, %edx
> > > -     cmp     %r8d, %edx
> > > -     jb      L(CopyVecSizeExit)
> > > -     jmp     L(StrncpyExit)
> > > -
> > > -     .p2align 4
> > > -L(CopyTwoVecSizeCase2):
> > > -     add     %rcx, %rsi
> > > -     bsf     %edx, %edx
> > > -     add     $VEC_SIZE, %edx
> > > -     sub     %ecx, %edx
> > > -     cmp     %r8d, %edx
> > > -     jb      L(CopyVecSizeExit)
> > > -     jmp     L(StrncpyExit)
> > > -
> > > -L(CopyVecSizeTailCase2):
> > > -     add     %rcx, %rsi
> > > -     bsf     %edx, %edx
> > > -     cmp     %r8d, %edx
> > > -     jb      L(CopyVecSizeExit)
> > > -     jmp     L(StrncpyExit)
> > > -
> > > -L(CopyVecSizeTail1Case2):
> > > -     bsf     %edx, %edx
> > > -     cmp     %r8d, %edx
> > > -     jb      L(CopyVecSizeExit)
> > > -     jmp     L(StrncpyExit)
> > > -
> > > -/* Case2 or Case3,  Case3 */
> > > -
> > > -     .p2align 4
> > > -L(CopyVecSizeCase2OrCase3):
> > > -     test    %rdx, %rdx
> > > -     jnz     L(CopyVecSizeCase2)
> > > -L(CopyVecSizeCase3):
> > > -     add     $VEC_SIZE, %r8
> > > -     add     %rcx, %rdi
> > > -     add     %rcx, %rsi
> > > -     jmp     L(StrncpyExit)
> > > -
> > > -     .p2align 4
> > > -L(CopyTwoVecSizeCase2OrCase3):
> > > -     test    %rdx, %rdx
> > > -     jnz     L(CopyTwoVecSizeCase2)
> > > -     add     %rcx, %rsi
> > > -     jmp     L(StrncpyExit)
> > > -
> > > -     .p2align 4
> > > -L(CopyVecSizeTailCase2OrCase3):
> > > -     test    %rdx, %rdx
> > > -     jnz     L(CopyVecSizeTailCase2)
> > > -     add     %rcx, %rsi
> > > -     jmp     L(StrncpyExit)
> > > -
> > > -     .p2align 4
> > > -L(CopyTwoVecSize1Case2OrCase3):
> > > -     add     $VEC_SIZE, %rdi
> > > -     add     $VEC_SIZE, %rsi
> > > -     sub     $VEC_SIZE, %r8
> > > -L(CopyVecSizeTail1Case2OrCase3):
> > > -     test    %rdx, %rdx
> > > -     jnz     L(CopyVecSizeTail1Case2)
> > > -     jmp     L(StrncpyExit)
> > > +#  if VEC_SIZE == 64
> > > +     .p2align 4,, 6
> > > +L(copy_32_63):
> > > +     VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > > +     VMOVU   %VMM_256(0), (%rdi)
> > > +     VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
> > > +     ret
> > > +#  endif
> > > +
> > > +
> > > +     .p2align 4,, 6
> > > +L(copy_16_31):
> > > +     /* Use xmm1 explicitly here as it won't require a `vzeroupper`
> > > +        and will save code size.  */
> > > +     vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > > +     VMOVU   %VMM_128(0), (%rdi)
> > > +     vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
> > > +     ret
> > > +
> > > +     .p2align 4,, 8
> > > +L(copy_8_15):
> > > +#  ifdef USE_AS_WCSCPY
> > > +     movl    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> > > +#  else
> > > +     movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
> > > +#  endif
> > > +     vmovq   %VMM_128(0), (%rdi)
> > > +     movq    %rcx, -(8 - CHAR_SIZE)(%END_REG)
> > > +     ret
> > >  # endif
> > >
> > > -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
> > >
> > > -     .p2align 4
> > > -L(Exit1):
> > > -     movzwl  (%rsi), %edx
> > > -     mov     %dx, (%rdi)
> > > -# ifdef USE_AS_STPCPY
> > > -     lea     1(%rdi), %rax
> > > +# ifndef USE_AS_WCSCPY
> > > +     .p2align 4,, 12
> > > +L(copy_4_7):
> > > +     movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> > > +     vmovd   %VMM_128(0), (%rdi)
> > > +     movl    %ecx, -(4 - CHAR_SIZE)(%END_REG)
> > > +     ret
> > >  # endif
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -     sub     $2, %r8
> > > -     lea     2(%rdi), %rdi
> > > -     jnz     L(StrncpyFillTailWithZero)
> > > +
> > > +
> > > +     .p2align 4,, 8
> > > +L(more_1x_vec):
> > > +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > > +     VMOVU   %VMM(0), (%rdi)
> > >  # endif
> > > -     ret
> > > +     subq    %rsi, %rdi
> > > +     andq    $-(VEC_SIZE), %rsi
> > > +     addq    %rsi, %rdi
> > > +     VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> > >
> > > -     .p2align 4
> > > -L(Exit2):
> > > -     movzwl  (%rsi), %ecx
> > > -     mov     %cx, (%rdi)
> > > -     movb    $0, 2(%rdi)
> > > +     /* Ideally we store after moves to minimize impact of potential
> > > +        false-dependencies.  */
> > > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > > +     VMOVU   %VMM(0), (%rax)
> > > +# endif
> > > +
> > > +     VPTESTN %VMM(1), %VMM(1), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(ret_vec_x1)
> > > +
> > > +     VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > > +     VMOVU   %VMM(1), VEC_SIZE(%rdi)
> > > +
> > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(ret_vec_x2)
> > > +
> > > +     VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> > > +     VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > > +
> > > +     VPTESTN %VMM(3), %VMM(3), %k0
> > > +     KMOV    %k0, %VRDX
> > > +     test    %VRDX, %VRDX
> > > +     jnz     L(ret_vec_x3)
> > > +
> > > +     VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > > +     VPTESTN %VMM(4), %VMM(4), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(ret_vec_x4)
> > > +
> > > +     VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> > > +
> > > +
> > > +     /* Align for 4x loop.  */
> > > +     subq    %rsi, %rdi
> > > +
> > > +     /* + VEC_SIZE * 5 because we never added the original VEC_SIZE
> > > +        we covered before aligning.  */
> > > +     subq    $-(VEC_SIZE * 5), %rsi
> > > +     andq    $-(VEC_SIZE * 4), %rsi
> > > +
> > > +
> > > +     /* Load first half of the loop before entry.  */
> > > +     VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > > +
> > > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > > +     VPTESTN %VMM(4), %VMM(4), %k2
> > > +     VPTESTN %VMM(6), %VMM(6), %k4
> > > +     KORTEST %k2, %k4
> > > +     jnz     L(loop_4x_done)
> > > +
> > > +     .p2align 4,, 11
> > > +L(loop_4x_vec):
> > > +
> > > +     VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
> > > +     VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > > +     VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> > > +     VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> > > +
> > > +     subq    $(VEC_SIZE * -4), %rsi
> > > +
> > > +     VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > > +
> > > +
> > > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > > +     VPTESTN %VMM(4), %VMM(4), %k2
> > > +     VPTESTN %VMM(6), %VMM(6), %k4
> > > +     KORTEST %k2, %k4
> > > +     jz      L(loop_4x_vec)
> > > +
> > > +L(loop_4x_done):
> > > +     VPTESTN %VMM(0), %VMM(0), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     /* Restore rdi (%rdi).  */
> > > +     addq    %rsi, %rdi
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(ret_vec_x0_end)
> > > +     VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> > > +
> > > +     KMOV    %k2, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(ret_vec_x1)
> > > +     VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> > > +
> > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(ret_vec_x2)
> > > +     VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> > > +     /* Place L(ret_vec_x4) here to save code size.  We get a
> > > +        meaningfuly benefit doing this for stpcpy.  */
> > > +     KMOV    %k4, %VRDX
> > > +L(ret_vec_x3):
> > > +     bsf     %VRDX, %VRDX
> > > +     VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > > +     VMOVU   %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > >  # ifdef USE_AS_STPCPY
> > > -     lea     2(%rdi), %rax
> > > -# endif
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -     sub     $3, %r8
> > > -     lea     3(%rdi), %rdi
> > > -     jnz     L(StrncpyFillTailWithZero)
> > > +     leaq    (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
> > >  # endif
> > > +L(return_end):
> > >       ret
> > >
> > > -     .p2align 4
> > > -L(Exit3):
> > > -     mov     (%rsi), %edx
> > > -     mov     %edx, (%rdi)
> > > +     .p2align 4,, 6
> > > +L(ret_vec_x0_end):
> > > +     bsf     %VRCX, %VRCX
> > >  # ifdef USE_AS_STPCPY
> > > -     lea     3(%rdi), %rax
> > > -# endif
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -     sub     $4, %r8
> > > -     lea     4(%rdi), %rdi
> > > -     jnz     L(StrncpyFillTailWithZero)
> > > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rax
> > >  # endif
> > > +     inc     %VRCX
> > > +     VMOVU   (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > > +     VMOVU   %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > >       ret
> > >
> > > -     .p2align 4
> > > -L(Exit4_7):
> > > -     mov     (%rsi), %ecx
> > > -     mov     %ecx, (%rdi)
> > > -     mov     -3(%rsi, %rdx), %ecx
> > > -     mov     %ecx, -3(%rdi, %rdx)
> > > +     .p2align 4,, 8
> > > +L(ret_vec_x1):
> > > +     bsf     %VRCX, %VRCX
> > > +     VMOVU   (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > > +     VMOVU   %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > >  # ifdef USE_AS_STPCPY
> > > -     lea     (%rdi, %rdx), %rax
> > > -# endif
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -     sub     %rdx, %r8
> > > -     sub     $1, %r8
> > > -     lea     1(%rdi, %rdx), %rdi
> > > -     jnz     L(StrncpyFillTailWithZero)
> > > +     leaq    VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
> > >  # endif
> > >       ret
> > >
> > > -     .p2align 4
> > > -L(Exit8_15):
> > > -     mov     (%rsi), %rcx
> > > -     mov     -7(%rsi, %rdx), %r9
> > > -     mov     %rcx, (%rdi)
> > > -     mov     %r9, -7(%rdi, %rdx)
> > > +     .p2align 4,, 4
> > > +L(ret_vec_x2):
> > > +     bsf     %VRCX, %VRCX
> > > +     VMOVU   ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > > +     VMOVU   %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > >  # ifdef USE_AS_STPCPY
> > > -     lea     (%rdi, %rdx), %rax
> > > -# endif
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -     sub     %rdx, %r8
> > > -     sub     $1, %r8
> > > -     lea     1(%rdi, %rdx), %rdi
> > > -     jnz     L(StrncpyFillTailWithZero)
> > > +     leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
> > >  # endif
> > >       ret
> > >
> > > -     .p2align 4
> > > -L(Exit16_31):
> > > -     VMOVU   (%rsi), %XMM2
> > > -     VMOVU   -15(%rsi, %rdx), %XMM3
> > > -     VMOVU   %XMM2, (%rdi)
> > > -     VMOVU   %XMM3, -15(%rdi, %rdx)
> > > +     /* ret_vec_x3 reuses return code after the loop.  */
> > > +     .p2align 4,, 6
> > > +L(ret_vec_x4):
> > > +     bsf     %VRCX, %VRCX
> > > +     VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > > +     VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > >  # ifdef USE_AS_STPCPY
> > > -     lea     (%rdi, %rdx), %rax
> > > -# endif
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -     sub %rdx, %r8
> > > -     sub $1, %r8
> > > -     lea 1(%rdi, %rdx), %rdi
> > > -     jnz L(StrncpyFillTailWithZero)
> > > +     leaq    (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
> > >  # endif
> > >       ret
> > >
> > > -     .p2align 4
> > > -L(Exit32_63):
> > > -     VMOVU   (%rsi), %YMM2
> > > -     VMOVU   -31(%rsi, %rdx), %YMM3
> > > -     VMOVU   %YMM2, (%rdi)
> > > -     VMOVU   %YMM3, -31(%rdi, %rdx)
> > > -# ifdef USE_AS_STPCPY
> > > -     lea     (%rdi, %rdx), %rax
> > > +
> > > +     .p2align 4,, 4
> > > +L(page_cross):
> > > +# ifndef USE_AS_STRCAT
> > > +     vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> > >  # endif
> > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > -     sub     %rdx, %r8
> > > -     sub     $1, %r8
> > > -     lea     1(%rdi, %rdx), %rdi
> > > -     jnz     L(StrncpyFillTailWithZero)
> > > +     movq    %rsi, %rcx
> > > +     andq    $(VEC_SIZE * -1), %rcx
> > > +
> > > +     VPCMPEQ (%rcx), %VZERO, %k0
> > > +     KMOV    %k0, %VRCX
> > > +# ifdef USE_AS_WCSCPY
> > > +     andl    $(VEC_SIZE - 1), %PAGE_ALIGN_REG
> > > +     shrl    $2, %PAGE_ALIGN_REG
> > >  # endif
> > > -     ret
> > > +     shrx    %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
> > >
> > > -# ifdef USE_AS_STRNCPY
> > > +# if USE_MOVSB_IN_PAGE_CROSS
> > > +     /* Optimizing more aggressively for space as this is very cold
> > > +        code. This saves 2x cache lines.  */
> > >
> > > -     .p2align 4
> > > -L(StrncpyExit1):
> > > -     movzbl  (%rsi), %edx
> > > -     mov     %dl, (%rdi)
> > > -#  ifdef USE_AS_STPCPY
> > > -     lea     1(%rdi), %rax
> > > -#  endif
> > > -#  ifdef USE_AS_STRCAT
> > > -     movb    $0, 1(%rdi)
> > > +     /* This adds once to the later result which will get correct
> > > +        copy bounds. NB: this can never zero-out a non-zero RCX as
> > > +        to be in the page cross case rsi cannot be aligned and we
> > > +        already right-shift rcx by the misalignment.  */
> > > +     shl     %VRCX
> > > +     jz      L(page_cross_continue)
> > > +#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > > +     movq    %rdi, %rax
> > >  #  endif
> > > -     ret
> > > +     bsf     %VRCX, %VRCX
> > > +     REP_MOVS
> > >
> > > -     .p2align 4
> > > -L(StrncpyExit2):
> > > -     movzwl  (%rsi), %edx
> > > -     mov     %dx, (%rdi)
> > >  #  ifdef USE_AS_STPCPY
> > > -     lea     2(%rdi), %rax
> > > -#  endif
> > > -#  ifdef USE_AS_STRCAT
> > > -     movb    $0, 2(%rdi)
> > > +     leaq    -CHAR_SIZE(%rdi), %rax
> > >  #  endif
> > >       ret
> > >
> > > -     .p2align 4
> > > -L(StrncpyExit3_4):
> > > -     movzwl  (%rsi), %ecx
> > > -     movzwl  -2(%rsi, %r8), %edx
> > > -     mov     %cx, (%rdi)
> > > -     mov     %dx, -2(%rdi, %r8)
> > > -#  ifdef USE_AS_STPCPY
> > > -     lea     (%rdi, %r8), %rax
> > > -#  endif
> > > -#  ifdef USE_AS_STRCAT
> > > -     movb    $0, (%rdi, %r8)
> > > -#  endif
> > > -     ret
> > >
> > > -     .p2align 4
> > > -L(StrncpyExit5_8):
> > > -     mov     (%rsi), %ecx
> > > -     mov     -4(%rsi, %r8), %edx
> > > -     mov     %ecx, (%rdi)
> > > -     mov     %edx, -4(%rdi, %r8)
> > > -#  ifdef USE_AS_STPCPY
> > > -     lea     (%rdi, %r8), %rax
> > > -#  endif
> > > -#  ifdef USE_AS_STRCAT
> > > -     movb    $0, (%rdi, %r8)
> > > -#  endif
> > > -     ret
> > > +# else
> > > +     /* Check if we found zero-char before end of page.  */
> > > +     test    %VRCX, %VRCX
> > > +     jz      L(page_cross_continue)
> > >
> > > -     .p2align 4
> > > -L(StrncpyExit9_16):
> > > -     mov     (%rsi), %rcx
> > > -     mov     -8(%rsi, %r8), %rdx
> > > -     mov     %rcx, (%rdi)
> > > -     mov     %rdx, -8(%rdi, %r8)
> > > -#  ifdef USE_AS_STPCPY
> > > -     lea     (%rdi, %r8), %rax
> > > -#  endif
> > > -#  ifdef USE_AS_STRCAT
> > > -     movb    $0, (%rdi, %r8)
> > > -#  endif
> > > -     ret
> > > +     /* Traditional copy case, essentially same as used in non-page-
> > > +        cross case but since we can't reuse VMM(0) we need twice as
> > > +        many loads from rsi.  */
> > >
> > > -     .p2align 4
> > > -L(StrncpyExit17_32):
> > > -     VMOVU   (%rsi), %XMM2
> > > -     VMOVU   -16(%rsi, %r8), %XMM3
> > > -     VMOVU   %XMM2, (%rdi)
> > > -     VMOVU   %XMM3, -16(%rdi, %r8)
> > > -#  ifdef USE_AS_STPCPY
> > > -     lea     (%rdi, %r8), %rax
> > > -#  endif
> > > -#  ifdef USE_AS_STRCAT
> > > -     movb    $0, (%rdi, %r8)
> > > +#  ifndef USE_AS_STRCAT
> > > +     xorl    %edx, %edx
> > >  #  endif
> > > -     ret
> > > -
> > > -     .p2align 4
> > > -L(StrncpyExit33_64):
> > > -     /*  0/32, 31/16 */
> > > -     VMOVU   (%rsi), %YMM2
> > > -     VMOVU   -VEC_SIZE(%rsi, %r8), %YMM3
> > > -     VMOVU   %YMM2, (%rdi)
> > > -     VMOVU   %YMM3, -VEC_SIZE(%rdi, %r8)
> > > +     /* Dependency on rdi must already have been satisfied.  */
> > > +     bsf     %VRCX, %VRDX
> > >  #  ifdef USE_AS_STPCPY
> > > -     lea     (%rdi, %r8), %rax
> > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > > +#  elif !defined USE_AS_STRCAT
> > > +     movq    %rdi, %rax
> > >  #  endif
> > > -#  ifdef USE_AS_STRCAT
> > > -     movb    $0, (%rdi, %r8)
> > > -#  endif
> > > -     ret
> > >
> > > -     .p2align 4
> > > -L(StrncpyExit65):
> > > -     /* 0/32, 32/32, 64/1 */
> > > -     VMOVU   (%rsi), %YMM2
> > > -     VMOVU   32(%rsi), %YMM3
> > > -     mov     64(%rsi), %cl
> > > -     VMOVU   %YMM2, (%rdi)
> > > -     VMOVU   %YMM3, 32(%rdi)
> > > -     mov     %cl, 64(%rdi)
> > > -#  ifdef USE_AS_STPCPY
> > > -     lea     65(%rdi), %rax
> > > -#  endif
> > > -#  ifdef USE_AS_STRCAT
> > > -     movb    $0, 65(%rdi)
> > > +#  if VEC_SIZE == 64
> > > +#   ifdef USE_AS_WCSCPY
> > > +     testb   %cl, %cl
> > > +#   else
> > > +     test    %ecx, %ecx
> > > +#   endif
> > > +     jz      L(page_cross_copy_32_63)
> > >  #  endif
> > > -     ret
> > > -
> > > -#  ifndef USE_AS_STRCAT
> > >
> > > -     .p2align 4
> > > -L(Fill1):
> > > -     mov     %dl, (%rdi)
> > > -     ret
> > > +#  ifdef USE_AS_WCSCPY
> > > +     testb   $0xf, %cl
> > > +#  else
> > > +     testw   %cx, %cx
> > > +#  endif
> > > +     jz      L(page_cross_copy_16_31)
> > >
> > > -     .p2align 4
> > > -L(Fill2):
> > > -     mov     %dx, (%rdi)
> > > -     ret
> > > +#  ifdef USE_AS_WCSCPY
> > > +     testb   $0x3, %cl
> > > +#  else
> > > +     testb   %cl, %cl
> > > +#  endif
> > > +     jz      L(page_cross_copy_8_15)
> > >
> > > -     .p2align 4
> > > -L(Fill3_4):
> > > -     mov     %dx, (%rdi)
> > > -     mov     %dx, -2(%rdi, %r8)
> > > +#  ifdef USE_AS_WCSCPY
> > > +     movl    (%rsi), %esi
> > > +     movl    %esi, (%rdi)
> > > +     movl    $0, (%END_REG)
> > >       ret
> > > +#  else
> > >
> > > -     .p2align 4
> > > -L(Fill5_8):
> > > -     mov     %edx, (%rdi)
> > > -     mov     %edx, -4(%rdi, %r8)
> > > -     ret
> > > +     testb   $0x7, %cl
> > > +     jz      L(page_cross_copy_4_7)
> > >
> > > -     .p2align 4
> > > -L(Fill9_16):
> > > -     mov     %rdx, (%rdi)
> > > -     mov     %rdx, -8(%rdi, %r8)
> > > +     test    %edx, %edx
> > > +     jz      L(page_cross_set_null_term)
> > > +     movzwl  (%rsi), %ecx
> > > +     movw    %cx, (%rdi)
> > > +L(page_cross_set_null_term):
> > > +     movb    $0, (%END_REG)
> > >       ret
> > >
> > > -     .p2align 4
> > > -L(Fill17_32):
> > > -     VMOVU   %XMMZERO, (%rdi)
> > > -     VMOVU   %XMMZERO, -16(%rdi, %r8)
> > > -     ret
> > >
> > > -     .p2align 4
> > > -L(CopyVecSizeUnalignedVec2):
> > > -     VMOVU   %YMM2, (%rdi, %rcx)
> > > -
> > > -     .p2align 4
> > > -L(CopyVecSizeVecExit):
> > > -     bsf     %edx, %edx
> > > -     add     $(VEC_SIZE - 1), %r8
> > > -     add     %rcx, %rdi
> > > -#   ifdef USE_AS_STPCPY
> > > -     lea     (%rdi, %rdx), %rax
> > > -#   endif
> > > -     sub     %rdx, %r8
> > > -     lea     1(%rdi, %rdx), %rdi
> > > -
> > > -     .p2align 4
> > > -L(StrncpyFillTailWithZero):
> > > -     xor     %edx, %edx
> > > -     sub     $VEC_SIZE, %r8
> > > -     jbe     L(StrncpyFillExit)
> > > -
> > > -     VMOVU   %YMMZERO, (%rdi)
> > > -     add     $VEC_SIZE, %rdi
> > > -
> > > -     mov     %rdi, %rsi
> > > -     and     $(VEC_SIZE - 1), %esi
> > > -     sub     %rsi, %rdi
> > > -     add     %rsi, %r8
> > > -     sub     $(VEC_SIZE * 4), %r8
> > > -     jb      L(StrncpyFillLessFourVecSize)
> > > -
> > > -L(StrncpyFillLoopVmovdqa):
> > > -     VMOVA   %YMMZERO, (%rdi)
> > > -     VMOVA   %YMMZERO, VEC_SIZE(%rdi)
> > > -     VMOVA   %YMMZERO, (VEC_SIZE * 2)(%rdi)
> > > -     VMOVA   %YMMZERO, (VEC_SIZE * 3)(%rdi)
> > > -     add     $(VEC_SIZE * 4), %rdi
> > > -     sub     $(VEC_SIZE * 4), %r8
> > > -     jae     L(StrncpyFillLoopVmovdqa)
> > > -
> > > -L(StrncpyFillLessFourVecSize):
> > > -     add     $(VEC_SIZE * 2), %r8
> > > -     jl      L(StrncpyFillLessTwoVecSize)
> > > -     VMOVA   %YMMZERO, (%rdi)
> > > -     VMOVA   %YMMZERO, VEC_SIZE(%rdi)
> > > -     add     $(VEC_SIZE * 2), %rdi
> > > -     sub     $VEC_SIZE, %r8
> > > -     jl      L(StrncpyFillExit)
> > > -     VMOVA   %YMMZERO, (%rdi)
> > > -     add     $VEC_SIZE, %rdi
> > > -     jmp     L(Fill)
> > > -
> > > -     .p2align 4
> > > -L(StrncpyFillLessTwoVecSize):
> > > -     add     $VEC_SIZE, %r8
> > > -     jl      L(StrncpyFillExit)
> > > -     VMOVA   %YMMZERO, (%rdi)
> > > -     add     $VEC_SIZE, %rdi
> > > -     jmp     L(Fill)
> > > -
> > > -     .p2align 4
> > > -L(StrncpyFillExit):
> > > -     add     $VEC_SIZE, %r8
> > > -L(Fill):
> > > -     cmp     $17, %r8d
> > > -     jae     L(Fill17_32)
> > > -     cmp     $9, %r8d
> > > -     jae     L(Fill9_16)
> > > -     cmp     $5, %r8d
> > > -     jae     L(Fill5_8)
> > > -     cmp     $3, %r8d
> > > -     jae     L(Fill3_4)
> > > -     cmp     $1, %r8d
> > > -     ja      L(Fill2)
> > > -     je      L(Fill1)
> > > +     .p2align 4,, 4
> > > +L(page_cross_copy_4_7):
> > > +     movl    (%rsi), %ecx
> > > +     movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
> > > +     movl    %ecx, (%rdi)
> > > +     movl    %esi, -(4 - CHAR_SIZE)(%END_REG)
> > >       ret
> > > -
> > > -/* end of ifndef USE_AS_STRCAT */
> > >  #  endif
> > >
> > > -     .p2align 4
> > > -L(UnalignedLeaveCase2OrCase3):
> > > -     test    %rdx, %rdx
> > > -     jnz     L(UnalignedFourVecSizeLeaveCase2)
> > > -L(UnalignedFourVecSizeLeaveCase3):
> > > -     lea     (VEC_SIZE * 4)(%r8), %rcx
> > > -     and     $-VEC_SIZE, %rcx
> > > -     add     $(VEC_SIZE * 3), %r8
> > > -     jl      L(CopyVecSizeCase3)
> > > -     VMOVU   %YMM4, (%rdi)
> > > -     sub     $VEC_SIZE, %r8
> > > -     jb      L(CopyVecSizeCase3)
> > > -     VMOVU   %YMM5, VEC_SIZE(%rdi)
> > > -     sub     $VEC_SIZE, %r8
> > > -     jb      L(CopyVecSizeCase3)
> > > -     VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > > -     sub     $VEC_SIZE, %r8
> > > -     jb      L(CopyVecSizeCase3)
> > > -     VMOVU   %YMM7, (VEC_SIZE * 3)(%rdi)
> > > -#  ifdef USE_AS_STPCPY
> > > -     lea     (VEC_SIZE * 4)(%rdi), %rax
> > > -#  endif
> > > -#  ifdef USE_AS_STRCAT
> > > -     movb    $0, (VEC_SIZE * 4)(%rdi)
> > > -#  endif
> > > +#  if VEC_SIZE == 64
> > > +     .p2align 4,, 4
> > > +L(page_cross_copy_32_63):
> > > +     VMOVU   (%rsi), %VMM_256(0)
> > > +     VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > > +     VMOVU   %VMM_256(0), (%rdi)
> > > +     VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
> > >       ret
> > > -
> > > -     .p2align 4
> > > -L(UnalignedFourVecSizeLeaveCase2):
> > > -     xor     %ecx, %ecx
> > > -     vpcmpb  $0, %YMM4, %YMMZERO, %k1
> > > -     kmovd   %k1, %edx
> > > -     add     $(VEC_SIZE * 3), %r8
> > > -     jle     L(CopyVecSizeCase2OrCase3)
> > > -     test    %edx, %edx
> > > -#  ifndef USE_AS_STRCAT
> > > -     jnz     L(CopyVecSizeUnalignedVec4)
> > > -#  else
> > > -     jnz     L(CopyVecSize)
> > > -#  endif
> > > -     vpcmpb  $0, %YMM5, %YMMZERO, %k2
> > > -     kmovd   %k2, %edx
> > > -     VMOVU   %YMM4, (%rdi)
> > > -     add     $VEC_SIZE, %rcx
> > > -     sub     $VEC_SIZE, %r8
> > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > -     test    %edx, %edx
> > > -#  ifndef USE_AS_STRCAT
> > > -     jnz     L(CopyVecSizeUnalignedVec5)
> > > -#  else
> > > -     jnz     L(CopyVecSize)
> > >  #  endif
> > >
> > > -     vpcmpb  $0, %YMM6, %YMMZERO, %k3
> > > -     kmovd   %k3, %edx
> > > -     VMOVU   %YMM5, VEC_SIZE(%rdi)
> > > -     add     $VEC_SIZE, %rcx
> > > -     sub     $VEC_SIZE, %r8
> > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > -     test    %edx, %edx
> > > -#  ifndef USE_AS_STRCAT
> > > -     jnz     L(CopyVecSizeUnalignedVec6)
> > > -#  else
> > > -     jnz     L(CopyVecSize)
> > > -#  endif
> > > -
> > > -     vpcmpb  $0, %YMM7, %YMMZERO, %k4
> > > -     kmovd   %k4, %edx
> > > -     VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > > -     lea     VEC_SIZE(%rdi, %rcx), %rdi
> > > -     lea     VEC_SIZE(%rsi, %rcx), %rsi
> > > -     bsf     %edx, %edx
> > > -     cmp     %r8d, %edx
> > > -     jb      L(CopyVecSizeExit)
> > > -L(StrncpyExit):
> > > -     cmp     $65, %r8d
> > > -     je      L(StrncpyExit65)
> > > -     cmp     $33, %r8d
> > > -     jae     L(StrncpyExit33_64)
> > > -     cmp     $17, %r8d
> > > -     jae     L(StrncpyExit17_32)
> > > -     cmp     $9, %r8d
> > > -     jae     L(StrncpyExit9_16)
> > > -     cmp     $5, %r8d
> > > -     jae     L(StrncpyExit5_8)
> > > -     cmp     $3, %r8d
> > > -     jae     L(StrncpyExit3_4)
> > > -     cmp     $1, %r8d
> > > -     ja      L(StrncpyExit2)
> > > -     je      L(StrncpyExit1)
> > > -#  ifdef USE_AS_STPCPY
> > > -     mov     %rdi, %rax
> > > -#  endif
> > > -#  ifdef USE_AS_STRCAT
> > > -     movb    $0, (%rdi)
> > > -#  endif
> > > +     .p2align 4,, 4
> > > +L(page_cross_copy_16_31):
> > > +     vmovdqu (%rsi), %xmm0
> > > +     vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > > +     vmovdqu %xmm0, (%rdi)
> > > +     vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
> > >       ret
> > >
> > > -     .p2align 4
> > > -L(ExitZero):
> > > -#  ifndef USE_AS_STRCAT
> > > -     mov     %rdi, %rax
> > > -#  endif
> > > +     .p2align 4,, 4
> > > +L(page_cross_copy_8_15):
> > > +     movq    (%rsi), %rcx
> > > +     movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> > > +     movq    %rcx, (%rdi)
> > > +     movq    %rsi, -(8 - CHAR_SIZE)(%END_REG)
> > >       ret
> > > -
> > > -# endif
> > > -
> > > -# ifndef USE_AS_STRCAT
> > > -END (STRCPY)
> > > -# else
> > > -END (STRCAT)
> > >  # endif
> > > +END(STRCPY)
> > >  #endif
> > > diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
> > > index 203a19bf21..d648ba5cfe 100644
> > > --- a/sysdeps/x86_64/multiarch/strncat-evex.S
> > > +++ b/sysdeps/x86_64/multiarch/strncat-evex.S
> > > @@ -1,7 +1,520 @@
> > > -#ifndef STRNCAT
> > > -# define STRNCAT     __strncat_evex
> > > -#endif
> > > +/* {wcs|str}ncat  with 256/512-bit EVEX.
> > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#include <isa-level.h>
> > > +
> > > +#if ISA_SHOULD_BUILD (4)
> > > +
> > > +     /* Use evex-masked stores for small sizes. Turned off at the
> > > +        moment.  */
> > > +# define USE_EVEX_MASKED_STORE       0
> > > +
> > > +# include <sysdep.h>
> > > +
> > > +# ifndef VEC_SIZE
> > > +#  include "x86-evex256-vecs.h"
> > > +# endif
> > > +
> > > +# ifndef STRNCAT
> > > +#  define STRNCAT    __strncat_evex
> > > +# endif
> > > +
> > > +
> > > +# ifdef USE_AS_WCSCPY
> > > +#  define movNULL    movl
> > > +#  define VMOVU_MASK vmovdqu32
> > > +#  define VPMIN      vpminud
> > > +#  define VPTESTN    vptestnmd
> > > +#  define VPTEST     vptestmd
> > > +#  define VPCMPEQ    vpcmpeqd
> > > +#  define CHAR_SIZE  4
> > > +
> > > +#  define REP_MOVS   rep movsd
> > > +
> > > +#  define VMASK_REG  VR10
> > > +#  define FIND_FIRST_ONE(src, dst)   movl $CHAR_PER_VEC, %dst; bsf %src, %dst
> > > +
> > > +#  define USE_WIDE_CHAR
> > > +# else
> > > +#  define movNULL    movb
> > > +#  define VMOVU_MASK vmovdqu8
> > > +#  define VPMIN      vpminub
> > > +#  define VPTESTN    vptestnmb
> > > +#  define VPTEST     vptestmb
> > > +#  define VPCMPEQ    vpcmpeqb
> > > +#  define CHAR_SIZE  1
> > > +
> > > +#  define REP_MOVS   rep movsb
> > > +
> > > +#  define VMASK_REG  VRCX
> > > +#  define FIND_FIRST_ONE(src, dst)   tzcnt %src, %dst
> > > +
> > > +# endif
> > > +
> > > +# include "strncpy-or-cat-overflow-def.h"
> > > +
> > > +# include "reg-macros.h"
> > > +
> > > +
> > > +# define VZERO       VMM(7)
> > > +# define VZERO_128   VMM_128(7)
> > > +
> > > +# define PAGE_SIZE   4096
> > > +# define CHAR_PER_VEC        (VEC_SIZE / CHAR_SIZE)
> > > +
> > > +     .section SECTION(.text), "ax", @progbits
> > > +ENTRY(STRNCAT)
> > > +     movq    %rdi, %rax
> > > +
> > > +     /* NB: It's safe to filter out zero-length strings WITHOUT
> > > +        setting null-term. Destination MUST be a null-terminated
> > > +        string so essentially the work is already done.  */
> > > +# ifdef USE_AS_WCSCPY
> > > +     leaq    -1(%rdx), %rcx
> > > +     shrq    $56, %rcx
> > > +     jnz     L(zero_len)
> > > +# else
> > > +     test    %rdx, %rdx
> > > +     jle     L(zero_len)
> > > +# endif
> > > +
> > > +# include "strcat-strlen-evex.S"
> > > +
> > > +     movl    %esi, %ecx
> > > +     andl    $(PAGE_SIZE - 1), %ecx
> > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > > +     ja      L(page_cross)
> > > +L(page_cross_continue):
> > > +     VMOVU   (%rsi), %VMM(0)
> > > +     VPTESTN %VMM(0), %VMM(0), %k0
> > > +
> > > +     /* If USE_EVEX_MASK_STORE is enabled then we just handle length
> > > +        <= CHAR_PER_VEC with masked instructions (which have
> > > +        potential for dramatically bad perf if dst splits a page and
> > > +        is not in the TLB).  */
> > > +# if USE_EVEX_MASKED_STORE
> > > +     KMOV    %k0, %VRCX
> > > +     FIND_FIRST_ONE (VRCX, VR8)
> > > +     cmpq    %r8, %rdx
> > > +     jbe     L(less_1x_vec)
> > > +
> > > +     test    %VRCX, %VRCX
> > > +     jz      L(more_1x_vec)
> > > +
> > > +     blsmsk  %VRCX, %VRCX
> > > +     KMOV    %VRCX, %k1
> > > +     VMOVU_MASK %VMM(0), (%rdi){%k1}
> > > +     ret
> > > +
> > > +L(less_1x_vec):
> > > +     mov     $-1, %VRCX
> > > +     bzhi    %VRDX, %VRCX, %VRCX
> > > +     KMOV    %VRCX, %k1
> > > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > > +     VMOVU_MASK %VMM(0), (%rdi){%k1}
> > > +
> > > +     ret
> > > +# else
> > > +     KMOV    %k0, %VMASK_REG
> > > +     /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
> > > +        %VMASK_REG, %VRCX` for wcsncat.  */
> > > +     FIND_FIRST_ONE (VMASK_REG, VRCX)
> > > +     cmpq    %rcx, %rdx
> > > +     jbe     L(less_1x_vec)
> > > +
> > > +     /* If there were no zero-CHARs (rcx was zero before
> > > +        FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> > > +     cmpl    $CHAR_PER_VEC, %ecx
> > > +     je      L(more_1x_vec)
> > > +
> > > +     movl    %ecx, %edx
> > > +
> > > +L(less_1x_vec):
> > > +#  if VEC_SIZE == 64
> > > +     cmpl    $(32 / CHAR_SIZE), %edx
> > > +     jae     L(copy_32_63)
> > > +#  endif
> > > +
> > > +     cmpl    $(16 / CHAR_SIZE), %edx
> > > +     jae     L(copy_16_31)
> > > +
> > > +
> > > +     cmpl    $(8 / CHAR_SIZE), %edx
> > > +     jae     L(copy_8_15)
> > > +
> > > +#  ifdef USE_AS_WCSCPY
> > > +     vmovd   %VMM_128(0), (%rdi)
> > > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > > +     ret
> > > +#  else
> > > +
> > > +     cmpl    $4, %edx
> > > +     jae     L(copy_4_7)
> > > +
> > > +     movzbl  (%rsi), %ecx
> > > +     cmpl    $1, %edx
> > > +     jbe     L(set_null_term)
> > > +
> > > +     movzwl  1(%rsi), %esi
> > > +     movw    %si, 1(%rdi)
> > > +
> > > +     .p2align 4,, 1
> > > +L(set_null_term):
> > > +     movb    %cl, (%rdi)
> > > +     movNULL $0, (%rdi, %rdx)
> > > +     ret
> > > +#  endif
> > > +
> > > +#  if VEC_SIZE == 64
> > > +     .p2align 4,, 6
> > > +L(copy_32_63):
> > > +     VMOVU   -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > > +     VMOVU   %VMM_256(0), (%rdi)
> > > +     VMOVU   %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
> > > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > > +     ret
> > > +#  endif
> > > +     .p2align 4,, 6
> > > +L(copy_16_31):
> > > +     /* Use xmm1 explicitly here as it won't require a `vzeroupper`
> > > +        and will save code size.  */
> > > +     vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > > +     VMOVU   %VMM_128(0), (%rdi)
> > > +     vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
> > > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > > +     ret
> > > +
> > > +     .p2align 4,, 2
> > > +L(copy_8_15):
> > > +     movq    -(8)(%rsi, %rdx, CHAR_SIZE), %rcx
> > > +     vmovq   %VMM_128(0), (%rdi)
> > > +     movq    %rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
> > > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > > +     ret
> > > +
> > > +#  ifndef USE_AS_WCSCPY
> > > +     .p2align 4,, 12
> > > +L(copy_4_7):
> > > +     movl    -(4)(%rsi, %rdx, CHAR_SIZE), %ecx
> > > +     vmovd   %VMM_128(0), (%rdi)
> > > +     movl    %ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
> > > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > > +     ret
> > > +#  endif
> > > +
> > > +# endif
> > > +     .p2align 4,, 4
> > > +L(zero_len):
> > > +# ifdef USE_AS_WCSCPY
> > > +     test    %rdx, %rdx
> > > +# endif
> > > +     jne     OVERFLOW_STRCAT
> > > +     ret
> > >
> > > -#define USE_AS_STRNCAT
> > > -#define STRCAT       STRNCAT
> > > -#include "strcat-evex.S"
> > > +     .p2align 4,, 8
> > > +L(more_1x_vec):
> > > +     VMOVU   %VMM(0), (%rdi)
> > > +
> > > +     /* We are going to align rsi here so will need to be able to re-
> > > +        adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> > > +        so rsi + rdx * CHAR_SIZE cannot overflow.  */
> > > +
> > > +     leaq    (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> > > +     subq    %rsi, %rdi
> > > +     andq    $-(VEC_SIZE), %rsi
> > > +L(loop_last_4x_vec):
> > > +     addq    %rsi, %rdi
> > > +     subq    %rsi, %rdx
> > > +# ifdef USE_AS_WCSCPY
> > > +     shrq    $2, %rdx
> > > +# endif
> > > +
> > > +     /* Will need this regardless.  */
> > > +     VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> > > +     VPTESTN %VMM(1), %VMM(1), %k0
> > > +     KMOV    %k0, %VMASK_REG
> > > +
> > > +     cmpq    $(CHAR_PER_VEC * 2), %rdx
> > > +     ja      L(more_2x_vec)
> > > +
> > > +L(last_2x_vec):
> > > +     FIND_FIRST_ONE (VMASK_REG, VRCX)
> > > +     cmpl    %ecx, %edx
> > > +     jbe     L(ret_vec_x1_len)
> > > +
> > > +     /* If there were no zero-CHARs (rcx was zero before
> > > +        FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> > > +     cmpl    $CHAR_PER_VEC, %ecx
> > > +     jne     L(ret_vec_x1)
> > > +
> > > +     VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > > +     VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     addl    $-CHAR_PER_VEC, %edx
> > > +     bzhi    %VRDX, %VRCX, %VR8
> > > +     jz      L(ret_vec_x2_len)
> > > +L(ret_vec_x2):
> > > +     bsf     %VRCX, %VRDX
> > > +L(ret_vec_x2_len):
> > > +     VMOVU   (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > > +     movNULL $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> > > +     VMOVU   %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > +     ret
> > > +
> > > +     .p2align 4,, 4
> > > +L(ret_vec_x1_len):
> > > +     movl    %edx, %ecx
> > > +L(ret_vec_x1):
> > > +     VMOVU   (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > > +     movNULL $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
> > > +     VMOVU   %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > > +     VZEROUPPER_RETURN
> > > +
> > > +
> > > +     .p2align 4,, 8
> > > +L(last_4x_vec):
> > > +     addl    $-(CHAR_PER_VEC * 4), %edx
> > > +     VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(1)
> > > +     VPTESTN %VMM(1), %VMM(1), %k0
> > > +     KMOV    %k0, %VMASK_REG
> > > +     subq    $-(VEC_SIZE * 4), %rsi
> > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > +     cmpl    $(CHAR_PER_VEC * 2), %edx
> > > +     jbe     L(last_2x_vec)
> > > +     .p2align 4,, 8
> > > +L(more_2x_vec):
> > > +# ifdef USE_AS_WCSCPY
> > > +     xorl    %ecx, %ecx
> > > +# endif
> > > +     bsf     %VMASK_REG, %VRCX
> > > +     jnz     L(ret_vec_x1)
> > > +
> > > +     VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > > +     VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(ret_vec_x2)
> > > +
> > > +     VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> > > +     VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > > +     VPTESTN %VMM(3), %VMM(3), %k0
> > > +     KMOV    %k0, %VMASK_REG
> > > +
> > > +     cmpq    $(CHAR_PER_VEC * 4), %rdx
> > > +     ja      L(more_4x_vec)
> > > +
> > > +     /* Adjust length before going to L(ret_vec_x3_len) or
> > > +        L(ret_vec_x3).  */
> > > +     addl    $(CHAR_PER_VEC * -2), %edx
> > > +
> > > +     FIND_FIRST_ONE (VMASK_REG, VRCX)
> > > +     cmpl    %ecx, %edx
> > > +     jbe     L(ret_vec_x3_len)
> > > +
> > > +     /* If there were no zero-CHARs (rcx was zero before
> > > +        FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> > > +     cmpl    $CHAR_PER_VEC, %ecx
> > > +     jne     L(ret_vec_x3)
> > > +
> > > +     VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > > +     VPTESTN %VMM(4), %VMM(4), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     addl    $-CHAR_PER_VEC, %edx
> > > +     bzhi    %VRDX, %VRCX, %VR8
> > > +     jz      L(ret_vec_x4_len)
> > > +L(ret_vec_x4):
> > > +     bsf     %VRCX, %VRDX
> > > +L(ret_vec_x4_len):
> > > +     VMOVU   (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > > +     movNULL $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
> > > +     VMOVU   %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > +     ret
> > > +
> > > +     .p2align 4,, 4
> > > +L(ret_vec_x3_len):
> > > +     movl    %edx, %ecx
> > > +L(ret_vec_x3):
> > > +     VMOVU   (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > > +     movNULL $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> > > +     VMOVU   %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > > +     ret
> > > +
> > > +     .p2align 4,, 8
> > > +L(more_4x_vec):
> > > +# ifdef USE_AS_WCSCPY
> > > +     xorl    %ecx, %ecx
> > > +# endif
> > > +     bsf     %VMASK_REG, %VRCX
> > > +     jnz     L(ret_vec_x3)
> > > +
> > > +     VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > > +     VPTESTN %VMM(4), %VMM(4), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(ret_vec_x4)
> > > +
> > > +     VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> > > +
> > > +     /* Check if we are near the end before aligning.  */
> > > +     cmpq    $(CHAR_PER_VEC * 8), %rdx
> > > +     jbe     L(last_4x_vec)
> > > +
> > > +
> > > +     /* Add rsi to rdx (length) before aligning rsi. NB: Since we
> > > +        filtered out huge lengths this cannot overflow.  */
> > > +# ifdef USE_AS_WCSCPY
> > > +     leaq    (%rsi, %rdx, CHAR_SIZE), %rdx
> > > +# else
> > > +     addq    %rsi, %rdx
> > > +# endif
> > > +
> > > +     /* Subtract rsi from rdi before aligning (add back will have
> > > +        correct rdi for aligned rsi).  */
> > > +     subq    %rsi, %rdi
> > > +     subq    $-(VEC_SIZE * 5), %rsi
> > > +     andq    $(VEC_SIZE * -4), %rsi
> > > +
> > > +     /* Load first half of the loop before entry.  */
> > > +     VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > > +
> > > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > > +     VPTESTN %VMM(4), %VMM(4), %k2
> > > +     VPTESTN %VMM(6), %VMM(6), %k4
> > > +
> > > +     /* Offset rsi by VEC_SIZE so that we can jump to
> > > +        L(loop_last_4x_vec).  */
> > > +     addq    $-(VEC_SIZE), %rsi
> > > +     KORTEST %k2, %k4
> > > +     jnz     L(loop_4x_done)
> > > +
> > > +     /* Store loop end in r9.  */
> > > +     leaq    -(VEC_SIZE * 5)(%rdx), %r9
> > > +
> > > +     .p2align 4,, 11
> > > +L(loop_4x_vec):
> > > +     VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > > +     VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> > > +     VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> > > +     VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> > > +
> > > +     subq    $(VEC_SIZE * -4), %rsi
> > > +     cmpq    %rsi, %r9
> > > +     jbe     L(loop_last_4x_vec)
> > > +
> > > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> > > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> > > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> > > +     VMOVA   (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> > > +
> > > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > > +     VPTESTN %VMM(4), %VMM(4), %k2
> > > +     VPTESTN %VMM(6), %VMM(6), %k4
> > > +     KORTEST %k2, %k4
> > > +     jz      L(loop_4x_vec)
> > > +
> > > +L(loop_4x_done):
> > > +     VPTESTN %VMM(0), %VMM(0), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     /* Restore rdi (dst).  */
> > > +     addq    %rsi, %rdi
> > > +
> > > +     /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
> > > +        test with bsf.  */
> > > +     bsf     %VRCX, %VRCX
> > > +     jnz     L(ret_vec_x1)
> > > +     VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
> > > +
> > > +     KMOV    %k2, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(ret_vec_x2)
> > > +     VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> > > +
> > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     bsf     %VRCX, %VRCX
> > > +     jnz     L(ret_vec_x3)
> > > +     VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> > > +
> > > +     KMOV    %k4, %VRCX
> > > +     bsf     %VRCX, %VRCX
> > > +     VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > > +     VMOVU   %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > > +     ret
> > > +
> > > +
> > > +     .p2align 4,, 4
> > > +L(page_cross):
> > > +     movq    %rsi, %r8
> > > +     andq    $(VEC_SIZE * -1), %r8
> > > +     VPCMPEQ (%r8), %VZERO, %k0
> > > +
> > > +# ifdef USE_AS_WCSCPY
> > > +     KMOV    %k0, %VR9
> > > +     shrl    $2, %ecx
> > > +     andl    $(CHAR_PER_VEC - 1), %ecx
> > > +     shrx    %VRCX, %VR9, %VRCX
> > > +# else
> > > +     KMOV    %k0, %VRCX
> > > +     shrx    %VRSI, %VRCX, %VRCX
> > > +# endif
> > > +
> > > +     subl    %esi, %r8d
> > > +     andl    $(VEC_SIZE - 1), %r8d
> > > +# ifdef USE_AS_WCSCPY
> > > +     shrl    $2, %r8d
> > > +# endif
> > > +     cmpq    %r8, %rdx
> > > +     jbe     L(page_cross_small)
> > > +     /* Optimizing more for space as this is very cold code. This
> > > +        saves 2x cache lines.  */
> > > +
> > > +     /* This adds once to the later result which will get correct
> > > +        copy bounds. NB: this can never zero-out a non-zero RCX as
> > > +        to be in the page cross case rsi cannot be aligned and we
> > > +        already right-shift rcx by the misalignment.  */
> > > +     shl     %VRCX
> > > +     jz      L(page_cross_continue)
> > > +     bsf     %VRCX, %VRCX
> > > +     REP_MOVS
> > > +     ret
> > > +
> > > +L(page_cross_small):
> > > +     tzcnt   %VRCX, %VRCX
> > > +     jz      L(page_cross_setz)
> > > +     cmpl    %edx, %ecx
> > > +     cmova   %edx, %ecx
> > > +
> > > +# ifdef USE_AS_WCSCPY
> > > +     rep     movsd
> > > +# else
> > > +     rep     movsb
> > > +# endif
> > > +L(page_cross_setz):
> > > +     movNULL $0, (%rdi)
> > > +     ret
> > > +END(STRNCAT)
> > > +#endif
> > > diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
> > > index 1b3426d511..49eaf4cbd9 100644
> > > --- a/sysdeps/x86_64/multiarch/strncpy-evex.S
> > > +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
> > > @@ -1,7 +1,990 @@
> > > -#ifndef STRNCPY
> > > -# define STRNCPY     __strncpy_evex
> > > -#endif
> > > +/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
> > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#include <isa-level.h>
> > > +
> > > +#if ISA_SHOULD_BUILD (4)
> > > +
> > > +     /* Use evex-masked stores for small sizes. Turned off at the
> > > +        moment.  */
> > > +# define USE_EVEX_MASKED_STORE       0
> > > +
> > > +
> > > +# include <sysdep.h>
> > > +# ifndef VEC_SIZE
> > > +#  include "x86-evex256-vecs.h"
> > > +# endif
> > > +
> > > +
> > > +# ifndef STRNCPY
> > > +#  define STRNCPY    __strncpy_evex
> > > +# endif
> > > +
> > > +# ifdef USE_AS_WCSCPY
> > > +#  define VMOVU_MASK vmovdqu32
> > > +#  define VPCMPEQ    vpcmpeqd
> > > +#  define VPMIN      vpminud
> > > +#  define VPTESTN    vptestnmd
> > > +#  define VPTEST     vptestmd
> > > +#  define CHAR_SIZE  4
> > > +
> > > +#  define REP_MOVS   rep movsd
> > > +#  define REP_STOS   rep stosl
> > > +
> > > +#  define USE_WIDE_CHAR
> > > +
> > > +# else
> > > +#  define VMOVU_MASK vmovdqu8
> > > +#  define VPCMPEQ    vpcmpeqb
> > > +#  define VPMIN      vpminub
> > > +#  define VPTESTN    vptestnmb
> > > +#  define VPTEST     vptestmb
> > > +#  define CHAR_SIZE  1
> > > +
> > > +#  define REP_MOVS   rep movsb
> > > +#  define REP_STOS   rep stosb
> > > +# endif
> > > +
> > > +# include "strncpy-or-cat-overflow-def.h"
> > > +
> > > +# define PAGE_SIZE   4096
> > > +# define CHAR_PER_VEC        (VEC_SIZE / CHAR_SIZE)
> > > +
> > > +# include "reg-macros.h"
> > > +
> > > +
> > > +# define VZERO       VMM(7)
> > > +# define VZERO_256   VMM_256(7)
> > > +# define VZERO_128   VMM_128(7)
> > > +
> > > +# if VEC_SIZE == 64
> > > +#  define VZERO_HALF VZERO_256
> > > +# else
> > > +#  define VZERO_HALF VZERO_128
> > > +# endif
> > > +
> > > +     .section SECTION(.text), "ax", @progbits
> > > +ENTRY(STRNCPY)
> > > +     /* Filter zero length strings and very long strings.  Zero
> > > +        length strings just return, very long strings are handled by
> > > +        just running rep stos{b|l} to zero set (which will almost
> > > +        certainly segfault), if that succeeds then just calling
> > > +        OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
> > > +# ifdef USE_AS_WCSCPY
> > > +     decq    %rdx
> > > +     movq    %rdx, %rax
> > > +     /* 56 is end of max supported address space.  */
> > > +     shr     $56, %rax
> > > +     jnz     L(zero_len)
> > > +# else
> > > +     decq    %rdx
> > > +     /* If the flag needs to become `jb` replace `dec` with `sub`.
> > > +      */
> > > +     jl      L(zero_len)
> > > +# endif
> > > +
> > > +     vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> > > +     movl    %esi, %eax
> > > +     andl    $(PAGE_SIZE - 1), %eax
> > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > +     ja      L(page_cross)
> > > +
> > > +L(page_cross_continue):
> > > +     VMOVU   (%rsi), %VMM(0)
> > > +     VPTESTN %VMM(0), %VMM(0), %k0
> > > +     KMOV    %k0, %VRCX
> > > +
> > > +     /* If no STPCPY just save end ahead of time.  */
> > > +# ifndef USE_AS_STPCPY
> > > +     movq    %rdi, %rax
> > > +# endif
> > > +
> > > +
> > > +     cmpq    $(CHAR_PER_VEC), %rdx
> > > +
> > > +     /* If USE_EVEX_MASK_STORE is enabled then we just handle length
> > > +        <= CHAR_PER_VEC with masked instructions (which have
> > > +        potential for dramatically bad perf if dst splits a page and
> > > +        is not in the TLB).  */
> > > +# if USE_EVEX_MASKED_STORE
> > > +     /* `jae` because length rdx is now length - 1.  */
> > > +     jae     L(more_1x_vec)
> > > +
> > > +     /* If there where multiple zero-CHAR matches in the first VEC,
> > > +        VRCX will be overset but thats fine since any oversets where
> > > +        at zero-positions anyways.  */
> > > +
> > > +#  ifdef USE_AS_STPCPY
> > > +     tzcnt   %VRCX, %VRAX
> > > +     cmpl    %eax, %edx
> > > +     cmovb   %edx, %eax
> > > +#   ifdef USE_AS_WCSCPY
> > > +     adcl    $0, %eax
> > > +     leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > +#   else
> > > +     adcq    %rdi, %rax
> > > +#   endif
> > > +#  endif
> > > +     dec     %VRCX
> > > +
> > > +     /* Zero out all non-zero CHAR's after the first zero match.  */
> > > +     KMOV    %VRCX, %k1
> > > +
> > > +     /* Use VZERO as destination so this can be reused for
> > > +        L(zfill_less_vec) (which if jumped to by subsequent logic
> > > +        will have zerod out VZERO.  */
> > > +     VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
> > > +L(zfill_less_vec):
> > > +     /* Get mask for what we need to set.  */
> > > +     incl    %edx
> > > +     mov     $-1, %VRCX
> > > +     bzhi    %VRDX, %VRCX, %VRCX
> > > +     KMOV    %VRCX, %k1
> > > +     VMOVU_MASK %VZERO, (%rdi){%k1}
> > > +     ret
> > > +
> > > +     .p2align 4,, 4
> > > +L(zero_len):
> > > +     cmpq    $-1, %rdx
> > > +     jne     L(best_effort_strncpy)
> > > +     movq    %rdi, %rax
> > > +     ret
> > > +
> > > +     .p2align 4,, 8
> > > +L(more_1x_vec):
> > > +# else
> > > +     /* `jb` because length rdx is now length - 1.  */
> > > +     jb      L(less_1x_vec)
> > > +# endif
> > > +
> > > +
> > > +     /* This may overset but thats fine because we still need to zero
> > > +        fill.  */
> > > +     VMOVU   %VMM(0), (%rdi)
> > > +
> > > +
> > > +     /* Length must be >= CHAR_PER_VEC so match here means we must
> > > +        zero-fill.  */
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(zfill)
> > > +
> > > +
> > > +     /* We are going to align rsi here so will need to be able to re-
> > > +        adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> > > +        so rsi + rdx * CHAR_SIZE cannot overflow.  */
> > > +     leaq    (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> > > +     subq    %rsi, %rdi
> > > +     andq    $-(VEC_SIZE), %rsi
> > > +
> > > +L(loop_last_4x_vec):
> > > +     addq    %rsi, %rdi
> > > +     subq    %rsi, %rdx
> > > +# ifdef USE_AS_WCSCPY
> > > +     shrq    $2, %rdx
> > > +# endif
> > > +
> > > +     VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> > > +     VPTESTN %VMM(1), %VMM(1), %k0
> > > +     KMOV    %k0, %VRCX
> > > +
> > > +     /* -1 because of the `dec %rdx` earlier.  */
> > > +     cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
> > > +     ja      L(more_2x_vec)
> > > +
> > > +L(last_2x_vec):
> > > +     /* This will be need to be computed no matter what. We do it
> > > +        ahead of time for CHAR_PER_VEC == 64 because we can't adjust
> > > +        the value of `tzcnt` with a shift.  */
> > > +# if CHAR_PER_VEC == 64
> > > +     tzcntq  %rcx, %rcx
> > > +# endif
> > > +
> > > +     cmpl    $(CHAR_PER_VEC), %edx
> > > +     jb      L(ret_vec_x1_len)
> > > +
> > > +     /* Seperate logic for CHAR_PER_VEC == 64 because we already did
> > > +        `tzcnt` on VRCX.  */
> > > +# if CHAR_PER_VEC == 64
> > > +     /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
> > > +     cmpb    $CHAR_PER_VEC, %cl
> > > +     jnz     L(ret_vec_x1_no_bsf)
> > > +# else
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(ret_vec_x1)
> > > +# endif
> > > +
> > > +
> > > +
> > > +     VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0
> > > +     VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > > +     KMOV    %k0, %VRCX
> > > +
> > > +# if CHAR_PER_VEC < 64
> > > +     /* This essentiallys adds CHAR_PER_VEC to computed result.  */
> > > +     shlq    $CHAR_PER_VEC, %rcx
> > > +# else
> > > +     tzcntq  %rcx, %rcx
> > > +     addl    $CHAR_PER_VEC, %ecx
> > > +# endif
> > > +
> > > +     .p2align 4,, 4
> > > +L(ret_vec_x1_len):
> > > +     /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
> > > +        already been done.  */
> > > +# if CHAR_PER_VEC < 64
> > > +     tzcntq  %rcx, %rcx
> > > +# endif
> > > +     cmpl    %ecx, %edx
> > > +     jbe     L(ret_vec_x1_len_no_zfill)
> > > +     /* Fall through (expectation) is copy len < buffer len.  */
> > > +     VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > +L(ret_vec_x1_len_no_zfill_mov):
> > > +     movl    %ecx, %edx
> > > +# ifdef USE_AS_STPCPY
> > > +     /* clear flags.  */
> > > +     xorl    %ecx, %ecx
> > > +# endif
> > > +L(ret_vec_x1_len_no_zfill):
> > > +     VMOVU   ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > > +     VMOVU   %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > +# ifdef USE_AS_STPCPY
> > > +#  ifdef USE_AS_WCSCPY
> > > +     adcq    $0, %rdx
> > > +     leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
> > > +#  else
> > > +     leal    (VEC_SIZE)(%rdx), %eax
> > > +     adcq    %rdi, %rax
> > > +#  endif
> > > +# endif
> > > +     ret
> > > +
> > > +
> > > +     .p2align 4,, 10
> > > +L(ret_vec_x1):
> > > +     bsf     %VRCX, %VRCX
> > > +L(ret_vec_x1_no_bsf):
> > > +     VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > +     subl    %ecx, %edx
> > > +     cmpl    $CHAR_PER_VEC, %edx
> > > +     jb      L(ret_vec_x1_len_no_zfill_mov)
> > > +     /* Fall through (expectation) is copy len < buffer len.  */
> > > +     VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > > +     VMOVU   %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
> > > +# ifdef USE_AS_STPCPY
> > > +     leaq    (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
> > > +# endif
> > > +     ret
> > > +
> > > +     .p2align 4,, 8
> > > +L(last_4x_vec):
> > > +     /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
> > > +        $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
> > > +        using `movzbl`.  */
> > > +# if CHAR_PER_VEC == 64
> > > +     movzbl  %dl, %edx
> > > +# else
> > > +     andl    $(CHAR_PER_VEC * 4 - 1), %edx
> > > +# endif
> > > +     VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(1)
> > > +     VPTESTN %VMM(1), %VMM(1), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     subq    $-(VEC_SIZE * 4), %rsi
> > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > +     cmpl    $(CHAR_PER_VEC * 2 - 1), %edx
> > > +     jbe     L(last_2x_vec)
> > > +     .p2align 4,, 8
> > > +L(more_2x_vec):
> > > +     VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > > +     test    %VRCX, %VRCX
> > > +     /* Must fill at least 2x VEC.  */
> > > +     jnz     L(zfill_vec1)
> > > +
> > > +     VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > > +     VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     /* Must fill at least 1x VEC.  */
> > > +     jnz     L(zfill_vec2)
> > > +
> > > +     VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> > > +     VPTESTN %VMM(3), %VMM(3), %k0
> > > +     KMOV    %k0, %VRCX
> > > +
> > > +     /* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
> > > +     cmpq    $(CHAR_PER_VEC * 4 - 1), %rdx
> > > +     ja      L(more_4x_vec)
> > > +
> > > +     subl    $(CHAR_PER_VEC * 3), %edx
> > > +     jb      L(ret_vec_x3_len)
> > > +
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(ret_vec_x3)
> > > +
> > > +     VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0
> > > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > > +     KMOV    %k0, %VRCX
> > > +     tzcnt   %VRCX, %VRCX
> > > +     cmpl    %ecx, %edx
> > > +     jbe     L(ret_vec_x4_len_no_zfill)
> > > +     /* Fall through (expectation) is copy len < buffer len.  */
> > > +     VMOVU   %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > +     movl    %ecx, %edx
> > > +L(ret_vec_x4_len_no_zfill):
> > > +     VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > > +     VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > +# ifdef USE_AS_STPCPY
> > > +#  ifdef USE_AS_WCSCPY
> > > +     adcq    $0, %rdx
> > > +     leaq    (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
> > > +#  else
> > > +     leal    (VEC_SIZE * 4 + 0)(%rdx), %eax
> > > +     adcq    %rdi, %rax
> > > +#  endif
> > > +# endif
> > > +     ret
> > > +
> > > +
> > > +L(ret_vec_x3_len):
> > > +     addl    $(CHAR_PER_VEC * 1), %edx
> > > +     tzcnt   %VRCX, %VRCX
> > > +     cmpl    %ecx, %edx
> > > +     jbe     L(ret_vec_x3_len_no_zfill)
> > > +     /* Fall through (expectation) is copy len < buffer len.  */
> > > +     VMOVU   %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > +L(ret_vec_x3_len_no_zfill_mov):
> > > +     movl    %ecx, %edx
> > > +# ifdef USE_AS_STPCPY
> > > +     /* clear flags.  */
> > > +     xorl    %ecx, %ecx
> > > +# endif
> > > +     .p2align 4,, 4
> > > +L(ret_vec_x3_len_no_zfill):
> > > +     VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > > +     VMOVU   %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > +# ifdef USE_AS_STPCPY
> > > +#  ifdef USE_AS_WCSCPY
> > > +     adcq    $0, %rdx
> > > +     leaq    (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
> > > +#  else
> > > +     leal    (VEC_SIZE * 3 + 0)(%rdx), %eax
> > > +     adcq    %rdi, %rax
> > > +#  endif
> > > +# endif
> > > +     ret
> > > +
> > > +
> > > +     .p2align 4,, 8
> > > +L(ret_vec_x3):
> > > +     bsf     %VRCX, %VRCX
> > > +     VMOVU   %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
> > > +     subl    %ecx, %edx
> > > +     jl      L(ret_vec_x3_len_no_zfill_mov)
> > > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > > +     VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> > > +# ifdef USE_AS_STPCPY
> > > +     leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
> > > +# endif
> > > +     ret
> > > +
> > > +     .p2align 4,, 8
> > > +L(more_4x_vec):
> > > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(zfill_vec3)
> > > +
> > > +     VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > > +     VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> > > +     VPTESTN %VMM(4), %VMM(4), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(zfill_vec4)
> > >
> > > -#define USE_AS_STRNCPY
> > > -#define STRCPY       STRNCPY
> > > -#include "strcpy-evex.S"
> > > +     /* Recheck length before aligning.  */
> > > +     cmpq    $(CHAR_PER_VEC * 8 - 1), %rdx
> > > +     jbe     L(last_4x_vec)
> > > +
> > > +     /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
> > > +# ifdef USE_AS_WCSCPY
> > > +     leaq    (%rsi, %rdx, CHAR_SIZE), %rdx
> > > +# else
> > > +     addq    %rsi, %rdx
> > > +# endif
> > > +     subq    %rsi, %rdi
> > > +     subq    $-(VEC_SIZE * 5), %rsi
> > > +     andq    $(VEC_SIZE * -4), %rsi
> > > +
> > > +
> > > +     /* Load first half of the loop before entry.  */
> > > +     VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > > +
> > > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > > +     VPTESTN %VMM(4), %VMM(4), %k2
> > > +     VPTESTN %VMM(6), %VMM(6), %k4
> > > +
> > > +
> > > +     /* Offset rsi by VEC_SIZE so that we can jump to
> > > +        L(loop_last_4x_vec).  */
> > > +     addq    $-(VEC_SIZE), %rsi
> > > +     KORTEST %k2, %k4
> > > +     jnz     L(loop_4x_done)
> > > +
> > > +     /* Store loop end in r9.  */
> > > +     leaq    -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
> > > +
> > > +     .p2align 4,, 11
> > > +L(loop_4x_vec):
> > > +     VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > > +     VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> > > +     VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> > > +     VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> > > +
> > > +     subq    $(VEC_SIZE * -4), %rsi
> > > +     cmpq    %rsi, %r9
> > > +     jbe     L(loop_last_4x_vec)
> > > +
> > > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> > > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> > > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> > > +     VMOVA   (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> > > +
> > > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > > +     VPTESTN %VMM(4), %VMM(4), %k2
> > > +     VPTESTN %VMM(6), %VMM(6), %k4
> > > +     KORTEST %k2, %k4
> > > +     jz      L(loop_4x_vec)
> > > +
> > > +L(loop_4x_done):
> > > +     /* Restore rdx (length).  */
> > > +     subq    %rsi, %rdx
> > > +# ifdef USE_AS_WCSCPY
> > > +     shrq    $2, %rdx
> > > +# endif
> > > +     VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > > +     /* Restore rdi (dst).  */
> > > +     addq    %rsi, %rdi
> > > +     VPTESTN %VMM(0), %VMM(0), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(zfill_vec1)
> > > +
> > > +     VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> > > +     KMOV    %k2, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(zfill_vec2)
> > > +
> > > +     VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > +     KMOV    %k0, %VRCX
> > > +     test    %VRCX, %VRCX
> > > +     jnz     L(zfill_vec3)
> > > +
> > > +     VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
> > > +     KMOV    %k4, %VRCX
> > > +     // Zfill more....
> > > +
> > > +     .p2align 4,, 4
> > > +L(zfill_vec4):
> > > +     subq    $(VEC_SIZE * -2), %rdi
> > > +     addq    $(CHAR_PER_VEC * -2), %rdx
> > > +L(zfill_vec2):
> > > +     subq    $(VEC_SIZE * -2), %rdi
> > > +     addq    $(CHAR_PER_VEC * -1), %rdx
> > > +L(zfill):
> > > +     /* VRCX must be non-zero.  */
> > > +     bsf     %VRCX, %VRCX
> > > +
> > > +     /* Adjust length / dst for zfill.  */
> > > +     subq    %rcx, %rdx
> > > +# ifdef USE_AS_WCSCPY
> > > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > +# else
> > > +     addq    %rcx, %rdi
> > > +# endif
> > > +# ifdef USE_AS_STPCPY
> > > +     movq    %rdi, %rax
> > > +# endif
> > > +L(zfill_from_page_cross):
> > > +
> > > +     /* From here on out its just memset(rdi, 0, rdx).  */
> > > +     cmpq    $CHAR_PER_VEC, %rdx
> > > +     jb      L(zfill_less_vec)
> > > +
> > > +L(zfill_more_1x_vec):
> > > +     VMOVU   %VZERO, (%rdi)
> > > +     VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > +     cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
> > > +     ja      L(zfill_more_2x_vec)
> > > +L(zfill_done0):
> > > +     ret
> > > +
> > > +     /* Coming from vec1/vec2 we must be able to zfill at least 2x
> > > +        VEC.  */
> > > +     .p2align 4,, 8
> > > +L(zfill_vec3):
> > > +     subq    $(VEC_SIZE * -2), %rdi
> > > +     addq    $(CHAR_PER_VEC * -2), %rdx
> > > +     .p2align 4,, 2
> > > +L(zfill_vec1):
> > > +     bsfq    %rcx, %rcx
> > > +     /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
> > > +      */
> > > +     leaq    VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> > > +     subq    %rcx, %rdx
> > > +# ifdef USE_AS_STPCPY
> > > +     movq    %rdi, %rax
> > > +# endif
> > > +
> > > +
> > > +     VMOVU   %VZERO, (%rdi)
> > > +     VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > +     cmpq    $(CHAR_PER_VEC * 2), %rdx
> > > +     jb      L(zfill_done0)
> > > +L(zfill_more_2x_vec):
> > > +     VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> > > +     VMOVU   %VZERO, (VEC_SIZE)(%rdi)
> > > +     subq    $(CHAR_PER_VEC * 4 - 1), %rdx
> > > +     jbe     L(zfill_done)
> > > +
> > > +# ifdef USE_AS_WCSCPY
> > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rdx
> > > +# else
> > > +     addq    %rdi, %rdx
> > > +# endif
> > > +
> > > +     VMOVU   %VZERO, (VEC_SIZE * 2)(%rdi)
> > > +     VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi)
> > > +
> > > +
> > > +     VMOVU   %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
> > > +     VMOVU   %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
> > > +
> > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > +     cmpq    %rdi, %rdx
> > > +     jbe     L(zfill_done)
> > > +
> > > +     /* Align rdi and zfill loop.  */
> > > +     andq    $-(VEC_SIZE), %rdi
> > > +     .p2align 4,, 12
> > > +L(zfill_loop_4x_vec):
> > > +     VMOVA   %VZERO, (VEC_SIZE * 0)(%rdi)
> > > +     VMOVA   %VZERO, (VEC_SIZE * 1)(%rdi)
> > > +     VMOVA   %VZERO, (VEC_SIZE * 2)(%rdi)
> > > +     VMOVA   %VZERO, (VEC_SIZE * 3)(%rdi)
> > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > +     cmpq    %rdi, %rdx
> > > +     ja      L(zfill_loop_4x_vec)
> > > +L(zfill_done):
> > > +     ret
> > > +
> > > +
> > > +     /* Less 1x VEC case if we are not using evex masked store.  */
> > > +# if !USE_EVEX_MASKED_STORE
> > > +     .p2align 4,, 8
> > > +L(copy_1x):
> > > +     /* Special case for copy 1x. It can be handled quickly and many
> > > +        buffer sizes have convenient alignment.  */
> > > +     VMOVU   %VMM(0), (%rdi)
> > > +     /* If no zeros then we are done.  */
> > > +     testl   %ecx, %ecx
> > > +     jz      L(ret_1x_1x)
> > > +
> > > +     /* Need to zfill, not we know that length <= CHAR_PER_VEC so we
> > > +        only handle the small case here.  */
> > > +     bsf     %VRCX, %VRCX
> > > +L(zfill_less_vec_no_bsf):
> > > +     /* Adjust length / dst then just zfill less_vec.  */
> > > +     subq    %rcx, %rdx
> > > +#  ifdef USE_AS_WCSCPY
> > > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > +#  else
> > > +     addq    %rcx, %rdi
> > > +#  endif
> > > +#  ifdef USE_AS_STPCPY
> > > +     movq    %rdi, %rax
> > > +#  endif
> > > +
> > > +L(zfill_less_vec):
> > > +     cmpl    $((VEC_SIZE / 2) / CHAR_SIZE), %edx
> > > +     jb      L(zfill_less_half)
> > > +
> > > +     VMOVU   %VZERO_HALF, (%rdi)
> > > +     VMOVU   %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > +     ret
> > > +#  ifdef USE_AS_STPCPY
> > > +L(ret_1x_1x):
> > > +     leaq    CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
> > > +     ret
> > > +#  endif
> > > +
> > > +
> > > +#  if VEC_SIZE == 64
> > > +     .p2align 4,, 4
> > > +L(copy_32_63):
> > > +     /* Overfill to avoid branches.  */
> > > +     VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > > +     VMOVU   %VMM_256(0), (%rdi)
> > > +     VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > +
> > > +     /* We are taking advantage of the fact that to be here we must
> > > +        be writing null-term as (%rdi, %rcx) we have a byte of lee-
> > > +        way for overwriting.  */
> > > +     cmpl    %ecx, %edx
> > > +     ja      L(zfill_less_vec_no_bsf)
> > > +#   ifndef USE_AS_STPCPY
> > > +L(ret_1x_1x):
> > > +#   else
> > > +#    ifdef USE_AS_WCSCPY
> > > +     adcq    $0, %rdx
> > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > > +#    else
> > > +     movl    %edx, %eax
> > > +     adcq    %rdi, %rax
> > > +#    endif
> > > +#   endif
> > > +     ret
> > > +#  endif
> > > +
> > > +     .p2align 4,, 4
> > > +L(copy_16_31):
> > > +     /* Overfill to avoid branches.  */
> > > +     vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > > +     VMOVU   %VMM_128(0), (%rdi)
> > > +     vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > +     cmpl    %ecx, %edx
> > > +
> > > +     /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
> > > +        we have a larger copy block for 32-63 so this is just falls
> > > +        through to zfill 16-31. If VEC_SIZE == 32 then we check for
> > > +        full zfill of less 1x VEC.  */
> > > +#  if VEC_SIZE == 64
> > > +     jbe     L(ret_16_31)
> > > +     subl    %ecx, %edx
> > > +#   ifdef USE_AS_WCSCPY
> > > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > +#   else
> > > +     addq    %rcx, %rdi
> > > +#   endif
> > > +#   ifdef USE_AS_STPCPY
> > > +     movq    %rdi, %rax
> > > +#   endif
> > > +L(zfill_less_half):
> > > +L(zfill_less_32):
> > > +     cmpl    $(16 / CHAR_SIZE), %edx
> > > +     jb      L(zfill_less_16)
> > > +     VMOVU   %VZERO_128, (%rdi)
> > > +     VMOVU   %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > +#   ifdef USE_AS_STPCPY
> > > +     ret
> > > +#   endif
> > > +L(ret_16_31):
> > > +#   ifdef USE_AS_STPCPY
> > > +#    ifdef USE_AS_WCSCPY
> > > +     adcq    $0, %rdx
> > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > > +#    else
> > > +     movl    %edx, %eax
> > > +     adcq    %rdi, %rax
> > > +#    endif
> > > +#   endif
> > > +     ret
> > > +#  else
> > > +     /* VEC_SIZE == 32 begins.  */
> > > +     ja      L(zfill_less_vec_no_bsf)
> > > +#   ifndef USE_AS_STPCPY
> > > +L(ret_1x_1x):
> > > +#   else
> > > +#    ifdef USE_AS_WCSCPY
> > > +     adcq    $0, %rdx
> > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > > +#    else
> > > +     movl    %edx, %eax
> > > +     adcq    %rdi, %rax
> > > +#    endif
> > > +#   endif
> > > +     ret
> > > +#  endif
> > > +
> > > +
> > > +     .p2align 4,, 4
> > > +L(copy_8_15):
> > > +     /* Overfill to avoid branches.  */
> > > +     movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> > > +     vmovq   %VMM_128(0), (%rdi)
> > > +     movq    %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > +     cmpl    %ecx, %edx
> > > +     jbe     L(ret_8_15)
> > > +     subl    %ecx, %edx
> > > +#  ifdef USE_AS_WCSCPY
> > > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > +#  else
> > > +     addq    %rcx, %rdi
> > > +#  endif
> > > +#  ifdef USE_AS_STPCPY
> > > +     movq    %rdi, %rax
> > > +#  endif
> > > +     .p2align 4,, 8
> > > +#  if VEC_SIZE == 32
> > > +L(zfill_less_half):
> > > +#  endif
> > > +L(zfill_less_16):
> > > +     xorl    %ecx, %ecx
> > > +     cmpl    $(8 / CHAR_SIZE), %edx
> > > +     jb      L(zfill_less_8)
> > > +     movq    %rcx, (%rdi)
> > > +     movq    %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > +#  ifndef USE_AS_STPCPY
> > > +L(ret_8_15):
> > > +#  endif
> > > +     ret
> > > +
> > > +     .p2align 4,, 8
> > > +L(less_1x_vec):
> > > +     je      L(copy_1x)
> > > +
> > > +     /* We will need `tzcnt` result for all other copy sizes.  */
> > > +     tzcnt   %VRCX, %VRCX
> > > +#  if VEC_SIZE == 64
> > > +     cmpl    $(32 / CHAR_SIZE), %edx
> > > +     jae     L(copy_32_63)
> > > +#  endif
> > > +
> > > +     cmpl    $(16 / CHAR_SIZE), %edx
> > > +     jae     L(copy_16_31)
> > > +
> > > +     cmpl    $(8 / CHAR_SIZE), %edx
> > > +     jae     L(copy_8_15)
> > > +#  ifdef USE_AS_WCSCPY
> > > +     testl   %ecx, %ecx
> > > +     jz      L(zfill_less_8_set_ret)
> > > +
> > > +     movl    (%rsi, %rdx, CHAR_SIZE), %esi
> > > +     vmovd   %VMM_128(0), (%rdi)
> > > +     movl    %esi, (%rdi, %rdx, CHAR_SIZE)
> > > +#   ifdef USE_AS_STPCPY
> > > +     cmpl    %ecx, %edx
> > > +L(ret_8_15):
> > > +     adcq    $0, %rdx
> > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > > +#   endif
> > > +     ret
> > > +L(zfill_less_8_set_ret):
> > > +     xorl    %ecx, %ecx
> > > +#   ifdef USE_AS_STPCPY
> > > +     movq    %rdi, %rax
> > > +#   endif
> > > +L(zfill_less_8):
> > > +     movl    %ecx, (%rdi)
> > > +     movl    %ecx, (%rdi, %rdx, CHAR_SIZE)
> > > +     ret
> > > +#  else
> > > +     cmpl    $3, %edx
> > > +     jb      L(copy_0_3)
> > > +     /* Overfill to avoid branches.  */
> > > +     movl    -3(%rsi, %rdx), %esi
> > > +     vmovd   %VMM_128(0), (%rdi)
> > > +     movl    %esi, -3(%rdi, %rdx)
> > > +     cmpl    %ecx, %edx
> > > +     jbe     L(ret_4_7)
> > > +     subq    %rcx, %rdx
> > > +     addq    %rcx, %rdi
> > > +#   ifdef USE_AS_STPCPY
> > > +     movq    %rdi, %rax
> > > +#   endif
> > > +     xorl    %ecx, %ecx
> > > +     .p2align 4,, 8
> > > +L(zfill_less_8):
> > > +     cmpl    $3, %edx
> > > +     jb      L(zfill_less_3)
> > > +     movl    %ecx, (%rdi)
> > > +     movl    %ecx, -3(%rdi, %rdx)
> > > +#   ifdef USE_AS_STPCPY
> > > +     ret
> > > +#   endif
> > > +
> > > +L(ret_4_7):
> > > +#   ifdef USE_AS_STPCPY
> > > +L(ret_8_15):
> > > +     movl    %edx, %eax
> > > +     adcq    %rdi, %rax
> > > +#   endif
> > > +     ret
> > > +
> > > +     .p2align 4,, 4
> > > +L(zfill_less_3):
> > > +     testl   %edx, %edx
> > > +     jz      L(zfill_1)
> > > +     movw    %cx, (%rdi)
> > > +L(zfill_1):
> > > +     movb    %cl, (%rdi, %rdx)
> > > +     ret
> > > +
> > > +     .p2align 4,, 8
> > > +L(copy_0_3):
> > > +     vmovd   %VMM_128(0), %r8d
> > > +     testl   %edx, %edx
> > > +     jz      L(copy_1)
> > > +     movw    %r8w, (%rdi)
> > > +     cmpl    %ecx, %edx
> > > +     ja      L(zfill_from_1)
> > > +     movzbl  (%rsi, %rdx), %r8d
> > > +#   ifdef USE_AS_STPCPY
> > > +     movl    %edx, %eax
> > > +     adcq    %rdi, %rax
> > > +     movb    %r8b, (%rdi, %rdx)
> > > +     ret
> > > +#   endif
> > > +
> > > +L(copy_1):
> > > +#   ifdef USE_AS_STPCPY
> > > +     movl    %edx, %eax
> > > +     cmpl    %ecx, %edx
> > > +     adcq    %rdi, %rax
> > > +#   endif
> > > +#   ifdef USE_AS_WCSCPY
> > > +     vmovd   %VMM_128(0), (%rdi)
> > > +#   else
> > > +     movb    %r8b, (%rdi, %rdx)
> > > +#   endif
> > > +     ret
> > > +#  endif
> > > +
> > > +
> > > +#  ifndef USE_AS_WCSCPY
> > > +     .p2align 4,, 8
> > > +L(zfill_from_1):
> > > +#   ifdef USE_AS_STPCPY
> > > +     leaq    (%rdi, %rcx), %rax
> > > +#   endif
> > > +     movw    $0, -1(%rdi, %rdx)
> > > +     ret
> > > +#  endif
> > > +
> > > +     .p2align 4,, 4
> > > +L(zero_len):
> > > +     incq    %rdx
> > > +     jne     L(best_effort_strncpy)
> > > +     movq    %rdi, %rax
> > > +     ret
> > > +# endif
> > > +
> > > +
> > > +     .p2align 4,, 4
> > > +     .p2align 6,, 8
> > > +L(page_cross):
> > > +     movq    %rsi, %rax
> > > +     andq    $(VEC_SIZE * -1), %rax
> > > +     VPCMPEQ (%rax), %VZERO, %k0
> > > +     KMOV    %k0, %VRCX
> > > +# ifdef USE_AS_WCSCPY
> > > +     movl    %esi, %r8d
> > > +     shrl    $2, %r8d
> > > +     andl    $(CHAR_PER_VEC - 1), %r8d
> > > +     shrx    %VR8, %VRCX, %VRCX
> > > +# else
> > > +     shrx    %VRSI, %VRCX, %VRCX
> > > +# endif
> > > +
> > > +     /* Compute amount of bytes we checked.  */
> > > +     subl    %esi, %eax
> > > +     andl    $(VEC_SIZE - 1), %eax
> > > +# ifdef USE_AS_WCSCPY
> > > +     shrl    $2, %eax
> > > +# endif
> > > +
> > > +     /* If rax > rdx then we are finishing the copy at the end of the
> > > +        page.  */
> > > +     cmpq    %rax, %rdx
> > > +     jb      L(page_cross_small)
> > > +
> > > +
> > > +     /* If rcx is non-zero then continue.  */
> > > +     test    %VRCX, %VRCX
> > > +     jz      L(page_cross_continue)
> > > +
> > > +     /* We found zero-CHAR so need to copy then zfill (we know we
> > > +        didn't cover all of length here).  */
> > > +     bsf     %VRCX, %VRCX
> > > +L(movsb_and_zfill):
> > > +     incl    %ecx
> > > +     subq    %rcx, %rdx
> > > +# ifdef USE_AS_STPCPY
> > > +     leaq    -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
> > > +# else
> > > +     movq    %rdi, %rax
> > > +# endif
> > > +
> > > +     REP_MOVS
> > > +# ifdef USE_AS_WCSCPY
> > > +     movl    $0, (%rdi)
> > > +# else
> > > +     movb    $0, (%rdi)
> > > +# endif
> > > +     jmp     L(zfill_from_page_cross)
> > > +
> > > +L(page_cross_small):
> > > +     tzcnt   %VRCX, %VRCX
> > > +     cmpl    %ecx, %edx
> > > +     jbe     L(page_cross_copy_only)
> > > +
> > > +     /* Do a zfill of the tail before copying.  */
> > > +     movq    %rdi, %r9
> > > +     xorl    %eax, %eax
> > > +
> > > +     movl    %ecx, %r8d
> > > +
> > > +     subl    %ecx, %edx
> > > +     leaq    CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> > > +     movl    %edx, %ecx
> > > +     REP_STOS
> > > +     movq    %r9, %rdi
> > > +     movl    %r8d, %edx
> > > +L(page_cross_copy_only):
> > > +     leal    1(%rdx), %ecx
> > > +# ifdef USE_AS_STPCPY
> > > +#  ifdef USE_AS_WCSCPY
> > > +     adcl    $0, %edx
> > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > > +#  else
> > > +     movl    %edx, %eax
> > > +     adcq    %rdi, %rax
> > > +#  endif
> > > +# else
> > > +     movq    %rdi, %rax
> > > +# endif
> > > +     REP_MOVS
> > > +     ret
> > > +
> > > +
> > > +L(best_effort_strncpy):
> > > +     movq    %rdx, %rcx
> > > +     xorl    %eax, %eax
> > > +     movq    %rdi, %r8
> > > +     /* The length is >= 2^63. We very much so expect to segfault at
> > > +        rep stos. If that doesn't happen then just strcpy to finish.
> > > +      */
> > > +     REP_STOS
> > > +     movq    %r8, %rdi
> > > +     jmp     OVERFLOW_STRCPY
> > > +END(STRNCPY)
> > > +#endif
> > > diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> > > new file mode 100644
> > > index 0000000000..d5ff4cbe50
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> >
> > Please add a copyright notice.
> >
> > > @@ -0,0 +1,65 @@
> > > +#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
> > > +#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
> > > +
> > > +#if defined USE_MULTIARCH && IS_IN(libc)
> > > +#  define UNDERSCORES __
> > > +#  ifdef USE_WITH_SSE2
> > > +#    define ISA_EXT _sse2
> > > +#  elif defined USE_WITH_AVX
> > > +#    ifdef USE_WITH_RTM
> > > +#      define ISA_EXT _avx_rtm
> > > +#    else
> > > +#      define ISA_EXT _avx
> > > +#    endif
> > > +#  elif defined USE_WITH_AVX2
> >
> > Do we have a function with both AVX and AVX2 versions? If not, should
> > keep just 1.
> >
> > > +#    ifdef USE_WITH_RTM
> > > +#      define ISA_EXT _avx2_rtm
> > > +#    else
> > > +#      define ISA_EXT _avx2
> > > +#    endif
> > > +
> > > +#  elif defined USE_WITH_EVEX256
> > > +#    define ISA_EXT _evex
> > > +#  elif defined USE_WITH_EVEX512
> > > +#    define ISA_EXT _evex512
> > > +#  endif
> > > +#else
> > > +#  define UNDERSCORES
> > > +#  define ISA_EXT
> > > +#endif
> > > +
> > > +#ifdef USE_AS_WCSCPY
> > > +#  define STRCPY_PREFIX wc
> > > +#  define STRCAT_PREFIX wcs
> > > +#  ifdef USE_AS_STPCPY
> > > +#    define STRCPY_POSTFIX pcpy
> > > +#  else
> > > +#    define STRCPY_POSTFIX scpy
> > > +#  endif
> > > +#else
> > > +#  define STRCPY_PREFIX st
> > > +#  define STRCAT_PREFIX str
> > > +#  ifdef USE_AS_STPCPY
> > > +#    define STRCPY_POSTFIX pcpy
> > > +#  else
> > > +#    define STRCPY_POSTFIX rcpy
> > > +#  endif
> > > +#endif
> > > +#define STRCAT_POSTFIX cat
> > > +
> > > +#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext)                 \
> > > +  underscores##prefix##postfix##ext
> > > +
> > > +#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
> > > +
> > > +#ifndef OVERFLOW_STRCPY
> > > +#  define OVERFLOW_STRCPY                                                     \
> > > +    OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
> > > +#endif
> > > +
> > > +#ifndef OVERFLOW_STRCAT
> > > +#  define OVERFLOW_STRCAT                                                     \
> > > +    OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
> > > +#endif
> > > +
> > > +#endif
> > > --
> > > 2.34.1
> > >
> >
> > H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v4 1/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
  2022-11-03  8:53 ` [PATCH v1 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
  2022-11-03  8:55   ` Noah Goldstein
@ 2022-11-04 23:04   ` Noah Goldstein
  2022-11-04 23:04     ` [PATCH v4 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
                       ` (3 more replies)
  2022-11-09  1:38   ` [PATCH v5 " Noah Goldstein
  2 siblings, 4 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04 23:04 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
    1. Use more overlapping stores to avoid branches.
    2. Reduce how unrolled the aligning copies are (this is more of a
       code-size save, its a negative for some sizes in terms of
       perf).
    3. Improve the loop a bit (similiar to what we do in strlen with
       2x vpminu + kortest instead of 3x vpminu + kmov + test).
    4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
       number that are taken.

Performance Changes:

    Times are from N = 10 runs of the benchmark suite and are
    reported as geometric mean of all ratios of
    New Implementation / Old Implementation.

    stpcpy-evex      -> 0.922
    strcat-evex      -> 0.985
    strcpy-evex      -> 0.880

    strncpy-evex     -> 0.831
    stpncpy-evex     -> 0.780

    strncat-evex     -> 0.958

Code Size Changes:
    function         -> Bytes New / Bytes Old -> Ratio

    strcat-evex      ->  819 / 1874 -> 0.437
    strcpy-evex      ->  700 / 1074 -> 0.652
    stpcpy-evex      ->  735 / 1094 -> 0.672

    strncpy-evex     -> 1397 / 2611 -> 0.535
    stpncpy-evex     -> 1489 / 2691 -> 0.553

    strncat-evex     -> 1184 / 2832 -> 0.418

Notes:
    1. Because of the significant difference between the
       implementations they are split into three files.

           strcpy-evex.S    -> strcpy, stpcpy, strcat
           strncpy-evex.S   -> strncpy
           strncat-evex.S    > strncat

       I couldn't find a way to merge them without making the
       ifdefs incredibly difficult to follow.

    2. All implementations can be made evex512 by including
       "x86-evex512-vecs.h" at the top.

    3. All implementations have an optional define:
        `USE_EVEX_MASKED_STORE`
       Setting to one uses evex-masked stores for handling short
       strings.  This saves code size and branches.  It's disabled
       for all implementations are the moment as there are some
       serious drawbacks to masked stores in certain cases, but
       that may be fixed on future architectures.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
 sysdeps/x86_64/multiarch/stpncpy-evex.S       |    5 +-
 sysdeps/x86_64/multiarch/strcat-evex.S        |  291 +---
 sysdeps/x86_64/multiarch/strcpy-evex.S        | 1282 ++++++-----------
 sysdeps/x86_64/multiarch/strncat-evex.S       |  525 ++++++-
 sysdeps/x86_64/multiarch/strncpy-evex.S       |  995 ++++++++++++-
 .../multiarch/strncpy-or-cat-overflow-def.h   |   65 +
 6 files changed, 1990 insertions(+), 1173 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h

diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
index 99ea76a372..3693491baa 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
@@ -3,6 +3,5 @@
 #endif
 
 #define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY	STPNCPY
-#include "strcpy-evex.S"
+#define STRNCPY	STPNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
index 0e2df947e9..b4207b7889 100644
--- a/sysdeps/x86_64/multiarch/strcat-evex.S
+++ b/sysdeps/x86_64/multiarch/strcat-evex.S
@@ -1,286 +1,7 @@
-/* strcat with 256-bit EVEX instructions.
-   Copyright (C) 2021-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_evex
-# endif
-
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
-/* zero register */
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMM0		ymm17
-# define YMM1		ymm18
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE	32
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRCAT)
-	mov	%rdi, %r9
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-	xor	%eax, %eax
-	mov	%edi, %ecx
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-	cmp	$(VEC_SIZE * 3), %ecx
-	ja	L(fourth_vector_boundary)
-	vpcmpb	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_first_vector)
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	jmp	L(align_vec_size_start)
-L(fourth_vector_boundary):
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	vpcmpb	$0, (%rax), %YMMZERO, %k0
-	mov	$-1, %r10d
-	sub	%rax, %rcx
-	shl	%cl, %r10d
-	kmovd	%k0, %edx
-	and	%r10d, %edx
-	jnz	L(exit)
-
-L(align_vec_size_start):
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 4), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	kmovd	%k4, %edx
-	add	$(VEC_SIZE * 4), %rax
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 4), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 5), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	add	$VEC_SIZE, %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	add	$VEC_SIZE, %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
-	add	$VEC_SIZE, %rax
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	add	$VEC_SIZE, %rax
-
-	.p2align 4
-L(align_four_vec_loop):
-	VMOVA	(%rax), %YMM0
-	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
-	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
-	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
-	vpminub	%YMM0, %YMM1, %YMM0
-	/* If K0 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM0, %YMMZERO, %k0
-	add	$(VEC_SIZE * 4), %rax
-	ktestd	%k0, %k0
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
-	sub	$(VEC_SIZE * 5), %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit):
-	sub	%rdi, %rax
-L(exit_null_on_first_vector):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_second_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$VEC_SIZE, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_third_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 2), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fourth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 3), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fifth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-
-	.p2align 4
-L(StartStrcpyPart):
-	lea	(%r9, %rax), %rdi
-	mov	%rsi, %rcx
-	mov	%r9, %rax      /* save result */
-
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(ExitZero)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-evex.S"
+#ifndef STRCAT
+# define STRCAT	__strcat_evex
 #endif
+
+#define USE_AS_STRCAT
+#define STRCPY	STRCAT
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
index 82e45ac675..932129ab40 100644
--- a/sysdeps/x86_64/multiarch/strcpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
@@ -1,4 +1,4 @@
-/* strcpy with 256-bit EVEX instructions.
+/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
    Copyright (C) 2021-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,990 +17,526 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <isa-level.h>
-
 #if ISA_SHOULD_BUILD (4)
 
 
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+	/* Use movsb in page cross case to save code size.  */
+# define USE_MOVSB_IN_PAGE_CROSS	1
 
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_evex
-#  endif
+# include <sysdep.h>
 
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
-/* Number of bytes in a vector register */
-# ifndef VEC_SIZE
-#  define VEC_SIZE	32
+# ifndef STRCPY
+#  define STRCPY	__strcpy_evex
 # endif
 
-# define XMM2		xmm18
-# define XMM3		xmm19
 
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
-# define YMM7		ymm23
+# ifdef USE_AS_WCSCPY
+#  define VMOVU_MASK	vmovdqu32
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
 
-# ifndef USE_AS_STRCAT
+#  define REP_MOVS	rep movsd
 
-/* zero register */
-#  define XMMZERO	xmm16
-#  define YMMZERO	ymm16
-#  define YMM1		ymm17
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRCPY)
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-	test	%R8_LP, %R8_LP
-	jz	L(ExitZero)
-#  endif
-	mov	%rsi, %rcx
-#  ifndef USE_AS_STPCPY
-	mov	%rdi, %rax      /* save result */
-#  endif
+#  define USE_WIDE_CHAR
+# else
+#  define VMOVU_MASK	vmovdqu8
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
 
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+#  define REP_MOVS	rep movsb
 # endif
 
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	cmp	$(VEC_SIZE * 2), %ecx
-	jbe	L(SourceStringAlignmentLessTwoVecSize)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-
-	vpcmpb	$0, (%rsi), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	shr	%cl, %rdx
+# include "reg-macros.h"
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	mov	$VEC_SIZE, %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  else
-	mov	$(VEC_SIZE + 1), %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  endif
-	jbe	L(CopyVecSizeTailCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail)
-
-	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
-	kmovd	%k1, %edx
 
-# ifdef USE_AS_STRNCPY
-	add	$VEC_SIZE, %r10
-	cmp	%r10, %r8
-	jbe	L(CopyTwoVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize)
-
-	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
-	VMOVU	%YMM2, (%rdi)
-
-/* If source address alignment != destination address alignment */
-	.p2align 4
-L(UnalignVecSizeBoth):
-	sub	%rcx, %rdi
-# ifdef USE_AS_STRNCPY
-	add	%rcx, %r8
-	sbb	%rcx, %rcx
-	or	%rcx, %r8
-# endif
-	mov	$VEC_SIZE, %rcx
-	VMOVA	(%rsi, %rcx), %YMM2
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 3), %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STPCPY
+#  define END_REG	rax
 # else
-	jnz	L(CopyVecSize)
+#  define END_REG	rdi, %rdx, CHAR_SIZE
 # endif
 
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
+# ifdef USE_AS_STRCAT
+#  define PAGE_ALIGN_REG	edx
+#  define PAGE_ALIGN_REG_64	rdx
 # else
-	jnz	L(CopyVecSize)
+#  define PAGE_ALIGN_REG	eax
+#  define PAGE_ALIGN_REG_64	rax
 # endif
 
-	VMOVU	%YMM3, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
-	vpcmpb	$0, %YMM4, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
 
-	VMOVU	%YMM4, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
 
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
-	VMOVU	%YMM2, (%rdi, %rcx)
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
 
-	VMOVU	%YMM3, (%rdi, %rcx)
-	mov	%rsi, %rdx
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	and	$-(VEC_SIZE * 4), %rsi
-	sub	%rsi, %rdx
-	sub	%rdx, %rdi
-# ifdef USE_AS_STRNCPY
-	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
-# endif
-L(UnalignedFourVecSizeLoop):
-	VMOVA	(%rsi), %YMM4
-	VMOVA	VEC_SIZE(%rsi), %YMM5
-	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
-	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
-	vpminub	%YMM5, %YMM4, %YMM2
-	vpminub	%YMM7, %YMM6, %YMM3
-	vpminub	%YMM2, %YMM3, %YMM2
-	/* If K7 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM2, %YMMZERO, %k7
-	kmovd	%k7, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRCPY)
+# ifdef USE_AS_STRCAT
+	movq	%rdi, %rax
+#  include "strcat-strlen-evex.h.S"
 # endif
-	test	%edx, %edx
-	jnz	L(UnalignedFourVecSizeLeave)
-
-L(UnalignedFourVecSizeLoop_start):
-	add	$(VEC_SIZE * 4), %rdi
-	add	$(VEC_SIZE * 4), %rsi
-	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
-	VMOVA	(%rsi), %YMM4
-	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
-	VMOVA	VEC_SIZE(%rsi), %YMM5
-	vpminub	%YMM5, %YMM4, %YMM2
-	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
-	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
-	VMOVU	%YMM7, -VEC_SIZE(%rdi)
-	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
-	vpminub	%YMM7, %YMM6, %YMM3
-	vpminub	%YMM2, %YMM3, %YMM2
-	/* If K7 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM2, %YMMZERO, %k7
-	kmovd	%k7, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
+
+	movl	%esi, %PAGE_ALIGN_REG
+	andl	$(PAGE_SIZE - 1), %PAGE_ALIGN_REG
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 # endif
-	test	%edx, %edx
-	jz	L(UnalignedFourVecSizeLoop_start)
 
-L(UnalignedFourVecSizeLeave):
-	vpcmpb	$0, %YMM4, %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_0)
 
-	vpcmpb	$0, %YMM5, %YMMZERO, %k2
-	kmovd	%k2, %ecx
-	test	%ecx, %ecx
-	jnz	L(CopyVecSizeUnaligned_16)
+	/* Two short string implementations. One with traditional
+	   branching approach and one with masked instructions (which
+	   have potential for dramatically bad perf if dst splits a
+	   page and is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	VPTEST	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+#  ifdef USE_AS_WCSCPY
+	subl	$((1 << CHAR_PER_VEC)- 1), %VRCX
+#  else
+	inc	%VRCX
+#  endif
+	jz	L(more_1x_vec)
+	KMOV	%VRCX, %k1
+	KXOR	%k0, %k1, %k1
 
-	vpcmpb	$0, %YMM6, %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_32)
-
-	vpcmpb	$0, %YMM7, %YMMZERO, %k4
-	kmovd	%k4, %ecx
-	bsf	%ecx, %edx
-	VMOVU	%YMM4, (%rdi)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$(VEC_SIZE * 3), %rsi
-	add	$(VEC_SIZE * 3), %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
 
-/* If source address alignment == destination address alignment */
+#  ifdef USE_AS_STPCPY
+	bsf	%VRCX, %VRCX
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
+#  endif
+	ret
 
-L(SourceStringAlignmentLessTwoVecSize):
-	VMOVU	(%rsi), %YMM3
-	VMOVU	VEC_SIZE(%rsi), %YMM2
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
+# else
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jz	L(more_1x_vec)
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$VEC_SIZE, %r8
+	xorl	%edx, %edx
+	bsf	%VRCX, %VRDX
+#  ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  endif
+
+	/* Use mask bits in rcx to detect which copy we need. If the low
+	   mask is zero then there must be a bit set in the upper half.
+	   I.e if rcx != 0 and ecx == 0, then match must be upper 32
+	   bits so we use L(copy_32_63).  */
+#  if VEC_SIZE == 64
+#   ifdef USE_AS_WCSCPY
+	testb	%cl, %cl
+#   else
+	testl	%ecx, %ecx
+#   endif
+	jz	L(copy_32_63)
+#  endif
+
+#  ifdef USE_AS_WCSCPY
+	testb	$0xf, %cl
 #  else
-	cmp	$(VEC_SIZE + 1), %r8
+	testw	%cx, %cx
 #  endif
-	jbe	L(CopyVecSizeTail1Case2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail1)
+	jz	L(copy_16_31)
 
-	VMOVU	%YMM3, (%rdi)
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$(VEC_SIZE * 2), %r8
+#  ifdef USE_AS_WCSCPY
+	testb	$0x3, %cl
 #  else
-	cmp	$((VEC_SIZE * 2) + 1), %r8
+	testb	%cl, %cl
 #  endif
-	jbe	L(CopyTwoVecSize1Case2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize1)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-	jmp	L(UnalignVecSizeBoth)
+	jz	L(copy_8_15)
 
-/*------End of main part with loops---------------------*/
 
-/* Case1 */
+#  ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+	/* No need to copy, we know its zero.  */
+	movl	$0, (%END_REG)
 
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
-	.p2align 4
-L(CopyVecSize):
-	add	%rcx, %rdi
-# endif
-L(CopyVecSizeTail):
-	add	%rcx, %rsi
-L(CopyVecSizeTail1):
-	bsf	%edx, %edx
-L(CopyVecSizeExit):
-	cmp	$32, %edx
-	jae	L(Exit32_63)
-	cmp	$16, %edx
-	jae	L(Exit16_31)
-	cmp	$8, %edx
-	jae	L(Exit8_15)
-	cmp	$4, %edx
-	jae	L(Exit4_7)
-	cmp	$3, %edx
-	je	L(Exit3)
-	cmp	$1, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	movb	$0, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$1, %r8
-	lea	1(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
 	ret
+#  else
 
-	.p2align 4
-L(CopyTwoVecSize1):
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$VEC_SIZE, %r8
-# endif
-	jmp	L(CopyVecSizeTail1)
-
-	.p2align 4
-L(CopyTwoVecSize):
-	bsf	%edx, %edx
-	add	%rcx, %rsi
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	jmp	L(CopyVecSizeExit)
-
-	.p2align 4
-L(CopyVecSizeUnaligned_0):
-	bsf	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM4, (%rdi)
-	add	$((VEC_SIZE * 4) - 1), %r8
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	jmp	L(CopyVecSizeExit)
-# endif
+	testb	$0x7, %cl
+	jz	L(copy_4_7)
 
-	.p2align 4
-L(CopyVecSizeUnaligned_16):
-	bsf	%ecx, %edx
-	VMOVU	%YMM4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	VEC_SIZE(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	add	$((VEC_SIZE * 3) - 1), %r8
-	sub	%rdx, %r8
-	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
 
-	.p2align 4
-L(CopyVecSizeUnaligned_32):
-	bsf	%edx, %edx
-	VMOVU	%YMM4, (%rdi)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-	add	$((VEC_SIZE * 2) - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$(VEC_SIZE * 2), %rsi
-	add	$(VEC_SIZE * 2), %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
+	test	%edx, %edx
+	jz	L(set_null_term)
 
-# ifdef USE_AS_STRNCPY
-#  ifndef USE_AS_STRCAT
-	.p2align 4
-L(CopyVecSizeUnalignedVec6):
-	VMOVU	%YMM6, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec5):
-	VMOVU	%YMM5, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec4):
-	VMOVU	%YMM4, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec3):
-	VMOVU	%YMM3, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
+	/* NB: make this `vmovw` if support for AVX512-FP16 is added.
+	 */
+	vmovd	%VMM_128(0), %esi
+	movw	%si, (%rdi)
+
+	.p2align 4,, 1
+L(set_null_term):
+	/* No need to copy, we know its zero.  */
+	movb	$0, (%END_REG)
+	ret
 #  endif
 
-/* Case2 */
-
-	.p2align 4
-L(CopyVecSizeCase2):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTailCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTail1Case2):
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-/* Case2 or Case3,  Case3 */
-
-	.p2align 4
-L(CopyVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeCase2)
-L(CopyVecSizeCase3):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyTwoVecSizeCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyVecSizeTailCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTailCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSize1Case2OrCase3):
-	add	$VEC_SIZE, %rdi
-	add	$VEC_SIZE, %rsi
-	sub	$VEC_SIZE, %r8
-L(CopyVecSizeTail1Case2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTail1Case2)
-	jmp	L(StrncpyExit)
+#  if VEC_SIZE == 64
+	.p2align 4,, 6
+L(copy_32_63):
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
+	ret
+#  endif
+
+
+	.p2align 4,, 6
+L(copy_16_31):
+	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
+	   and will save code size.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
+	ret
+
+	.p2align 4,, 8
+L(copy_8_15):
+#  ifdef USE_AS_WCSCPY
+	movl	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+#  else
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
+#  endif
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%END_REG)
+	ret
 # endif
 
-/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
 
-	.p2align 4
-L(Exit1):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
+# ifndef USE_AS_WCSCPY
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%ecx, -(4 - CHAR_SIZE)(%END_REG)
+	ret
 # endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$2, %r8
-	lea	2(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rdi)
 # endif
-	ret
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+	addq	%rsi, %rdi
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
 
-	.p2align 4
-L(Exit2):
-	movzwl	(%rsi), %ecx
-	mov	%cx, (%rdi)
-	movb	$0, 2(%rdi)
+	/* Ideally we store after moves to minimize impact of potential
+	   false-dependencies.  */
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rax)
+# endif
+
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+
+
+	/* Align for 4x loop.  */
+	subq	%rsi, %rdi
+
+	/* + VEC_SIZE * 5 because we never added the original VEC_SIZE
+	   we covered before aligning.  */
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$-(VEC_SIZE * 4), %rsi
+
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	/* Restore rdi (%rdi).  */
+	addq	%rsi, %rdi
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x0_end)
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	/* Place L(ret_vec_x4) here to save code size.  We get a
+	   meaningfuly benefit doing this for stpcpy.  */
+	KMOV	%k4, %VRDX
+L(ret_vec_x3):
+	bsf	%VRDX, %VRDX
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$3, %r8
-	lea	3(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
 # endif
+L(return_end):
 	ret
 
-	.p2align 4
-L(Exit3):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
+	.p2align 4,, 6
+L(ret_vec_x0_end):
+	bsf	%VRCX, %VRCX
 # ifdef USE_AS_STPCPY
-	lea	3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$4, %r8
-	lea	4(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
+	inc	%VRCX
+	VMOVU	(-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
 	ret
 
-	.p2align 4
-L(Exit4_7):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	-3(%rsi, %rdx), %ecx
-	mov	%ecx, -3(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x1):
+	bsf	%VRCX, %VRCX
+	VMOVU	(VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
 	ret
 
-	.p2align 4
-L(Exit8_15):
-	mov	(%rsi), %rcx
-	mov	-7(%rsi, %rdx), %r9
-	mov	%rcx, (%rdi)
-	mov	%r9, -7(%rdi, %rdx)
+	.p2align 4,, 4
+L(ret_vec_x2):
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
 	ret
 
-	.p2align 4
-L(Exit16_31):
-	VMOVU	(%rsi), %XMM2
-	VMOVU	-15(%rsi, %rdx), %XMM3
-	VMOVU	%XMM2, (%rdi)
-	VMOVU	%XMM3, -15(%rdi, %rdx)
+	/* ret_vec_x3 reuses return code after the loop.  */
+	.p2align 4,, 6
+L(ret_vec_x4):
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub %rdx, %r8
-	sub $1, %r8
-	lea 1(%rdi, %rdx), %rdi
-	jnz L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
 	ret
 
-	.p2align 4
-L(Exit32_63):
-	VMOVU	(%rsi), %YMM2
-	VMOVU	-31(%rsi, %rdx), %YMM3
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, -31(%rdi, %rdx)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
+
+	.p2align 4,, 4
+L(page_cross):
+# ifndef USE_AS_STRCAT
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
 # endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	movq	%rsi, %rcx
+	andq	$(VEC_SIZE * -1), %rcx
+
+	VPCMPEQ	(%rcx), %VZERO, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSCPY
+	andl	$(VEC_SIZE - 1), %PAGE_ALIGN_REG
+	shrl	$2, %PAGE_ALIGN_REG
 # endif
-	ret
+	shrx	%VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
 
-# ifdef USE_AS_STRNCPY
+# if USE_MOVSB_IN_PAGE_CROSS
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
 
-	.p2align 4
-L(StrncpyExit1):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 1(%rdi)
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shl	%VRCX
+	jz	L(page_cross_continue)
+#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 #  endif
-	ret
+	bsf	%VRCX, %VRCX
+	REP_MOVS
 
-	.p2align 4
-L(StrncpyExit2):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
 #  ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 2(%rdi)
+	leaq	-CHAR_SIZE(%rdi), %rax
 #  endif
 	ret
 
-	.p2align 4
-L(StrncpyExit3_4):
-	movzwl	(%rsi), %ecx
-	movzwl	-2(%rsi, %r8), %edx
-	mov	%cx, (%rdi)
-	mov	%dx, -2(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
 
-	.p2align 4
-L(StrncpyExit5_8):
-	mov	(%rsi), %ecx
-	mov	-4(%rsi, %r8), %edx
-	mov	%ecx, (%rdi)
-	mov	%edx, -4(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
+# else
+	/* Check if we found zero-char before end of page.  */
+	test	%VRCX, %VRCX
+	jz	L(page_cross_continue)
 
-	.p2align 4
-L(StrncpyExit9_16):
-	mov	(%rsi), %rcx
-	mov	-8(%rsi, %r8), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
+	/* Traditional copy case, essentially same as used in non-page-
+	   cross case but since we can't reuse VMM(0) we need twice as
+	   many loads from rsi.  */
 
-	.p2align 4
-L(StrncpyExit17_32):
-	VMOVU	(%rsi), %XMM2
-	VMOVU	-16(%rsi, %r8), %XMM3
-	VMOVU	%XMM2, (%rdi)
-	VMOVU	%XMM3, -16(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
+#  ifndef USE_AS_STRCAT
+	xorl	%edx, %edx
 #  endif
-	ret
-
-	.p2align 4
-L(StrncpyExit33_64):
-	/*  0/32, 31/16 */
-	VMOVU	(%rsi), %YMM2
-	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
+	/* Dependency on rdi must already have been satisfied.  */
+	bsf	%VRCX, %VRDX
 #  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  elif !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 #  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
 
-	.p2align 4
-L(StrncpyExit65):
-	/* 0/32, 32/32, 64/1 */
-	VMOVU	(%rsi), %YMM2
-	VMOVU	32(%rsi), %YMM3
-	mov	64(%rsi), %cl
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, 32(%rdi)
-	mov	%cl, 64(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	65(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 65(%rdi)
+#  if VEC_SIZE == 64
+#   ifdef USE_AS_WCSCPY
+	testb	%cl, %cl
+#   else
+	test	%ecx, %ecx
+#   endif
+	jz	L(page_cross_copy_32_63)
 #  endif
-	ret
-
-#  ifndef USE_AS_STRCAT
 
-	.p2align 4
-L(Fill1):
-	mov	%dl, (%rdi)
-	ret
+#  ifdef USE_AS_WCSCPY
+	testb	$0xf, %cl
+#  else
+	testw	%cx, %cx
+#  endif
+	jz	L(page_cross_copy_16_31)
 
-	.p2align 4
-L(Fill2):
-	mov	%dx, (%rdi)
-	ret
+#  ifdef USE_AS_WCSCPY
+	testb	$0x3, %cl
+#  else
+	testb	%cl, %cl
+#  endif
+	jz	L(page_cross_copy_8_15)
 
-	.p2align 4
-L(Fill3_4):
-	mov	%dx, (%rdi)
-	mov     %dx, -2(%rdi, %r8)
+#  ifdef USE_AS_WCSCPY
+	movl	(%rsi), %esi
+	movl	%esi, (%rdi)
+	movl	$0, (%END_REG)
 	ret
+#  else
 
-	.p2align 4
-L(Fill5_8):
-	mov	%edx, (%rdi)
-	mov     %edx, -4(%rdi, %r8)
-	ret
+	testb	$0x7, %cl
+	jz	L(page_cross_copy_4_7)
 
-	.p2align 4
-L(Fill9_16):
-	mov	%rdx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
+	test	%edx, %edx
+	jz	L(page_cross_set_null_term)
+	movzwl	(%rsi), %ecx
+	movw	%cx, (%rdi)
+L(page_cross_set_null_term):
+	movb	$0, (%END_REG)
 	ret
 
-	.p2align 4
-L(Fill17_32):
-	VMOVU	%XMMZERO, (%rdi)
-	VMOVU	%XMMZERO, -16(%rdi, %r8)
-	ret
 
-	.p2align 4
-L(CopyVecSizeUnalignedVec2):
-	VMOVU	%YMM2, (%rdi, %rcx)
-
-	.p2align 4
-L(CopyVecSizeVecExit):
-	bsf	%edx, %edx
-	add	$(VEC_SIZE - 1), %r8
-	add	%rcx, %rdi
-#   ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-#   endif
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-
-	.p2align 4
-L(StrncpyFillTailWithZero):
-	xor	%edx, %edx
-	sub	$VEC_SIZE, %r8
-	jbe	L(StrncpyFillExit)
-
-	VMOVU	%YMMZERO, (%rdi)
-	add	$VEC_SIZE, %rdi
-
-	mov	%rdi, %rsi
-	and	$(VEC_SIZE - 1), %esi
-	sub	%rsi, %rdi
-	add	%rsi, %r8
-	sub	$(VEC_SIZE * 4), %r8
-	jb	L(StrncpyFillLessFourVecSize)
-
-L(StrncpyFillLoopVmovdqa):
-	VMOVA	%YMMZERO, (%rdi)
-	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
-	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
-	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE * 4), %rdi
-	sub	$(VEC_SIZE * 4), %r8
-	jae	L(StrncpyFillLoopVmovdqa)
-
-L(StrncpyFillLessFourVecSize):
-	add	$(VEC_SIZE * 2), %r8
-	jl	L(StrncpyFillLessTwoVecSize)
-	VMOVA	%YMMZERO, (%rdi)
-	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
-	add	$(VEC_SIZE * 2), %rdi
-	sub	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	VMOVA	%YMMZERO, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillLessTwoVecSize):
-	add	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	VMOVA	%YMMZERO, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillExit):
-	add	$VEC_SIZE, %r8
-L(Fill):
-	cmp	$17, %r8d
-	jae	L(Fill17_32)
-	cmp	$9, %r8d
-	jae	L(Fill9_16)
-	cmp	$5, %r8d
-	jae	L(Fill5_8)
-	cmp	$3, %r8d
-	jae	L(Fill3_4)
-	cmp	$1, %r8d
-	ja	L(Fill2)
-	je	L(Fill1)
+	.p2align 4,, 4
+L(page_cross_copy_4_7):
+	movl	(%rsi), %ecx
+	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, -(4 - CHAR_SIZE)(%END_REG)
 	ret
-
-/* end of ifndef USE_AS_STRCAT */
 #  endif
 
-	.p2align 4
-L(UnalignedLeaveCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(UnalignedFourVecSizeLeaveCase2)
-L(UnalignedFourVecSizeLeaveCase3):
-	lea	(VEC_SIZE * 4)(%r8), %rcx
-	and	$-VEC_SIZE, %rcx
-	add	$(VEC_SIZE * 3), %r8
-	jl	L(CopyVecSizeCase3)
-	VMOVU	%YMM4, (%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 4)(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (VEC_SIZE * 4)(%rdi)
-#  endif
+#  if VEC_SIZE == 64
+	.p2align 4,, 4
+L(page_cross_copy_32_63):
+	VMOVU	(%rsi), %VMM_256(0)
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
 	ret
-
-	.p2align 4
-L(UnalignedFourVecSizeLeaveCase2):
-	xor	%ecx, %ecx
-	vpcmpb	$0, %YMM4, %YMMZERO, %k1
-	kmovd	%k1, %edx
-	add	$(VEC_SIZE * 3), %r8
-	jle	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
-	vpcmpb	$0, %YMM5, %YMMZERO, %k2
-	kmovd	%k2, %edx
-	VMOVU	%YMM4, (%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec5)
-#  else
-	jnz	L(CopyVecSize)
 #  endif
 
-	vpcmpb	$0, %YMM6, %YMMZERO, %k3
-	kmovd	%k3, %edx
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec6)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
-
-	vpcmpb	$0, %YMM7, %YMMZERO, %k4
-	kmovd	%k4, %edx
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-	lea	VEC_SIZE(%rdi, %rcx), %rdi
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-L(StrncpyExit):
-	cmp	$65, %r8d
-	je	L(StrncpyExit65)
-	cmp	$33, %r8d
-	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8d
-	jae	L(StrncpyExit17_32)
-	cmp	$9, %r8d
-	jae	L(StrncpyExit9_16)
-	cmp	$5, %r8d
-	jae	L(StrncpyExit5_8)
-	cmp	$3, %r8d
-	jae	L(StrncpyExit3_4)
-	cmp	$1, %r8d
-	ja	L(StrncpyExit2)
-	je	L(StrncpyExit1)
-#  ifdef USE_AS_STPCPY
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi)
-#  endif
+	.p2align 4,, 4
+L(page_cross_copy_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
 	ret
 
-	.p2align 4
-L(ExitZero):
-#  ifndef USE_AS_STRCAT
-	mov	%rdi, %rax
-#  endif
+	.p2align 4,, 4
+L(page_cross_copy_8_15):
+	movq	(%rsi), %rcx
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+	movq	%rcx, (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%END_REG)
 	ret
-
-# endif
-
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
 # endif
+END(STRCPY)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
index 203a19bf21..9aed2d9970 100644
--- a/sysdeps/x86_64/multiarch/strncat-evex.S
+++ b/sysdeps/x86_64/multiarch/strncat-evex.S
@@ -1,7 +1,520 @@
-#ifndef STRNCAT
-# define STRNCAT	__strncat_evex
-#endif
+/* {wcs|str}ncat  with 256/512-bit EVEX.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+# ifndef STRNCAT
+#  define STRNCAT	__strncat_evex
+# endif
+
+
+# ifdef USE_AS_WCSCPY
+#  define movNULL	movl
+#  define VMOVU_MASK	vmovdqu32
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
+
+#  define REP_MOVS	rep movsd
+
+#  define VMASK_REG	VR10
+#  define FIND_FIRST_ONE(src, dst)	movl $CHAR_PER_VEC, %dst; bsf %src, %dst
+
+#  define USE_WIDE_CHAR
+# else
+#  define movNULL	movb
+#  define VMOVU_MASK	vmovdqu8
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
+
+#  define REP_MOVS	rep movsb
+
+#  define VMASK_REG	VRCX
+#  define FIND_FIRST_ONE(src, dst)	tzcnt %src, %dst
+
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# include "reg-macros.h"
+
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCAT)
+	movq	%rdi, %rax
+
+	/* NB: It's safe to filter out zero-length strings WITHOUT
+	   setting null-term. Destination MUST be a null-terminated
+	   string so essentially the work is already done.  */
+# ifdef USE_AS_WCSCPY
+	leaq	-1(%rdx), %rcx
+	shrq	$56, %rcx
+	jnz	L(zero_len)
+# else
+	test	%rdx, %rdx
+	jle	L(zero_len)
+# endif
+
+# include "strcat-strlen-evex.h.S"
+
+	movl	%esi, %ecx
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPTESTN	%VMM(0), %VMM(0), %k0
+
+	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
+	   <= CHAR_PER_VEC with masked instructions (which have
+	   potential for dramatically bad perf if dst splits a page and
+	   is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	KMOV	%k0, %VRCX
+	FIND_FIRST_ONE (VRCX, VR8)
+	cmpq	%r8, %rdx
+	jbe	L(less_1x_vec)
+
+	test	%VRCX, %VRCX
+	jz	L(more_1x_vec)
+
+	blsmsk	%VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
+	ret
+
+L(less_1x_vec):
+	mov	$-1, %VRCX
+	bzhi	%VRDX, %VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
+
+	ret
+# else
+	KMOV	%k0, %VMASK_REG
+	/* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
+	   %VMASK_REG, %VRCX` for wcsncat.  */
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpq	%rcx, %rdx
+	jbe	L(less_1x_vec)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	je	L(more_1x_vec)
+
+	movl	%ecx, %edx
+
+L(less_1x_vec):
+#  if VEC_SIZE == 64
+	cmpl	$(32 / CHAR_SIZE), %edx
+	jae	L(copy_32_63)
+#  endif
+
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jae	L(copy_16_31)
+
+
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jae	L(copy_8_15)
+
+#  ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  else
+
+	cmpl	$4, %edx
+	jae	L(copy_4_7)
+
+	movzbl	(%rsi), %ecx
+	cmpl	$1, %edx
+	jbe	L(set_null_term)
+
+	movzwl	1(%rsi), %esi
+	movw	%si, 1(%rdi)
+
+	.p2align 4,, 1
+L(set_null_term):
+	movb	%cl, (%rdi)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+#  endif
+
+#  if VEC_SIZE == 64
+	.p2align 4,, 6
+L(copy_32_63):
+	VMOVU	-(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  endif
+	.p2align 4,, 6
+L(copy_16_31):
+	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
+	   and will save code size.  */
+	vmovdqu	-(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 2
+L(copy_8_15):
+	movq	-(8)(%rsi, %rdx, CHAR_SIZE), %rcx
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+
+#  ifndef USE_AS_WCSCPY
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-(4)(%rsi, %rdx, CHAR_SIZE), %ecx
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
+	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  endif
+
+# endif
+	.p2align 4,, 4
+L(zero_len):
+# ifdef USE_AS_WCSCPY
+	test	%rdx, %rdx
+# endif
+	jne	OVERFLOW_STRCAT
+	ret
 
-#define USE_AS_STRNCAT
-#define STRCAT	STRNCAT
-#include "strcat-evex.S"
+	.p2align 4,, 8
+L(more_1x_vec):
+	VMOVU	%VMM(0), (%rdi)
+
+	/* We are going to align rsi here so will need to be able to re-
+	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
+
+	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+L(loop_last_4x_vec):
+	addq	%rsi, %rdi
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+
+	/* Will need this regardless.  */
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VMASK_REG
+
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	jne	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	addl	$-CHAR_PER_VEC, %edx
+	bzhi	%VRDX, %VRCX, %VR8
+	jz	L(ret_vec_x2_len)
+L(ret_vec_x2):
+	bsf	%VRCX, %VRDX
+L(ret_vec_x2_len):
+	VMOVU	(VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	movNULL	$0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 4
+L(ret_vec_x1_len):
+	movl	%edx, %ecx
+L(ret_vec_x1):
+	VMOVU	(VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	movNULL	$0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	addl	$-(CHAR_PER_VEC * 4), %edx
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VMASK_REG
+	subq	$-(VEC_SIZE * 4), %rsi
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+# ifdef USE_AS_WCSCPY
+	xorl	%ecx, %ecx
+# endif
+	bsf	%VMASK_REG, %VRCX
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VMASK_REG
+
+	cmpq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(more_4x_vec)
+
+	/* Adjust length before going to L(ret_vec_x3_len) or
+	   L(ret_vec_x3).  */
+	addl	$(CHAR_PER_VEC * -2), %edx
+
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	jne	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	addl	$-CHAR_PER_VEC, %edx
+	bzhi	%VRDX, %VRCX, %VR8
+	jz	L(ret_vec_x4_len)
+L(ret_vec_x4):
+	bsf	%VRCX, %VRDX
+L(ret_vec_x4_len):
+	VMOVU	(VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	movNULL	$0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 4
+L(ret_vec_x3_len):
+	movl	%edx, %ecx
+L(ret_vec_x3):
+	VMOVU	(VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	movNULL	$0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 8
+L(more_4x_vec):
+# ifdef USE_AS_WCSCPY
+	xorl	%ecx, %ecx
+# endif
+	bsf	%VMASK_REG, %VRCX
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+
+	/* Check if we are near the end before aligning.  */
+	cmpq	$(CHAR_PER_VEC * 8), %rdx
+	jbe	L(last_4x_vec)
+
+
+	/* Add rsi to rdx (length) before aligning rsi. NB: Since we
+	   filtered out huge lengths this cannot overflow.  */
+# ifdef USE_AS_WCSCPY
+	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rsi, %rdx
+# endif
+
+	/* Subtract rsi from rdi before aligning (add back will have
+	   correct rdi for aligned rsi).  */
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+
+	/* Offset rsi by VEC_SIZE so that we can jump to
+	   L(loop_last_4x_vec).  */
+	addq	$-(VEC_SIZE), %rsi
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	/* Store loop end in r9.  */
+	leaq	-(VEC_SIZE * 5)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	/* Restore rdi (dst).  */
+	addq	%rsi, %rdi
+
+	/* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
+	   test with bsf.  */
+	bsf	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
+
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	bsf	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+
+	KMOV	%k4, %VRCX
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	ret
+
+
+	.p2align 4,, 4
+L(page_cross):
+	movq	%rsi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+	VPCMPEQ	(%r8), %VZERO, %k0
+
+# ifdef USE_AS_WCSCPY
+	KMOV	%k0, %VR9
+	shrl	$2, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+	shrx	%VRCX, %VR9, %VRCX
+# else
+	KMOV	%k0, %VRCX
+	shrx	%VRSI, %VRCX, %VRCX
+# endif
+
+	subl	%esi, %r8d
+	andl	$(VEC_SIZE - 1), %r8d
+# ifdef USE_AS_WCSCPY
+	shrl	$2, %r8d
+# endif
+	cmpq	%r8, %rdx
+	jbe	L(page_cross_small)
+	/* Optimizing more for space as this is very cold code. This
+	   saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shl	%VRCX
+	jz	L(page_cross_continue)
+	bsf	%VRCX, %VRCX
+	REP_MOVS
+	ret
+
+L(page_cross_small):
+	tzcnt	%VRCX, %VRCX
+	jz	L(page_cross_setz)
+	cmpl	%edx, %ecx
+	cmova	%edx, %ecx
+
+# ifdef USE_AS_WCSCPY
+	rep	movsd
+# else
+	rep	movsb
+# endif
+L(page_cross_setz):
+	movNULL	$0, (%rdi)
+	ret
+END(STRNCAT)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
index 1b3426d511..49eaf4cbd9 100644
--- a/sysdeps/x86_64/multiarch/strncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
@@ -1,7 +1,990 @@
-#ifndef STRNCPY
-# define STRNCPY	__strncpy_evex
-#endif
+/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+
+
+# include <sysdep.h>
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+
+# ifndef STRNCPY
+#  define STRNCPY	__strncpy_evex
+# endif
+
+# ifdef USE_AS_WCSCPY
+#  define VMOVU_MASK	vmovdqu32
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define CHAR_SIZE	4
+
+#  define REP_MOVS	rep movsd
+#  define REP_STOS	rep stosl
+
+#  define USE_WIDE_CHAR
+
+# else
+#  define VMOVU_MASK	vmovdqu8
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define CHAR_SIZE	1
+
+#  define REP_MOVS	rep movsb
+#  define REP_STOS	rep stosb
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+
+# define VZERO	VMM(7)
+# define VZERO_256	VMM_256(7)
+# define VZERO_128	VMM_128(7)
+
+# if VEC_SIZE == 64
+#  define VZERO_HALF	VZERO_256
+# else
+#  define VZERO_HALF	VZERO_128
+# endif
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCPY)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   just running rep stos{b|l} to zero set (which will almost
+	   certainly segfault), if that succeeds then just calling
+	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
+# ifdef USE_AS_WCSCPY
+	decq	%rdx
+	movq	%rdx, %rax
+	/* 56 is end of max supported address space.  */
+	shr	$56, %rax
+	jnz	L(zero_len)
+# else
+	decq	%rdx
+	/* If the flag needs to become `jb` replace `dec` with `sub`.
+	 */
+	jl	L(zero_len)
+# endif
+
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
+	movl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+
+	/* If no STPCPY just save end ahead of time.  */
+# ifndef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+
+
+	cmpq	$(CHAR_PER_VEC), %rdx
+
+	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
+	   <= CHAR_PER_VEC with masked instructions (which have
+	   potential for dramatically bad perf if dst splits a page and
+	   is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	/* `jae` because length rdx is now length - 1.  */
+	jae	L(more_1x_vec)
+
+	/* If there where multiple zero-CHAR matches in the first VEC,
+	   VRCX will be overset but thats fine since any oversets where
+	   at zero-positions anyways.  */
+
+#  ifdef USE_AS_STPCPY
+	tzcnt	%VRCX, %VRAX
+	cmpl	%eax, %edx
+	cmovb	%edx, %eax
+#   ifdef USE_AS_WCSCPY
+	adcl	$0, %eax
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#   else
+	adcq	%rdi, %rax
+#   endif
+#  endif
+	dec	%VRCX
+
+	/* Zero out all non-zero CHAR's after the first zero match.  */
+	KMOV	%VRCX, %k1
+
+	/* Use VZERO as destination so this can be reused for
+	   L(zfill_less_vec) (which if jumped to by subsequent logic
+	   will have zerod out VZERO.  */
+	VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
+L(zfill_less_vec):
+	/* Get mask for what we need to set.  */
+	incl	%edx
+	mov	$-1, %VRCX
+	bzhi	%VRDX, %VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	VMOVU_MASK %VZERO, (%rdi){%k1}
+	ret
+
+	.p2align 4,, 4
+L(zero_len):
+	cmpq	$-1, %rdx
+	jne	L(best_effort_strncpy)
+	movq	%rdi, %rax
+	ret
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# else
+	/* `jb` because length rdx is now length - 1.  */
+	jb	L(less_1x_vec)
+# endif
+
+
+	/* This may overset but thats fine because we still need to zero
+	   fill.  */
+	VMOVU	%VMM(0), (%rdi)
+
+
+	/* Length must be >= CHAR_PER_VEC so match here means we must
+	   zero-fill.  */
+	test	%VRCX, %VRCX
+	jnz	L(zfill)
+
+
+	/* We are going to align rsi here so will need to be able to re-
+	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
+	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+
+L(loop_last_4x_vec):
+	addq	%rsi, %rdi
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+
+	/* -1 because of the `dec %rdx` earlier.  */
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	/* This will be need to be computed no matter what. We do it
+	   ahead of time for CHAR_PER_VEC == 64 because we can't adjust
+	   the value of `tzcnt` with a shift.  */
+# if CHAR_PER_VEC == 64
+	tzcntq	%rcx, %rcx
+# endif
+
+	cmpl	$(CHAR_PER_VEC), %edx
+	jb	L(ret_vec_x1_len)
+
+	/* Seperate logic for CHAR_PER_VEC == 64 because we already did
+	   `tzcnt` on VRCX.  */
+# if CHAR_PER_VEC == 64
+	/* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
+	cmpb	$CHAR_PER_VEC, %cl
+	jnz	L(ret_vec_x1_no_bsf)
+# else
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+# endif
+
+
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %VZERO, %k0
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	KMOV	%k0, %VRCX
+
+# if CHAR_PER_VEC < 64
+	/* This essentiallys adds CHAR_PER_VEC to computed result.  */
+	shlq	$CHAR_PER_VEC, %rcx
+# else
+	tzcntq	%rcx, %rcx
+	addl	$CHAR_PER_VEC, %ecx
+# endif
+
+	.p2align 4,, 4
+L(ret_vec_x1_len):
+	/* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
+	   already been done.  */
+# if CHAR_PER_VEC < 64
+	tzcntq	%rcx, %rcx
+# endif
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x1_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+L(ret_vec_x1_len_no_zfill):
+	VMOVU	((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+	.p2align 4,, 10
+L(ret_vec_x1):
+	bsf	%VRCX, %VRCX
+L(ret_vec_x1_no_bsf):
+	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	subl	%ecx, %edx
+	cmpl	$CHAR_PER_VEC, %edx
+	jb	L(ret_vec_x1_len_no_zfill_mov)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+	ret
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	/* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
+	   $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
+	   using `movzbl`.  */
+# if CHAR_PER_VEC == 64
+	movzbl	%dl, %edx
+# else
+	andl	$(CHAR_PER_VEC * 4 - 1), %edx
+# endif
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+	subq	$-(VEC_SIZE * 4), %rsi
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2 - 1), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	test	%VRCX, %VRCX
+	/* Must fill at least 2x VEC.  */
+	jnz	L(zfill_vec1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	/* Must fill at least 1x VEC.  */
+	jnz	L(zfill_vec2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRCX
+
+	/* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rdx
+	ja	L(more_4x_vec)
+
+	subl	$(CHAR_PER_VEC * 3), %edx
+	jb	L(ret_vec_x3_len)
+
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rsi), %VZERO, %k0
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	KMOV	%k0, %VRCX
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x4_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	movl	%ecx, %edx
+L(ret_vec_x4_len_no_zfill):
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 4 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+L(ret_vec_x3_len):
+	addl	$(CHAR_PER_VEC * 1), %edx
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x3_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+	.p2align 4,, 4
+L(ret_vec_x3_len_no_zfill):
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 3 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsf	%VRCX, %VRCX
+	VMOVU	%VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
+	subl	%ecx, %edx
+	jl	L(ret_vec_x3_len_no_zfill_mov)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+	ret
+
+	.p2align 4,, 8
+L(more_4x_vec):
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec4)
 
-#define USE_AS_STRNCPY
-#define STRCPY	STRNCPY
-#include "strcpy-evex.S"
+	/* Recheck length before aligning.  */
+	cmpq	$(CHAR_PER_VEC * 8 - 1), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
+# ifdef USE_AS_WCSCPY
+	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rsi, %rdx
+# endif
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+
+
+	/* Offset rsi by VEC_SIZE so that we can jump to
+	   L(loop_last_4x_vec).  */
+	addq	$-(VEC_SIZE), %rsi
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	/* Store loop end in r9.  */
+	leaq	-(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	/* Restore rdx (length).  */
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	/* Restore rdi (dst).  */
+	addq	%rsi, %rdi
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec1)
+
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec2)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec3)
+
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
+	KMOV	%k4, %VRCX
+	// Zfill more....
+
+	.p2align 4,, 4
+L(zfill_vec4):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -2), %rdx
+L(zfill_vec2):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -1), %rdx
+L(zfill):
+	/* VRCX must be non-zero.  */
+	bsf	%VRCX, %VRCX
+
+	/* Adjust length / dst for zfill.  */
+	subq	%rcx, %rdx
+# ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+# else
+	addq	%rcx, %rdi
+# endif
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_from_page_cross):
+
+	/* From here on out its just memset(rdi, 0, rdx).  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jb	L(zfill_less_vec)
+
+L(zfill_more_1x_vec):
+	VMOVU	%VZERO, (%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
+	ja	L(zfill_more_2x_vec)
+L(zfill_done0):
+	ret
+
+	/* Coming from vec1/vec2 we must be able to zfill at least 2x
+	   VEC.  */
+	.p2align 4,, 8
+L(zfill_vec3):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -2), %rdx
+	.p2align 4,, 2
+L(zfill_vec1):
+	bsfq	%rcx, %rcx
+	/* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
+	 */
+	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+
+
+	VMOVU	%VZERO, (%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	jb	L(zfill_done0)
+L(zfill_more_2x_vec):
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VZERO, (VEC_SIZE)(%rdi)
+	subq	$(CHAR_PER_VEC * 4 - 1), %rdx
+	jbe	L(zfill_done)
+
+# ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rdi, %rdx
+# endif
+
+	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi)
+
+
+	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
+	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	jbe	L(zfill_done)
+
+	/* Align rdi and zfill loop.  */
+	andq	$-(VEC_SIZE), %rdi
+	.p2align 4,, 12
+L(zfill_loop_4x_vec):
+	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	ja	L(zfill_loop_4x_vec)
+L(zfill_done):
+	ret
+
+
+	/* Less 1x VEC case if we are not using evex masked store.  */
+# if !USE_EVEX_MASKED_STORE
+	.p2align 4,, 8
+L(copy_1x):
+	/* Special case for copy 1x. It can be handled quickly and many
+	   buffer sizes have convenient alignment.  */
+	VMOVU	%VMM(0), (%rdi)
+	/* If no zeros then we are done.  */
+	testl	%ecx, %ecx
+	jz	L(ret_1x_1x)
+
+	/* Need to zfill, not we know that length <= CHAR_PER_VEC so we
+	   only handle the small case here.  */
+	bsf	%VRCX, %VRCX
+L(zfill_less_vec_no_bsf):
+	/* Adjust length / dst then just zfill less_vec.  */
+	subq	%rcx, %rdx
+#  ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#  else
+	addq	%rcx, %rdi
+#  endif
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+
+L(zfill_less_vec):
+	cmpl	$((VEC_SIZE / 2) / CHAR_SIZE), %edx
+	jb	L(zfill_less_half)
+
+	VMOVU	%VZERO_HALF, (%rdi)
+	VMOVU	%VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	ret
+#  ifdef USE_AS_STPCPY
+L(ret_1x_1x):
+	leaq	CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
+	ret
+#  endif
+
+
+#  if VEC_SIZE == 64
+	.p2align 4,, 4
+L(copy_32_63):
+	/* Overfill to avoid branches.  */
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+
+	/* We are taking advantage of the fact that to be here we must
+	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
+	   way for overwriting.  */
+	cmpl	%ecx, %edx
+	ja	L(zfill_less_vec_no_bsf)
+#   ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+#   else
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  endif
+
+	.p2align 4,, 4
+L(copy_16_31):
+	/* Overfill to avoid branches.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpl	%ecx, %edx
+
+	/* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
+	   we have a larger copy block for 32-63 so this is just falls
+	   through to zfill 16-31. If VEC_SIZE == 32 then we check for
+	   full zfill of less 1x VEC.  */
+#  if VEC_SIZE == 64
+	jbe	L(ret_16_31)
+	subl	%ecx, %edx
+#   ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#   else
+	addq	%rcx, %rdi
+#   endif
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+L(zfill_less_half):
+L(zfill_less_32):
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jb	L(zfill_less_16)
+	VMOVU	%VZERO_128, (%rdi)
+	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+#   ifdef USE_AS_STPCPY
+	ret
+#   endif
+L(ret_16_31):
+#   ifdef USE_AS_STPCPY
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  else
+	/* VEC_SIZE == 32 begins.  */
+	ja	L(zfill_less_vec_no_bsf)
+#   ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+#   else
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  endif
+
+
+	.p2align 4,, 4
+L(copy_8_15):
+	/* Overfill to avoid branches.  */
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpl	%ecx, %edx
+	jbe	L(ret_8_15)
+	subl	%ecx, %edx
+#  ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#  else
+	addq	%rcx, %rdi
+#  endif
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+	.p2align 4,, 8
+#  if VEC_SIZE == 32
+L(zfill_less_half):
+#  endif
+L(zfill_less_16):
+	xorl	%ecx, %ecx
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jb	L(zfill_less_8)
+	movq	%rcx, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+#  ifndef USE_AS_STPCPY
+L(ret_8_15):
+#  endif
+	ret
+
+	.p2align 4,, 8
+L(less_1x_vec):
+	je	L(copy_1x)
+
+	/* We will need `tzcnt` result for all other copy sizes.  */
+	tzcnt	%VRCX, %VRCX
+#  if VEC_SIZE == 64
+	cmpl	$(32 / CHAR_SIZE), %edx
+	jae	L(copy_32_63)
+#  endif
+
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jae	L(copy_16_31)
+
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jae	L(copy_8_15)
+#  ifdef USE_AS_WCSCPY
+	testl	%ecx, %ecx
+	jz	L(zfill_less_8_set_ret)
+
+	movl	(%rsi, %rdx, CHAR_SIZE), %esi
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%esi, (%rdi, %rdx, CHAR_SIZE)
+#   ifdef USE_AS_STPCPY
+	cmpl	%ecx, %edx
+L(ret_8_15):
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#   endif
+	ret
+L(zfill_less_8_set_ret):
+	xorl	%ecx, %ecx
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+L(zfill_less_8):
+	movl	%ecx, (%rdi)
+	movl	%ecx, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  else
+	cmpl	$3, %edx
+	jb	L(copy_0_3)
+	/* Overfill to avoid branches.  */
+	movl	-3(%rsi, %rdx), %esi
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%esi, -3(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_4_7)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+	xorl	%ecx, %ecx
+	.p2align 4,, 8
+L(zfill_less_8):
+	cmpl	$3, %edx
+	jb	L(zfill_less_3)
+	movl	%ecx, (%rdi)
+	movl	%ecx, -3(%rdi, %rdx)
+#   ifdef USE_AS_STPCPY
+	ret
+#   endif
+
+L(ret_4_7):
+#   ifdef USE_AS_STPCPY
+L(ret_8_15):
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#   endif
+	ret
+
+	.p2align 4,, 4
+L(zfill_less_3):
+	testl	%edx, %edx
+	jz	L(zfill_1)
+	movw	%cx, (%rdi)
+L(zfill_1):
+	movb	%cl, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_0_3):
+	vmovd	%VMM_128(0), %r8d
+	testl	%edx, %edx
+	jz	L(copy_1)
+	movw	%r8w, (%rdi)
+	cmpl	%ecx, %edx
+	ja	L(zfill_from_1)
+	movzbl	(%rsi, %rdx), %r8d
+#   ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+	movb	%r8b, (%rdi, %rdx)
+	ret
+#   endif
+
+L(copy_1):
+#   ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	cmpl	%ecx, %edx
+	adcq	%rdi, %rax
+#   endif
+#   ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+#   else
+	movb	%r8b, (%rdi, %rdx)
+#   endif
+	ret
+#  endif
+
+
+#  ifndef USE_AS_WCSCPY
+	.p2align 4,, 8
+L(zfill_from_1):
+#   ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+#   endif
+	movw	$0, -1(%rdi, %rdx)
+	ret
+#  endif
+
+	.p2align 4,, 4
+L(zero_len):
+	incq	%rdx
+	jne	L(best_effort_strncpy)
+	movq	%rdi, %rax
+	ret
+# endif
+
+
+	.p2align 4,, 4
+	.p2align 6,, 8
+L(page_cross):
+	movq	%rsi, %rax
+	andq	$(VEC_SIZE * -1), %rax
+	VPCMPEQ	(%rax), %VZERO, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSCPY
+	movl	%esi, %r8d
+	shrl	$2, %r8d
+	andl	$(CHAR_PER_VEC - 1), %r8d
+	shrx	%VR8, %VRCX, %VRCX
+# else
+	shrx	%VRSI, %VRCX, %VRCX
+# endif
+
+	/* Compute amount of bytes we checked.  */
+	subl	%esi, %eax
+	andl	$(VEC_SIZE - 1), %eax
+# ifdef USE_AS_WCSCPY
+	shrl	$2, %eax
+# endif
+
+	/* If rax > rdx then we are finishing the copy at the end of the
+	   page.  */
+	cmpq	%rax, %rdx
+	jb	L(page_cross_small)
+
+
+	/* If rcx is non-zero then continue.  */
+	test	%VRCX, %VRCX
+	jz	L(page_cross_continue)
+
+	/* We found zero-CHAR so need to copy then zfill (we know we
+	   didn't cover all of length here).  */
+	bsf	%VRCX, %VRCX
+L(movsb_and_zfill):
+	incl	%ecx
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	leaq	-CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
+# else
+	movq	%rdi, %rax
+# endif
+
+	REP_MOVS
+# ifdef USE_AS_WCSCPY
+	movl	$0, (%rdi)
+# else
+	movb	$0, (%rdi)
+# endif
+	jmp	L(zfill_from_page_cross)
+
+L(page_cross_small):
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(page_cross_copy_only)
+
+	/* Do a zfill of the tail before copying.  */
+	movq	%rdi, %r9
+	xorl	%eax, %eax
+
+	movl	%ecx, %r8d
+
+	subl	%ecx, %edx
+	leaq	CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+	movl	%edx, %ecx
+	REP_STOS
+	movq	%r9, %rdi
+	movl	%r8d, %edx
+L(page_cross_copy_only):
+	leal	1(%rdx), %ecx
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcl	$0, %edx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# else
+	movq	%rdi, %rax
+# endif
+	REP_MOVS
+	ret
+
+
+L(best_effort_strncpy):
+	movq	%rdx, %rcx
+	xorl	%eax, %eax
+	movq	%rdi, %r8
+	/* The length is >= 2^63. We very much so expect to segfault at
+	   rep stos. If that doesn't happen then just strcpy to finish.
+	 */
+	REP_STOS
+	movq	%r8, %rdi
+	jmp	OVERFLOW_STRCPY
+END(STRNCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
new file mode 100644
index 0000000000..d5ff4cbe50
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
@@ -0,0 +1,65 @@
+#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
+#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
+
+#if defined USE_MULTIARCH && IS_IN(libc)
+#  define UNDERSCORES __
+#  ifdef USE_WITH_SSE2
+#    define ISA_EXT _sse2
+#  elif defined USE_WITH_AVX
+#    ifdef USE_WITH_RTM
+#      define ISA_EXT _avx_rtm
+#    else
+#      define ISA_EXT _avx
+#    endif
+#  elif defined USE_WITH_AVX2
+#    ifdef USE_WITH_RTM
+#      define ISA_EXT _avx2_rtm
+#    else
+#      define ISA_EXT _avx2
+#    endif
+
+#  elif defined USE_WITH_EVEX256
+#    define ISA_EXT _evex
+#  elif defined USE_WITH_EVEX512
+#    define ISA_EXT _evex512
+#  endif
+#else
+#  define UNDERSCORES
+#  define ISA_EXT
+#endif
+
+#ifdef USE_AS_WCSCPY
+#  define STRCPY_PREFIX wc
+#  define STRCAT_PREFIX wcs
+#  ifdef USE_AS_STPCPY
+#    define STRCPY_POSTFIX pcpy
+#  else
+#    define STRCPY_POSTFIX scpy
+#  endif
+#else
+#  define STRCPY_PREFIX st
+#  define STRCAT_PREFIX str
+#  ifdef USE_AS_STPCPY
+#    define STRCPY_POSTFIX pcpy
+#  else
+#    define STRCPY_POSTFIX rcpy
+#  endif
+#endif
+#define STRCAT_POSTFIX cat
+
+#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext)                 \
+  underscores##prefix##postfix##ext
+
+#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
+
+#ifndef OVERFLOW_STRCPY
+#  define OVERFLOW_STRCPY                                                     \
+    OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
+#endif
+
+#ifndef OVERFLOW_STRCAT
+#  define OVERFLOW_STRCAT                                                     \
+    OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
+#endif
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v4 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions
  2022-11-04 23:04   ` [PATCH v4 1/4] " Noah Goldstein
@ 2022-11-04 23:04     ` Noah Goldstein
  2022-11-04 23:04     ` [PATCH v4 3/4] x86: Add evex optimized functions for the wchar_t strcpy family Noah Goldstein
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04 23:04 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
    1. Use more overlapping stores to avoid branches.
    2. Reduce how unrolled the aligning copies are (this is more of a
       code-size save, its a negative for some sizes in terms of
       perf).
    3. For st{r|p}n{cat|cpy} re-order the branches to minimize the
       number that are taken.

Performance Changes:

    Times are from N = 10 runs of the benchmark suite and are
    reported as geometric mean of all ratios of
    New Implementation / Old Implementation.

    strcat-avx2      -> 0.998
    strcpy-avx2      -> 0.937
    stpcpy-avx2      -> 0.971

    strncpy-avx2     -> 0.793
    stpncpy-avx2     -> 0.775

    strncat-avx2     -> 0.962

Code Size Changes:
    function         -> Bytes New / Bytes Old -> Ratio

    strcat-avx2      ->  685 / 1639 -> 0.418
    strcpy-avx2      ->  560 /  903 -> 0.620
    stpcpy-avx2      ->  592 /  939 -> 0.630

    strncpy-avx2     -> 1176 / 2390 -> 0.492
    stpncpy-avx2     -> 1268 / 2438 -> 0.520

    strncat-avx2     -> 1042 / 2563 -> 0.407

Notes:
    1. Because of the significant difference between the
       implementations they are split into three files.

           strcpy-avx2.S    -> strcpy, stpcpy, strcat
           strncpy-avx2.S   -> strncpy
           strncat-avx2.S    > strncat

       I couldn't find a way to merge them without making the
       ifdefs incredibly difficult to follow.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
 sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S   |    6 +-
 sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S  |    7 +-
 sysdeps/x86_64/multiarch/stpncpy-avx2.S      |    5 +-
 sysdeps/x86_64/multiarch/strcat-avx2-rtm.S   |   13 +-
 sysdeps/x86_64/multiarch/strcat-avx2.S       |  268 +---
 sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S   |   13 +-
 sysdeps/x86_64/multiarch/strcpy-avx2.S       | 1236 +++++-------------
 sysdeps/x86_64/multiarch/strncat-avx2-rtm.S  |    6 +-
 sysdeps/x86_64/multiarch/strncat-avx2.S      |  424 +++++-
 sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S  |    6 +-
 sysdeps/x86_64/multiarch/strncpy-avx2.S      |  740 ++++++++++-
 sysdeps/x86_64/multiarch/x86-avx-vecs.h      |    5 +-
 sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h |   26 +
 sysdeps/x86_64/multiarch/x86-avx2-vecs.h     |   27 +
 14 files changed, 1548 insertions(+), 1234 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-vecs.h

diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
index 2b9c07a59f..189a288053 100644
--- a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STPCPY	__stpcpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "stpcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
index 60a2ccfe53..1b252985e7 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
@@ -1,4 +1,3 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STPNCPY	__stpncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "stpncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
index b2f8c19143..a46a8edbe2 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
@@ -3,6 +3,5 @@
 #endif
 
 #define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY	STPNCPY
-#include "strcpy-avx2.S"
+#define STRNCPY	STPNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
index 637fb557c4..94d51d10bd 100644
--- a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
@@ -1,12 +1,3 @@
-#ifndef STRCAT
-# define STRCAT __strcat_avx2_rtm
-#endif
-
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
-
-#define SECTION(p) p##.avx.rtm
-
+#define STRCAT	__strcat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
 #include "strcat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
index d9b7fb2a43..3f914fa342 100644
--- a/sysdeps/x86_64/multiarch/strcat-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
@@ -16,266 +16,10 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (3)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_avx2
-# endif
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE	32
-
-# ifndef SECTION
-#  define SECTION(p)	p##.avx
-# endif
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRCAT)
-	mov	%rdi, %r9
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-	xor	%eax, %eax
-	mov	%edi, %ecx
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	vpxor	%xmm6, %xmm6, %xmm6
-	cmp	$(VEC_SIZE * 3), %ecx
-	ja	L(fourth_vector_boundary)
-	vpcmpeqb (%rdi), %ymm6, %ymm0
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_first_vector)
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	jmp	L(align_vec_size_start)
-L(fourth_vector_boundary):
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	vpcmpeqb	(%rax), %ymm6, %ymm0
-	mov	$-1, %r10d
-	sub	%rax, %rcx
-	shl	%cl, %r10d
-	vpmovmskb %ymm0, %edx
-	and	%r10d, %edx
-	jnz	L(exit)
-
-L(align_vec_size_start):
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 4), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 4), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 4), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 5), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
-	add	$VEC_SIZE, %rax
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
-	add	$VEC_SIZE, %rax
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
-	add	$VEC_SIZE, %rax
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	add	$VEC_SIZE, %rax
-
-	.p2align 4
-L(align_four_vec_loop):
-	vmovaps	(%rax),	%ymm4
-	vpminub	VEC_SIZE(%rax),	%ymm4, %ymm4
-	vmovaps	(VEC_SIZE * 2)(%rax),	%ymm5
-	vpminub	(VEC_SIZE * 3)(%rax),	%ymm5, %ymm5
-	add	$(VEC_SIZE * 4),	%rax
-	vpminub	%ymm4,	%ymm5, %ymm5
-	vpcmpeqb %ymm5,	%ymm6, %ymm5
-	vpmovmskb %ymm5,	%edx
-	test	%edx,	%edx
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
-	sub	$(VEC_SIZE * 5),	%rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit):
-	sub	%rdi, %rax
-L(exit_null_on_first_vector):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_second_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$VEC_SIZE, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_third_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 2), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fourth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 3), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fifth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-
-	.p2align 4
-L(StartStrcpyPart):
-	lea	(%r9, %rax), %rdi
-	mov	%rsi, %rcx
-	mov	%r9, %rax      /* save result */
-
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(ExitZero)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-avx2.S"
+#ifndef STRCAT
+# define STRCAT	__strcat_avx2
 #endif
+
+#define USE_AS_STRCAT
+#define STRCPY	STRCAT
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
index c2c581ecf7..fe80ffd265 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
@@ -1,12 +1,3 @@
-#ifndef STRCPY
-# define STRCPY __strcpy_avx2_rtm
-#endif
-
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
-
-#define SECTION(p) p##.avx.rtm
-
+#define STRCPY	__strcpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
 #include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index c725834929..048d55fad7 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -20,984 +20,378 @@
 
 #if ISA_SHOULD_BUILD (3)
 
+# include <sysdep.h>
 
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
-
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_avx2
-#  endif
-
-# endif
-
-/* Number of bytes in a vector register */
 # ifndef VEC_SIZE
-#  define VEC_SIZE	32
-# endif
-
-# ifndef VZEROUPPER
-#  define VZEROUPPER	vzeroupper
-# endif
-
-# ifndef SECTION
-#  define SECTION(p)	p##.avx
-# endif
-
-/* zero register */
-#define xmmZ	xmm0
-#define ymmZ	ymm0
-
-/* mask register */
-#define ymmM	ymm1
-
-# ifndef USE_AS_STRCAT
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRCPY)
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-	test	%R8_LP, %R8_LP
-	jz	L(ExitZero)
-#  endif
-	mov	%rsi, %rcx
-#  ifndef USE_AS_STPCPY
-	mov	%rdi, %rax      /* save result */
-#  endif
-
+#  include "x86-avx2-vecs.h"
 # endif
 
-	vpxor	%xmmZ, %xmmZ, %xmmZ
-
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	cmp	$(VEC_SIZE * 2), %ecx
-	jbe	L(SourceStringAlignmentLessTwoVecSize)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-
-	vpcmpeqb (%rsi), %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	shr	%cl, %rdx
-
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	mov	$VEC_SIZE, %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  else
-	mov	$(VEC_SIZE + 1), %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  endif
-	jbe	L(CopyVecSizeTailCase2OrCase3)
+# ifndef STRCPY
+#  define STRCPY	__strcpy_avx2
 # endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail)
 
-	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
-	vpmovmskb %ymm2, %edx
+	/* Use movsb in page cross case to save code size.  */
+# define USE_MOVSB_IN_PAGE_CROSS	1
 
-# ifdef USE_AS_STRNCPY
-	add	$VEC_SIZE, %r10
-	cmp	%r10, %r8
-	jbe	L(CopyTwoVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize)
-
-	vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
-	vmovdqu %ymm2, (%rdi)
-
-/* If source address alignment != destination address alignment */
-	.p2align 4
-L(UnalignVecSizeBoth):
-	sub	%rcx, %rdi
-# ifdef USE_AS_STRNCPY
-	add	%rcx, %r8
-	sbb	%rcx, %rcx
-	or	%rcx, %r8
-# endif
-	mov	$VEC_SIZE, %rcx
-	vmovdqa (%rsi, %rcx), %ymm2
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 3), %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_WCSCPY
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
 # else
-	jnz	L(CopyVecSize)
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
 # endif
 
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
-	vpcmpeqb %ymm3, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define PAGE_SIZE	4096
 
-	vmovdqu %ymm3, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
-	vpcmpeqb %ymm4, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
+# ifdef USE_AS_STPCPY
+#  define END_REG	rax
 # else
-	jnz	L(CopyVecSize)
+#  define END_REG	rdi, %rdx
 # endif
 
-	vmovdqu %ymm4, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STRCAT
+#  define PAGE_ALIGN_REG	ecx
 # else
-	jnz	L(CopyVecSize)
+#  define PAGE_ALIGN_REG	eax
 # endif
 
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
 
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vpcmpeqb %ymm3, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRCPY)
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
 
-	vmovdqu %ymm3, (%rdi, %rcx)
-	mov	%rsi, %rdx
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	and	$-(VEC_SIZE * 4), %rsi
-	sub	%rsi, %rdx
-	sub	%rdx, %rdi
-# ifdef USE_AS_STRNCPY
-	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
-# endif
-L(UnalignedFourVecSizeLoop):
-	vmovdqa (%rsi), %ymm4
-	vmovdqa VEC_SIZE(%rsi), %ymm5
-	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
-	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
-	vpminub %ymm5, %ymm4, %ymm2
-	vpminub %ymm7, %ymm6, %ymm3
-	vpminub %ymm2, %ymm3, %ymm3
-	vpcmpeqb %ymmM, %ymm3, %ymm3
-	vpmovmskb %ymm3, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(UnalignedFourVecSizeLeave)
-
-L(UnalignedFourVecSizeLoop_start):
-	add	$(VEC_SIZE * 4), %rdi
-	add	$(VEC_SIZE * 4), %rsi
-	vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
-	vmovdqa (%rsi), %ymm4
-	vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
-	vmovdqa VEC_SIZE(%rsi), %ymm5
-	vpminub %ymm5, %ymm4, %ymm2
-	vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
-	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
-	vmovdqu %ymm7, -VEC_SIZE(%rdi)
-	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
-	vpminub %ymm7, %ymm6, %ymm3
-	vpminub %ymm2, %ymm3, %ymm3
-	vpcmpeqb %ymmM, %ymm3, %ymm3
-	vpmovmskb %ymm3, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jz	L(UnalignedFourVecSizeLoop_start)
-
-L(UnalignedFourVecSizeLeave):
-	vpcmpeqb %ymm4, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_0)
-
-	vpcmpeqb %ymm5, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %ecx
-	test	%ecx, %ecx
-	jnz	L(CopyVecSizeUnaligned_16)
-
-	vpcmpeqb %ymm6, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_32)
-
-	vpcmpeqb %ymm7, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %ecx
-	bsf	%ecx, %edx
-	vmovdqu %ymm4, (%rdi)
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$(VEC_SIZE * 3), %rsi
-	add	$(VEC_SIZE * 3), %rdi
-	jmp	L(CopyVecSizeExit)
+# ifdef USE_AS_STRCAT
+	movq	%rdi, %rax
+#  include "strcat-strlen-avx2.h.S"
 # endif
 
-/* If source address alignment == destination address alignment */
-
-L(SourceStringAlignmentLessTwoVecSize):
-	vmovdqu (%rsi), %ymm3
-	vmovdqu VEC_SIZE(%rsi), %ymm2
-	vpcmpeqb %ymm3, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$VEC_SIZE, %r8
-#  else
-	cmp	$(VEC_SIZE + 1), %r8
-#  endif
-	jbe	L(CopyVecSizeTail1Case2OrCase3)
+	movl	%esi, %PAGE_ALIGN_REG
+	andl	$(PAGE_SIZE - 1), %PAGE_ALIGN_REG
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
+	ja	L(page_cross)
+L(page_cross_continue):
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 # endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail1)
-
-	vmovdqu %ymm3, (%rdi)
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$(VEC_SIZE * 2), %r8
-#  else
-	cmp	$((VEC_SIZE * 2) + 1), %r8
-#  endif
-	jbe	L(CopyTwoVecSize1Case2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize1)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-	jmp	L(UnalignVecSizeBoth)
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
 
-/*------End of main part with loops---------------------*/
+	testl	%ecx, %ecx
+	jz	L(more_1x_vec)
 
-/* Case1 */
+	/* No longer need ymm registers so just vzeroupper so it doesn't
+	   need to be duplicated at each return statement.  */
+	COND_VZEROUPPER
 
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
-	.p2align 4
-L(CopyVecSize):
-	add	%rcx, %rdi
-# endif
-L(CopyVecSizeTail):
-	add	%rcx, %rsi
-L(CopyVecSizeTail1):
-	bsf	%edx, %edx
-L(CopyVecSizeExit):
-	cmp	$32, %edx
-	jae	L(Exit32_63)
-	cmp	$16, %edx
-	jae	L(Exit16_31)
-	cmp	$8, %edx
-	jae	L(Exit8_15)
-	cmp	$4, %edx
-	jae	L(Exit4_7)
-	cmp	$3, %edx
-	je	L(Exit3)
-	cmp	$1, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	movb	$0, (%rdi)
+	xorl	%edx, %edx
+	bsfl	%ecx, %edx
 # ifdef USE_AS_STPCPY
-	lea	(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$1, %r8
-	lea	1(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
-
-	.p2align 4
-L(CopyTwoVecSize1):
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$VEC_SIZE, %r8
-# endif
-	jmp	L(CopyVecSizeTail1)
-
-	.p2align 4
-L(CopyTwoVecSize):
-	bsf	%edx, %edx
-	add	%rcx, %rsi
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	jmp	L(CopyVecSizeExit)
-
-	.p2align 4
-L(CopyVecSizeUnaligned_0):
-	bsf	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm4, (%rdi)
-	add	$((VEC_SIZE * 4) - 1), %r8
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	jmp	L(CopyVecSizeExit)
-# endif
-
-	.p2align 4
-L(CopyVecSizeUnaligned_16):
-	bsf	%ecx, %edx
-	vmovdqu %ymm4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	VEC_SIZE(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	add	$((VEC_SIZE * 3) - 1), %r8
-	sub	%rdx, %r8
-	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
+	leaq	(%rdi, %rdx), %rax
+# endif
+
+	/* Use mask bits in rcx to detect which copy we need. If the low
+	   mask is zero then there must be a bit set in the upper half.
+	   I.e if ecx != 0 and cx == 0, then match must be upper 16
+	   bits so we use L(copy_16_31).  */
+	testw	%cx, %cx
+	jz	L(copy_16_31)
+
+	testb	%cl, %cl
+	jz	L(copy_8_15)
+# ifdef USE_AS_WCSCPY
+	vmovd	%xmm0, (%rdi)
+	movl	$0, (%END_REG)
+	ret
 # else
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
-
-	.p2align 4
-L(CopyVecSizeUnaligned_32):
-	bsf	%edx, %edx
-	vmovdqu %ymm4, (%rdi)
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-	add	$((VEC_SIZE * 2) - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
+	testb	$0x7, %cl
+	jz	L(copy_4_7)
+
+	testl	%edx, %edx
+	jz	L(set_null_term)
+	vmovd	%xmm0, %ecx
+	movw	%cx, (%rdi)
+
+	.p2align 4,, 2
+L(set_null_term):
+	movb	$0, (%END_REG)
+	ret
+
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-3(%rsi, %rdx), %ecx
+	vmovd	%xmm0, (%rdi)
+	movl	%ecx, -3(%END_REG)
+	ret
+# endif
+
+	.p2align 4,, 10
+L(copy_16_31):
+	VMOVU	-(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
+	ret
+
+	.p2align 4,, 10
+L(copy_8_15):
+# ifdef USE_AS_WCSCPY
+	movl	-(8 - CHAR_SIZE)(%rsi, %rdx), %ecx
 # else
-	add	$(VEC_SIZE * 2), %rsi
-	add	$(VEC_SIZE * 2), %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
-
-# ifdef USE_AS_STRNCPY
-#  ifndef USE_AS_STRCAT
-	.p2align 4
-L(CopyVecSizeUnalignedVec6):
-	vmovdqu %ymm6, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec5):
-	vmovdqu %ymm5, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec4):
-	vmovdqu %ymm4, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec3):
-	vmovdqu %ymm3, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-#  endif
-
-/* Case2 */
-
-	.p2align 4
-L(CopyVecSizeCase2):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTailCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTail1Case2):
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-/* Case2 or Case3,  Case3 */
-
-	.p2align 4
-L(CopyVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeCase2)
-L(CopyVecSizeCase3):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyTwoVecSizeCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyVecSizeTailCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTailCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSize1Case2OrCase3):
-	add	$VEC_SIZE, %rdi
-	add	$VEC_SIZE, %rsi
-	sub	$VEC_SIZE, %r8
-L(CopyVecSizeTail1Case2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTail1Case2)
-	jmp	L(StrncpyExit)
-# endif
-
-/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
-
-	.p2align 4
-L(Exit1):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$2, %r8
-	lea	2(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Exit2):
-	movzwl	(%rsi), %ecx
-	mov	%cx, (%rdi)
-	movb	$0, 2(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$3, %r8
-	lea	3(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Exit3):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
+# endif
+	vmovq	%xmm0, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%END_REG)
+	ret
+
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rdi)
+# endif
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	addq	%rsi, %rdi
+	VMOVA	1(%rsi), %VMM(1)
+
+	/* Try and order stores after as many loads as is reasonable to
+	   avoid potential false dependencies.  */
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rax)
+# endif
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE + 1)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), 1(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE + 1)(%rdi)
+
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2 + 1)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %edx
+	testl	%edx, %edx
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 3 + 1)(%rdi)
+
+	/* Subtract rsi from rdi before aligning. Adding back rsi will
+	   get proper rdi (dst) for new src.  */
+	subq	%rsi, %rdi
+	incq	%rsi
+	orq	$(VEC_SIZE * 4 - 1), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %edx
+	addq	%rsi, %rdi
+
+	testl	%edx, %edx
+	jnz	L(loop_4x_done)
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 1)(%rdi)
+
+
+	VMOVA	(VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %edx
+	subq	$(VEC_SIZE * -4), %rdi
+	testl	%edx, %edx
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
+
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
+L(ret_vec_x4):
+	bsfl	%edx, %edx
+	VMOVU	((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
 # ifdef USE_AS_STPCPY
-	lea	3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$4, %r8
-	lea	4(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax
 # endif
+L(return_end):
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(Exit4_7):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	-3(%rsi, %rdx), %ecx
-	mov	%ecx, -3(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x1):
+	bsfl	%ecx, %ecx
+	VMOVU	(1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	1(%rcx, %rdi), %rax
 # endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Exit8_15):
-	mov	(%rsi), %rcx
-	mov	-7(%rsi, %rdx), %r9
-	mov	%rcx, (%rdi)
-	mov	%r9, -7(%rdi, %rdx)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
-L(Exit16_31):
-	vmovdqu (%rsi), %xmm2
-	vmovdqu -15(%rsi, %rdx), %xmm3
-	vmovdqu %xmm2, (%rdi)
-	vmovdqu %xmm3, -15(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x2):
+	bsfl	%ecx, %ecx
+	VMOVU	((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub %rdx, %r8
-	sub $1, %r8
-	lea 1(%rdi, %rdx), %rdi
-	jnz L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax
 # endif
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(Exit32_63):
-	vmovdqu (%rsi), %ymm2
-	vmovdqu -31(%rsi, %rdx), %ymm3
-	vmovdqu %ymm2, (%rdi)
-	vmovdqu %ymm3, -31(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsfl	%ecx, %ecx
+	VMOVU	((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax
 # endif
 	VZEROUPPER_RETURN
 
-# ifdef USE_AS_STRNCPY
 
-	.p2align 4
-L(StrncpyExit1):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
+	.p2align 4,, 4
+L(page_cross):
+	movq	%rsi, %rcx
+	andq	$(VEC_SIZE * -1), %rcx
+
+	VPCMPEQ	(%rcx), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+# if USE_MOVSB_IN_PAGE_CROSS
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shll	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsfl	%ecx, %ecx
+#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
+#  endif
+	rep	movsb
 #  ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 1(%rdi)
+	leaq	-CHAR_SIZE(%rdi), %rax
 #  endif
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(StrncpyExit2):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 2(%rdi)
-#  endif
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(StrncpyExit3_4):
-	movzwl	(%rsi), %ecx
-	movzwl	-2(%rsi, %r8), %edx
-	mov	%cx, (%rdi)
-	mov	%dx, -2(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit5_8):
-	mov	(%rsi), %ecx
-	mov	-4(%rsi, %r8), %edx
-	mov	%ecx, (%rdi)
-	mov	%edx, -4(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit9_16):
-	mov	(%rsi), %rcx
-	mov	-8(%rsi, %r8), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit17_32):
-	vmovdqu (%rsi), %xmm2
-	vmovdqu -16(%rsi, %r8), %xmm3
-	vmovdqu %xmm2, (%rdi)
-	vmovdqu %xmm3, -16(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit33_64):
-	/*  0/32, 31/16 */
-	vmovdqu (%rsi), %ymm2
-	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
-	vmovdqu %ymm2, (%rdi)
-	vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit65):
-	/* 0/32, 32/32, 64/1 */
-	vmovdqu (%rsi), %ymm2
-	vmovdqu 32(%rsi), %ymm3
-	mov	64(%rsi), %cl
-	vmovdqu %ymm2, (%rdi)
-	vmovdqu %ymm3, 32(%rdi)
-	mov	%cl, 64(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	65(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 65(%rdi)
-#  endif
-	VZEROUPPER_RETURN
+# else
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
 
+	/* Traditional copy case, essentially same as used in non-page-
+	   cross case but since we can't reuse VMM(0) we need twice as
+	   many loads from rsi.  */
 #  ifndef USE_AS_STRCAT
-
-	.p2align 4
-L(Fill1):
-	mov	%dl, (%rdi)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill2):
-	mov	%dx, (%rdi)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill3_4):
-	mov	%dx, (%rdi)
-	mov     %dx, -2(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill5_8):
-	mov	%edx, (%rdi)
-	mov     %edx, -4(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill9_16):
-	mov	%rdx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill17_32):
-	vmovdqu %xmmZ, (%rdi)
-	vmovdqu %xmmZ, -16(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec2):
-	vmovdqu %ymm2, (%rdi, %rcx)
-
-	.p2align 4
-L(CopyVecSizeVecExit):
-	bsf	%edx, %edx
-	add	$(VEC_SIZE - 1), %r8
-	add	%rcx, %rdi
-#   ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-#   endif
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-
-	.p2align 4
-L(StrncpyFillTailWithZero):
-	xor	%edx, %edx
-	sub	$VEC_SIZE, %r8
-	jbe	L(StrncpyFillExit)
-
-	vmovdqu %ymmZ, (%rdi)
-	add	$VEC_SIZE, %rdi
-
-	mov	%rdi, %rsi
-	and	$(VEC_SIZE - 1), %esi
-	sub	%rsi, %rdi
-	add	%rsi, %r8
-	sub	$(VEC_SIZE * 4), %r8
-	jb	L(StrncpyFillLessFourVecSize)
-
-L(StrncpyFillLoopVmovdqa):
-	vmovdqa %ymmZ, (%rdi)
-	vmovdqa %ymmZ, VEC_SIZE(%rdi)
-	vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
-	vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE * 4), %rdi
-	sub	$(VEC_SIZE * 4), %r8
-	jae	L(StrncpyFillLoopVmovdqa)
-
-L(StrncpyFillLessFourVecSize):
-	add	$(VEC_SIZE * 2), %r8
-	jl	L(StrncpyFillLessTwoVecSize)
-	vmovdqa %ymmZ, (%rdi)
-	vmovdqa %ymmZ, VEC_SIZE(%rdi)
-	add	$(VEC_SIZE * 2), %rdi
-	sub	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	vmovdqa %ymmZ, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillLessTwoVecSize):
-	add	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	vmovdqa %ymmZ, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillExit):
-	add	$VEC_SIZE, %r8
-L(Fill):
-	cmp	$17, %r8d
-	jae	L(Fill17_32)
-	cmp	$9, %r8d
-	jae	L(Fill9_16)
-	cmp	$5, %r8d
-	jae	L(Fill5_8)
-	cmp	$3, %r8d
-	jae	L(Fill3_4)
-	cmp	$1, %r8d
-	ja	L(Fill2)
-	je	L(Fill1)
-	VZEROUPPER_RETURN
-
-/* end of ifndef USE_AS_STRCAT */
+	xorl	%edx, %edx
 #  endif
-
-	.p2align 4
-L(UnalignedLeaveCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(UnalignedFourVecSizeLeaveCase2)
-L(UnalignedFourVecSizeLeaveCase3):
-	lea	(VEC_SIZE * 4)(%r8), %rcx
-	and	$-VEC_SIZE, %rcx
-	add	$(VEC_SIZE * 3), %r8
-	jl	L(CopyVecSizeCase3)
-	vmovdqu %ymm4, (%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
+	bsfl	%ecx, %edx
 #  ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 4)(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (VEC_SIZE * 4)(%rdi)
+	leaq	(%rdi, %rdx), %rax
+#  elif !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 #  endif
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(UnalignedFourVecSizeLeaveCase2):
-	xor	%ecx, %ecx
-	vpcmpeqb %ymm4, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$(VEC_SIZE * 3), %r8
-	jle	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
-	vpcmpeqb %ymm5, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	vmovdqu %ymm4, (%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec5)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
+	/* vzeroupper early to avoid duplicating at each return.  */
+	COND_VZEROUPPER
 
-	vpcmpeqb %ymm6, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec6)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
+	testw	%cx, %cx
+	jz	L(page_cross_copy_16_31)
 
-	vpcmpeqb %ymm7, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-	lea	VEC_SIZE(%rdi, %rcx), %rdi
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-L(StrncpyExit):
-	cmp	$65, %r8d
-	je	L(StrncpyExit65)
-	cmp	$33, %r8d
-	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8d
-	jae	L(StrncpyExit17_32)
-	cmp	$9, %r8d
-	jae	L(StrncpyExit9_16)
-	cmp	$5, %r8d
-	jae	L(StrncpyExit5_8)
-	cmp	$3, %r8d
-	jae	L(StrncpyExit3_4)
-	cmp	$1, %r8d
-	ja	L(StrncpyExit2)
-	je	L(StrncpyExit1)
-#  ifdef USE_AS_STPCPY
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(ExitZero):
-#  ifndef USE_AS_STRCAT
-	mov	%rdi, %rax
-#  endif
-	VZEROUPPER_RETURN
+	testb	%cl, %cl
+	jz	L(page_cross_copy_8_15)
 
-# endif
+	testl	$0x7, %cl
+	jz	L(page_cross_copy_4_7)
 
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
-# endif
+	testl	%edx, %edx
+	jz	L(page_cross_set_null_term)
+	movzwl	(%rsi), %ecx
+	movw	%cx, (%rdi)
+L(page_cross_set_null_term):
+	movb	$0, (%END_REG)
+	ret
+
+	.p2align 4,, 4
+L(page_cross_copy_4_7):
+	movl	(%rsi), %ecx
+	movl	-3(%rsi, %rdx), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, -3(%END_REG)
+	ret
+
+	.p2align 4,, 4
+L(page_cross_copy_8_15):
+	movq	(%rsi), %rcx
+	movq	-7(%rsi, %rdx), %rsi
+	movq	%rcx, (%rdi)
+	movq	%rsi, -7(%END_REG)
+	ret
+
+
+	.p2align 4,, 3
+L(page_cross_copy_16_31):
+	VMOVU	(%rsi), %xmm0
+	VMOVU	-15(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -15(%END_REG)
+	ret
+# endif
+
+END(STRCPY)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
index 0dcea18dbb..2bbdbb91ab 100644
--- a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_avx2_rtm
-#include "strcat-avx2-rtm.S"
+#define STRNCAT	__strncat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "strncat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
index 52ecbca943..6af9f9c86f 100644
--- a/sysdeps/x86_64/multiarch/strncat-avx2.S
+++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
@@ -1,7 +1,419 @@
-#ifndef STRNCAT
-# define STRNCAT	__strncat_avx2
-#endif
+/* strncat with AVX2
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+#  include "x86-avx2-vecs.h"
+# endif
+
+# ifndef STRNCAT
+#  define STRNCAT	__strncat_avx2
+# endif
+
+# ifdef USE_AS_WCSCPY
+#  define movNULL	movl
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define movNULL	movb
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCAT)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   using the non-length variant {wcs|str}cat.  */
+	movq	%rdi, %rax
+# ifdef USE_AS_WCSCPY
+	leaq	-1(%rdx), %rcx
+	shr	$56, %rcx
+	jnz	L(zero_len)
+	salq	$2, %rdx
+# else
+	test	%rdx, %rdx
+	jl	L(zero_len)
+# endif
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
+
+# include "strcat-strlen-avx2.h.S"
+
+	movl	%esi, %ecx
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	tzcnt	%ecx, %r8d
+	cmpq	%r8, %rdx
+	jbe	L(less_1x_vec)
+
+	testl	%ecx, %ecx
+	jz	L(more_1x_vec)
+
+	/* Hoist this to save code size.  */
+
+	movl	%r8d, %edx
+
+L(less_1x_vec):
+	COND_VZEROUPPER
+
+	cmpl	$16, %edx
+	jae	L(copy_16_31)
+	cmpl	$8, %edx
+	jae	L(copy_8_15)
+
+
+# ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+# else
+	cmpl	$4, %edx
+	jae	L(copy_4_7)
+
+	movzbl	(%rsi), %ecx
+	cmpl	$1, %edx
+	jbe	L(set_null_term)
+
+	/* NB: make this `vmovw` if support for AVX512-FP16 is added.
+	 */
+	movzwl	1(%rsi), %esi
+	movw	%si, 1(%rdi)
+
+	.p2align 4,, 1
+L(set_null_term):
+	movb	%cl, (%rdi)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 11
+L(copy_4_7):
+	movl	-(4)(%rsi, %rdx), %ecx
+	vmovd	%xmm0, (%rdi)
+	movl	%ecx, -(4)(%rdi, %rdx)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+# endif
+
+
+	.p2align 4,, 10
+L(copy_16_31):
+	VMOVU	-(16)(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -(16)(%rdi, %rdx)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 10
+L(copy_8_15):
+	movq	-(8)(%rsi, %rdx), %rcx
+	vmovq	%xmm0, (%rdi)
+	movq	%rcx, -(8)(%rdi, %rdx)
+	movNULL	$0, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+	.p2align 6,, 14
+L(more_1x_vec):
+	VMOVU	%VMM(0), (%rdi)
+
+	/* Align rsi (src) and just rdx/rdi (length/dst).  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	incq	%rsi
+	addq	%rsi, %rdi
+L(loop_last_4x_vec):
+	subq	%rsi, %rdx
+	VMOVA	0(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	tzcnt	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len)
+
+	cmpl	$VEC_SIZE, %ecx
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	addl	$-VEC_SIZE, %edx
+	bzhil	%edx, %ecx, %r8d
+	jz	L(ret_vec_x2_len)
+L(ret_vec_x2):
+	bsfl	%ecx, %edx
+L(ret_vec_x2_len):
+	VMOVU	(%rsi, %rdx), %VMM(0)
+	movNULL	$0, (VEC_SIZE)(%rdi, %rdx)
+	VMOVU	%VMM(0), (%rdi, %rdx)
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+
+	.p2align 4,, 12
+L(ret_vec_x1_len):
+	movl	%edx, %ecx
+L(ret_vec_x1):
+	VMOVU	-(VEC_SIZE)(%rsi, %rcx), %VMM(1)
+	movNULL	$0, (%rdi, %rcx)
+	VMOVU	%VMM(1), -VEC_SIZE(%rdi, %rcx)
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	subq	$-(VEC_SIZE * 4), %rsi
+	VMOVA	0(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	subq	$-(VEC_SIZE * 4), %rdi
+	addl	$-(VEC_SIZE * 4), %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+	/* L(ret_vec_x1) expects ecx to have position of first match so
+	   test with bsf.  */
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
 
-#define USE_AS_STRNCAT
-#define STRCAT	STRNCAT
-#include "strcat-avx2.S"
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE * 1)(%rdi)
+
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* Check if length is greater than 4x VEC.  */
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	addl	$(VEC_SIZE * -2), %edx
+
+	tzcnt	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len)
+
+	cmpl	$VEC_SIZE, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	addl	$-VEC_SIZE, %edx
+	bzhil	%edx, %ecx, %r8d
+	jz	L(ret_vec_x4_len)
+L(ret_vec_x4):
+	bsfl	%ecx, %edx
+L(ret_vec_x4_len):
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %VMM(0)
+	movNULL	$0, (VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * 2)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 4
+L(ret_vec_x3_len):
+	movl	%edx, %ecx
+L(ret_vec_x3):
+	VMOVU	(VEC_SIZE)(%rsi, %rcx), %VMM(0)
+	movNULL	$0, (VEC_SIZE * 2)(%rdi, %rcx)
+	VMOVU	%VMM(0), (VEC_SIZE)(%rdi, %rcx)
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(more_4x_vec):
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi)
+
+
+	/* Recheck length before aligning.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Align rsi (src) and just rdx/rdi (length/dst).  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %r8d
+	addq	%rsi, %rdi
+	testl	%r8d, %r8d
+	jnz	L(loop_4x_done)
+
+	/* Use r9 for end of region before handling last 4x VEC
+	   specially.  */
+	leaq	-(VEC_SIZE * 4)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %r8d
+
+	testl	%r8d, %r8d
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	/* L(ret_vec_x1) expects ecx to have position of first match so
+	   test with bsf.  */
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	bsfl	%r8d, %r8d
+	VMOVU	(VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8)
+	VZEROUPPER_RETURN
+
+
+
+	.p2align 4,, 4
+L(page_cross):
+	movq	%rsi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+
+	VPCMPEQ	(%r8), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+
+	subl	%esi, %r8d
+	andl	$(VEC_SIZE - 1), %r8d
+	cmpq	%r8, %rdx
+	jb	L(page_cross_small)
+
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shll	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsfl	%ecx, %ecx
+	rep	movsb
+	VZEROUPPER_RETURN
+
+L(page_cross_small):
+	tzcntl	%ecx, %ecx
+	jz	L(page_cross_setz)
+	cmpl	%edx, %ecx
+	cmova	%edx, %ecx
+	rep	movsb
+L(page_cross_setz):
+	movNULL	$0, (%rdi)
+	VZEROUPPER_RETURN
+L(zero_len):
+# ifdef USE_AS_WCSCPY
+	test	%rdx, %rdx
+# endif
+	jnz	OVERFLOW_STRCAT
+	ret
+
+
+END(STRNCAT)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
index 79e7083299..b582a4a7a1 100644
--- a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STRNCPY	__strncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
index ce634e94fa..d1b25b7a42 100644
--- a/sysdeps/x86_64/multiarch/strncpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
@@ -1,7 +1,735 @@
-#ifndef STRNCPY
-# define STRNCPY	__strncpy_avx2
-#endif
+/* strncpy with AVX2
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+
+# ifndef VEC_SIZE
+#  include "x86-avx2-vecs.h"
+# endif
+
+# ifndef STRNCPY
+#  define STRNCPY	__strncpy_avx2
+# endif
+
+
+# ifdef USE_AS_WCSCPY
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCPY)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   just running rep stos{b|l} to zero set (which will almost
+	   certainly segfault), if that succeeds then just calling
+	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
+# ifdef USE_AS_WCSCPY
+	decq	%rdx
+	movq	%rdx, %rax
+	/* 56 is end of max supported address space.  */
+	shr	$56, %rax
+	jnz	L(zero_len)
+	salq	$2, %rdx
+# else
+	decq	%rdx
+	/* `dec` can macrofuse with `jl`. If the flag needs to become
+	   `jb` replace `dec` with `sub`.  */
+	jl	L(zero_len)
+# endif
+
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
+	movl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* If no STPCPY just save end ahead of time.  */
+# ifndef USE_AS_STPCPY
+	movq	%rdi, %rax
+# elif defined USE_AS_WCSCPY
+	/* Clear dependency as nearly all return code for wcpncpy uses
+	   `setc %al`.  */
+	xorl	%eax, %eax
+# endif
+
+	cmpq	$(VEC_SIZE - CHAR_SIZE), %rdx
+	/* `jb` because length rdx is now length - CHAR_SIZE.  */
+	jbe	L(less_1x_vec)
+
+	/* This may overset but thats fine because we still need to zero
+	   fill.  */
+	VMOVU	%VMM(0), (%rdi)
+
+	testl	%ecx, %ecx
+	jnz	L(zfill)
+
+	/* Align.  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	incq	%rsi
+L(last_4x_vec):
+	addq	%rsi, %rdi
+L(loop_last_4x_vec):
+	subq	%rsi, %rdx
+
+
+	VMOVA	0(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jae	L(more_2x_vec)
+
+	cmpl	$(VEC_SIZE), %edx
+	jb	L(ret_vec_x1_len)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VPCMPEQ	VEC_SIZE(%rsi), %VZERO, %VMM(6)
+	VMOVU	%VMM(1), (%rdi)
+	vpmovmskb %VMM(6), %ecx
+	shlq	$VEC_SIZE, %rcx
+L(ret_vec_x1_len):
+	tzcntq	%rcx, %rcx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+L(ret_vec_x1_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+L(ret_vec_x1_len_no_zfill):
+	VMOVU	((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	.p2align 4,, 6
+L(ret_vec_x1):
+	bsfl	%ecx, %ecx
+	VMOVU	%VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+	subl	%ecx, %edx
+	/* Check if we need to reload/store.  */
+	cmpl	$VEC_SIZE, %edx
+	jb	L(ret_vec_x1_len_no_zfill_mov)
+	/* Otherwise safe to just store directly.  */
+	VMOVU	%VMM(1), (%rdi)
+	VMOVU	%VZERO, (%rdi, %rcx)
+# ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 12
+L(more_2x_vec):
+	VMOVU	%VMM(1), (%rdi)
+	testl	%ecx, %ecx
+	/* Must fill at least 2x VEC.  */
+	jnz	L(zfill_vec1)
+
+	VMOVA	VEC_SIZE(%rsi), %VMM(2)
+	VMOVU	%VMM(2), VEC_SIZE(%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	/* Must fill at least 1x VEC.  */
+	jnz	L(zfill_vec2)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
+	   CHAR_SIZE.  */
+	cmpq	$(VEC_SIZE * 4 - CHAR_SIZE), %rdx
+	ja	L(more_4x_vec)
+
+	subl	$(VEC_SIZE * 3), %edx
+	jb	L(ret_vec_x3_len)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	vpmovmskb %VMM(6), %ecx
+	tzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x4_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+	movl	%ecx, %edx
+L(ret_vec_x4_len_no_zfill):
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 3 + 0)(%edx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+
+L(ret_vec_x3_len):
+	addl	$(VEC_SIZE * 1), %edx
+	tzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+L(ret_vec_x3_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+	.p2align 4,, 4
+L(ret_vec_x3_len_no_zfill):
+	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 2 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsfl	%ecx, %ecx
+	VMOVU	%VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
+	subl	%ecx, %edx
+	jl	L(ret_vec_x3_len_no_zfill_mov)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx), %rax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(more_4x_vec):
+
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec3)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(4)
+	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec4)
+
+	movq	%rdx, %rcx
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+	/* Recheck length before aligning.  */
+	cmpq	$(VEC_SIZE * 8 - CHAR_SIZE), %rcx
+	jbe	L(last_4x_vec)
+
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %r8d
+	addq	%rsi, %rdi
+	testl	%r8d, %r8d
+	jnz	L(loop_4x_done)
+
+	/* Use r9 as end register.  */
+	leaq	-(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
 
-#define USE_AS_STRNCPY
-#define STRCPY	STRNCPY
-#include "strcpy-avx2.S"
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %r8d
+
+	testl	%r8d, %r8d
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	subq	%rsi, %rdx
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec1)
+
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec2)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec3)
+
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+	movl	%r8d, %ecx
+
+	// Zfill more....
+
+	.p2align 4,, 4
+L(zfill_vec4):
+	addq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2), %rdx
+L(zfill_vec2):
+	shlq	$VEC_SIZE, %rcx
+L(zfill):
+	bsfq	%rcx, %rcx
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_from_page_cross):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(zfill_less_vec_vzeroupper)
+
+L(zfill_more_1x_vec):
+	VMOVU	%VZERO, CHAR_SIZE(%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jae	L(zfill_more_2x_vec)
+L(zfill_done0):
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(zfill_vec3):
+	addq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2), %rdx
+	.p2align 4,, 2
+L(zfill_vec1):
+	bsfl	%ecx, %ecx
+	addq	%rcx, %rdi
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+	/* zfill from vec1/vec3 must have to set at least 2x VECS.  */
+
+	VMOVU	%VZERO, CHAR_SIZE(%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jb	L(zfill_done0)
+L(zfill_more_2x_vec):
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
+	subq	$(VEC_SIZE * 4 - CHAR_SIZE), %rdx
+	jbe	L(zfill_done)
+
+	addq	%rdi, %rdx
+	VMOVU	%VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
+
+
+	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
+	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
+
+	subq	$-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
+	cmpq	%rdi, %rdx
+	jbe	L(zfill_done)
+
+	andq	$-(VEC_SIZE), %rdi
+	.p2align 4,, 12
+L(zfill_loop_4x_vec):
+	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	ja	L(zfill_loop_4x_vec)
+L(zfill_done):
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(copy_1x):
+	VMOVU	%VMM(0), (%rdi)
+	testl	%ecx, %ecx
+	jz	L(ret_32_32)
+L(zfill_less_vec):
+	bsfl	%ecx, %ecx
+L(zfill_less_vec_no_bsf):
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_less_vec_vzeroupper):
+	COND_VZEROUPPER
+	/* We are taking advantage of the fact that to be here we must
+	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
+	   way for overwriting.  */
+	cmpl	$16, %edx
+	jb	L(zfill_less_16)
+	VMOVU	%VZERO_128, (%rdi)
+	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
+	ret
+# ifdef USE_AS_STPCPY
+L(ret_32_32):
+	leaq	CHAR_SIZE(%rdi, %rdx), %rax
+	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4,, 4
+L(copy_16_31):
+	/* Overfill to avoid branches.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	ja	L(zfill_less_vec_no_bsf)
+# ifndef USE_AS_STPCPY
+L(ret_32_32):
+# else
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 4
+L(copy_8_15):
+	/* Overfill to avoid branches.  */
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
+	vmovq	%xmm0, (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_8_15)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+	.p2align 4,, 8
+L(zfill_less_16):
+	xorl	%ecx, %ecx
+	cmpl	$8, %edx
+	jb	L(zfill_less_8)
+	movq	%rcx, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
+# ifndef USE_AS_STPCPY
+L(ret_8_15):
+# endif
+	ret
+
+
+	.p2align 4,, 8
+L(less_1x_vec):
+	/* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
+	   buffer sizes are aligned conventially.  */
+	je	L(copy_1x)
+
+	tzcntl	%ecx, %ecx
+	cmpl	$16, %edx
+	jae	L(copy_16_31)
+
+	COND_VZEROUPPER
+	cmpl	$8, %edx
+	jae	L(copy_8_15)
+# ifdef USE_AS_WCSCPY
+	testl	%ecx, %ecx
+	jz	L(zfill_less_8_set_ret)
+
+	movl	(%rsi, %rdx), %esi
+	vmovd	%xmm0, (%rdi)
+	movl	%esi, (%rdi, %rdx)
+
+#  ifdef USE_AS_STPCPY
+	cmpl	%ecx, %edx
+L(ret_8_15):
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  endif
+	ret
+L(zfill_less_8_set_ret):
+	xorl	%ecx, %ecx
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+L(zfill_less_8):
+	movl	%ecx, (%rdi)
+	movl	%ecx, (%rdi, %rdx)
+	ret
+
+# else
+	cmpl	$3, %edx
+	jb	L(copy_0_3)
+	/* Overfill to avoid branches.  */
+	movl	-3(%rsi, %rdx), %esi
+	vmovd	%xmm0, (%rdi)
+	movl	%esi, -3(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_4_7)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+	xorl	%ecx, %ecx
+	.p2align 4,, 8
+L(zfill_less_8):
+	cmpl	$3, %edx
+	jb	L(zfill_less_3)
+	movl	%ecx, (%rdi)
+	movl	%ecx, -3(%rdi, %rdx)
+#  ifdef USE_AS_STPCPY
+	ret
+#  endif
+
+L(ret_4_7):
+#  ifdef USE_AS_STPCPY
+L(ret_8_15):
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+	ret
+
+	.p2align 4,, 4
+L(zfill_less_3):
+	testl	%edx, %edx
+	jz	L(zfill_1)
+	movw	%cx, (%rdi)
+L(zfill_1):
+	movb	%cl, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_0_3):
+	vmovd	%xmm0, %r8d
+	testl	%edx, %edx
+	jz	L(copy_1)
+	movw	%r8w, (%rdi)
+	cmpl	%ecx, %edx
+	ja	L(zfill_from_1)
+	movzbl	(%rsi, %rdx), %r8d
+#  ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+	movb	%r8b, (%rdi, %rdx)
+	ret
+#  endif
+
+L(copy_1):
+#  ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	cmpl	%ecx, %edx
+	adcq	%rdi, %rax
+#  endif
+#  ifdef USE_AS_WCSCPY
+	vmovd	%xmm0, (%rdi)
+#  else
+	movb	%r8b, (%rdi, %rdx)
+#  endif
+	ret
+# endif
+
+	.p2align 4,, 2
+L(zero_len):
+	movq	%rdi, %rax
+	ret
+# ifndef USE_AS_WCSCPY
+	.p2align 4,, 8
+L(zfill_from_1):
+#  ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+#  endif
+	movw	$0, -1(%rdi, %rdx)
+	ret
+# endif
+
+	.p2align 4,, 4
+	.p2align 6,, 8
+L(page_cross):
+	movq	%rsi, %rax
+	andq	$(VEC_SIZE * -1), %rax
+
+	VPCMPEQ	(%rax), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+
+	subl	%esi, %eax
+	andl	$(VEC_SIZE - 1), %eax
+	cmpq	%rax, %rdx
+	jb	L(page_cross_small)
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* If rcx is non-zero then continue.  */
+	shl	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsf	%ecx, %ecx
+
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	leaq	-CHAR_SIZE(%rdi, %rcx), %rax
+# else
+	movq	%rdi, %rax
+# endif
+
+	rep	movsb
+# ifdef USE_AS_WCSCPY
+	movl	$0, (%rdi)
+# else
+	movb	$0, (%rdi)
+# endif
+	jmp	L(zfill_from_page_cross)
+
+L(page_cross_small):
+	tzcntl	%ecx, %ecx
+	xorl	%eax, %eax
+	cmpl	%ecx, %edx
+	jbe	L(page_cross_copy_only)
+
+	/* Do a zfill of the tail before copying.  */
+	movq	%rdi, %r9
+	movl	%ecx, %r8d
+
+	subl	%ecx, %edx
+	leaq	CHAR_SIZE(%rdi, %rcx), %rdi
+	movl	%edx, %ecx
+	rep	stosb
+	movq	%r9, %rdi
+	movl	%r8d, %edx
+L(page_cross_copy_only):
+	leal	CHAR_SIZE(%rdx), %ecx
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdi, %rdx
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# else
+	movq	%rdi, %rax
+# endif
+	rep	movsb
+	ret
+
+
+L(best_effort_strncpy):
+	movq	%rdx, %rcx
+	xorl	%eax, %eax
+	movq	%rdi, %r8
+	/* The length is >= 2^63. We very much so expect to segfault at
+	   rep stos. If that doesn't happen then just strcpy to finish.
+	 */
+# ifdef USE_AS_WCSCPY
+	rep	stosl
+# else
+	rep	stosb
+# endif
+	movq	%r8, %rdi
+	jmp	OVERFLOW_STRCPY
+END(STRNCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
index dca1089060..01bead1435 100644
--- a/sysdeps/x86_64/multiarch/x86-avx-vecs.h
+++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
@@ -27,7 +27,10 @@
 #define VEC_SIZE			32
 #include "x86-vec-macros.h"
 
-#define USE_WITH_AVX		1
+#ifndef USE_WITH_AVX2
+# define USE_WITH_AVX		1
+#endif
+
 #define SECTION(p)			p##.avx
 
 /* 4-byte mov instructions with AVX2.  */
diff --git a/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
new file mode 100644
index 0000000000..a5966701ec
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
@@ -0,0 +1,26 @@
+/* Common config for AVX2-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX2_RTM_VECS_H
+#define _X86_AVX2_RTM_VECS_H			1
+
+#define USE_WITH_AVX2		1
+#include "x86-avx-rtm-vecs.h"
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-avx2-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
new file mode 100644
index 0000000000..16d7ae5147
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
@@ -0,0 +1,27 @@
+/* Common config for AVX2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX2_VECS_H
+#define _X86_AVX2_VECS_H			1
+
+#define USE_WITH_AVX2		1
+
+#include "x86-avx-vecs.h"
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v4 3/4] x86: Add evex optimized functions for the wchar_t strcpy family
  2022-11-04 23:04   ` [PATCH v4 1/4] " Noah Goldstein
  2022-11-04 23:04     ` [PATCH v4 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
@ 2022-11-04 23:04     ` Noah Goldstein
  2022-11-04 23:04     ` [PATCH v4 4/4] x86: Add avx2 " Noah Goldstein
  2022-11-04 23:34     ` [PATCH v4 1/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions H.J. Lu
  3 siblings, 0 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04 23:04 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Implemented:
    wcscat-evex  (+ 905 bytes)
    wcscpy-evex  (+ 674 bytes)
    wcpcpy-evex  (+ 709 bytes)
    wcsncpy-evex (+1358 bytes)
    wcpncpy-evex (+1467 bytes)
    wcsncat-evex (+1213 bytes)

Performance Changes:
    Times are from N = 10 runs of the benchmark suite and are reported
    as geometric mean of all ratios of New Implementation / Best Old
    Implementation. Best Old Implementation was determined with the
    highest ISA implementation.

    wcscat-evex     -> 0.991
    wcscpy-evex     -> 0.587
    wcpcpy-evex     -> 0.695
    wcsncpy-evex    -> 0.719
    wcpncpy-evex    -> 0.694
    wcsncat-evex    -> 0.979

Code Size Changes:
    This change  increase the size of libc.so by ~6.3kb bytes. For
    reference the patch optimizing the normal strcpy family functions
    decreases libc.so by ~5.7kb.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
 sysdeps/x86_64/Makefile                    |  5 ++
 sysdeps/x86_64/multiarch/Makefile          | 14 ++++-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c | 69 +++++++++++++++++++++-
 sysdeps/x86_64/multiarch/ifunc-wcs.h       | 49 +++++++++++++++
 sysdeps/x86_64/multiarch/wcpcpy-evex.S     |  8 +++
 sysdeps/x86_64/multiarch/wcpcpy-generic.c  | 27 +++++++++
 sysdeps/x86_64/multiarch/wcpcpy.c          | 37 ++++++++++++
 sysdeps/x86_64/multiarch/wcpncpy-evex.S    |  8 +++
 sysdeps/x86_64/multiarch/wcpncpy-generic.c | 27 +++++++++
 sysdeps/x86_64/multiarch/wcpncpy.c         | 37 ++++++++++++
 sysdeps/x86_64/multiarch/wcscat-evex.S     |  9 +++
 sysdeps/x86_64/multiarch/wcscat-generic.c  | 27 +++++++++
 sysdeps/x86_64/multiarch/wcscat.c          | 37 ++++++++++++
 sysdeps/x86_64/multiarch/wcscpy-evex.S     |  7 +++
 sysdeps/x86_64/multiarch/wcscpy-generic.c  |  3 +-
 sysdeps/x86_64/multiarch/wcscpy.c          | 12 ++++
 sysdeps/x86_64/multiarch/wcsncat-evex.S    |  9 +++
 sysdeps/x86_64/multiarch/wcsncat-generic.c | 27 +++++++++
 sysdeps/x86_64/multiarch/wcsncat.c         | 34 +++++++++++
 sysdeps/x86_64/multiarch/wcsncpy-evex.S    |  7 +++
 sysdeps/x86_64/multiarch/wcsncpy-generic.c | 27 +++++++++
 sysdeps/x86_64/multiarch/wcsncpy.c         | 37 ++++++++++++
 sysdeps/x86_64/wcpcpy-generic.c            | 31 ++++++++++
 sysdeps/x86_64/wcpcpy.S                    | 40 +++++++++++++
 sysdeps/x86_64/wcpncpy-generic.c           | 31 ++++++++++
 sysdeps/x86_64/wcpncpy.S                   | 40 +++++++++++++
 sysdeps/x86_64/wcscat-generic.c            | 31 ++++++++++
 sysdeps/x86_64/wcscat.S                    | 40 +++++++++++++
 sysdeps/x86_64/wcscpy.S                    |  1 +
 sysdeps/x86_64/wcsncat-generic.c           | 31 ++++++++++
 sysdeps/x86_64/wcsncat.S                   | 38 ++++++++++++
 sysdeps/x86_64/wcsncpy-generic.c           | 31 ++++++++++
 sysdeps/x86_64/wcsncpy.S                   | 40 +++++++++++++
 33 files changed, 865 insertions(+), 6 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcs.h
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy.c
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy.c
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcscat.c
 create mode 100644 sysdeps/x86_64/multiarch/wcscpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat.c
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy.c
 create mode 100644 sysdeps/x86_64/wcpcpy-generic.c
 create mode 100644 sysdeps/x86_64/wcpcpy.S
 create mode 100644 sysdeps/x86_64/wcpncpy-generic.c
 create mode 100644 sysdeps/x86_64/wcpncpy.S
 create mode 100644 sysdeps/x86_64/wcscat-generic.c
 create mode 100644 sysdeps/x86_64/wcscat.S
 create mode 100644 sysdeps/x86_64/wcsncat-generic.c
 create mode 100644 sysdeps/x86_64/wcsncat.S
 create mode 100644 sysdeps/x86_64/wcsncpy-generic.c
 create mode 100644 sysdeps/x86_64/wcsncpy.S

diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 3627c5659f..688eb2d7c4 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -188,8 +188,13 @@ endif
 ifeq ($(subdir),wcsmbs)
 
 sysdep_routines += \
+  wcpcpy-generic \
+  wcpncpy-generic \
+  wcscat-generic \
   wcscpy-generic \
+  wcsncat-generic \
   wcsncmp-generic \
+  wcsncpy-generic \
   wcsnlen-generic \
 # sysdep_routines
 
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 066bfa48d9..d6e01940c3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -131,6 +131,12 @@ endif
 
 ifeq ($(subdir),wcsmbs)
 sysdep_routines += \
+  wcpcpy-evex \
+  wcpcpy-generic \
+  wcpncpy-evex \
+  wcpncpy-generic \
+  wcscat-evex \
+  wcscat-generic \
   wcschr-avx2 \
   wcschr-avx2-rtm \
   wcschr-evex \
@@ -140,6 +146,8 @@ sysdep_routines += \
   wcscmp-avx2-rtm \
   wcscmp-evex \
   wcscmp-sse2 \
+  wcscpy-evex \
+  wcscpy-generic \
   wcscpy-ssse3 \
   wcslen-avx2 \
   wcslen-avx2-rtm \
@@ -147,9 +155,13 @@ sysdep_routines += \
   wcslen-evex512 \
   wcslen-sse2 \
   wcslen-sse4_1 \
+  wcsncat-evex \
+  wcsncat-generic \
   wcsncmp-avx2 \
   wcsncmp-avx2-rtm \
   wcsncmp-evex \
+  wcsncpy-evex \
+  wcsncpy-generic \
   wcsnlen-avx2 \
   wcsnlen-avx2-rtm \
   wcsnlen-evex \
@@ -163,8 +175,8 @@ sysdep_routines += \
   wmemchr-avx2 \
   wmemchr-avx2-rtm \
   wmemchr-evex \
-  wmemchr-evex512 \
   wmemchr-evex-rtm \
+  wmemchr-evex512 \
   wmemchr-sse2 \
   wmemcmp-avx2-movbe \
   wmemcmp-avx2-movbe-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7cebee7ec7..959cb0b420 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -901,16 +901,79 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/wcscpy.c.  */
   IFUNC_IMPL (i, name, wcscpy,
-	      /* ISA V4 wrapper for SSSE3 implementation because
-	         the SSSE3 implementation is also used at ISA
-	         level 3/4.  */
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcscpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __wcscpy_ssse3)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
 				     1,
 				     __wcscpy_generic))
 
+  /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
+  IFUNC_IMPL (i, name, wcsncpy,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncpy_evex)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcpncpy,
+				     1,
+				     __wcsncpy_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcpcpy.c.  */
+  IFUNC_IMPL (i, name, wcpcpy,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcpcpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpcpy_evex)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcpcpy,
+				     1,
+				     __wcpcpy_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcpncpy.c.  */
+  IFUNC_IMPL (i, name, wcpncpy,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcpncpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpncpy_evex)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncpy,
+				     1,
+				     __wcpncpy_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcscat.c.  */
+  IFUNC_IMPL (i, name, wcscat,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcscat,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscat_evex)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscat,
+				     1,
+				     __wcscat_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcsncat.c.  */
+  IFUNC_IMPL (i, name, wcsncat,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncat,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncat_evex)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncat,
+				     1,
+				     __wcsncat_generic))
+
   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
   IFUNC_IMPL (i, name, wcslen,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcslen,
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcs.h b/sysdeps/x86_64/multiarch/ifunc-wcs.h
new file mode 100644
index 0000000000..da6e1b03d0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-wcs.h
@@ -0,0 +1,49 @@
+/* Common definition for ifunc selections optimized wide-character
+   string copy functions.
+
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+#ifndef GENERIC
+# define GENERIC generic
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+				      AVX_Fast_Unaligned_Load, ))
+    {
+      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+    }
+
+  return OPTIMIZE (GENERIC);
+}
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-evex.S b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
new file mode 100644
index 0000000000..ac6429cc07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
@@ -0,0 +1,8 @@
+#ifndef WCPCPY
+# define WCPCPY	__wcpcpy_evex
+#endif
+
+#define USE_AS_STPCPY
+#define USE_AS_WCSCPY
+#define STRCPY	WCPCPY
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-generic.c b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
new file mode 100644
index 0000000000..6039196a3e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
@@ -0,0 +1,27 @@
+/* wcpcpy.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (3)
+
+# define WCPCPY __wcpcpy_generic
+# include <wcsmbs/wcpcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcpcpy.c b/sysdeps/x86_64/multiarch/wcpcpy.c
new file mode 100644
index 0000000000..8f96ddbc99
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcpcpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcpcpy __redirect_wcpcpy
+# include <wchar.h>
+# undef __wcpcpy
+
+# define SYMBOL_NAME wcpcpy
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcpcpy, __wcpcpy, IFUNC_SELECTOR ());
+weak_alias (__wcpcpy, wcpcpy)
+# ifdef SHARED
+__hidden_ver1 (__wcpcpy, __GI___wcpcpy, __redirect_wcpcpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpcpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-evex.S b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
new file mode 100644
index 0000000000..62ddb694fe
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
@@ -0,0 +1,8 @@
+#ifndef WCPNCPY
+# define WCPNCPY	__wcpncpy_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STPCPY
+#define STRNCPY	WCPNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-generic.c b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
new file mode 100644
index 0000000000..de8d34320e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
@@ -0,0 +1,27 @@
+/* wcpncpy.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (3)
+
+# define WCPNCPY __wcpncpy_generic
+# include <wcsmbs/wcpncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcpncpy.c b/sysdeps/x86_64/multiarch/wcpncpy.c
new file mode 100644
index 0000000000..ed8f307e07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcpncpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcpncpy __redirect_wcpncpy
+# include <wchar.h>
+# undef __wcpncpy
+
+# define SYMBOL_NAME wcpncpy
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcpncpy, __wcpncpy, IFUNC_SELECTOR ());
+weak_alias (__wcpncpy, wcpncpy)
+# ifdef SHARED
+__hidden_ver1 (__wcpncpy, __GI___wcpncpy, __redirect_wcpncpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpncpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcscat-evex.S b/sysdeps/x86_64/multiarch/wcscat-evex.S
new file mode 100644
index 0000000000..1d017e4899
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-evex.S
@@ -0,0 +1,9 @@
+#ifndef WCSCAT
+# define WCSCAT	__wcscat_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRCPY	WCSCAT
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcscat-generic.c b/sysdeps/x86_64/multiarch/wcscat-generic.c
new file mode 100644
index 0000000000..d86b4d5c00
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-generic.c
@@ -0,0 +1,27 @@
+/* wcscat.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (3)
+
+# define WCSCAT __wcscat_generic
+# include <wcsmbs/wcscat.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcscat.c b/sysdeps/x86_64/multiarch/wcscat.c
new file mode 100644
index 0000000000..3277c44561
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcscat.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcscat __redirect_wcscat
+# include <wchar.h>
+# undef __wcscat
+
+# define SYMBOL_NAME wcscat
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcscat, __wcscat, IFUNC_SELECTOR ());
+weak_alias (__wcscat, wcscat)
+# ifdef SHARED
+__hidden_ver1 (__wcscat, __GI___wcscat, __redirect_wcscat)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcscat);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcscpy-evex.S b/sysdeps/x86_64/multiarch/wcscpy-evex.S
new file mode 100644
index 0000000000..1069a8e224
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-evex.S
@@ -0,0 +1,7 @@
+#ifndef WCSCPY
+# define WCSCPY	__wcscpy_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define STRCPY	WCSCPY
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-generic.c b/sysdeps/x86_64/multiarch/wcscpy-generic.c
index 93d314aaad..4a1fffae4b 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcscpy-generic.c
@@ -18,8 +18,7 @@
 
 
 #include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (1)
+#if ISA_SHOULD_BUILD (3)
 
 # define WCSCPY  __wcscpy_generic
 # include <wcsmbs/wcscpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
index 92c917b6b4..efe32e505f 100644
--- a/sysdeps/x86_64/multiarch/wcscpy.c
+++ b/sysdeps/x86_64/multiarch/wcscpy.c
@@ -26,6 +26,8 @@
 # define SYMBOL_NAME wcscpy
 # include <init-arch.h>
 
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
@@ -35,6 +37,16 @@ IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
+  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load, ))
+    {
+      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+    }
+
   if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
     return OPTIMIZE (ssse3);
 
diff --git a/sysdeps/x86_64/multiarch/wcsncat-evex.S b/sysdeps/x86_64/multiarch/wcsncat-evex.S
new file mode 100644
index 0000000000..392215950a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-evex.S
@@ -0,0 +1,9 @@
+#ifndef WCSCAT
+# define WCSCAT	__wcsncat_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRNCAT	WCSCAT
+#include "strncat-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncat-generic.c b/sysdeps/x86_64/multiarch/wcsncat-generic.c
new file mode 100644
index 0000000000..4b55cb40bc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-generic.c
@@ -0,0 +1,27 @@
+/* wcsncat.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (3)
+
+# define WCSNCAT __wcsncat_generic
+# include <wcsmbs/wcsncat.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncat.c b/sysdeps/x86_64/multiarch/wcsncat.c
new file mode 100644
index 0000000000..49c46aef08
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat.c
@@ -0,0 +1,34 @@
+/* Multiple versions of wcsncat.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define wcsncat __redirect_wcsncat
+# include <wchar.h>
+# undef wcsncat
+
+# define SYMBOL_NAME wcsncat
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcsncat, wcsncat, IFUNC_SELECTOR ());
+# ifdef SHARED
+__hidden_ver1 (wcsncat, __GI_wcsncat, __redirect_wcsncat)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncat);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-evex.S b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
new file mode 100644
index 0000000000..2debb8fd6b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
@@ -0,0 +1,7 @@
+#ifndef WCSNCPY
+# define WCSNCPY	__wcsncpy_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define STRNCPY	WCSNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-generic.c b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
new file mode 100644
index 0000000000..d0e8a86605
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
@@ -0,0 +1,27 @@
+/* wcsncpy.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (3)
+
+# define WCSNCPY __wcsncpy_generic
+# include <wcsmbs/wcsncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncpy.c b/sysdeps/x86_64/multiarch/wcsncpy.c
new file mode 100644
index 0000000000..5b89dd4d27
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcsncpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcsncpy __redirect_wcsncpy
+# include <wchar.h>
+# undef __wcsncpy
+
+# define SYMBOL_NAME wcsncpy
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcsncpy, __wcsncpy, IFUNC_SELECTOR ());
+weak_alias (__wcsncpy, wcsncpy)
+# ifdef SHARED
+__hidden_ver1 (__wcsncpy, __GI___wcsncpy, __redirect_wcsncpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/wcpcpy-generic.c b/sysdeps/x86_64/wcpcpy-generic.c
new file mode 100644
index 0000000000..d52525f288
--- /dev/null
+++ b/sysdeps/x86_64/wcpcpy-generic.c
@@ -0,0 +1,31 @@
+/* wcpcpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpcpy non-multiarch build is split into two files,
+   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcpcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcpcpy.S b/sysdeps/x86_64/wcpcpy.S
new file mode 100644
index 0000000000..97e9207c16
--- /dev/null
+++ b/sysdeps/x86_64/wcpcpy.S
@@ -0,0 +1,40 @@
+/* wcpcpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpcpy non-multiarch build is split into two files,
+   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCPCPY	__wcpcpy
+
+# define DEFAULT_IMPL_V4	"multiarch/wcpcpy-evex.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcpcpy, wcpcpy)
+libc_hidden_def (__wcpcpy)
+#endif
diff --git a/sysdeps/x86_64/wcpncpy-generic.c b/sysdeps/x86_64/wcpncpy-generic.c
new file mode 100644
index 0000000000..871219a445
--- /dev/null
+++ b/sysdeps/x86_64/wcpncpy-generic.c
@@ -0,0 +1,31 @@
+/* wcpncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpncpy non-multiarch build is split into two files,
+   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcpncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcpncpy.S b/sysdeps/x86_64/wcpncpy.S
new file mode 100644
index 0000000000..2169ed5545
--- /dev/null
+++ b/sysdeps/x86_64/wcpncpy.S
@@ -0,0 +1,40 @@
+/* wcpncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpncpy non-multiarch build is split into two files,
+   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCPNCPY	__wcpncpy
+
+# define DEFAULT_IMPL_V4	"multiarch/wcpncpy-evex.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcpncpy, wcpncpy)
+libc_hidden_def (__wcpncpy)
+#endif
diff --git a/sysdeps/x86_64/wcscat-generic.c b/sysdeps/x86_64/wcscat-generic.c
new file mode 100644
index 0000000000..85f981a81f
--- /dev/null
+++ b/sysdeps/x86_64/wcscat-generic.c
@@ -0,0 +1,31 @@
+/* wcscat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcscat non-multiarch build is split into two files,
+   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcscat.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcscat.S b/sysdeps/x86_64/wcscat.S
new file mode 100644
index 0000000000..8432087c7c
--- /dev/null
+++ b/sysdeps/x86_64/wcscat.S
@@ -0,0 +1,40 @@
+/* wcscat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcscat non-multiarch build is split into two files,
+   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCSCAT	__wcscat
+
+# define DEFAULT_IMPL_V4	"multiarch/wcscat-evex.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcscat, wcscat)
+libc_hidden_def (__wcscat)
+#endif
diff --git a/sysdeps/x86_64/wcscpy.S b/sysdeps/x86_64/wcscpy.S
index 11d0bb4bab..ff8bdd3aea 100644
--- a/sysdeps/x86_64/wcscpy.S
+++ b/sysdeps/x86_64/wcscpy.S
@@ -28,6 +28,7 @@
 
 # define WCSCPY	__wcscpy
 
+# define DEFAULT_IMPL_V4	"multiarch/wcscpy-evex.S"
 # define DEFAULT_IMPL_V2	"multiarch/wcscpy-ssse3.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
diff --git a/sysdeps/x86_64/wcsncat-generic.c b/sysdeps/x86_64/wcsncat-generic.c
new file mode 100644
index 0000000000..2cc0f7b11a
--- /dev/null
+++ b/sysdeps/x86_64/wcsncat-generic.c
@@ -0,0 +1,31 @@
+/* wcsncat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncat non-multiarch build is split into two files,
+   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcsncat.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcsncat.S b/sysdeps/x86_64/wcsncat.S
new file mode 100644
index 0000000000..64e144a9c7
--- /dev/null
+++ b/sysdeps/x86_64/wcsncat.S
@@ -0,0 +1,38 @@
+/* wcsncat dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncat non-multiarch build is split into two files,
+   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCSNCAT	wcsncat
+
+# define DEFAULT_IMPL_V4	"multiarch/wcsncat-evex.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+#endif
diff --git a/sysdeps/x86_64/wcsncpy-generic.c b/sysdeps/x86_64/wcsncpy-generic.c
new file mode 100644
index 0000000000..49d06b8ae8
--- /dev/null
+++ b/sysdeps/x86_64/wcsncpy-generic.c
@@ -0,0 +1,31 @@
+/* wcsncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncpy non-multiarch build is split into two files,
+   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcsncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcsncpy.S b/sysdeps/x86_64/wcsncpy.S
new file mode 100644
index 0000000000..1450c1aa28
--- /dev/null
+++ b/sysdeps/x86_64/wcsncpy.S
@@ -0,0 +1,40 @@
+/* wcsncpy dispatch for RTLD and non-multiarch .c ISA level 1 build.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncpy non-multiarch build is split into two files,
+   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCSNCPY	__wcsncpy
+
+# define DEFAULT_IMPL_V4	"multiarch/wcsncpy-evex.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcsncpy, wcsncpy)
+libc_hidden_def (__wcsncpy)
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v4 4/4] x86: Add avx2 optimized functions for the wchar_t strcpy family
  2022-11-04 23:04   ` [PATCH v4 1/4] " Noah Goldstein
  2022-11-04 23:04     ` [PATCH v4 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
  2022-11-04 23:04     ` [PATCH v4 3/4] x86: Add evex optimized functions for the wchar_t strcpy family Noah Goldstein
@ 2022-11-04 23:04     ` Noah Goldstein
  2022-11-04 23:34     ` [PATCH v4 1/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions H.J. Lu
  3 siblings, 0 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04 23:04 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Implemented:
    wcscat-avx2{+rtm}  (+ 744 * 2 bytes
    wcscpy-avx2{+rtm}  (+ 539 * 2 bytes)
    wcpcpy-avx2{+rtm}  (+ 577 * 2 bytes)
    wcsncpy-avx2{+rtm} (+1108 * 2 bytes)
    wcpncpy-avx2{+rtm} (+1214 * 2 bytes)
    wcsncat-avx2{+rtm} (+1085 * 2 bytes)

Performance Changes:
    Times are from N = 10 runs of the benchmark suite and are reported
    as geometric mean of all ratios of New Implementation / Best Old
    Implementation. Best Old Implementation was determined with the
    highest ISA implementation.

    wcscat-avx2     -> 0.975
    wcscpy-avx2     -> 0.591
    wcpcpy-avx2     -> 0.698
    wcsncpy-avx2    -> 0.730
    wcpncpy-avx2    -> 0.711
    wcsncat-avx2    -> 0.954

Code Size Changes:
    This change  increase the size of libc.so by ~11kb bytes. For
    reference the patch optimizing the normal strcpy family functions
    decreases libc.so by ~5.2kb.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
 sysdeps/x86_64/multiarch/Makefile           | 12 ++++
 sysdeps/x86_64/multiarch/ifunc-impl-list.c  | 66 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/ifunc-wcs.h        | 11 ++++
 sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S  |  3 +
 sysdeps/x86_64/multiarch/wcpcpy-avx2.S      |  8 +++
 sysdeps/x86_64/multiarch/wcpcpy-generic.c   |  2 +-
 sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S |  3 +
 sysdeps/x86_64/multiarch/wcpncpy-avx2.S     |  8 +++
 sysdeps/x86_64/multiarch/wcpncpy-generic.c  |  2 +-
 sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S  |  3 +
 sysdeps/x86_64/multiarch/wcscat-avx2.S      | 10 ++++
 sysdeps/x86_64/multiarch/wcscat-generic.c   |  2 +-
 sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S  |  3 +
 sysdeps/x86_64/multiarch/wcscpy-avx2.S      |  7 +++
 sysdeps/x86_64/multiarch/wcscpy-generic.c   |  2 +-
 sysdeps/x86_64/multiarch/wcscpy.c           |  9 +++
 sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S |  3 +
 sysdeps/x86_64/multiarch/wcsncat-avx2.S     |  9 +++
 sysdeps/x86_64/multiarch/wcsncat-generic.c  |  2 +-
 sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S |  3 +
 sysdeps/x86_64/multiarch/wcsncpy-avx2.S     |  7 +++
 sysdeps/x86_64/multiarch/wcsncpy-generic.c  |  2 +-
 sysdeps/x86_64/wcpcpy-generic.c             |  2 +-
 sysdeps/x86_64/wcpcpy.S                     |  3 +-
 sysdeps/x86_64/wcpncpy-generic.c            |  2 +-
 sysdeps/x86_64/wcpncpy.S                    |  3 +-
 sysdeps/x86_64/wcscat-generic.c             |  2 +-
 sysdeps/x86_64/wcscat.S                     |  3 +-
 sysdeps/x86_64/wcscpy.S                     |  1 +
 sysdeps/x86_64/wcsncat-generic.c            |  2 +-
 sysdeps/x86_64/wcsncat.S                    |  3 +-
 sysdeps/x86_64/wcsncpy-generic.c            |  2 +-
 sysdeps/x86_64/wcsncpy.S                    |  3 +-
 33 files changed, 187 insertions(+), 16 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d6e01940c3..f848fc0e28 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -131,10 +131,16 @@ endif
 
 ifeq ($(subdir),wcsmbs)
 sysdep_routines += \
+  wcpcpy-avx2 \
+  wcpcpy-avx2-rtm \
   wcpcpy-evex \
   wcpcpy-generic \
+  wcpncpy-avx2 \
+  wcpncpy-avx2-rtm \
   wcpncpy-evex \
   wcpncpy-generic \
+  wcscat-avx2 \
+  wcscat-avx2-rtm \
   wcscat-evex \
   wcscat-generic \
   wcschr-avx2 \
@@ -146,6 +152,8 @@ sysdep_routines += \
   wcscmp-avx2-rtm \
   wcscmp-evex \
   wcscmp-sse2 \
+  wcscpy-avx2 \
+  wcscpy-avx2-rtm \
   wcscpy-evex \
   wcscpy-generic \
   wcscpy-ssse3 \
@@ -155,11 +163,15 @@ sysdep_routines += \
   wcslen-evex512 \
   wcslen-sse2 \
   wcslen-sse4_1 \
+  wcsncat-avx2 \
+  wcsncat-avx2-rtm \
   wcsncat-evex \
   wcsncat-generic \
   wcsncmp-avx2 \
   wcsncmp-avx2-rtm \
   wcsncmp-evex \
+  wcsncpy-avx2 \
+  wcsncpy-avx2-rtm \
   wcsncpy-evex \
   wcsncpy-generic \
   wcsnlen-avx2 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 959cb0b420..71e8953e91 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -908,6 +908,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcscpy_evex)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcscpy_avx2_rtm)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcscpy,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __wcscpy_ssse3)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
@@ -922,6 +933,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI1)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcsncpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcsncpy_avx2_rtm)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcpncpy,
 				     1,
 				     __wcsncpy_generic))
@@ -934,6 +956,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI1)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcpcpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpcpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcpcpy_avx2_rtm)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcpcpy,
 				     1,
 				     __wcpcpy_generic))
@@ -946,6 +979,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI1)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcpncpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpncpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcpncpy_avx2_rtm)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncpy,
 				     1,
 				     __wcpncpy_generic))
@@ -958,6 +1002,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI1)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcscat_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscat_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcscat_avx2_rtm)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscat,
 				     1,
 				     __wcscat_generic))
@@ -970,6 +1025,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI1)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcsncat_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncat_avx2)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __wcsncat_avx2_rtm)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcsncat,
 				     1,
 				     __wcsncat_generic))
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcs.h b/sysdeps/x86_64/multiarch/ifunc-wcs.h
index da6e1b03d0..cda633d8fb 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wcs.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wcs.h
@@ -27,6 +27,9 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+
 extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
 
 static inline void *
@@ -43,6 +46,14 @@ IFUNC_SELECTOR (void)
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+				       Prefer_No_VZEROUPPER, !))
+	return OPTIMIZE (avx2);
+
     }
 
   return OPTIMIZE (GENERIC);
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
new file mode 100644
index 0000000000..756280a3ab
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCPCPY	__wcpcpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcpcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
new file mode 100644
index 0000000000..0fffd912d3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
@@ -0,0 +1,8 @@
+#ifndef WCPCPY
+# define WCPCPY	__wcpcpy_avx2
+#endif
+
+#define USE_AS_STPCPY
+#define USE_AS_WCSCPY
+#define STRCPY	WCPCPY
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-generic.c b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
index 6039196a3e..0ba29b081f 100644
--- a/sysdeps/x86_64/multiarch/wcpcpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
@@ -19,7 +19,7 @@
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCPCPY __wcpcpy_generic
 # include <wcsmbs/wcpcpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
new file mode 100644
index 0000000000..80600d6b01
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCPNCPY	__wcpncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcpncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
new file mode 100644
index 0000000000..b7e594f7b7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
@@ -0,0 +1,8 @@
+#ifndef WCPNCPY
+# define WCPNCPY	__wcpncpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STPCPY
+#define STRNCPY	WCPNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-generic.c b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
index de8d34320e..4aab4ecdd2 100644
--- a/sysdeps/x86_64/multiarch/wcpncpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
@@ -19,7 +19,7 @@
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCPNCPY __wcpncpy_generic
 # include <wcsmbs/wcpncpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
new file mode 100644
index 0000000000..e99449a2dc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSCAT	__wcscat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcscat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2.S b/sysdeps/x86_64/multiarch/wcscat-avx2.S
new file mode 100644
index 0000000000..a20f23c09d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-avx2.S
@@ -0,0 +1,10 @@
+#ifndef WCSCAT
+# define WCSCAT	__wcscat_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRCPY	WCSCAT
+
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscat-generic.c b/sysdeps/x86_64/multiarch/wcscat-generic.c
index d86b4d5c00..6476f85bbb 100644
--- a/sysdeps/x86_64/multiarch/wcscat-generic.c
+++ b/sysdeps/x86_64/multiarch/wcscat-generic.c
@@ -19,7 +19,7 @@
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSCAT __wcscat_generic
 # include <wcsmbs/wcscat.c>
diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
new file mode 100644
index 0000000000..2f800c8d3e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSCPY	__wcscpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcscpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2.S b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
new file mode 100644
index 0000000000..6bc509da07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
@@ -0,0 +1,7 @@
+#ifndef WCSCPY
+# define WCSCPY	__wcscpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define STRCPY	WCSCPY
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-generic.c b/sysdeps/x86_64/multiarch/wcscpy-generic.c
index 4a1fffae4b..600d606c45 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcscpy-generic.c
@@ -18,7 +18,7 @@
 
 
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSCPY  __wcscpy_generic
 # include <wcsmbs/wcscpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
index efe32e505f..7f6387817b 100644
--- a/sysdeps/x86_64/multiarch/wcscpy.c
+++ b/sysdeps/x86_64/multiarch/wcscpy.c
@@ -28,6 +28,9 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
@@ -45,6 +48,12 @@ IFUNC_SELECTOR (void)
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER, !))
+	return OPTIMIZE (avx2);
     }
 
   if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
new file mode 100644
index 0000000000..609d6e69c0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSNCAT	__wcsncat_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcsncat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2.S b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
new file mode 100644
index 0000000000..a72105b7e9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
@@ -0,0 +1,9 @@
+#ifndef WCSNCAT
+# define WCSNCAT	__wcsncat_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRNCAT	WCSNCAT
+#include "strncat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncat-generic.c b/sysdeps/x86_64/multiarch/wcsncat-generic.c
index 4b55cb40bc..9ced02b35e 100644
--- a/sysdeps/x86_64/multiarch/wcsncat-generic.c
+++ b/sysdeps/x86_64/multiarch/wcsncat-generic.c
@@ -19,7 +19,7 @@
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSNCAT __wcsncat_generic
 # include <wcsmbs/wcsncat.c>
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
new file mode 100644
index 0000000000..cab5a6b820
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define WCSNCPY	__wcsncpy_avx2_rtm
+#include "x86-avx2-rtm-vecs.h"
+#include "wcsncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
new file mode 100644
index 0000000000..3a1a8a372c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
@@ -0,0 +1,7 @@
+#ifndef WCSNCPY
+# define WCSNCPY	__wcsncpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define STRNCPY	WCSNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-generic.c b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
index d0e8a86605..693521713b 100644
--- a/sysdeps/x86_64/multiarch/wcsncpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
@@ -19,7 +19,7 @@
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSNCPY __wcsncpy_generic
 # include <wcsmbs/wcsncpy.c>
diff --git a/sysdeps/x86_64/wcpcpy-generic.c b/sysdeps/x86_64/wcpcpy-generic.c
index d52525f288..2e4d69a500 100644
--- a/sysdeps/x86_64/wcpcpy-generic.c
+++ b/sysdeps/x86_64/wcpcpy-generic.c
@@ -24,7 +24,7 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcpcpy.c>
 
diff --git a/sysdeps/x86_64/wcpcpy.S b/sysdeps/x86_64/wcpcpy.S
index 97e9207c16..cfde4309fe 100644
--- a/sysdeps/x86_64/wcpcpy.S
+++ b/sysdeps/x86_64/wcpcpy.S
@@ -24,11 +24,12 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCPCPY	__wcpcpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcpcpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcpcpy-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcpncpy-generic.c b/sysdeps/x86_64/wcpncpy-generic.c
index 871219a445..1f12a0e4c6 100644
--- a/sysdeps/x86_64/wcpncpy-generic.c
+++ b/sysdeps/x86_64/wcpncpy-generic.c
@@ -24,7 +24,7 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcpncpy.c>
 
diff --git a/sysdeps/x86_64/wcpncpy.S b/sysdeps/x86_64/wcpncpy.S
index 2169ed5545..2f89482d30 100644
--- a/sysdeps/x86_64/wcpncpy.S
+++ b/sysdeps/x86_64/wcpncpy.S
@@ -24,11 +24,12 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCPNCPY	__wcpncpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcpncpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcpncpy-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcscat-generic.c b/sysdeps/x86_64/wcscat-generic.c
index 85f981a81f..3552167ebe 100644
--- a/sysdeps/x86_64/wcscat-generic.c
+++ b/sysdeps/x86_64/wcscat-generic.c
@@ -24,7 +24,7 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcscat.c>
 
diff --git a/sysdeps/x86_64/wcscat.S b/sysdeps/x86_64/wcscat.S
index 8432087c7c..2e59987e76 100644
--- a/sysdeps/x86_64/wcscat.S
+++ b/sysdeps/x86_64/wcscat.S
@@ -24,11 +24,12 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCSCAT	__wcscat
 
 # define DEFAULT_IMPL_V4	"multiarch/wcscat-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcscat-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcscpy.S b/sysdeps/x86_64/wcscpy.S
index ff8bdd3aea..ab9288ed74 100644
--- a/sysdeps/x86_64/wcscpy.S
+++ b/sysdeps/x86_64/wcscpy.S
@@ -29,6 +29,7 @@
 # define WCSCPY	__wcscpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcscpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcscpy-avx2.S"
 # define DEFAULT_IMPL_V2	"multiarch/wcscpy-ssse3.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
diff --git a/sysdeps/x86_64/wcsncat-generic.c b/sysdeps/x86_64/wcsncat-generic.c
index 2cc0f7b11a..47f6a8ad56 100644
--- a/sysdeps/x86_64/wcsncat-generic.c
+++ b/sysdeps/x86_64/wcsncat-generic.c
@@ -24,7 +24,7 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcsncat.c>
 
diff --git a/sysdeps/x86_64/wcsncat.S b/sysdeps/x86_64/wcsncat.S
index 64e144a9c7..9a55499131 100644
--- a/sysdeps/x86_64/wcsncat.S
+++ b/sysdeps/x86_64/wcsncat.S
@@ -24,11 +24,12 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCSNCAT	wcsncat
 
 # define DEFAULT_IMPL_V4	"multiarch/wcsncat-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcsncat-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcsncpy-generic.c b/sysdeps/x86_64/wcsncpy-generic.c
index 49d06b8ae8..7f19fcaddc 100644
--- a/sysdeps/x86_64/wcsncpy-generic.c
+++ b/sysdeps/x86_64/wcsncpy-generic.c
@@ -24,7 +24,7 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcsncpy.c>
 
diff --git a/sysdeps/x86_64/wcsncpy.S b/sysdeps/x86_64/wcsncpy.S
index 1450c1aa28..dc44b32395 100644
--- a/sysdeps/x86_64/wcsncpy.S
+++ b/sysdeps/x86_64/wcsncpy.S
@@ -24,11 +24,12 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCSNCPY	__wcsncpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcsncpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcsncpy-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v3 2/5] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
  2022-11-04 22:47         ` H.J. Lu
@ 2022-11-04 23:06           ` Noah Goldstein
  0 siblings, 0 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-04 23:06 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Nov 4, 2022 at 3:47 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Nov 4, 2022 at 3:28 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Nov 4, 2022 at 2:46 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Nov 04, 2022 at 01:13:11PM -0700, Noah Goldstein wrote:
> > > > Optimizations are:
> > > >     1. Use more overlapping stores to avoid branches.
> > > >     2. Reduce how unrolled the aligning copies are (this is more of a
> > > >        code-size save, its a negative for some sizes in terms of
> > > >        perf).
> > > >     3. Improve the loop a bit (similiar to what we do in strlen with
> > > >        2x vpminu + kortest instead of 3x vpminu + kmov + test).
> > > >     4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
> > > >        number that are taken.
> > > >
> > > > Performance Changes:
> > > >
> > > >     Times are from N = 10 runs of the benchmark suite and are
> > > >     reported as geometric mean of all ratios of
> > > >     New Implementation / Old Implementation.
> > > >
> > > >     stpcpy-evex      -> 0.922
> > > >     strcat-evex      -> 0.985
> > > >     strcpy-evex      -> 0.880
> > > >
> > > >     strncpy-evex     -> 0.831
> > > >     stpncpy-evex     -> 0.780
> > > >
> > > >     strncat-evex     -> 0.958
> > > >
> > > > Code Size Changes:
> > > >     function         -> Bytes New / Bytes Old -> Ratio
> > > >
> > > >     strcat-evex      ->  819 / 1874 -> 0.437
> > > >     strcpy-evex      ->  700 / 1074 -> 0.652
> > > >     stpcpy-evex      ->  735 / 1094 -> 0.672
> > > >
> > > >     strncpy-evex     -> 1397 / 2611 -> 0.535
> > > >     stpncpy-evex     -> 1489 / 2691 -> 0.553
> > > >
> > > >     strncat-evex     -> 1184 / 2832 -> 0.418
> > > >
> > > > Notes:
> > > >     1. Because of the significant difference between the
> > > >        implementations they are split into three files.
> > > >
> > > >            strcpy-evex.S    -> strcpy, stpcpy, strcat
> > > >            strncpy-evex.S   -> strncpy
> > > >            strncat-evex.S    > strncat
> > > >
> > > >        I couldn't find a way to merge them without making the
> > > >        ifdefs incredibly difficult to follow.
> > > >
> > > >     2. All implementations can be made evex512 by including
> > > >        "x86-evex512-vecs.h" at the top.
> > > >
> > > >     3. All implementations have an optional define:
> > > >         `USE_EVEX_MASKED_STORE`
> > > >        Setting to one uses evex-masked stores for handling short
> > > >        strings.  This saves code size and branches.  It's disabled
> > > >        for all implementations are the moment as there are some
> > > >        serious drawbacks to masked stores in certain cases, but
> > > >        that may be fixed on future architectures.
> > > >
> > > > Full check passes on x86-64 and build succeeds for all ISA levels w/
> > > > and w/o multiarch.
> > > > ---
> > > >  sysdeps/x86_64/multiarch/stpncpy-evex.S       |    5 +-
> > > >  sysdeps/x86_64/multiarch/strcat-evex.S        |  291 +---
> > > >  sysdeps/x86_64/multiarch/strcat-strlen-evex.S |  110 ++
> > > >  sysdeps/x86_64/multiarch/strcpy-evex.S        | 1282 ++++++-----------
> > > >  sysdeps/x86_64/multiarch/strncat-evex.S       |  525 ++++++-
> > > >  sysdeps/x86_64/multiarch/strncpy-evex.S       |  995 ++++++++++++-
> > > >  .../multiarch/strncpy-or-cat-overflow-def.h   |   65 +
> > > >  7 files changed, 2100 insertions(+), 1173 deletions(-)
> > > >  create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> > > > index 99ea76a372..3693491baa 100644
> > > > --- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
> > > > +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> > > > @@ -3,6 +3,5 @@
> > > >  #endif
> > > >
> > > >  #define USE_AS_STPCPY
> > > > -#define USE_AS_STRNCPY
> > > > -#define STRCPY       STPNCPY
> > > > -#include "strcpy-evex.S"
> > > > +#define STRNCPY      STPNCPY
> > > > +#include "strncpy-evex.S"
> > > > diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
> > > > index 0e2df947e9..b4207b7889 100644
> > > > --- a/sysdeps/x86_64/multiarch/strcat-evex.S
> > > > +++ b/sysdeps/x86_64/multiarch/strcat-evex.S
> > > > @@ -1,286 +1,7 @@
> > > > -/* strcat with 256-bit EVEX instructions.
> > > > -   Copyright (C) 2021-2022 Free Software Foundation, Inc.
> > > > -   This file is part of the GNU C Library.
> > > > -
> > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > -   modify it under the terms of the GNU Lesser General Public
> > > > -   License as published by the Free Software Foundation; either
> > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > -
> > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > -   Lesser General Public License for more details.
> > > > -
> > > > -   You should have received a copy of the GNU Lesser General Public
> > > > -   License along with the GNU C Library; if not, see
> > > > -   <https://www.gnu.org/licenses/>.  */
> > > > -
> > > > -#include <isa-level.h>
> > > > -
> > > > -#if ISA_SHOULD_BUILD (4)
> > > > -
> > > > -
> > > > -# include <sysdep.h>
> > > > -
> > > > -# ifndef STRCAT
> > > > -#  define STRCAT  __strcat_evex
> > > > -# endif
> > > > -
> > > > -# define VMOVU               vmovdqu64
> > > > -# define VMOVA               vmovdqa64
> > > > -
> > > > -/* zero register */
> > > > -# define XMMZERO     xmm16
> > > > -# define YMMZERO     ymm16
> > > > -# define YMM0                ymm17
> > > > -# define YMM1                ymm18
> > > > -
> > > > -# define USE_AS_STRCAT
> > > > -
> > > > -/* Number of bytes in a vector register */
> > > > -# define VEC_SIZE    32
> > > > -
> > > > -     .section .text.evex,"ax",@progbits
> > > > -ENTRY (STRCAT)
> > > > -     mov     %rdi, %r9
> > > > -# ifdef USE_AS_STRNCAT
> > > > -     mov     %rdx, %r8
> > > > -# endif
> > > > -
> > > > -     xor     %eax, %eax
> > > > -     mov     %edi, %ecx
> > > > -     and     $((VEC_SIZE * 4) - 1), %ecx
> > > > -     vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> > > > -     cmp     $(VEC_SIZE * 3), %ecx
> > > > -     ja      L(fourth_vector_boundary)
> > > > -     vpcmpb  $0, (%rdi), %YMMZERO, %k0
> > > > -     kmovd   %k0, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_first_vector)
> > > > -     mov     %rdi, %rax
> > > > -     and     $-VEC_SIZE, %rax
> > > > -     jmp     L(align_vec_size_start)
> > > > -L(fourth_vector_boundary):
> > > > -     mov     %rdi, %rax
> > > > -     and     $-VEC_SIZE, %rax
> > > > -     vpcmpb  $0, (%rax), %YMMZERO, %k0
> > > > -     mov     $-1, %r10d
> > > > -     sub     %rax, %rcx
> > > > -     shl     %cl, %r10d
> > > > -     kmovd   %k0, %edx
> > > > -     and     %r10d, %edx
> > > > -     jnz     L(exit)
> > > > -
> > > > -L(align_vec_size_start):
> > > > -     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> > > > -     kmovd   %k0, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_second_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > > > -     kmovd   %k1, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_third_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > > > -     kmovd   %k2, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_fourth_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > > > -     kmovd   %k3, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_fifth_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > > > -     add     $(VEC_SIZE * 4), %rax
> > > > -     kmovd   %k4, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_second_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > > > -     kmovd   %k1, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_third_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > > > -     kmovd   %k2, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_fourth_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > > > -     kmovd   %k3, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_fifth_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > > > -     kmovd   %k4, %edx
> > > > -     add     $(VEC_SIZE * 4), %rax
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_second_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > > > -     kmovd   %k1, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_third_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > > > -     kmovd   %k2, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_fourth_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > > > -     kmovd   %k3, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_fifth_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > > > -     add     $(VEC_SIZE * 4), %rax
> > > > -     kmovd   %k4, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_second_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > > > -     kmovd   %k1, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_third_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > > > -     kmovd   %k2, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_fourth_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > > > -     kmovd   %k3, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_fifth_vector)
> > > > -
> > > > -     test    $((VEC_SIZE * 4) - 1), %rax
> > > > -     jz      L(align_four_vec_loop)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> > > > -     add     $(VEC_SIZE * 5), %rax
> > > > -     kmovd   %k4, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit)
> > > > -
> > > > -     test    $((VEC_SIZE * 4) - 1), %rax
> > > > -     jz      L(align_four_vec_loop)
> > > > -
> > > > -     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> > > > -     add     $VEC_SIZE, %rax
> > > > -     kmovd   %k0, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit)
> > > > -
> > > > -     test    $((VEC_SIZE * 4) - 1), %rax
> > > > -     jz      L(align_four_vec_loop)
> > > > -
> > > > -     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
> > > > -     add     $VEC_SIZE, %rax
> > > > -     kmovd   %k0, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit)
> > > > -
> > > > -     test    $((VEC_SIZE * 4) - 1), %rax
> > > > -     jz      L(align_four_vec_loop)
> > > > -
> > > > -     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k1
> > > > -     add     $VEC_SIZE, %rax
> > > > -     kmovd   %k1, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit)
> > > > -
> > > > -     add     $VEC_SIZE, %rax
> > > > -
> > > > -     .p2align 4
> > > > -L(align_four_vec_loop):
> > > > -     VMOVA   (%rax), %YMM0
> > > > -     VMOVA   (VEC_SIZE * 2)(%rax), %YMM1
> > > > -     vpminub VEC_SIZE(%rax), %YMM0, %YMM0
> > > > -     vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1
> > > > -     vpminub %YMM0, %YMM1, %YMM0
> > > > -     /* If K0 != 0, there is a null byte.  */
> > > > -     vpcmpb  $0, %YMM0, %YMMZERO, %k0
> > > > -     add     $(VEC_SIZE * 4), %rax
> > > > -     ktestd  %k0, %k0
> > > > -     jz      L(align_four_vec_loop)
> > > > -
> > > > -     vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
> > > > -     sub     $(VEC_SIZE * 5), %rax
> > > > -     kmovd   %k0, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_second_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> > > > -     kmovd   %k1, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_third_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> > > > -     kmovd   %k2, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(exit_null_on_fourth_vector)
> > > > -
> > > > -     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> > > > -     kmovd   %k3, %edx
> > > > -     sub     %rdi, %rax
> > > > -     bsf     %rdx, %rdx
> > > > -     add     %rdx, %rax
> > > > -     add     $(VEC_SIZE * 4), %rax
> > > > -     jmp     L(StartStrcpyPart)
> > > > -
> > > > -     .p2align 4
> > > > -L(exit):
> > > > -     sub     %rdi, %rax
> > > > -L(exit_null_on_first_vector):
> > > > -     bsf     %rdx, %rdx
> > > > -     add     %rdx, %rax
> > > > -     jmp     L(StartStrcpyPart)
> > > > -
> > > > -     .p2align 4
> > > > -L(exit_null_on_second_vector):
> > > > -     sub     %rdi, %rax
> > > > -     bsf     %rdx, %rdx
> > > > -     add     %rdx, %rax
> > > > -     add     $VEC_SIZE, %rax
> > > > -     jmp     L(StartStrcpyPart)
> > > > -
> > > > -     .p2align 4
> > > > -L(exit_null_on_third_vector):
> > > > -     sub     %rdi, %rax
> > > > -     bsf     %rdx, %rdx
> > > > -     add     %rdx, %rax
> > > > -     add     $(VEC_SIZE * 2), %rax
> > > > -     jmp     L(StartStrcpyPart)
> > > > -
> > > > -     .p2align 4
> > > > -L(exit_null_on_fourth_vector):
> > > > -     sub     %rdi, %rax
> > > > -     bsf     %rdx, %rdx
> > > > -     add     %rdx, %rax
> > > > -     add     $(VEC_SIZE * 3), %rax
> > > > -     jmp     L(StartStrcpyPart)
> > > > -
> > > > -     .p2align 4
> > > > -L(exit_null_on_fifth_vector):
> > > > -     sub     %rdi, %rax
> > > > -     bsf     %rdx, %rdx
> > > > -     add     %rdx, %rax
> > > > -     add     $(VEC_SIZE * 4), %rax
> > > > -
> > > > -     .p2align 4
> > > > -L(StartStrcpyPart):
> > > > -     lea     (%r9, %rax), %rdi
> > > > -     mov     %rsi, %rcx
> > > > -     mov     %r9, %rax      /* save result */
> > > > -
> > > > -# ifdef USE_AS_STRNCAT
> > > > -     test    %r8, %r8
> > > > -     jz      L(ExitZero)
> > > > -#  define USE_AS_STRNCPY
> > > > -# endif
> > > > -
> > > > -# include "strcpy-evex.S"
> > > > +#ifndef STRCAT
> > > > +# define STRCAT      __strcat_evex
> > > >  #endif
> > > > +
> > > > +#define USE_AS_STRCAT
> > > > +#define STRCPY       STRCAT
> > > > +#include "strcpy-evex.S"
> > > > diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> > > > new file mode 100644
> > > > index 0000000000..9530d7b683
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S
> > > > @@ -0,0 +1,110 @@
> > > > +/* strlen used for begining of str{n}cat using EVEX 256/512.
> > > > +   Copyright (C) 2011-2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +
> > > > +/* NOTE: This file is meant to be included by strcat-evex or
> > > > +   strncat-evex and does not standalone.  Before including %rdi
> > > > +   must be saved in %rax.  */
> > >
> > > Since this file isn't standalone, please rename it to .h.
> >
> > Can it be .h.S so it plays well it IDE modes?
>
> It sounds reasonable.
Fixed in V4.
>
> > >
> > > > +
> > > > +
> > > > +/* Simple strlen implementation that ends at
> > > > +   L(strcat_strlen_done).  */
> > > > +     vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> > > > +     movq    %rdi, %r8
> > > > +     andq    $(VEC_SIZE * -1), %r8
> > > > +     VPCMPEQ (%r8), %VZERO, %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +#ifdef USE_AS_WCSCPY
> > > > +     subl    %r8d, %edi
> > > > +     shrl    $2, %edi
> > > > +#endif
> > > > +     shrx    %VRDI, %VRCX, %VRCX
> > > > +#ifdef USE_AS_WCSCPY
> > > > +     movq    %rax, %rdi
> > > > +#endif
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(bsf_and_done_v0)
> > > > +
> > > > +
> > > > +     VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     leaq    (VEC_SIZE)(%r8), %rdi
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(bsf_and_done_v0)
> > > > +
> > > > +     VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(bsf_and_done_v1)
> > > > +
> > > > +     VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(bsf_and_done_v2)
> > > > +
> > > > +     VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(bsf_and_done_v3)
> > > > +
> > > > +     andq    $-(VEC_SIZE * 4), %rdi
> > > > +     .p2align 4,, 8
> > > > +L(loop_2x_vec):
> > > > +     VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(0)
> > > > +     VPMIN   (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
> > > > +     VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(2)
> > > > +     VPMIN   (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
> > > > +     VPTESTN %VMM(1), %VMM(1), %k1
> > > > +     VPTESTN %VMM(3), %VMM(3), %k3
> > > > +     subq    $(VEC_SIZE * -4), %rdi
> > > > +     KORTEST %k1, %k3
> > > > +     jz      L(loop_2x_vec)
> > > > +
> > > > +     VPTESTN %VMM(0), %VMM(0), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(bsf_and_done_v0)
> > > > +
> > > > +     KMOV    %k1, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(bsf_and_done_v1)
> > > > +
> > > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(bsf_and_done_v2)
> > > > +
> > > > +     KMOV    %k3, %VRCX
> > > > +L(bsf_and_done_v3):
> > > > +     addq    $VEC_SIZE, %rdi
> > > > +L(bsf_and_done_v2):
> > > > +     bsf     %VRCX, %VRCX
> > > > +     leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
> > > > +     jmp     L(strcat_strlen_done)
> > > > +
> > > > +     .p2align 4,, 4
> > > > +L(bsf_and_done_v1):
> > > > +     addq    $VEC_SIZE, %rdi
> > > > +L(bsf_and_done_v0):
> > > > +     bsf     %VRCX, %VRCX
> > > > +#ifdef USE_AS_WCSCPY
> > > > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > > +#else
> > > > +     addq    %rcx, %rdi
> > > > +#endif
> > > > +L(strcat_strlen_done):
> > > > diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
> > > > index 82e45ac675..1ba0195ed2 100644
> > > > --- a/sysdeps/x86_64/multiarch/strcpy-evex.S
> > > > +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
> > > > @@ -1,4 +1,4 @@
> > > > -/* strcpy with 256-bit EVEX instructions.
> > > > +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
> > > >     Copyright (C) 2021-2022 Free Software Foundation, Inc.
> > > >     This file is part of the GNU C Library.
> > > >
> > > > @@ -17,990 +17,526 @@
> > > >     <https://www.gnu.org/licenses/>.  */
> > > >
> > > >  #include <isa-level.h>
> > > > -
> > > >  #if ISA_SHOULD_BUILD (4)
> > > >
> > > >
> > > > -# ifndef USE_AS_STRCAT
> > > > -#  include <sysdep.h>
> > > > +     /* Use evex-masked stores for small sizes. Turned off at the
> > > > +        moment.  */
> > > > +# define USE_EVEX_MASKED_STORE       0
> > > > +     /* Use movsb in page cross case to save code size.  */
> > > > +# define USE_MOVSB_IN_PAGE_CROSS     1
> > > >
> > > > -#  ifndef STRCPY
> > > > -#   define STRCPY  __strcpy_evex
> > > > -#  endif
> > > > +# include <sysdep.h>
> > > >
> > > > +# ifndef VEC_SIZE
> > > > +#  include "x86-evex256-vecs.h"
> > > >  # endif
> > > >
> > > > -# define VMOVU               vmovdqu64
> > > > -# define VMOVA               vmovdqa64
> > > > -
> > > > -/* Number of bytes in a vector register */
> > > > -# ifndef VEC_SIZE
> > > > -#  define VEC_SIZE   32
> > > > +# ifndef STRCPY
> > > > +#  define STRCPY     __strcpy_evex
> > > >  # endif
> > > >
> > > > -# define XMM2                xmm18
> > > > -# define XMM3                xmm19
> > > >
> > > > -# define YMM2                ymm18
> > > > -# define YMM3                ymm19
> > > > -# define YMM4                ymm20
> > > > -# define YMM5                ymm21
> > > > -# define YMM6                ymm22
> > > > -# define YMM7                ymm23
> > > > +# ifdef USE_AS_WCSCPY
> > > > +#  define VMOVU_MASK vmovdqu32
> > > > +#  define VPMIN      vpminud
> > > > +#  define VPTESTN    vptestnmd
> > > > +#  define VPTEST     vptestmd
> > > > +#  define VPCMPEQ    vpcmpeqd
> > > > +#  define CHAR_SIZE  4
> > > >
> > > > -# ifndef USE_AS_STRCAT
> > > > +#  define REP_MOVS   rep movsd
> > > >
> > > > -/* zero register */
> > > > -#  define XMMZERO    xmm16
> > > > -#  define YMMZERO    ymm16
> > > > -#  define YMM1               ymm17
> > > > -
> > > > -     .section .text.evex,"ax",@progbits
> > > > -ENTRY (STRCPY)
> > > > -#  ifdef USE_AS_STRNCPY
> > > > -     mov     %RDX_LP, %R8_LP
> > > > -     test    %R8_LP, %R8_LP
> > > > -     jz      L(ExitZero)
> > > > -#  endif
> > > > -     mov     %rsi, %rcx
> > > > -#  ifndef USE_AS_STPCPY
> > > > -     mov     %rdi, %rax      /* save result */
> > > > -#  endif
> > > > +#  define USE_WIDE_CHAR
> > > > +# else
> > > > +#  define VMOVU_MASK vmovdqu8
> > > > +#  define VPMIN      vpminub
> > > > +#  define VPTESTN    vptestnmb
> > > > +#  define VPTEST     vptestmb
> > > > +#  define VPCMPEQ    vpcmpeqb
> > > > +#  define CHAR_SIZE  1
> > > >
> > > > -     vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> > > > +#  define REP_MOVS   rep movsb
> > > >  # endif
> > > >
> > > > -     and     $((VEC_SIZE * 4) - 1), %ecx
> > > > -     cmp     $(VEC_SIZE * 2), %ecx
> > > > -     jbe     L(SourceStringAlignmentLessTwoVecSize)
> > > > -
> > > > -     and     $-VEC_SIZE, %rsi
> > > > -     and     $(VEC_SIZE - 1), %ecx
> > > > -
> > > > -     vpcmpb  $0, (%rsi), %YMMZERO, %k0
> > > > -     kmovd   %k0, %edx
> > > > -     shr     %cl, %rdx
> > > > +# include "reg-macros.h"
> > > >
> > > > -# ifdef USE_AS_STRNCPY
> > > > -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > > > -     mov     $VEC_SIZE, %r10
> > > > -     sub     %rcx, %r10
> > > > -     cmp     %r10, %r8
> > > > -#  else
> > > > -     mov     $(VEC_SIZE + 1), %r10
> > > > -     sub     %rcx, %r10
> > > > -     cmp     %r10, %r8
> > > > -#  endif
> > > > -     jbe     L(CopyVecSizeTailCase2OrCase3)
> > > > -# endif
> > > > -     test    %edx, %edx
> > > > -     jnz     L(CopyVecSizeTail)
> > > > -
> > > > -     vpcmpb  $0, VEC_SIZE(%rsi), %YMMZERO, %k1
> > > > -     kmovd   %k1, %edx
> > > >
> > > > -# ifdef USE_AS_STRNCPY
> > > > -     add     $VEC_SIZE, %r10
> > > > -     cmp     %r10, %r8
> > > > -     jbe     L(CopyTwoVecSizeCase2OrCase3)
> > > > -# endif
> > > > -     test    %edx, %edx
> > > > -     jnz     L(CopyTwoVecSize)
> > > > -
> > > > -     VMOVU   (%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
> > > > -     VMOVU   %YMM2, (%rdi)
> > > > -
> > > > -/* If source address alignment != destination address alignment */
> > > > -     .p2align 4
> > > > -L(UnalignVecSizeBoth):
> > > > -     sub     %rcx, %rdi
> > > > -# ifdef USE_AS_STRNCPY
> > > > -     add     %rcx, %r8
> > > > -     sbb     %rcx, %rcx
> > > > -     or      %rcx, %r8
> > > > -# endif
> > > > -     mov     $VEC_SIZE, %rcx
> > > > -     VMOVA   (%rsi, %rcx), %YMM2
> > > > -     VMOVU   %YMM2, (%rdi, %rcx)
> > > > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> > > > -     vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > > > -     kmovd   %k0, %edx
> > > > -     add     $VEC_SIZE, %rcx
> > > > -# ifdef USE_AS_STRNCPY
> > > > -     sub     $(VEC_SIZE * 3), %r8
> > > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > > -# endif
> > > > -     test    %edx, %edx
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -     jnz     L(CopyVecSizeUnalignedVec2)
> > > > +# ifdef USE_AS_STPCPY
> > > > +#  define END_REG    rax
> > > >  # else
> > > > -     jnz     L(CopyVecSize)
> > > > +#  define END_REG    rdi, %rdx, CHAR_SIZE
> > > >  # endif
> > > >
> > > > -     VMOVU   %YMM2, (%rdi, %rcx)
> > > > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM3
> > > > -     vpcmpb  $0, %YMM3, %YMMZERO, %k0
> > > > -     kmovd   %k0, %edx
> > > > -     add     $VEC_SIZE, %rcx
> > > > -# ifdef USE_AS_STRNCPY
> > > > -     sub     $VEC_SIZE, %r8
> > > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > > -# endif
> > > > -     test    %edx, %edx
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -     jnz     L(CopyVecSizeUnalignedVec3)
> > > > +# ifdef USE_AS_STRCAT
> > > > +#  define PAGE_ALIGN_REG     edx
> > > > +#  define PAGE_ALIGN_REG_64  rdx
> > > >  # else
> > > > -     jnz     L(CopyVecSize)
> > > > +#  define PAGE_ALIGN_REG     eax
> > > > +#  define PAGE_ALIGN_REG_64  rax
> > > >  # endif
> > > >
> > > > -     VMOVU   %YMM3, (%rdi, %rcx)
> > > > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM4
> > > > -     vpcmpb  $0, %YMM4, %YMMZERO, %k0
> > > > -     kmovd   %k0, %edx
> > > > -     add     $VEC_SIZE, %rcx
> > > > -# ifdef USE_AS_STRNCPY
> > > > -     sub     $VEC_SIZE, %r8
> > > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > > -# endif
> > > > -     test    %edx, %edx
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -     jnz     L(CopyVecSizeUnalignedVec4)
> > > > -# else
> > > > -     jnz     L(CopyVecSize)
> > > > -# endif
> > > > +# define VZERO       VMM(7)
> > > > +# define VZERO_128   VMM_128(7)
> > > >
> > > > -     VMOVU   %YMM4, (%rdi, %rcx)
> > > > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> > > > -     vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > > > -     kmovd   %k0, %edx
> > > > -     add     $VEC_SIZE, %rcx
> > > > -# ifdef USE_AS_STRNCPY
> > > > -     sub     $VEC_SIZE, %r8
> > > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > > -# endif
> > > > -     test    %edx, %edx
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -     jnz     L(CopyVecSizeUnalignedVec2)
> > > > -# else
> > > > -     jnz     L(CopyVecSize)
> > > > -# endif
> > > >
> > > > -     VMOVU   %YMM2, (%rdi, %rcx)
> > > > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM2
> > > > -     vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > > > -     kmovd   %k0, %edx
> > > > -     add     $VEC_SIZE, %rcx
> > > > -# ifdef USE_AS_STRNCPY
> > > > -     sub     $VEC_SIZE, %r8
> > > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > > -# endif
> > > > -     test    %edx, %edx
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -     jnz     L(CopyVecSizeUnalignedVec2)
> > > > -# else
> > > > -     jnz     L(CopyVecSize)
> > > > -# endif
> > > > +# define PAGE_SIZE   4096
> > > > +# define CHAR_PER_VEC        (VEC_SIZE / CHAR_SIZE)
> > > >
> > > > -     VMOVA   VEC_SIZE(%rsi, %rcx), %YMM3
> > > > -     VMOVU   %YMM2, (%rdi, %rcx)
> > > > -     vpcmpb  $0, %YMM3, %YMMZERO, %k0
> > > > -     kmovd   %k0, %edx
> > > > -     add     $VEC_SIZE, %rcx
> > > > -# ifdef USE_AS_STRNCPY
> > > > -     sub     $VEC_SIZE, %r8
> > > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > > -# endif
> > > > -     test    %edx, %edx
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -     jnz     L(CopyVecSizeUnalignedVec3)
> > > > -# else
> > > > -     jnz     L(CopyVecSize)
> > > > -# endif
> > > >
> > > > -     VMOVU   %YMM3, (%rdi, %rcx)
> > > > -     mov     %rsi, %rdx
> > > > -     lea     VEC_SIZE(%rsi, %rcx), %rsi
> > > > -     and     $-(VEC_SIZE * 4), %rsi
> > > > -     sub     %rsi, %rdx
> > > > -     sub     %rdx, %rdi
> > > > -# ifdef USE_AS_STRNCPY
> > > > -     lea     (VEC_SIZE * 8)(%r8, %rdx), %r8
> > > > -# endif
> > > > -L(UnalignedFourVecSizeLoop):
> > > > -     VMOVA   (%rsi), %YMM4
> > > > -     VMOVA   VEC_SIZE(%rsi), %YMM5
> > > > -     VMOVA   (VEC_SIZE * 2)(%rsi), %YMM6
> > > > -     VMOVA   (VEC_SIZE * 3)(%rsi), %YMM7
> > > > -     vpminub %YMM5, %YMM4, %YMM2
> > > > -     vpminub %YMM7, %YMM6, %YMM3
> > > > -     vpminub %YMM2, %YMM3, %YMM2
> > > > -     /* If K7 != 0, there is a null byte.  */
> > > > -     vpcmpb  $0, %YMM2, %YMMZERO, %k7
> > > > -     kmovd   %k7, %edx
> > > > -# ifdef USE_AS_STRNCPY
> > > > -     sub     $(VEC_SIZE * 4), %r8
> > > > -     jbe     L(UnalignedLeaveCase2OrCase3)
> > > > +     .section SECTION(.text), "ax", @progbits
> > > > +ENTRY(STRCPY)
> > > > +# ifdef USE_AS_STRCAT
> > > > +     movq    %rdi, %rax
> > > > +#  include "strcat-strlen-evex.S"
> > > >  # endif
> > > > -     test    %edx, %edx
> > > > -     jnz     L(UnalignedFourVecSizeLeave)
> > > > -
> > > > -L(UnalignedFourVecSizeLoop_start):
> > > > -     add     $(VEC_SIZE * 4), %rdi
> > > > -     add     $(VEC_SIZE * 4), %rsi
> > > > -     VMOVU   %YMM4, -(VEC_SIZE * 4)(%rdi)
> > > > -     VMOVA   (%rsi), %YMM4
> > > > -     VMOVU   %YMM5, -(VEC_SIZE * 3)(%rdi)
> > > > -     VMOVA   VEC_SIZE(%rsi), %YMM5
> > > > -     vpminub %YMM5, %YMM4, %YMM2
> > > > -     VMOVU   %YMM6, -(VEC_SIZE * 2)(%rdi)
> > > > -     VMOVA   (VEC_SIZE * 2)(%rsi), %YMM6
> > > > -     VMOVU   %YMM7, -VEC_SIZE(%rdi)
> > > > -     VMOVA   (VEC_SIZE * 3)(%rsi), %YMM7
> > > > -     vpminub %YMM7, %YMM6, %YMM3
> > > > -     vpminub %YMM2, %YMM3, %YMM2
> > > > -     /* If K7 != 0, there is a null byte.  */
> > > > -     vpcmpb  $0, %YMM2, %YMMZERO, %k7
> > > > -     kmovd   %k7, %edx
> > > > -# ifdef USE_AS_STRNCPY
> > > > -     sub     $(VEC_SIZE * 4), %r8
> > > > -     jbe     L(UnalignedLeaveCase2OrCase3)
> > > > +
> > > > +     movl    %esi, %PAGE_ALIGN_REG
> > > > +     andl    $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
> > > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
> > > > +     ja      L(page_cross)
> > > > +L(page_cross_continue):
> > > > +     VMOVU   (%rsi), %VMM(0)
> > > > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > > > +     movq    %rdi, %rax
> > > >  # endif
> > > > -     test    %edx, %edx
> > > > -     jz      L(UnalignedFourVecSizeLoop_start)
> > > >
> > > > -L(UnalignedFourVecSizeLeave):
> > > > -     vpcmpb  $0, %YMM4, %YMMZERO, %k1
> > > > -     kmovd   %k1, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(CopyVecSizeUnaligned_0)
> > > >
> > > > -     vpcmpb  $0, %YMM5, %YMMZERO, %k2
> > > > -     kmovd   %k2, %ecx
> > > > -     test    %ecx, %ecx
> > > > -     jnz     L(CopyVecSizeUnaligned_16)
> > > > +     /* Two short string implementations. One with traditional
> > > > +        branching approach and one with masked instructions (which
> > > > +        have potential for dramatically bad perf if dst splits a
> > > > +        page and is not in the TLB).  */
> > > > +# if USE_EVEX_MASKED_STORE
> > > > +     VPTEST  %VMM(0), %VMM(0), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     subl    $((1 << CHAR_PER_VEC)- 1), %VRCX
> > > > +#  else
> > > > +     inc     %VRCX
> > > > +#  endif
> > > > +     jz      L(more_1x_vec)
> > > > +     KMOV    %VRCX, %k1
> > > > +     KXOR    %k0, %k1, %k1
> > > >
> > > > -     vpcmpb  $0, %YMM6, %YMMZERO, %k3
> > > > -     kmovd   %k3, %edx
> > > > -     test    %edx, %edx
> > > > -     jnz     L(CopyVecSizeUnaligned_32)
> > > > -
> > > > -     vpcmpb  $0, %YMM7, %YMMZERO, %k4
> > > > -     kmovd   %k4, %ecx
> > > > -     bsf     %ecx, %edx
> > > > -     VMOVU   %YMM4, (%rdi)
> > > > -     VMOVU   %YMM5, VEC_SIZE(%rdi)
> > > > -     VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -# ifdef USE_AS_STPCPY
> > > > -     lea     (VEC_SIZE * 3)(%rdi, %rdx), %rax
> > > > -# endif
> > > > -     VMOVU   %YMM7, (VEC_SIZE * 3)(%rdi)
> > > > -     add     $(VEC_SIZE - 1), %r8
> > > > -     sub     %rdx, %r8
> > > > -     lea     ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
> > > > -     jmp     L(StrncpyFillTailWithZero)
> > > > -# else
> > > > -     add     $(VEC_SIZE * 3), %rsi
> > > > -     add     $(VEC_SIZE * 3), %rdi
> > > > -     jmp     L(CopyVecSizeExit)
> > > > -# endif
> > > > +     VMOVU_MASK %VMM(0), (%rdi){%k1}
> > > >
> > > > -/* If source address alignment == destination address alignment */
> > > > +#  ifdef USE_AS_STPCPY
> > > > +     bsf     %VRCX, %VRCX
> > > > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rax
> > > > +#  endif
> > > > +     ret
> > > >
> > > > -L(SourceStringAlignmentLessTwoVecSize):
> > > > -     VMOVU   (%rsi), %YMM3
> > > > -     VMOVU   VEC_SIZE(%rsi), %YMM2
> > > > -     vpcmpb  $0, %YMM3, %YMMZERO, %k0
> > > > -     kmovd   %k0, %edx
> > > > +# else
> > > > +     VPTESTN %VMM(0), %VMM(0), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jz      L(more_1x_vec)
> > > >
> > > > -# ifdef USE_AS_STRNCPY
> > > > -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > > > -     cmp     $VEC_SIZE, %r8
> > > > +     xorl    %edx, %edx
> > > > +     bsf     %VRCX, %VRDX
> > > > +#  ifdef USE_AS_STPCPY
> > > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > > > +#  endif
> > > > +
> > > > +     /* Use mask bits in rcx to detect which copy we need. If the low
> > > > +        mask is zero then there must be a bit set in the upper half.
> > > > +        I.e if rcx != 0 and ecx == 0, then match must be upper 32
> > > > +        bits so we use L(copy_32_63).  */
> > > > +#  if VEC_SIZE == 64
> > > > +#   ifdef USE_AS_WCSCPY
> > > > +     testb   %cl, %cl
> > > > +#   else
> > > > +     testl   %ecx, %ecx
> > > > +#   endif
> > > > +     jz      L(copy_32_63)
> > > > +#  endif
> > > > +
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     testb   $0xf, %cl
> > > >  #  else
> > > > -     cmp     $(VEC_SIZE + 1), %r8
> > > > +     testw   %cx, %cx
> > > >  #  endif
> > > > -     jbe     L(CopyVecSizeTail1Case2OrCase3)
> > > > -# endif
> > > > -     test    %edx, %edx
> > > > -     jnz     L(CopyVecSizeTail1)
> > > > +     jz      L(copy_16_31)
> > > >
> > > > -     VMOVU   %YMM3, (%rdi)
> > > > -     vpcmpb  $0, %YMM2, %YMMZERO, %k0
> > > > -     kmovd   %k0, %edx
> > > >
> > > > -# ifdef USE_AS_STRNCPY
> > > > -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > > > -     cmp     $(VEC_SIZE * 2), %r8
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     testb   $0x3, %cl
> > > >  #  else
> > > > -     cmp     $((VEC_SIZE * 2) + 1), %r8
> > > > +     testb   %cl, %cl
> > > >  #  endif
> > > > -     jbe     L(CopyTwoVecSize1Case2OrCase3)
> > > > -# endif
> > > > -     test    %edx, %edx
> > > > -     jnz     L(CopyTwoVecSize1)
> > > > -
> > > > -     and     $-VEC_SIZE, %rsi
> > > > -     and     $(VEC_SIZE - 1), %ecx
> > > > -     jmp     L(UnalignVecSizeBoth)
> > > > +     jz      L(copy_8_15)
> > > >
> > > > -/*------End of main part with loops---------------------*/
> > > >
> > > > -/* Case1 */
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     vmovd   %VMM_128(0), (%rdi)
> > > > +     /* No need to copy, we know its zero.  */
> > > > +     movl    $0, (%END_REG)
> > > >
> > > > -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> > > > -     .p2align 4
> > > > -L(CopyVecSize):
> > > > -     add     %rcx, %rdi
> > > > -# endif
> > > > -L(CopyVecSizeTail):
> > > > -     add     %rcx, %rsi
> > > > -L(CopyVecSizeTail1):
> > > > -     bsf     %edx, %edx
> > > > -L(CopyVecSizeExit):
> > > > -     cmp     $32, %edx
> > > > -     jae     L(Exit32_63)
> > > > -     cmp     $16, %edx
> > > > -     jae     L(Exit16_31)
> > > > -     cmp     $8, %edx
> > > > -     jae     L(Exit8_15)
> > > > -     cmp     $4, %edx
> > > > -     jae     L(Exit4_7)
> > > > -     cmp     $3, %edx
> > > > -     je      L(Exit3)
> > > > -     cmp     $1, %edx
> > > > -     ja      L(Exit2)
> > > > -     je      L(Exit1)
> > > > -     movb    $0, (%rdi)
> > > > -# ifdef USE_AS_STPCPY
> > > > -     lea     (%rdi), %rax
> > > > -# endif
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -     sub     $1, %r8
> > > > -     lea     1(%rdi), %rdi
> > > > -     jnz     L(StrncpyFillTailWithZero)
> > > > -# endif
> > > >       ret
> > > > +#  else
> > > >
> > > > -     .p2align 4
> > > > -L(CopyTwoVecSize1):
> > > > -     add     $VEC_SIZE, %rsi
> > > > -     add     $VEC_SIZE, %rdi
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -     sub     $VEC_SIZE, %r8
> > > > -# endif
> > > > -     jmp     L(CopyVecSizeTail1)
> > > > -
> > > > -     .p2align 4
> > > > -L(CopyTwoVecSize):
> > > > -     bsf     %edx, %edx
> > > > -     add     %rcx, %rsi
> > > > -     add     $VEC_SIZE, %edx
> > > > -     sub     %ecx, %edx
> > > > -     jmp     L(CopyVecSizeExit)
> > > > -
> > > > -     .p2align 4
> > > > -L(CopyVecSizeUnaligned_0):
> > > > -     bsf     %edx, %edx
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -# ifdef USE_AS_STPCPY
> > > > -     lea     (%rdi, %rdx), %rax
> > > > -# endif
> > > > -     VMOVU   %YMM4, (%rdi)
> > > > -     add     $((VEC_SIZE * 4) - 1), %r8
> > > > -     sub     %rdx, %r8
> > > > -     lea     1(%rdi, %rdx), %rdi
> > > > -     jmp     L(StrncpyFillTailWithZero)
> > > > -# else
> > > > -     jmp     L(CopyVecSizeExit)
> > > > -# endif
> > > > +     testb   $0x7, %cl
> > > > +     jz      L(copy_4_7)
> > > >
> > > > -     .p2align 4
> > > > -L(CopyVecSizeUnaligned_16):
> > > > -     bsf     %ecx, %edx
> > > > -     VMOVU   %YMM4, (%rdi)
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -# ifdef USE_AS_STPCPY
> > > > -     lea     VEC_SIZE(%rdi, %rdx), %rax
> > > > -# endif
> > > > -     VMOVU   %YMM5, VEC_SIZE(%rdi)
> > > > -     add     $((VEC_SIZE * 3) - 1), %r8
> > > > -     sub     %rdx, %r8
> > > > -     lea     (VEC_SIZE + 1)(%rdi, %rdx), %rdi
> > > > -     jmp     L(StrncpyFillTailWithZero)
> > > > -# else
> > > > -     add     $VEC_SIZE, %rsi
> > > > -     add     $VEC_SIZE, %rdi
> > > > -     jmp     L(CopyVecSizeExit)
> > > > -# endif
> > > >
> > > > -     .p2align 4
> > > > -L(CopyVecSizeUnaligned_32):
> > > > -     bsf     %edx, %edx
> > > > -     VMOVU   %YMM4, (%rdi)
> > > > -     VMOVU   %YMM5, VEC_SIZE(%rdi)
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -# ifdef USE_AS_STPCPY
> > > > -     lea     (VEC_SIZE * 2)(%rdi, %rdx), %rax
> > > > -# endif
> > > > -     VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > > > -     add     $((VEC_SIZE * 2) - 1), %r8
> > > > -     sub     %rdx, %r8
> > > > -     lea     ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
> > > > -     jmp     L(StrncpyFillTailWithZero)
> > > > -# else
> > > > -     add     $(VEC_SIZE * 2), %rsi
> > > > -     add     $(VEC_SIZE * 2), %rdi
> > > > -     jmp     L(CopyVecSizeExit)
> > > > -# endif
> > > > +     test    %edx, %edx
> > > > +     jz      L(set_null_term)
> > > >
> > > > -# ifdef USE_AS_STRNCPY
> > > > -#  ifndef USE_AS_STRCAT
> > > > -     .p2align 4
> > > > -L(CopyVecSizeUnalignedVec6):
> > > > -     VMOVU   %YMM6, (%rdi, %rcx)
> > > > -     jmp     L(CopyVecSizeVecExit)
> > > > -
> > > > -     .p2align 4
> > > > -L(CopyVecSizeUnalignedVec5):
> > > > -     VMOVU   %YMM5, (%rdi, %rcx)
> > > > -     jmp     L(CopyVecSizeVecExit)
> > > > -
> > > > -     .p2align 4
> > > > -L(CopyVecSizeUnalignedVec4):
> > > > -     VMOVU   %YMM4, (%rdi, %rcx)
> > > > -     jmp     L(CopyVecSizeVecExit)
> > > > -
> > > > -     .p2align 4
> > > > -L(CopyVecSizeUnalignedVec3):
> > > > -     VMOVU   %YMM3, (%rdi, %rcx)
> > > > -     jmp     L(CopyVecSizeVecExit)
> > > > +     /* NB: make this `vmovw` if support for AVX512-FP16 is added.
> > > > +      */
> > > > +     vmovd   %VMM_128(0), %esi
> > > > +     movw    %si, (%rdi)
> > > > +
> > > > +     .p2align 4,, 1
> > > > +L(set_null_term):
> > > > +     /* No need to copy, we know its zero.  */
> > > > +     movb    $0, (%END_REG)
> > > > +     ret
> > > >  #  endif
> > > >
> > > > -/* Case2 */
> > > > -
> > > > -     .p2align 4
> > > > -L(CopyVecSizeCase2):
> > > > -     add     $VEC_SIZE, %r8
> > > > -     add     %rcx, %rdi
> > > > -     add     %rcx, %rsi
> > > > -     bsf     %edx, %edx
> > > > -     cmp     %r8d, %edx
> > > > -     jb      L(CopyVecSizeExit)
> > > > -     jmp     L(StrncpyExit)
> > > > -
> > > > -     .p2align 4
> > > > -L(CopyTwoVecSizeCase2):
> > > > -     add     %rcx, %rsi
> > > > -     bsf     %edx, %edx
> > > > -     add     $VEC_SIZE, %edx
> > > > -     sub     %ecx, %edx
> > > > -     cmp     %r8d, %edx
> > > > -     jb      L(CopyVecSizeExit)
> > > > -     jmp     L(StrncpyExit)
> > > > -
> > > > -L(CopyVecSizeTailCase2):
> > > > -     add     %rcx, %rsi
> > > > -     bsf     %edx, %edx
> > > > -     cmp     %r8d, %edx
> > > > -     jb      L(CopyVecSizeExit)
> > > > -     jmp     L(StrncpyExit)
> > > > -
> > > > -L(CopyVecSizeTail1Case2):
> > > > -     bsf     %edx, %edx
> > > > -     cmp     %r8d, %edx
> > > > -     jb      L(CopyVecSizeExit)
> > > > -     jmp     L(StrncpyExit)
> > > > -
> > > > -/* Case2 or Case3,  Case3 */
> > > > -
> > > > -     .p2align 4
> > > > -L(CopyVecSizeCase2OrCase3):
> > > > -     test    %rdx, %rdx
> > > > -     jnz     L(CopyVecSizeCase2)
> > > > -L(CopyVecSizeCase3):
> > > > -     add     $VEC_SIZE, %r8
> > > > -     add     %rcx, %rdi
> > > > -     add     %rcx, %rsi
> > > > -     jmp     L(StrncpyExit)
> > > > -
> > > > -     .p2align 4
> > > > -L(CopyTwoVecSizeCase2OrCase3):
> > > > -     test    %rdx, %rdx
> > > > -     jnz     L(CopyTwoVecSizeCase2)
> > > > -     add     %rcx, %rsi
> > > > -     jmp     L(StrncpyExit)
> > > > -
> > > > -     .p2align 4
> > > > -L(CopyVecSizeTailCase2OrCase3):
> > > > -     test    %rdx, %rdx
> > > > -     jnz     L(CopyVecSizeTailCase2)
> > > > -     add     %rcx, %rsi
> > > > -     jmp     L(StrncpyExit)
> > > > -
> > > > -     .p2align 4
> > > > -L(CopyTwoVecSize1Case2OrCase3):
> > > > -     add     $VEC_SIZE, %rdi
> > > > -     add     $VEC_SIZE, %rsi
> > > > -     sub     $VEC_SIZE, %r8
> > > > -L(CopyVecSizeTail1Case2OrCase3):
> > > > -     test    %rdx, %rdx
> > > > -     jnz     L(CopyVecSizeTail1Case2)
> > > > -     jmp     L(StrncpyExit)
> > > > +#  if VEC_SIZE == 64
> > > > +     .p2align 4,, 6
> > > > +L(copy_32_63):
> > > > +     VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > > > +     VMOVU   %VMM_256(0), (%rdi)
> > > > +     VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
> > > > +     ret
> > > > +#  endif
> > > > +
> > > > +
> > > > +     .p2align 4,, 6
> > > > +L(copy_16_31):
> > > > +     /* Use xmm1 explicitly here as it won't require a `vzeroupper`
> > > > +        and will save code size.  */
> > > > +     vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > > > +     VMOVU   %VMM_128(0), (%rdi)
> > > > +     vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
> > > > +     ret
> > > > +
> > > > +     .p2align 4,, 8
> > > > +L(copy_8_15):
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     movl    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> > > > +#  else
> > > > +     movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
> > > > +#  endif
> > > > +     vmovq   %VMM_128(0), (%rdi)
> > > > +     movq    %rcx, -(8 - CHAR_SIZE)(%END_REG)
> > > > +     ret
> > > >  # endif
> > > >
> > > > -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
> > > >
> > > > -     .p2align 4
> > > > -L(Exit1):
> > > > -     movzwl  (%rsi), %edx
> > > > -     mov     %dx, (%rdi)
> > > > -# ifdef USE_AS_STPCPY
> > > > -     lea     1(%rdi), %rax
> > > > +# ifndef USE_AS_WCSCPY
> > > > +     .p2align 4,, 12
> > > > +L(copy_4_7):
> > > > +     movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> > > > +     vmovd   %VMM_128(0), (%rdi)
> > > > +     movl    %ecx, -(4 - CHAR_SIZE)(%END_REG)
> > > > +     ret
> > > >  # endif
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -     sub     $2, %r8
> > > > -     lea     2(%rdi), %rdi
> > > > -     jnz     L(StrncpyFillTailWithZero)
> > > > +
> > > > +
> > > > +     .p2align 4,, 8
> > > > +L(more_1x_vec):
> > > > +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > > > +     VMOVU   %VMM(0), (%rdi)
> > > >  # endif
> > > > -     ret
> > > > +     subq    %rsi, %rdi
> > > > +     andq    $-(VEC_SIZE), %rsi
> > > > +     addq    %rsi, %rdi
> > > > +     VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> > > >
> > > > -     .p2align 4
> > > > -L(Exit2):
> > > > -     movzwl  (%rsi), %ecx
> > > > -     mov     %cx, (%rdi)
> > > > -     movb    $0, 2(%rdi)
> > > > +     /* Ideally we store after moves to minimize impact of potential
> > > > +        false-dependencies.  */
> > > > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > > > +     VMOVU   %VMM(0), (%rax)
> > > > +# endif
> > > > +
> > > > +     VPTESTN %VMM(1), %VMM(1), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(ret_vec_x1)
> > > > +
> > > > +     VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > > > +     VMOVU   %VMM(1), VEC_SIZE(%rdi)
> > > > +
> > > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(ret_vec_x2)
> > > > +
> > > > +     VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> > > > +     VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > > > +
> > > > +     VPTESTN %VMM(3), %VMM(3), %k0
> > > > +     KMOV    %k0, %VRDX
> > > > +     test    %VRDX, %VRDX
> > > > +     jnz     L(ret_vec_x3)
> > > > +
> > > > +     VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > > > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > > > +     VPTESTN %VMM(4), %VMM(4), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(ret_vec_x4)
> > > > +
> > > > +     VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> > > > +
> > > > +
> > > > +     /* Align for 4x loop.  */
> > > > +     subq    %rsi, %rdi
> > > > +
> > > > +     /* + VEC_SIZE * 5 because we never added the original VEC_SIZE
> > > > +        we covered before aligning.  */
> > > > +     subq    $-(VEC_SIZE * 5), %rsi
> > > > +     andq    $-(VEC_SIZE * 4), %rsi
> > > > +
> > > > +
> > > > +     /* Load first half of the loop before entry.  */
> > > > +     VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > > > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > > > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > > > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > > > +
> > > > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > > > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > > > +     VPTESTN %VMM(4), %VMM(4), %k2
> > > > +     VPTESTN %VMM(6), %VMM(6), %k4
> > > > +     KORTEST %k2, %k4
> > > > +     jnz     L(loop_4x_done)
> > > > +
> > > > +     .p2align 4,, 11
> > > > +L(loop_4x_vec):
> > > > +
> > > > +     VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
> > > > +     VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > > > +     VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> > > > +     VMOVU   %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> > > > +
> > > > +     subq    $(VEC_SIZE * -4), %rsi
> > > > +
> > > > +     VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > > > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > > > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > > > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > > > +
> > > > +
> > > > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > > > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > > > +     VPTESTN %VMM(4), %VMM(4), %k2
> > > > +     VPTESTN %VMM(6), %VMM(6), %k4
> > > > +     KORTEST %k2, %k4
> > > > +     jz      L(loop_4x_vec)
> > > > +
> > > > +L(loop_4x_done):
> > > > +     VPTESTN %VMM(0), %VMM(0), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     /* Restore rdi (%rdi).  */
> > > > +     addq    %rsi, %rdi
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(ret_vec_x0_end)
> > > > +     VMOVU   %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> > > > +
> > > > +     KMOV    %k2, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(ret_vec_x1)
> > > > +     VMOVU   %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> > > > +
> > > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(ret_vec_x2)
> > > > +     VMOVU   %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> > > > +     /* Place L(ret_vec_x4) here to save code size.  We get a
> > > > +        meaningfuly benefit doing this for stpcpy.  */
> > > > +     KMOV    %k4, %VRDX
> > > > +L(ret_vec_x3):
> > > > +     bsf     %VRDX, %VRDX
> > > > +     VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > > > +     VMOVU   %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > >  # ifdef USE_AS_STPCPY
> > > > -     lea     2(%rdi), %rax
> > > > -# endif
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -     sub     $3, %r8
> > > > -     lea     3(%rdi), %rdi
> > > > -     jnz     L(StrncpyFillTailWithZero)
> > > > +     leaq    (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
> > > >  # endif
> > > > +L(return_end):
> > > >       ret
> > > >
> > > > -     .p2align 4
> > > > -L(Exit3):
> > > > -     mov     (%rsi), %edx
> > > > -     mov     %edx, (%rdi)
> > > > +     .p2align 4,, 6
> > > > +L(ret_vec_x0_end):
> > > > +     bsf     %VRCX, %VRCX
> > > >  # ifdef USE_AS_STPCPY
> > > > -     lea     3(%rdi), %rax
> > > > -# endif
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -     sub     $4, %r8
> > > > -     lea     4(%rdi), %rdi
> > > > -     jnz     L(StrncpyFillTailWithZero)
> > > > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rax
> > > >  # endif
> > > > +     inc     %VRCX
> > > > +     VMOVU   (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > > > +     VMOVU   %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > > >       ret
> > > >
> > > > -     .p2align 4
> > > > -L(Exit4_7):
> > > > -     mov     (%rsi), %ecx
> > > > -     mov     %ecx, (%rdi)
> > > > -     mov     -3(%rsi, %rdx), %ecx
> > > > -     mov     %ecx, -3(%rdi, %rdx)
> > > > +     .p2align 4,, 8
> > > > +L(ret_vec_x1):
> > > > +     bsf     %VRCX, %VRCX
> > > > +     VMOVU   (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > > > +     VMOVU   %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > > >  # ifdef USE_AS_STPCPY
> > > > -     lea     (%rdi, %rdx), %rax
> > > > -# endif
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -     sub     %rdx, %r8
> > > > -     sub     $1, %r8
> > > > -     lea     1(%rdi, %rdx), %rdi
> > > > -     jnz     L(StrncpyFillTailWithZero)
> > > > +     leaq    VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
> > > >  # endif
> > > >       ret
> > > >
> > > > -     .p2align 4
> > > > -L(Exit8_15):
> > > > -     mov     (%rsi), %rcx
> > > > -     mov     -7(%rsi, %rdx), %r9
> > > > -     mov     %rcx, (%rdi)
> > > > -     mov     %r9, -7(%rdi, %rdx)
> > > > +     .p2align 4,, 4
> > > > +L(ret_vec_x2):
> > > > +     bsf     %VRCX, %VRCX
> > > > +     VMOVU   ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > > > +     VMOVU   %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > > >  # ifdef USE_AS_STPCPY
> > > > -     lea     (%rdi, %rdx), %rax
> > > > -# endif
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -     sub     %rdx, %r8
> > > > -     sub     $1, %r8
> > > > -     lea     1(%rdi, %rdx), %rdi
> > > > -     jnz     L(StrncpyFillTailWithZero)
> > > > +     leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
> > > >  # endif
> > > >       ret
> > > >
> > > > -     .p2align 4
> > > > -L(Exit16_31):
> > > > -     VMOVU   (%rsi), %XMM2
> > > > -     VMOVU   -15(%rsi, %rdx), %XMM3
> > > > -     VMOVU   %XMM2, (%rdi)
> > > > -     VMOVU   %XMM3, -15(%rdi, %rdx)
> > > > +     /* ret_vec_x3 reuses return code after the loop.  */
> > > > +     .p2align 4,, 6
> > > > +L(ret_vec_x4):
> > > > +     bsf     %VRCX, %VRCX
> > > > +     VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > > > +     VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > > >  # ifdef USE_AS_STPCPY
> > > > -     lea     (%rdi, %rdx), %rax
> > > > -# endif
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -     sub %rdx, %r8
> > > > -     sub $1, %r8
> > > > -     lea 1(%rdi, %rdx), %rdi
> > > > -     jnz L(StrncpyFillTailWithZero)
> > > > +     leaq    (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
> > > >  # endif
> > > >       ret
> > > >
> > > > -     .p2align 4
> > > > -L(Exit32_63):
> > > > -     VMOVU   (%rsi), %YMM2
> > > > -     VMOVU   -31(%rsi, %rdx), %YMM3
> > > > -     VMOVU   %YMM2, (%rdi)
> > > > -     VMOVU   %YMM3, -31(%rdi, %rdx)
> > > > -# ifdef USE_AS_STPCPY
> > > > -     lea     (%rdi, %rdx), %rax
> > > > +
> > > > +     .p2align 4,, 4
> > > > +L(page_cross):
> > > > +# ifndef USE_AS_STRCAT
> > > > +     vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> > > >  # endif
> > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > > > -     sub     %rdx, %r8
> > > > -     sub     $1, %r8
> > > > -     lea     1(%rdi, %rdx), %rdi
> > > > -     jnz     L(StrncpyFillTailWithZero)
> > > > +     movq    %rsi, %rcx
> > > > +     andq    $(VEC_SIZE * -1), %rcx
> > > > +
> > > > +     VPCMPEQ (%rcx), %VZERO, %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     andl    $(VEC_SIZE - 1), %PAGE_ALIGN_REG
> > > > +     shrl    $2, %PAGE_ALIGN_REG
> > > >  # endif
> > > > -     ret
> > > > +     shrx    %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
> > > >
> > > > -# ifdef USE_AS_STRNCPY
> > > > +# if USE_MOVSB_IN_PAGE_CROSS
> > > > +     /* Optimizing more aggressively for space as this is very cold
> > > > +        code. This saves 2x cache lines.  */
> > > >
> > > > -     .p2align 4
> > > > -L(StrncpyExit1):
> > > > -     movzbl  (%rsi), %edx
> > > > -     mov     %dl, (%rdi)
> > > > -#  ifdef USE_AS_STPCPY
> > > > -     lea     1(%rdi), %rax
> > > > -#  endif
> > > > -#  ifdef USE_AS_STRCAT
> > > > -     movb    $0, 1(%rdi)
> > > > +     /* This adds once to the later result which will get correct
> > > > +        copy bounds. NB: this can never zero-out a non-zero RCX as
> > > > +        to be in the page cross case rsi cannot be aligned and we
> > > > +        already right-shift rcx by the misalignment.  */
> > > > +     shl     %VRCX
> > > > +     jz      L(page_cross_continue)
> > > > +#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > > > +     movq    %rdi, %rax
> > > >  #  endif
> > > > -     ret
> > > > +     bsf     %VRCX, %VRCX
> > > > +     REP_MOVS
> > > >
> > > > -     .p2align 4
> > > > -L(StrncpyExit2):
> > > > -     movzwl  (%rsi), %edx
> > > > -     mov     %dx, (%rdi)
> > > >  #  ifdef USE_AS_STPCPY
> > > > -     lea     2(%rdi), %rax
> > > > -#  endif
> > > > -#  ifdef USE_AS_STRCAT
> > > > -     movb    $0, 2(%rdi)
> > > > +     leaq    -CHAR_SIZE(%rdi), %rax
> > > >  #  endif
> > > >       ret
> > > >
> > > > -     .p2align 4
> > > > -L(StrncpyExit3_4):
> > > > -     movzwl  (%rsi), %ecx
> > > > -     movzwl  -2(%rsi, %r8), %edx
> > > > -     mov     %cx, (%rdi)
> > > > -     mov     %dx, -2(%rdi, %r8)
> > > > -#  ifdef USE_AS_STPCPY
> > > > -     lea     (%rdi, %r8), %rax
> > > > -#  endif
> > > > -#  ifdef USE_AS_STRCAT
> > > > -     movb    $0, (%rdi, %r8)
> > > > -#  endif
> > > > -     ret
> > > >
> > > > -     .p2align 4
> > > > -L(StrncpyExit5_8):
> > > > -     mov     (%rsi), %ecx
> > > > -     mov     -4(%rsi, %r8), %edx
> > > > -     mov     %ecx, (%rdi)
> > > > -     mov     %edx, -4(%rdi, %r8)
> > > > -#  ifdef USE_AS_STPCPY
> > > > -     lea     (%rdi, %r8), %rax
> > > > -#  endif
> > > > -#  ifdef USE_AS_STRCAT
> > > > -     movb    $0, (%rdi, %r8)
> > > > -#  endif
> > > > -     ret
> > > > +# else
> > > > +     /* Check if we found zero-char before end of page.  */
> > > > +     test    %VRCX, %VRCX
> > > > +     jz      L(page_cross_continue)
> > > >
> > > > -     .p2align 4
> > > > -L(StrncpyExit9_16):
> > > > -     mov     (%rsi), %rcx
> > > > -     mov     -8(%rsi, %r8), %rdx
> > > > -     mov     %rcx, (%rdi)
> > > > -     mov     %rdx, -8(%rdi, %r8)
> > > > -#  ifdef USE_AS_STPCPY
> > > > -     lea     (%rdi, %r8), %rax
> > > > -#  endif
> > > > -#  ifdef USE_AS_STRCAT
> > > > -     movb    $0, (%rdi, %r8)
> > > > -#  endif
> > > > -     ret
> > > > +     /* Traditional copy case, essentially same as used in non-page-
> > > > +        cross case but since we can't reuse VMM(0) we need twice as
> > > > +        many loads from rsi.  */
> > > >
> > > > -     .p2align 4
> > > > -L(StrncpyExit17_32):
> > > > -     VMOVU   (%rsi), %XMM2
> > > > -     VMOVU   -16(%rsi, %r8), %XMM3
> > > > -     VMOVU   %XMM2, (%rdi)
> > > > -     VMOVU   %XMM3, -16(%rdi, %r8)
> > > > -#  ifdef USE_AS_STPCPY
> > > > -     lea     (%rdi, %r8), %rax
> > > > -#  endif
> > > > -#  ifdef USE_AS_STRCAT
> > > > -     movb    $0, (%rdi, %r8)
> > > > +#  ifndef USE_AS_STRCAT
> > > > +     xorl    %edx, %edx
> > > >  #  endif
> > > > -     ret
> > > > -
> > > > -     .p2align 4
> > > > -L(StrncpyExit33_64):
> > > > -     /*  0/32, 31/16 */
> > > > -     VMOVU   (%rsi), %YMM2
> > > > -     VMOVU   -VEC_SIZE(%rsi, %r8), %YMM3
> > > > -     VMOVU   %YMM2, (%rdi)
> > > > -     VMOVU   %YMM3, -VEC_SIZE(%rdi, %r8)
> > > > +     /* Dependency on rdi must already have been satisfied.  */
> > > > +     bsf     %VRCX, %VRDX
> > > >  #  ifdef USE_AS_STPCPY
> > > > -     lea     (%rdi, %r8), %rax
> > > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > > > +#  elif !defined USE_AS_STRCAT
> > > > +     movq    %rdi, %rax
> > > >  #  endif
> > > > -#  ifdef USE_AS_STRCAT
> > > > -     movb    $0, (%rdi, %r8)
> > > > -#  endif
> > > > -     ret
> > > >
> > > > -     .p2align 4
> > > > -L(StrncpyExit65):
> > > > -     /* 0/32, 32/32, 64/1 */
> > > > -     VMOVU   (%rsi), %YMM2
> > > > -     VMOVU   32(%rsi), %YMM3
> > > > -     mov     64(%rsi), %cl
> > > > -     VMOVU   %YMM2, (%rdi)
> > > > -     VMOVU   %YMM3, 32(%rdi)
> > > > -     mov     %cl, 64(%rdi)
> > > > -#  ifdef USE_AS_STPCPY
> > > > -     lea     65(%rdi), %rax
> > > > -#  endif
> > > > -#  ifdef USE_AS_STRCAT
> > > > -     movb    $0, 65(%rdi)
> > > > +#  if VEC_SIZE == 64
> > > > +#   ifdef USE_AS_WCSCPY
> > > > +     testb   %cl, %cl
> > > > +#   else
> > > > +     test    %ecx, %ecx
> > > > +#   endif
> > > > +     jz      L(page_cross_copy_32_63)
> > > >  #  endif
> > > > -     ret
> > > > -
> > > > -#  ifndef USE_AS_STRCAT
> > > >
> > > > -     .p2align 4
> > > > -L(Fill1):
> > > > -     mov     %dl, (%rdi)
> > > > -     ret
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     testb   $0xf, %cl
> > > > +#  else
> > > > +     testw   %cx, %cx
> > > > +#  endif
> > > > +     jz      L(page_cross_copy_16_31)
> > > >
> > > > -     .p2align 4
> > > > -L(Fill2):
> > > > -     mov     %dx, (%rdi)
> > > > -     ret
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     testb   $0x3, %cl
> > > > +#  else
> > > > +     testb   %cl, %cl
> > > > +#  endif
> > > > +     jz      L(page_cross_copy_8_15)
> > > >
> > > > -     .p2align 4
> > > > -L(Fill3_4):
> > > > -     mov     %dx, (%rdi)
> > > > -     mov     %dx, -2(%rdi, %r8)
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     movl    (%rsi), %esi
> > > > +     movl    %esi, (%rdi)
> > > > +     movl    $0, (%END_REG)
> > > >       ret
> > > > +#  else
> > > >
> > > > -     .p2align 4
> > > > -L(Fill5_8):
> > > > -     mov     %edx, (%rdi)
> > > > -     mov     %edx, -4(%rdi, %r8)
> > > > -     ret
> > > > +     testb   $0x7, %cl
> > > > +     jz      L(page_cross_copy_4_7)
> > > >
> > > > -     .p2align 4
> > > > -L(Fill9_16):
> > > > -     mov     %rdx, (%rdi)
> > > > -     mov     %rdx, -8(%rdi, %r8)
> > > > +     test    %edx, %edx
> > > > +     jz      L(page_cross_set_null_term)
> > > > +     movzwl  (%rsi), %ecx
> > > > +     movw    %cx, (%rdi)
> > > > +L(page_cross_set_null_term):
> > > > +     movb    $0, (%END_REG)
> > > >       ret
> > > >
> > > > -     .p2align 4
> > > > -L(Fill17_32):
> > > > -     VMOVU   %XMMZERO, (%rdi)
> > > > -     VMOVU   %XMMZERO, -16(%rdi, %r8)
> > > > -     ret
> > > >
> > > > -     .p2align 4
> > > > -L(CopyVecSizeUnalignedVec2):
> > > > -     VMOVU   %YMM2, (%rdi, %rcx)
> > > > -
> > > > -     .p2align 4
> > > > -L(CopyVecSizeVecExit):
> > > > -     bsf     %edx, %edx
> > > > -     add     $(VEC_SIZE - 1), %r8
> > > > -     add     %rcx, %rdi
> > > > -#   ifdef USE_AS_STPCPY
> > > > -     lea     (%rdi, %rdx), %rax
> > > > -#   endif
> > > > -     sub     %rdx, %r8
> > > > -     lea     1(%rdi, %rdx), %rdi
> > > > -
> > > > -     .p2align 4
> > > > -L(StrncpyFillTailWithZero):
> > > > -     xor     %edx, %edx
> > > > -     sub     $VEC_SIZE, %r8
> > > > -     jbe     L(StrncpyFillExit)
> > > > -
> > > > -     VMOVU   %YMMZERO, (%rdi)
> > > > -     add     $VEC_SIZE, %rdi
> > > > -
> > > > -     mov     %rdi, %rsi
> > > > -     and     $(VEC_SIZE - 1), %esi
> > > > -     sub     %rsi, %rdi
> > > > -     add     %rsi, %r8
> > > > -     sub     $(VEC_SIZE * 4), %r8
> > > > -     jb      L(StrncpyFillLessFourVecSize)
> > > > -
> > > > -L(StrncpyFillLoopVmovdqa):
> > > > -     VMOVA   %YMMZERO, (%rdi)
> > > > -     VMOVA   %YMMZERO, VEC_SIZE(%rdi)
> > > > -     VMOVA   %YMMZERO, (VEC_SIZE * 2)(%rdi)
> > > > -     VMOVA   %YMMZERO, (VEC_SIZE * 3)(%rdi)
> > > > -     add     $(VEC_SIZE * 4), %rdi
> > > > -     sub     $(VEC_SIZE * 4), %r8
> > > > -     jae     L(StrncpyFillLoopVmovdqa)
> > > > -
> > > > -L(StrncpyFillLessFourVecSize):
> > > > -     add     $(VEC_SIZE * 2), %r8
> > > > -     jl      L(StrncpyFillLessTwoVecSize)
> > > > -     VMOVA   %YMMZERO, (%rdi)
> > > > -     VMOVA   %YMMZERO, VEC_SIZE(%rdi)
> > > > -     add     $(VEC_SIZE * 2), %rdi
> > > > -     sub     $VEC_SIZE, %r8
> > > > -     jl      L(StrncpyFillExit)
> > > > -     VMOVA   %YMMZERO, (%rdi)
> > > > -     add     $VEC_SIZE, %rdi
> > > > -     jmp     L(Fill)
> > > > -
> > > > -     .p2align 4
> > > > -L(StrncpyFillLessTwoVecSize):
> > > > -     add     $VEC_SIZE, %r8
> > > > -     jl      L(StrncpyFillExit)
> > > > -     VMOVA   %YMMZERO, (%rdi)
> > > > -     add     $VEC_SIZE, %rdi
> > > > -     jmp     L(Fill)
> > > > -
> > > > -     .p2align 4
> > > > -L(StrncpyFillExit):
> > > > -     add     $VEC_SIZE, %r8
> > > > -L(Fill):
> > > > -     cmp     $17, %r8d
> > > > -     jae     L(Fill17_32)
> > > > -     cmp     $9, %r8d
> > > > -     jae     L(Fill9_16)
> > > > -     cmp     $5, %r8d
> > > > -     jae     L(Fill5_8)
> > > > -     cmp     $3, %r8d
> > > > -     jae     L(Fill3_4)
> > > > -     cmp     $1, %r8d
> > > > -     ja      L(Fill2)
> > > > -     je      L(Fill1)
> > > > +     .p2align 4,, 4
> > > > +L(page_cross_copy_4_7):
> > > > +     movl    (%rsi), %ecx
> > > > +     movl    -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
> > > > +     movl    %ecx, (%rdi)
> > > > +     movl    %esi, -(4 - CHAR_SIZE)(%END_REG)
> > > >       ret
> > > > -
> > > > -/* end of ifndef USE_AS_STRCAT */
> > > >  #  endif
> > > >
> > > > -     .p2align 4
> > > > -L(UnalignedLeaveCase2OrCase3):
> > > > -     test    %rdx, %rdx
> > > > -     jnz     L(UnalignedFourVecSizeLeaveCase2)
> > > > -L(UnalignedFourVecSizeLeaveCase3):
> > > > -     lea     (VEC_SIZE * 4)(%r8), %rcx
> > > > -     and     $-VEC_SIZE, %rcx
> > > > -     add     $(VEC_SIZE * 3), %r8
> > > > -     jl      L(CopyVecSizeCase3)
> > > > -     VMOVU   %YMM4, (%rdi)
> > > > -     sub     $VEC_SIZE, %r8
> > > > -     jb      L(CopyVecSizeCase3)
> > > > -     VMOVU   %YMM5, VEC_SIZE(%rdi)
> > > > -     sub     $VEC_SIZE, %r8
> > > > -     jb      L(CopyVecSizeCase3)
> > > > -     VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > > > -     sub     $VEC_SIZE, %r8
> > > > -     jb      L(CopyVecSizeCase3)
> > > > -     VMOVU   %YMM7, (VEC_SIZE * 3)(%rdi)
> > > > -#  ifdef USE_AS_STPCPY
> > > > -     lea     (VEC_SIZE * 4)(%rdi), %rax
> > > > -#  endif
> > > > -#  ifdef USE_AS_STRCAT
> > > > -     movb    $0, (VEC_SIZE * 4)(%rdi)
> > > > -#  endif
> > > > +#  if VEC_SIZE == 64
> > > > +     .p2align 4,, 4
> > > > +L(page_cross_copy_32_63):
> > > > +     VMOVU   (%rsi), %VMM_256(0)
> > > > +     VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > > > +     VMOVU   %VMM_256(0), (%rdi)
> > > > +     VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
> > > >       ret
> > > > -
> > > > -     .p2align 4
> > > > -L(UnalignedFourVecSizeLeaveCase2):
> > > > -     xor     %ecx, %ecx
> > > > -     vpcmpb  $0, %YMM4, %YMMZERO, %k1
> > > > -     kmovd   %k1, %edx
> > > > -     add     $(VEC_SIZE * 3), %r8
> > > > -     jle     L(CopyVecSizeCase2OrCase3)
> > > > -     test    %edx, %edx
> > > > -#  ifndef USE_AS_STRCAT
> > > > -     jnz     L(CopyVecSizeUnalignedVec4)
> > > > -#  else
> > > > -     jnz     L(CopyVecSize)
> > > > -#  endif
> > > > -     vpcmpb  $0, %YMM5, %YMMZERO, %k2
> > > > -     kmovd   %k2, %edx
> > > > -     VMOVU   %YMM4, (%rdi)
> > > > -     add     $VEC_SIZE, %rcx
> > > > -     sub     $VEC_SIZE, %r8
> > > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > > -     test    %edx, %edx
> > > > -#  ifndef USE_AS_STRCAT
> > > > -     jnz     L(CopyVecSizeUnalignedVec5)
> > > > -#  else
> > > > -     jnz     L(CopyVecSize)
> > > >  #  endif
> > > >
> > > > -     vpcmpb  $0, %YMM6, %YMMZERO, %k3
> > > > -     kmovd   %k3, %edx
> > > > -     VMOVU   %YMM5, VEC_SIZE(%rdi)
> > > > -     add     $VEC_SIZE, %rcx
> > > > -     sub     $VEC_SIZE, %r8
> > > > -     jbe     L(CopyVecSizeCase2OrCase3)
> > > > -     test    %edx, %edx
> > > > -#  ifndef USE_AS_STRCAT
> > > > -     jnz     L(CopyVecSizeUnalignedVec6)
> > > > -#  else
> > > > -     jnz     L(CopyVecSize)
> > > > -#  endif
> > > > -
> > > > -     vpcmpb  $0, %YMM7, %YMMZERO, %k4
> > > > -     kmovd   %k4, %edx
> > > > -     VMOVU   %YMM6, (VEC_SIZE * 2)(%rdi)
> > > > -     lea     VEC_SIZE(%rdi, %rcx), %rdi
> > > > -     lea     VEC_SIZE(%rsi, %rcx), %rsi
> > > > -     bsf     %edx, %edx
> > > > -     cmp     %r8d, %edx
> > > > -     jb      L(CopyVecSizeExit)
> > > > -L(StrncpyExit):
> > > > -     cmp     $65, %r8d
> > > > -     je      L(StrncpyExit65)
> > > > -     cmp     $33, %r8d
> > > > -     jae     L(StrncpyExit33_64)
> > > > -     cmp     $17, %r8d
> > > > -     jae     L(StrncpyExit17_32)
> > > > -     cmp     $9, %r8d
> > > > -     jae     L(StrncpyExit9_16)
> > > > -     cmp     $5, %r8d
> > > > -     jae     L(StrncpyExit5_8)
> > > > -     cmp     $3, %r8d
> > > > -     jae     L(StrncpyExit3_4)
> > > > -     cmp     $1, %r8d
> > > > -     ja      L(StrncpyExit2)
> > > > -     je      L(StrncpyExit1)
> > > > -#  ifdef USE_AS_STPCPY
> > > > -     mov     %rdi, %rax
> > > > -#  endif
> > > > -#  ifdef USE_AS_STRCAT
> > > > -     movb    $0, (%rdi)
> > > > -#  endif
> > > > +     .p2align 4,, 4
> > > > +L(page_cross_copy_16_31):
> > > > +     vmovdqu (%rsi), %xmm0
> > > > +     vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > > > +     vmovdqu %xmm0, (%rdi)
> > > > +     vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
> > > >       ret
> > > >
> > > > -     .p2align 4
> > > > -L(ExitZero):
> > > > -#  ifndef USE_AS_STRCAT
> > > > -     mov     %rdi, %rax
> > > > -#  endif
> > > > +     .p2align 4,, 4
> > > > +L(page_cross_copy_8_15):
> > > > +     movq    (%rsi), %rcx
> > > > +     movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> > > > +     movq    %rcx, (%rdi)
> > > > +     movq    %rsi, -(8 - CHAR_SIZE)(%END_REG)
> > > >       ret
> > > > -
> > > > -# endif
> > > > -
> > > > -# ifndef USE_AS_STRCAT
> > > > -END (STRCPY)
> > > > -# else
> > > > -END (STRCAT)
> > > >  # endif
> > > > +END(STRCPY)
> > > >  #endif
> > > > diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
> > > > index 203a19bf21..d648ba5cfe 100644
> > > > --- a/sysdeps/x86_64/multiarch/strncat-evex.S
> > > > +++ b/sysdeps/x86_64/multiarch/strncat-evex.S
> > > > @@ -1,7 +1,520 @@
> > > > -#ifndef STRNCAT
> > > > -# define STRNCAT     __strncat_evex
> > > > -#endif
> > > > +/* {wcs|str}ncat  with 256/512-bit EVEX.
> > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#include <isa-level.h>
> > > > +
> > > > +#if ISA_SHOULD_BUILD (4)
> > > > +
> > > > +     /* Use evex-masked stores for small sizes. Turned off at the
> > > > +        moment.  */
> > > > +# define USE_EVEX_MASKED_STORE       0
> > > > +
> > > > +# include <sysdep.h>
> > > > +
> > > > +# ifndef VEC_SIZE
> > > > +#  include "x86-evex256-vecs.h"
> > > > +# endif
> > > > +
> > > > +# ifndef STRNCAT
> > > > +#  define STRNCAT    __strncat_evex
> > > > +# endif
> > > > +
> > > > +
> > > > +# ifdef USE_AS_WCSCPY
> > > > +#  define movNULL    movl
> > > > +#  define VMOVU_MASK vmovdqu32
> > > > +#  define VPMIN      vpminud
> > > > +#  define VPTESTN    vptestnmd
> > > > +#  define VPTEST     vptestmd
> > > > +#  define VPCMPEQ    vpcmpeqd
> > > > +#  define CHAR_SIZE  4
> > > > +
> > > > +#  define REP_MOVS   rep movsd
> > > > +
> > > > +#  define VMASK_REG  VR10
> > > > +#  define FIND_FIRST_ONE(src, dst)   movl $CHAR_PER_VEC, %dst; bsf %src, %dst
> > > > +
> > > > +#  define USE_WIDE_CHAR
> > > > +# else
> > > > +#  define movNULL    movb
> > > > +#  define VMOVU_MASK vmovdqu8
> > > > +#  define VPMIN      vpminub
> > > > +#  define VPTESTN    vptestnmb
> > > > +#  define VPTEST     vptestmb
> > > > +#  define VPCMPEQ    vpcmpeqb
> > > > +#  define CHAR_SIZE  1
> > > > +
> > > > +#  define REP_MOVS   rep movsb
> > > > +
> > > > +#  define VMASK_REG  VRCX
> > > > +#  define FIND_FIRST_ONE(src, dst)   tzcnt %src, %dst
> > > > +
> > > > +# endif
> > > > +
> > > > +# include "strncpy-or-cat-overflow-def.h"
> > > > +
> > > > +# include "reg-macros.h"
> > > > +
> > > > +
> > > > +# define VZERO       VMM(7)
> > > > +# define VZERO_128   VMM_128(7)
> > > > +
> > > > +# define PAGE_SIZE   4096
> > > > +# define CHAR_PER_VEC        (VEC_SIZE / CHAR_SIZE)
> > > > +
> > > > +     .section SECTION(.text), "ax", @progbits
> > > > +ENTRY(STRNCAT)
> > > > +     movq    %rdi, %rax
> > > > +
> > > > +     /* NB: It's safe to filter out zero-length strings WITHOUT
> > > > +        setting null-term. Destination MUST be a null-terminated
> > > > +        string so essentially the work is already done.  */
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     leaq    -1(%rdx), %rcx
> > > > +     shrq    $56, %rcx
> > > > +     jnz     L(zero_len)
> > > > +# else
> > > > +     test    %rdx, %rdx
> > > > +     jle     L(zero_len)
> > > > +# endif
> > > > +
> > > > +# include "strcat-strlen-evex.S"
> > > > +
> > > > +     movl    %esi, %ecx
> > > > +     andl    $(PAGE_SIZE - 1), %ecx
> > > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > > > +     ja      L(page_cross)
> > > > +L(page_cross_continue):
> > > > +     VMOVU   (%rsi), %VMM(0)
> > > > +     VPTESTN %VMM(0), %VMM(0), %k0
> > > > +
> > > > +     /* If USE_EVEX_MASK_STORE is enabled then we just handle length
> > > > +        <= CHAR_PER_VEC with masked instructions (which have
> > > > +        potential for dramatically bad perf if dst splits a page and
> > > > +        is not in the TLB).  */
> > > > +# if USE_EVEX_MASKED_STORE
> > > > +     KMOV    %k0, %VRCX
> > > > +     FIND_FIRST_ONE (VRCX, VR8)
> > > > +     cmpq    %r8, %rdx
> > > > +     jbe     L(less_1x_vec)
> > > > +
> > > > +     test    %VRCX, %VRCX
> > > > +     jz      L(more_1x_vec)
> > > > +
> > > > +     blsmsk  %VRCX, %VRCX
> > > > +     KMOV    %VRCX, %k1
> > > > +     VMOVU_MASK %VMM(0), (%rdi){%k1}
> > > > +     ret
> > > > +
> > > > +L(less_1x_vec):
> > > > +     mov     $-1, %VRCX
> > > > +     bzhi    %VRDX, %VRCX, %VRCX
> > > > +     KMOV    %VRCX, %k1
> > > > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > > > +     VMOVU_MASK %VMM(0), (%rdi){%k1}
> > > > +
> > > > +     ret
> > > > +# else
> > > > +     KMOV    %k0, %VMASK_REG
> > > > +     /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
> > > > +        %VMASK_REG, %VRCX` for wcsncat.  */
> > > > +     FIND_FIRST_ONE (VMASK_REG, VRCX)
> > > > +     cmpq    %rcx, %rdx
> > > > +     jbe     L(less_1x_vec)
> > > > +
> > > > +     /* If there were no zero-CHARs (rcx was zero before
> > > > +        FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> > > > +     cmpl    $CHAR_PER_VEC, %ecx
> > > > +     je      L(more_1x_vec)
> > > > +
> > > > +     movl    %ecx, %edx
> > > > +
> > > > +L(less_1x_vec):
> > > > +#  if VEC_SIZE == 64
> > > > +     cmpl    $(32 / CHAR_SIZE), %edx
> > > > +     jae     L(copy_32_63)
> > > > +#  endif
> > > > +
> > > > +     cmpl    $(16 / CHAR_SIZE), %edx
> > > > +     jae     L(copy_16_31)
> > > > +
> > > > +
> > > > +     cmpl    $(8 / CHAR_SIZE), %edx
> > > > +     jae     L(copy_8_15)
> > > > +
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     vmovd   %VMM_128(0), (%rdi)
> > > > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > > > +     ret
> > > > +#  else
> > > > +
> > > > +     cmpl    $4, %edx
> > > > +     jae     L(copy_4_7)
> > > > +
> > > > +     movzbl  (%rsi), %ecx
> > > > +     cmpl    $1, %edx
> > > > +     jbe     L(set_null_term)
> > > > +
> > > > +     movzwl  1(%rsi), %esi
> > > > +     movw    %si, 1(%rdi)
> > > > +
> > > > +     .p2align 4,, 1
> > > > +L(set_null_term):
> > > > +     movb    %cl, (%rdi)
> > > > +     movNULL $0, (%rdi, %rdx)
> > > > +     ret
> > > > +#  endif
> > > > +
> > > > +#  if VEC_SIZE == 64
> > > > +     .p2align 4,, 6
> > > > +L(copy_32_63):
> > > > +     VMOVU   -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > > > +     VMOVU   %VMM_256(0), (%rdi)
> > > > +     VMOVU   %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
> > > > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > > > +     ret
> > > > +#  endif
> > > > +     .p2align 4,, 6
> > > > +L(copy_16_31):
> > > > +     /* Use xmm1 explicitly here as it won't require a `vzeroupper`
> > > > +        and will save code size.  */
> > > > +     vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > > > +     VMOVU   %VMM_128(0), (%rdi)
> > > > +     vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
> > > > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > > > +     ret
> > > > +
> > > > +     .p2align 4,, 2
> > > > +L(copy_8_15):
> > > > +     movq    -(8)(%rsi, %rdx, CHAR_SIZE), %rcx
> > > > +     vmovq   %VMM_128(0), (%rdi)
> > > > +     movq    %rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
> > > > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > > > +     ret
> > > > +
> > > > +#  ifndef USE_AS_WCSCPY
> > > > +     .p2align 4,, 12
> > > > +L(copy_4_7):
> > > > +     movl    -(4)(%rsi, %rdx, CHAR_SIZE), %ecx
> > > > +     vmovd   %VMM_128(0), (%rdi)
> > > > +     movl    %ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
> > > > +     movNULL $0, (%rdi, %rdx, CHAR_SIZE)
> > > > +     ret
> > > > +#  endif
> > > > +
> > > > +# endif
> > > > +     .p2align 4,, 4
> > > > +L(zero_len):
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     test    %rdx, %rdx
> > > > +# endif
> > > > +     jne     OVERFLOW_STRCAT
> > > > +     ret
> > > >
> > > > -#define USE_AS_STRNCAT
> > > > -#define STRCAT       STRNCAT
> > > > -#include "strcat-evex.S"
> > > > +     .p2align 4,, 8
> > > > +L(more_1x_vec):
> > > > +     VMOVU   %VMM(0), (%rdi)
> > > > +
> > > > +     /* We are going to align rsi here so will need to be able to re-
> > > > +        adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> > > > +        so rsi + rdx * CHAR_SIZE cannot overflow.  */
> > > > +
> > > > +     leaq    (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> > > > +     subq    %rsi, %rdi
> > > > +     andq    $-(VEC_SIZE), %rsi
> > > > +L(loop_last_4x_vec):
> > > > +     addq    %rsi, %rdi
> > > > +     subq    %rsi, %rdx
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     shrq    $2, %rdx
> > > > +# endif
> > > > +
> > > > +     /* Will need this regardless.  */
> > > > +     VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> > > > +     VPTESTN %VMM(1), %VMM(1), %k0
> > > > +     KMOV    %k0, %VMASK_REG
> > > > +
> > > > +     cmpq    $(CHAR_PER_VEC * 2), %rdx
> > > > +     ja      L(more_2x_vec)
> > > > +
> > > > +L(last_2x_vec):
> > > > +     FIND_FIRST_ONE (VMASK_REG, VRCX)
> > > > +     cmpl    %ecx, %edx
> > > > +     jbe     L(ret_vec_x1_len)
> > > > +
> > > > +     /* If there were no zero-CHARs (rcx was zero before
> > > > +        FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> > > > +     cmpl    $CHAR_PER_VEC, %ecx
> > > > +     jne     L(ret_vec_x1)
> > > > +
> > > > +     VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > > > +     VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     addl    $-CHAR_PER_VEC, %edx
> > > > +     bzhi    %VRDX, %VRCX, %VR8
> > > > +     jz      L(ret_vec_x2_len)
> > > > +L(ret_vec_x2):
> > > > +     bsf     %VRCX, %VRDX
> > > > +L(ret_vec_x2_len):
> > > > +     VMOVU   (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > > > +     movNULL $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> > > > +     VMOVU   %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > > +     ret
> > > > +
> > > > +     .p2align 4,, 4
> > > > +L(ret_vec_x1_len):
> > > > +     movl    %edx, %ecx
> > > > +L(ret_vec_x1):
> > > > +     VMOVU   (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > > > +     movNULL $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
> > > > +     VMOVU   %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > > > +     VZEROUPPER_RETURN
> > > > +
> > > > +
> > > > +     .p2align 4,, 8
> > > > +L(last_4x_vec):
> > > > +     addl    $-(CHAR_PER_VEC * 4), %edx
> > > > +     VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(1)
> > > > +     VPTESTN %VMM(1), %VMM(1), %k0
> > > > +     KMOV    %k0, %VMASK_REG
> > > > +     subq    $-(VEC_SIZE * 4), %rsi
> > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > +     cmpl    $(CHAR_PER_VEC * 2), %edx
> > > > +     jbe     L(last_2x_vec)
> > > > +     .p2align 4,, 8
> > > > +L(more_2x_vec):
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     xorl    %ecx, %ecx
> > > > +# endif
> > > > +     bsf     %VMASK_REG, %VRCX
> > > > +     jnz     L(ret_vec_x1)
> > > > +
> > > > +     VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > > > +     VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(ret_vec_x2)
> > > > +
> > > > +     VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> > > > +     VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > > > +     VPTESTN %VMM(3), %VMM(3), %k0
> > > > +     KMOV    %k0, %VMASK_REG
> > > > +
> > > > +     cmpq    $(CHAR_PER_VEC * 4), %rdx
> > > > +     ja      L(more_4x_vec)
> > > > +
> > > > +     /* Adjust length before going to L(ret_vec_x3_len) or
> > > > +        L(ret_vec_x3).  */
> > > > +     addl    $(CHAR_PER_VEC * -2), %edx
> > > > +
> > > > +     FIND_FIRST_ONE (VMASK_REG, VRCX)
> > > > +     cmpl    %ecx, %edx
> > > > +     jbe     L(ret_vec_x3_len)
> > > > +
> > > > +     /* If there were no zero-CHARs (rcx was zero before
> > > > +        FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> > > > +     cmpl    $CHAR_PER_VEC, %ecx
> > > > +     jne     L(ret_vec_x3)
> > > > +
> > > > +     VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > > > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > > > +     VPTESTN %VMM(4), %VMM(4), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     addl    $-CHAR_PER_VEC, %edx
> > > > +     bzhi    %VRDX, %VRCX, %VR8
> > > > +     jz      L(ret_vec_x4_len)
> > > > +L(ret_vec_x4):
> > > > +     bsf     %VRCX, %VRDX
> > > > +L(ret_vec_x4_len):
> > > > +     VMOVU   (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > > > +     movNULL $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
> > > > +     VMOVU   %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > > +     ret
> > > > +
> > > > +     .p2align 4,, 4
> > > > +L(ret_vec_x3_len):
> > > > +     movl    %edx, %ecx
> > > > +L(ret_vec_x3):
> > > > +     VMOVU   (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > > > +     movNULL $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> > > > +     VMOVU   %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > > > +     ret
> > > > +
> > > > +     .p2align 4,, 8
> > > > +L(more_4x_vec):
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     xorl    %ecx, %ecx
> > > > +# endif
> > > > +     bsf     %VMASK_REG, %VRCX
> > > > +     jnz     L(ret_vec_x3)
> > > > +
> > > > +     VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > > > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > > > +     VPTESTN %VMM(4), %VMM(4), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(ret_vec_x4)
> > > > +
> > > > +     VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> > > > +
> > > > +     /* Check if we are near the end before aligning.  */
> > > > +     cmpq    $(CHAR_PER_VEC * 8), %rdx
> > > > +     jbe     L(last_4x_vec)
> > > > +
> > > > +
> > > > +     /* Add rsi to rdx (length) before aligning rsi. NB: Since we
> > > > +        filtered out huge lengths this cannot overflow.  */
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     leaq    (%rsi, %rdx, CHAR_SIZE), %rdx
> > > > +# else
> > > > +     addq    %rsi, %rdx
> > > > +# endif
> > > > +
> > > > +     /* Subtract rsi from rdi before aligning (add back will have
> > > > +        correct rdi for aligned rsi).  */
> > > > +     subq    %rsi, %rdi
> > > > +     subq    $-(VEC_SIZE * 5), %rsi
> > > > +     andq    $(VEC_SIZE * -4), %rsi
> > > > +
> > > > +     /* Load first half of the loop before entry.  */
> > > > +     VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > > > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > > > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > > > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > > > +
> > > > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > > > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > > > +     VPTESTN %VMM(4), %VMM(4), %k2
> > > > +     VPTESTN %VMM(6), %VMM(6), %k4
> > > > +
> > > > +     /* Offset rsi by VEC_SIZE so that we can jump to
> > > > +        L(loop_last_4x_vec).  */
> > > > +     addq    $-(VEC_SIZE), %rsi
> > > > +     KORTEST %k2, %k4
> > > > +     jnz     L(loop_4x_done)
> > > > +
> > > > +     /* Store loop end in r9.  */
> > > > +     leaq    -(VEC_SIZE * 5)(%rdx), %r9
> > > > +
> > > > +     .p2align 4,, 11
> > > > +L(loop_4x_vec):
> > > > +     VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > > > +     VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> > > > +     VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> > > > +     VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> > > > +
> > > > +     subq    $(VEC_SIZE * -4), %rsi
> > > > +     cmpq    %rsi, %r9
> > > > +     jbe     L(loop_last_4x_vec)
> > > > +
> > > > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> > > > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> > > > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> > > > +     VMOVA   (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> > > > +
> > > > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > > > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > > > +     VPTESTN %VMM(4), %VMM(4), %k2
> > > > +     VPTESTN %VMM(6), %VMM(6), %k4
> > > > +     KORTEST %k2, %k4
> > > > +     jz      L(loop_4x_vec)
> > > > +
> > > > +L(loop_4x_done):
> > > > +     VPTESTN %VMM(0), %VMM(0), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     /* Restore rdi (dst).  */
> > > > +     addq    %rsi, %rdi
> > > > +
> > > > +     /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
> > > > +        test with bsf.  */
> > > > +     bsf     %VRCX, %VRCX
> > > > +     jnz     L(ret_vec_x1)
> > > > +     VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
> > > > +
> > > > +     KMOV    %k2, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(ret_vec_x2)
> > > > +     VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> > > > +
> > > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     bsf     %VRCX, %VRCX
> > > > +     jnz     L(ret_vec_x3)
> > > > +     VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> > > > +
> > > > +     KMOV    %k4, %VRCX
> > > > +     bsf     %VRCX, %VRCX
> > > > +     VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> > > > +     VMOVU   %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> > > > +     ret
> > > > +
> > > > +
> > > > +     .p2align 4,, 4
> > > > +L(page_cross):
> > > > +     movq    %rsi, %r8
> > > > +     andq    $(VEC_SIZE * -1), %r8
> > > > +     VPCMPEQ (%r8), %VZERO, %k0
> > > > +
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     KMOV    %k0, %VR9
> > > > +     shrl    $2, %ecx
> > > > +     andl    $(CHAR_PER_VEC - 1), %ecx
> > > > +     shrx    %VRCX, %VR9, %VRCX
> > > > +# else
> > > > +     KMOV    %k0, %VRCX
> > > > +     shrx    %VRSI, %VRCX, %VRCX
> > > > +# endif
> > > > +
> > > > +     subl    %esi, %r8d
> > > > +     andl    $(VEC_SIZE - 1), %r8d
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     shrl    $2, %r8d
> > > > +# endif
> > > > +     cmpq    %r8, %rdx
> > > > +     jbe     L(page_cross_small)
> > > > +     /* Optimizing more for space as this is very cold code. This
> > > > +        saves 2x cache lines.  */
> > > > +
> > > > +     /* This adds once to the later result which will get correct
> > > > +        copy bounds. NB: this can never zero-out a non-zero RCX as
> > > > +        to be in the page cross case rsi cannot be aligned and we
> > > > +        already right-shift rcx by the misalignment.  */
> > > > +     shl     %VRCX
> > > > +     jz      L(page_cross_continue)
> > > > +     bsf     %VRCX, %VRCX
> > > > +     REP_MOVS
> > > > +     ret
> > > > +
> > > > +L(page_cross_small):
> > > > +     tzcnt   %VRCX, %VRCX
> > > > +     jz      L(page_cross_setz)
> > > > +     cmpl    %edx, %ecx
> > > > +     cmova   %edx, %ecx
> > > > +
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     rep     movsd
> > > > +# else
> > > > +     rep     movsb
> > > > +# endif
> > > > +L(page_cross_setz):
> > > > +     movNULL $0, (%rdi)
> > > > +     ret
> > > > +END(STRNCAT)
> > > > +#endif
> > > > diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
> > > > index 1b3426d511..49eaf4cbd9 100644
> > > > --- a/sysdeps/x86_64/multiarch/strncpy-evex.S
> > > > +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
> > > > @@ -1,7 +1,990 @@
> > > > -#ifndef STRNCPY
> > > > -# define STRNCPY     __strncpy_evex
> > > > -#endif
> > > > +/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
> > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#include <isa-level.h>
> > > > +
> > > > +#if ISA_SHOULD_BUILD (4)
> > > > +
> > > > +     /* Use evex-masked stores for small sizes. Turned off at the
> > > > +        moment.  */
> > > > +# define USE_EVEX_MASKED_STORE       0
> > > > +
> > > > +
> > > > +# include <sysdep.h>
> > > > +# ifndef VEC_SIZE
> > > > +#  include "x86-evex256-vecs.h"
> > > > +# endif
> > > > +
> > > > +
> > > > +# ifndef STRNCPY
> > > > +#  define STRNCPY    __strncpy_evex
> > > > +# endif
> > > > +
> > > > +# ifdef USE_AS_WCSCPY
> > > > +#  define VMOVU_MASK vmovdqu32
> > > > +#  define VPCMPEQ    vpcmpeqd
> > > > +#  define VPMIN      vpminud
> > > > +#  define VPTESTN    vptestnmd
> > > > +#  define VPTEST     vptestmd
> > > > +#  define CHAR_SIZE  4
> > > > +
> > > > +#  define REP_MOVS   rep movsd
> > > > +#  define REP_STOS   rep stosl
> > > > +
> > > > +#  define USE_WIDE_CHAR
> > > > +
> > > > +# else
> > > > +#  define VMOVU_MASK vmovdqu8
> > > > +#  define VPCMPEQ    vpcmpeqb
> > > > +#  define VPMIN      vpminub
> > > > +#  define VPTESTN    vptestnmb
> > > > +#  define VPTEST     vptestmb
> > > > +#  define CHAR_SIZE  1
> > > > +
> > > > +#  define REP_MOVS   rep movsb
> > > > +#  define REP_STOS   rep stosb
> > > > +# endif
> > > > +
> > > > +# include "strncpy-or-cat-overflow-def.h"
> > > > +
> > > > +# define PAGE_SIZE   4096
> > > > +# define CHAR_PER_VEC        (VEC_SIZE / CHAR_SIZE)
> > > > +
> > > > +# include "reg-macros.h"
> > > > +
> > > > +
> > > > +# define VZERO       VMM(7)
> > > > +# define VZERO_256   VMM_256(7)
> > > > +# define VZERO_128   VMM_128(7)
> > > > +
> > > > +# if VEC_SIZE == 64
> > > > +#  define VZERO_HALF VZERO_256
> > > > +# else
> > > > +#  define VZERO_HALF VZERO_128
> > > > +# endif
> > > > +
> > > > +     .section SECTION(.text), "ax", @progbits
> > > > +ENTRY(STRNCPY)
> > > > +     /* Filter zero length strings and very long strings.  Zero
> > > > +        length strings just return, very long strings are handled by
> > > > +        just running rep stos{b|l} to zero set (which will almost
> > > > +        certainly segfault), if that succeeds then just calling
> > > > +        OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     decq    %rdx
> > > > +     movq    %rdx, %rax
> > > > +     /* 56 is end of max supported address space.  */
> > > > +     shr     $56, %rax
> > > > +     jnz     L(zero_len)
> > > > +# else
> > > > +     decq    %rdx
> > > > +     /* If the flag needs to become `jb` replace `dec` with `sub`.
> > > > +      */
> > > > +     jl      L(zero_len)
> > > > +# endif
> > > > +
> > > > +     vpxorq  %VZERO_128, %VZERO_128, %VZERO_128
> > > > +     movl    %esi, %eax
> > > > +     andl    $(PAGE_SIZE - 1), %eax
> > > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > +     ja      L(page_cross)
> > > > +
> > > > +L(page_cross_continue):
> > > > +     VMOVU   (%rsi), %VMM(0)
> > > > +     VPTESTN %VMM(0), %VMM(0), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +
> > > > +     /* If no STPCPY just save end ahead of time.  */
> > > > +# ifndef USE_AS_STPCPY
> > > > +     movq    %rdi, %rax
> > > > +# endif
> > > > +
> > > > +
> > > > +     cmpq    $(CHAR_PER_VEC), %rdx
> > > > +
> > > > +     /* If USE_EVEX_MASK_STORE is enabled then we just handle length
> > > > +        <= CHAR_PER_VEC with masked instructions (which have
> > > > +        potential for dramatically bad perf if dst splits a page and
> > > > +        is not in the TLB).  */
> > > > +# if USE_EVEX_MASKED_STORE
> > > > +     /* `jae` because length rdx is now length - 1.  */
> > > > +     jae     L(more_1x_vec)
> > > > +
> > > > +     /* If there where multiple zero-CHAR matches in the first VEC,
> > > > +        VRCX will be overset but thats fine since any oversets where
> > > > +        at zero-positions anyways.  */
> > > > +
> > > > +#  ifdef USE_AS_STPCPY
> > > > +     tzcnt   %VRCX, %VRAX
> > > > +     cmpl    %eax, %edx
> > > > +     cmovb   %edx, %eax
> > > > +#   ifdef USE_AS_WCSCPY
> > > > +     adcl    $0, %eax
> > > > +     leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > +#   else
> > > > +     adcq    %rdi, %rax
> > > > +#   endif
> > > > +#  endif
> > > > +     dec     %VRCX
> > > > +
> > > > +     /* Zero out all non-zero CHAR's after the first zero match.  */
> > > > +     KMOV    %VRCX, %k1
> > > > +
> > > > +     /* Use VZERO as destination so this can be reused for
> > > > +        L(zfill_less_vec) (which if jumped to by subsequent logic
> > > > +        will have zerod out VZERO.  */
> > > > +     VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
> > > > +L(zfill_less_vec):
> > > > +     /* Get mask for what we need to set.  */
> > > > +     incl    %edx
> > > > +     mov     $-1, %VRCX
> > > > +     bzhi    %VRDX, %VRCX, %VRCX
> > > > +     KMOV    %VRCX, %k1
> > > > +     VMOVU_MASK %VZERO, (%rdi){%k1}
> > > > +     ret
> > > > +
> > > > +     .p2align 4,, 4
> > > > +L(zero_len):
> > > > +     cmpq    $-1, %rdx
> > > > +     jne     L(best_effort_strncpy)
> > > > +     movq    %rdi, %rax
> > > > +     ret
> > > > +
> > > > +     .p2align 4,, 8
> > > > +L(more_1x_vec):
> > > > +# else
> > > > +     /* `jb` because length rdx is now length - 1.  */
> > > > +     jb      L(less_1x_vec)
> > > > +# endif
> > > > +
> > > > +
> > > > +     /* This may overset but thats fine because we still need to zero
> > > > +        fill.  */
> > > > +     VMOVU   %VMM(0), (%rdi)
> > > > +
> > > > +
> > > > +     /* Length must be >= CHAR_PER_VEC so match here means we must
> > > > +        zero-fill.  */
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(zfill)
> > > > +
> > > > +
> > > > +     /* We are going to align rsi here so will need to be able to re-
> > > > +        adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> > > > +        so rsi + rdx * CHAR_SIZE cannot overflow.  */
> > > > +     leaq    (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> > > > +     subq    %rsi, %rdi
> > > > +     andq    $-(VEC_SIZE), %rsi
> > > > +
> > > > +L(loop_last_4x_vec):
> > > > +     addq    %rsi, %rdi
> > > > +     subq    %rsi, %rdx
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     shrq    $2, %rdx
> > > > +# endif
> > > > +
> > > > +     VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> > > > +     VPTESTN %VMM(1), %VMM(1), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +
> > > > +     /* -1 because of the `dec %rdx` earlier.  */
> > > > +     cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
> > > > +     ja      L(more_2x_vec)
> > > > +
> > > > +L(last_2x_vec):
> > > > +     /* This will be need to be computed no matter what. We do it
> > > > +        ahead of time for CHAR_PER_VEC == 64 because we can't adjust
> > > > +        the value of `tzcnt` with a shift.  */
> > > > +# if CHAR_PER_VEC == 64
> > > > +     tzcntq  %rcx, %rcx
> > > > +# endif
> > > > +
> > > > +     cmpl    $(CHAR_PER_VEC), %edx
> > > > +     jb      L(ret_vec_x1_len)
> > > > +
> > > > +     /* Seperate logic for CHAR_PER_VEC == 64 because we already did
> > > > +        `tzcnt` on VRCX.  */
> > > > +# if CHAR_PER_VEC == 64
> > > > +     /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
> > > > +     cmpb    $CHAR_PER_VEC, %cl
> > > > +     jnz     L(ret_vec_x1_no_bsf)
> > > > +# else
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(ret_vec_x1)
> > > > +# endif
> > > > +
> > > > +
> > > > +
> > > > +     VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0
> > > > +     VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > > > +     KMOV    %k0, %VRCX
> > > > +
> > > > +# if CHAR_PER_VEC < 64
> > > > +     /* This essentiallys adds CHAR_PER_VEC to computed result.  */
> > > > +     shlq    $CHAR_PER_VEC, %rcx
> > > > +# else
> > > > +     tzcntq  %rcx, %rcx
> > > > +     addl    $CHAR_PER_VEC, %ecx
> > > > +# endif
> > > > +
> > > > +     .p2align 4,, 4
> > > > +L(ret_vec_x1_len):
> > > > +     /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
> > > > +        already been done.  */
> > > > +# if CHAR_PER_VEC < 64
> > > > +     tzcntq  %rcx, %rcx
> > > > +# endif
> > > > +     cmpl    %ecx, %edx
> > > > +     jbe     L(ret_vec_x1_len_no_zfill)
> > > > +     /* Fall through (expectation) is copy len < buffer len.  */
> > > > +     VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > > +L(ret_vec_x1_len_no_zfill_mov):
> > > > +     movl    %ecx, %edx
> > > > +# ifdef USE_AS_STPCPY
> > > > +     /* clear flags.  */
> > > > +     xorl    %ecx, %ecx
> > > > +# endif
> > > > +L(ret_vec_x1_len_no_zfill):
> > > > +     VMOVU   ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > > > +     VMOVU   %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > > +# ifdef USE_AS_STPCPY
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     adcq    $0, %rdx
> > > > +     leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
> > > > +#  else
> > > > +     leal    (VEC_SIZE)(%rdx), %eax
> > > > +     adcq    %rdi, %rax
> > > > +#  endif
> > > > +# endif
> > > > +     ret
> > > > +
> > > > +
> > > > +     .p2align 4,, 10
> > > > +L(ret_vec_x1):
> > > > +     bsf     %VRCX, %VRCX
> > > > +L(ret_vec_x1_no_bsf):
> > > > +     VMOVU   %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > > +     subl    %ecx, %edx
> > > > +     cmpl    $CHAR_PER_VEC, %edx
> > > > +     jb      L(ret_vec_x1_len_no_zfill_mov)
> > > > +     /* Fall through (expectation) is copy len < buffer len.  */
> > > > +     VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > > > +     VMOVU   %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
> > > > +# ifdef USE_AS_STPCPY
> > > > +     leaq    (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
> > > > +# endif
> > > > +     ret
> > > > +
> > > > +     .p2align 4,, 8
> > > > +L(last_4x_vec):
> > > > +     /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
> > > > +        $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
> > > > +        using `movzbl`.  */
> > > > +# if CHAR_PER_VEC == 64
> > > > +     movzbl  %dl, %edx
> > > > +# else
> > > > +     andl    $(CHAR_PER_VEC * 4 - 1), %edx
> > > > +# endif
> > > > +     VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(1)
> > > > +     VPTESTN %VMM(1), %VMM(1), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     subq    $-(VEC_SIZE * 4), %rsi
> > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > +     cmpl    $(CHAR_PER_VEC * 2 - 1), %edx
> > > > +     jbe     L(last_2x_vec)
> > > > +     .p2align 4,, 8
> > > > +L(more_2x_vec):
> > > > +     VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > > > +     test    %VRCX, %VRCX
> > > > +     /* Must fill at least 2x VEC.  */
> > > > +     jnz     L(zfill_vec1)
> > > > +
> > > > +     VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > > > +     VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     /* Must fill at least 1x VEC.  */
> > > > +     jnz     L(zfill_vec2)
> > > > +
> > > > +     VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(3)
> > > > +     VPTESTN %VMM(3), %VMM(3), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +
> > > > +     /* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
> > > > +     cmpq    $(CHAR_PER_VEC * 4 - 1), %rdx
> > > > +     ja      L(more_4x_vec)
> > > > +
> > > > +     subl    $(CHAR_PER_VEC * 3), %edx
> > > > +     jb      L(ret_vec_x3_len)
> > > > +
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(ret_vec_x3)
> > > > +
> > > > +     VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0
> > > > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > > > +     KMOV    %k0, %VRCX
> > > > +     tzcnt   %VRCX, %VRCX
> > > > +     cmpl    %ecx, %edx
> > > > +     jbe     L(ret_vec_x4_len_no_zfill)
> > > > +     /* Fall through (expectation) is copy len < buffer len.  */
> > > > +     VMOVU   %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > > +     movl    %ecx, %edx
> > > > +L(ret_vec_x4_len_no_zfill):
> > > > +     VMOVU   ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > > > +     VMOVU   %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > > +# ifdef USE_AS_STPCPY
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     adcq    $0, %rdx
> > > > +     leaq    (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
> > > > +#  else
> > > > +     leal    (VEC_SIZE * 4 + 0)(%rdx), %eax
> > > > +     adcq    %rdi, %rax
> > > > +#  endif
> > > > +# endif
> > > > +     ret
> > > > +
> > > > +
> > > > +L(ret_vec_x3_len):
> > > > +     addl    $(CHAR_PER_VEC * 1), %edx
> > > > +     tzcnt   %VRCX, %VRCX
> > > > +     cmpl    %ecx, %edx
> > > > +     jbe     L(ret_vec_x3_len_no_zfill)
> > > > +     /* Fall through (expectation) is copy len < buffer len.  */
> > > > +     VMOVU   %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > > +L(ret_vec_x3_len_no_zfill_mov):
> > > > +     movl    %ecx, %edx
> > > > +# ifdef USE_AS_STPCPY
> > > > +     /* clear flags.  */
> > > > +     xorl    %ecx, %ecx
> > > > +# endif
> > > > +     .p2align 4,, 4
> > > > +L(ret_vec_x3_len_no_zfill):
> > > > +     VMOVU   ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> > > > +     VMOVU   %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> > > > +# ifdef USE_AS_STPCPY
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     adcq    $0, %rdx
> > > > +     leaq    (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
> > > > +#  else
> > > > +     leal    (VEC_SIZE * 3 + 0)(%rdx), %eax
> > > > +     adcq    %rdi, %rax
> > > > +#  endif
> > > > +# endif
> > > > +     ret
> > > > +
> > > > +
> > > > +     .p2align 4,, 8
> > > > +L(ret_vec_x3):
> > > > +     bsf     %VRCX, %VRCX
> > > > +     VMOVU   %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
> > > > +     subl    %ecx, %edx
> > > > +     jl      L(ret_vec_x3_len_no_zfill_mov)
> > > > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > > > +     VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> > > > +# ifdef USE_AS_STPCPY
> > > > +     leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
> > > > +# endif
> > > > +     ret
> > > > +
> > > > +     .p2align 4,, 8
> > > > +L(more_4x_vec):
> > > > +     VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(zfill_vec3)
> > > > +
> > > > +     VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(4)
> > > > +     VMOVU   %VMM(4), (VEC_SIZE * 4)(%rdi)
> > > > +     VPTESTN %VMM(4), %VMM(4), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(zfill_vec4)
> > > >
> > > > -#define USE_AS_STRNCPY
> > > > -#define STRCPY       STRNCPY
> > > > -#include "strcpy-evex.S"
> > > > +     /* Recheck length before aligning.  */
> > > > +     cmpq    $(CHAR_PER_VEC * 8 - 1), %rdx
> > > > +     jbe     L(last_4x_vec)
> > > > +
> > > > +     /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     leaq    (%rsi, %rdx, CHAR_SIZE), %rdx
> > > > +# else
> > > > +     addq    %rsi, %rdx
> > > > +# endif
> > > > +     subq    %rsi, %rdi
> > > > +     subq    $-(VEC_SIZE * 5), %rsi
> > > > +     andq    $(VEC_SIZE * -4), %rsi
> > > > +
> > > > +
> > > > +     /* Load first half of the loop before entry.  */
> > > > +     VMOVA   (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > > > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > > > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > > > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > > > +
> > > > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > > > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > > > +     VPTESTN %VMM(4), %VMM(4), %k2
> > > > +     VPTESTN %VMM(6), %VMM(6), %k4
> > > > +
> > > > +
> > > > +     /* Offset rsi by VEC_SIZE so that we can jump to
> > > > +        L(loop_last_4x_vec).  */
> > > > +     addq    $-(VEC_SIZE), %rsi
> > > > +     KORTEST %k2, %k4
> > > > +     jnz     L(loop_4x_done)
> > > > +
> > > > +     /* Store loop end in r9.  */
> > > > +     leaq    -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
> > > > +
> > > > +     .p2align 4,, 11
> > > > +L(loop_4x_vec):
> > > > +     VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > > > +     VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> > > > +     VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> > > > +     VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> > > > +
> > > > +     subq    $(VEC_SIZE * -4), %rsi
> > > > +     cmpq    %rsi, %r9
> > > > +     jbe     L(loop_last_4x_vec)
> > > > +
> > > > +     VMOVA   (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> > > > +     VMOVA   (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> > > > +     VMOVA   (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> > > > +     VMOVA   (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> > > > +
> > > > +     VPMIN   %VMM(0), %VMM(1), %VMM(4)
> > > > +     VPMIN   %VMM(2), %VMM(3), %VMM(6)
> > > > +     VPTESTN %VMM(4), %VMM(4), %k2
> > > > +     VPTESTN %VMM(6), %VMM(6), %k4
> > > > +     KORTEST %k2, %k4
> > > > +     jz      L(loop_4x_vec)
> > > > +
> > > > +L(loop_4x_done):
> > > > +     /* Restore rdx (length).  */
> > > > +     subq    %rsi, %rdx
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     shrq    $2, %rdx
> > > > +# endif
> > > > +     VMOVU   %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> > > > +     /* Restore rdi (dst).  */
> > > > +     addq    %rsi, %rdi
> > > > +     VPTESTN %VMM(0), %VMM(0), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(zfill_vec1)
> > > > +
> > > > +     VMOVU   %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> > > > +     KMOV    %k2, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(zfill_vec2)
> > > > +
> > > > +     VMOVU   %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> > > > +     VPTESTN %VMM(2), %VMM(2), %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +     test    %VRCX, %VRCX
> > > > +     jnz     L(zfill_vec3)
> > > > +
> > > > +     VMOVU   %VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
> > > > +     KMOV    %k4, %VRCX
> > > > +     // Zfill more....
> > > > +
> > > > +     .p2align 4,, 4
> > > > +L(zfill_vec4):
> > > > +     subq    $(VEC_SIZE * -2), %rdi
> > > > +     addq    $(CHAR_PER_VEC * -2), %rdx
> > > > +L(zfill_vec2):
> > > > +     subq    $(VEC_SIZE * -2), %rdi
> > > > +     addq    $(CHAR_PER_VEC * -1), %rdx
> > > > +L(zfill):
> > > > +     /* VRCX must be non-zero.  */
> > > > +     bsf     %VRCX, %VRCX
> > > > +
> > > > +     /* Adjust length / dst for zfill.  */
> > > > +     subq    %rcx, %rdx
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > > +# else
> > > > +     addq    %rcx, %rdi
> > > > +# endif
> > > > +# ifdef USE_AS_STPCPY
> > > > +     movq    %rdi, %rax
> > > > +# endif
> > > > +L(zfill_from_page_cross):
> > > > +
> > > > +     /* From here on out its just memset(rdi, 0, rdx).  */
> > > > +     cmpq    $CHAR_PER_VEC, %rdx
> > > > +     jb      L(zfill_less_vec)
> > > > +
> > > > +L(zfill_more_1x_vec):
> > > > +     VMOVU   %VZERO, (%rdi)
> > > > +     VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > > +     cmpq    $(CHAR_PER_VEC * 2 - 1), %rdx
> > > > +     ja      L(zfill_more_2x_vec)
> > > > +L(zfill_done0):
> > > > +     ret
> > > > +
> > > > +     /* Coming from vec1/vec2 we must be able to zfill at least 2x
> > > > +        VEC.  */
> > > > +     .p2align 4,, 8
> > > > +L(zfill_vec3):
> > > > +     subq    $(VEC_SIZE * -2), %rdi
> > > > +     addq    $(CHAR_PER_VEC * -2), %rdx
> > > > +     .p2align 4,, 2
> > > > +L(zfill_vec1):
> > > > +     bsfq    %rcx, %rcx
> > > > +     /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
> > > > +      */
> > > > +     leaq    VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> > > > +     subq    %rcx, %rdx
> > > > +# ifdef USE_AS_STPCPY
> > > > +     movq    %rdi, %rax
> > > > +# endif
> > > > +
> > > > +
> > > > +     VMOVU   %VZERO, (%rdi)
> > > > +     VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > > +     cmpq    $(CHAR_PER_VEC * 2), %rdx
> > > > +     jb      L(zfill_done0)
> > > > +L(zfill_more_2x_vec):
> > > > +     VMOVU   %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> > > > +     VMOVU   %VZERO, (VEC_SIZE)(%rdi)
> > > > +     subq    $(CHAR_PER_VEC * 4 - 1), %rdx
> > > > +     jbe     L(zfill_done)
> > > > +
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rdx
> > > > +# else
> > > > +     addq    %rdi, %rdx
> > > > +# endif
> > > > +
> > > > +     VMOVU   %VZERO, (VEC_SIZE * 2)(%rdi)
> > > > +     VMOVU   %VZERO, (VEC_SIZE * 3)(%rdi)
> > > > +
> > > > +
> > > > +     VMOVU   %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
> > > > +     VMOVU   %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
> > > > +
> > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > +     cmpq    %rdi, %rdx
> > > > +     jbe     L(zfill_done)
> > > > +
> > > > +     /* Align rdi and zfill loop.  */
> > > > +     andq    $-(VEC_SIZE), %rdi
> > > > +     .p2align 4,, 12
> > > > +L(zfill_loop_4x_vec):
> > > > +     VMOVA   %VZERO, (VEC_SIZE * 0)(%rdi)
> > > > +     VMOVA   %VZERO, (VEC_SIZE * 1)(%rdi)
> > > > +     VMOVA   %VZERO, (VEC_SIZE * 2)(%rdi)
> > > > +     VMOVA   %VZERO, (VEC_SIZE * 3)(%rdi)
> > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > +     cmpq    %rdi, %rdx
> > > > +     ja      L(zfill_loop_4x_vec)
> > > > +L(zfill_done):
> > > > +     ret
> > > > +
> > > > +
> > > > +     /* Less 1x VEC case if we are not using evex masked store.  */
> > > > +# if !USE_EVEX_MASKED_STORE
> > > > +     .p2align 4,, 8
> > > > +L(copy_1x):
> > > > +     /* Special case for copy 1x. It can be handled quickly and many
> > > > +        buffer sizes have convenient alignment.  */
> > > > +     VMOVU   %VMM(0), (%rdi)
> > > > +     /* If no zeros then we are done.  */
> > > > +     testl   %ecx, %ecx
> > > > +     jz      L(ret_1x_1x)
> > > > +
> > > > +     /* Need to zfill, not we know that length <= CHAR_PER_VEC so we
> > > > +        only handle the small case here.  */
> > > > +     bsf     %VRCX, %VRCX
> > > > +L(zfill_less_vec_no_bsf):
> > > > +     /* Adjust length / dst then just zfill less_vec.  */
> > > > +     subq    %rcx, %rdx
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > > +#  else
> > > > +     addq    %rcx, %rdi
> > > > +#  endif
> > > > +#  ifdef USE_AS_STPCPY
> > > > +     movq    %rdi, %rax
> > > > +#  endif
> > > > +
> > > > +L(zfill_less_vec):
> > > > +     cmpl    $((VEC_SIZE / 2) / CHAR_SIZE), %edx
> > > > +     jb      L(zfill_less_half)
> > > > +
> > > > +     VMOVU   %VZERO_HALF, (%rdi)
> > > > +     VMOVU   %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > > +     ret
> > > > +#  ifdef USE_AS_STPCPY
> > > > +L(ret_1x_1x):
> > > > +     leaq    CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
> > > > +     ret
> > > > +#  endif
> > > > +
> > > > +
> > > > +#  if VEC_SIZE == 64
> > > > +     .p2align 4,, 4
> > > > +L(copy_32_63):
> > > > +     /* Overfill to avoid branches.  */
> > > > +     VMOVU   -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> > > > +     VMOVU   %VMM_256(0), (%rdi)
> > > > +     VMOVU   %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > > +
> > > > +     /* We are taking advantage of the fact that to be here we must
> > > > +        be writing null-term as (%rdi, %rcx) we have a byte of lee-
> > > > +        way for overwriting.  */
> > > > +     cmpl    %ecx, %edx
> > > > +     ja      L(zfill_less_vec_no_bsf)
> > > > +#   ifndef USE_AS_STPCPY
> > > > +L(ret_1x_1x):
> > > > +#   else
> > > > +#    ifdef USE_AS_WCSCPY
> > > > +     adcq    $0, %rdx
> > > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > > > +#    else
> > > > +     movl    %edx, %eax
> > > > +     adcq    %rdi, %rax
> > > > +#    endif
> > > > +#   endif
> > > > +     ret
> > > > +#  endif
> > > > +
> > > > +     .p2align 4,, 4
> > > > +L(copy_16_31):
> > > > +     /* Overfill to avoid branches.  */
> > > > +     vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> > > > +     VMOVU   %VMM_128(0), (%rdi)
> > > > +     vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > > +     cmpl    %ecx, %edx
> > > > +
> > > > +     /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
> > > > +        we have a larger copy block for 32-63 so this is just falls
> > > > +        through to zfill 16-31. If VEC_SIZE == 32 then we check for
> > > > +        full zfill of less 1x VEC.  */
> > > > +#  if VEC_SIZE == 64
> > > > +     jbe     L(ret_16_31)
> > > > +     subl    %ecx, %edx
> > > > +#   ifdef USE_AS_WCSCPY
> > > > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > > +#   else
> > > > +     addq    %rcx, %rdi
> > > > +#   endif
> > > > +#   ifdef USE_AS_STPCPY
> > > > +     movq    %rdi, %rax
> > > > +#   endif
> > > > +L(zfill_less_half):
> > > > +L(zfill_less_32):
> > > > +     cmpl    $(16 / CHAR_SIZE), %edx
> > > > +     jb      L(zfill_less_16)
> > > > +     VMOVU   %VZERO_128, (%rdi)
> > > > +     VMOVU   %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > > +#   ifdef USE_AS_STPCPY
> > > > +     ret
> > > > +#   endif
> > > > +L(ret_16_31):
> > > > +#   ifdef USE_AS_STPCPY
> > > > +#    ifdef USE_AS_WCSCPY
> > > > +     adcq    $0, %rdx
> > > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > > > +#    else
> > > > +     movl    %edx, %eax
> > > > +     adcq    %rdi, %rax
> > > > +#    endif
> > > > +#   endif
> > > > +     ret
> > > > +#  else
> > > > +     /* VEC_SIZE == 32 begins.  */
> > > > +     ja      L(zfill_less_vec_no_bsf)
> > > > +#   ifndef USE_AS_STPCPY
> > > > +L(ret_1x_1x):
> > > > +#   else
> > > > +#    ifdef USE_AS_WCSCPY
> > > > +     adcq    $0, %rdx
> > > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > > > +#    else
> > > > +     movl    %edx, %eax
> > > > +     adcq    %rdi, %rax
> > > > +#    endif
> > > > +#   endif
> > > > +     ret
> > > > +#  endif
> > > > +
> > > > +
> > > > +     .p2align 4,, 4
> > > > +L(copy_8_15):
> > > > +     /* Overfill to avoid branches.  */
> > > > +     movq    -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> > > > +     vmovq   %VMM_128(0), (%rdi)
> > > > +     movq    %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > > +     cmpl    %ecx, %edx
> > > > +     jbe     L(ret_8_15)
> > > > +     subl    %ecx, %edx
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > > +#  else
> > > > +     addq    %rcx, %rdi
> > > > +#  endif
> > > > +#  ifdef USE_AS_STPCPY
> > > > +     movq    %rdi, %rax
> > > > +#  endif
> > > > +     .p2align 4,, 8
> > > > +#  if VEC_SIZE == 32
> > > > +L(zfill_less_half):
> > > > +#  endif
> > > > +L(zfill_less_16):
> > > > +     xorl    %ecx, %ecx
> > > > +     cmpl    $(8 / CHAR_SIZE), %edx
> > > > +     jb      L(zfill_less_8)
> > > > +     movq    %rcx, (%rdi)
> > > > +     movq    %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > > > +#  ifndef USE_AS_STPCPY
> > > > +L(ret_8_15):
> > > > +#  endif
> > > > +     ret
> > > > +
> > > > +     .p2align 4,, 8
> > > > +L(less_1x_vec):
> > > > +     je      L(copy_1x)
> > > > +
> > > > +     /* We will need `tzcnt` result for all other copy sizes.  */
> > > > +     tzcnt   %VRCX, %VRCX
> > > > +#  if VEC_SIZE == 64
> > > > +     cmpl    $(32 / CHAR_SIZE), %edx
> > > > +     jae     L(copy_32_63)
> > > > +#  endif
> > > > +
> > > > +     cmpl    $(16 / CHAR_SIZE), %edx
> > > > +     jae     L(copy_16_31)
> > > > +
> > > > +     cmpl    $(8 / CHAR_SIZE), %edx
> > > > +     jae     L(copy_8_15)
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     testl   %ecx, %ecx
> > > > +     jz      L(zfill_less_8_set_ret)
> > > > +
> > > > +     movl    (%rsi, %rdx, CHAR_SIZE), %esi
> > > > +     vmovd   %VMM_128(0), (%rdi)
> > > > +     movl    %esi, (%rdi, %rdx, CHAR_SIZE)
> > > > +#   ifdef USE_AS_STPCPY
> > > > +     cmpl    %ecx, %edx
> > > > +L(ret_8_15):
> > > > +     adcq    $0, %rdx
> > > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > > > +#   endif
> > > > +     ret
> > > > +L(zfill_less_8_set_ret):
> > > > +     xorl    %ecx, %ecx
> > > > +#   ifdef USE_AS_STPCPY
> > > > +     movq    %rdi, %rax
> > > > +#   endif
> > > > +L(zfill_less_8):
> > > > +     movl    %ecx, (%rdi)
> > > > +     movl    %ecx, (%rdi, %rdx, CHAR_SIZE)
> > > > +     ret
> > > > +#  else
> > > > +     cmpl    $3, %edx
> > > > +     jb      L(copy_0_3)
> > > > +     /* Overfill to avoid branches.  */
> > > > +     movl    -3(%rsi, %rdx), %esi
> > > > +     vmovd   %VMM_128(0), (%rdi)
> > > > +     movl    %esi, -3(%rdi, %rdx)
> > > > +     cmpl    %ecx, %edx
> > > > +     jbe     L(ret_4_7)
> > > > +     subq    %rcx, %rdx
> > > > +     addq    %rcx, %rdi
> > > > +#   ifdef USE_AS_STPCPY
> > > > +     movq    %rdi, %rax
> > > > +#   endif
> > > > +     xorl    %ecx, %ecx
> > > > +     .p2align 4,, 8
> > > > +L(zfill_less_8):
> > > > +     cmpl    $3, %edx
> > > > +     jb      L(zfill_less_3)
> > > > +     movl    %ecx, (%rdi)
> > > > +     movl    %ecx, -3(%rdi, %rdx)
> > > > +#   ifdef USE_AS_STPCPY
> > > > +     ret
> > > > +#   endif
> > > > +
> > > > +L(ret_4_7):
> > > > +#   ifdef USE_AS_STPCPY
> > > > +L(ret_8_15):
> > > > +     movl    %edx, %eax
> > > > +     adcq    %rdi, %rax
> > > > +#   endif
> > > > +     ret
> > > > +
> > > > +     .p2align 4,, 4
> > > > +L(zfill_less_3):
> > > > +     testl   %edx, %edx
> > > > +     jz      L(zfill_1)
> > > > +     movw    %cx, (%rdi)
> > > > +L(zfill_1):
> > > > +     movb    %cl, (%rdi, %rdx)
> > > > +     ret
> > > > +
> > > > +     .p2align 4,, 8
> > > > +L(copy_0_3):
> > > > +     vmovd   %VMM_128(0), %r8d
> > > > +     testl   %edx, %edx
> > > > +     jz      L(copy_1)
> > > > +     movw    %r8w, (%rdi)
> > > > +     cmpl    %ecx, %edx
> > > > +     ja      L(zfill_from_1)
> > > > +     movzbl  (%rsi, %rdx), %r8d
> > > > +#   ifdef USE_AS_STPCPY
> > > > +     movl    %edx, %eax
> > > > +     adcq    %rdi, %rax
> > > > +     movb    %r8b, (%rdi, %rdx)
> > > > +     ret
> > > > +#   endif
> > > > +
> > > > +L(copy_1):
> > > > +#   ifdef USE_AS_STPCPY
> > > > +     movl    %edx, %eax
> > > > +     cmpl    %ecx, %edx
> > > > +     adcq    %rdi, %rax
> > > > +#   endif
> > > > +#   ifdef USE_AS_WCSCPY
> > > > +     vmovd   %VMM_128(0), (%rdi)
> > > > +#   else
> > > > +     movb    %r8b, (%rdi, %rdx)
> > > > +#   endif
> > > > +     ret
> > > > +#  endif
> > > > +
> > > > +
> > > > +#  ifndef USE_AS_WCSCPY
> > > > +     .p2align 4,, 8
> > > > +L(zfill_from_1):
> > > > +#   ifdef USE_AS_STPCPY
> > > > +     leaq    (%rdi, %rcx), %rax
> > > > +#   endif
> > > > +     movw    $0, -1(%rdi, %rdx)
> > > > +     ret
> > > > +#  endif
> > > > +
> > > > +     .p2align 4,, 4
> > > > +L(zero_len):
> > > > +     incq    %rdx
> > > > +     jne     L(best_effort_strncpy)
> > > > +     movq    %rdi, %rax
> > > > +     ret
> > > > +# endif
> > > > +
> > > > +
> > > > +     .p2align 4,, 4
> > > > +     .p2align 6,, 8
> > > > +L(page_cross):
> > > > +     movq    %rsi, %rax
> > > > +     andq    $(VEC_SIZE * -1), %rax
> > > > +     VPCMPEQ (%rax), %VZERO, %k0
> > > > +     KMOV    %k0, %VRCX
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     movl    %esi, %r8d
> > > > +     shrl    $2, %r8d
> > > > +     andl    $(CHAR_PER_VEC - 1), %r8d
> > > > +     shrx    %VR8, %VRCX, %VRCX
> > > > +# else
> > > > +     shrx    %VRSI, %VRCX, %VRCX
> > > > +# endif
> > > > +
> > > > +     /* Compute amount of bytes we checked.  */
> > > > +     subl    %esi, %eax
> > > > +     andl    $(VEC_SIZE - 1), %eax
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     shrl    $2, %eax
> > > > +# endif
> > > > +
> > > > +     /* If rax > rdx then we are finishing the copy at the end of the
> > > > +        page.  */
> > > > +     cmpq    %rax, %rdx
> > > > +     jb      L(page_cross_small)
> > > > +
> > > > +
> > > > +     /* If rcx is non-zero then continue.  */
> > > > +     test    %VRCX, %VRCX
> > > > +     jz      L(page_cross_continue)
> > > > +
> > > > +     /* We found zero-CHAR so need to copy then zfill (we know we
> > > > +        didn't cover all of length here).  */
> > > > +     bsf     %VRCX, %VRCX
> > > > +L(movsb_and_zfill):
> > > > +     incl    %ecx
> > > > +     subq    %rcx, %rdx
> > > > +# ifdef USE_AS_STPCPY
> > > > +     leaq    -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
> > > > +# else
> > > > +     movq    %rdi, %rax
> > > > +# endif
> > > > +
> > > > +     REP_MOVS
> > > > +# ifdef USE_AS_WCSCPY
> > > > +     movl    $0, (%rdi)
> > > > +# else
> > > > +     movb    $0, (%rdi)
> > > > +# endif
> > > > +     jmp     L(zfill_from_page_cross)
> > > > +
> > > > +L(page_cross_small):
> > > > +     tzcnt   %VRCX, %VRCX
> > > > +     cmpl    %ecx, %edx
> > > > +     jbe     L(page_cross_copy_only)
> > > > +
> > > > +     /* Do a zfill of the tail before copying.  */
> > > > +     movq    %rdi, %r9
> > > > +     xorl    %eax, %eax
> > > > +
> > > > +     movl    %ecx, %r8d
> > > > +
> > > > +     subl    %ecx, %edx
> > > > +     leaq    CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> > > > +     movl    %edx, %ecx
> > > > +     REP_STOS
> > > > +     movq    %r9, %rdi
> > > > +     movl    %r8d, %edx
> > > > +L(page_cross_copy_only):
> > > > +     leal    1(%rdx), %ecx
> > > > +# ifdef USE_AS_STPCPY
> > > > +#  ifdef USE_AS_WCSCPY
> > > > +     adcl    $0, %edx
> > > > +     leaq    (%rdi, %rdx, CHAR_SIZE), %rax
> > > > +#  else
> > > > +     movl    %edx, %eax
> > > > +     adcq    %rdi, %rax
> > > > +#  endif
> > > > +# else
> > > > +     movq    %rdi, %rax
> > > > +# endif
> > > > +     REP_MOVS
> > > > +     ret
> > > > +
> > > > +
> > > > +L(best_effort_strncpy):
> > > > +     movq    %rdx, %rcx
> > > > +     xorl    %eax, %eax
> > > > +     movq    %rdi, %r8
> > > > +     /* The length is >= 2^63. We very much so expect to segfault at
> > > > +        rep stos. If that doesn't happen then just strcpy to finish.
> > > > +      */
> > > > +     REP_STOS
> > > > +     movq    %r8, %rdi
> > > > +     jmp     OVERFLOW_STRCPY
> > > > +END(STRNCPY)
> > > > +#endif
> > > > diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> > > > new file mode 100644
> > > > index 0000000000..d5ff4cbe50
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> > >
> > > Please add a copyright notice.
> > >
> > > > @@ -0,0 +1,65 @@
> > > > +#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
> > > > +#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
> > > > +
> > > > +#if defined USE_MULTIARCH && IS_IN(libc)
> > > > +#  define UNDERSCORES __
> > > > +#  ifdef USE_WITH_SSE2
> > > > +#    define ISA_EXT _sse2
> > > > +#  elif defined USE_WITH_AVX
> > > > +#    ifdef USE_WITH_RTM
> > > > +#      define ISA_EXT _avx_rtm
> > > > +#    else
> > > > +#      define ISA_EXT _avx
> > > > +#    endif
> > > > +#  elif defined USE_WITH_AVX2
> > >
> > > Do we have a function with both AVX and AVX2 versions? If not, should
> > > keep just 1.
> > >
> > > > +#    ifdef USE_WITH_RTM
> > > > +#      define ISA_EXT _avx2_rtm
> > > > +#    else
> > > > +#      define ISA_EXT _avx2
> > > > +#    endif
> > > > +
> > > > +#  elif defined USE_WITH_EVEX256
> > > > +#    define ISA_EXT _evex
> > > > +#  elif defined USE_WITH_EVEX512
> > > > +#    define ISA_EXT _evex512
> > > > +#  endif
> > > > +#else
> > > > +#  define UNDERSCORES
> > > > +#  define ISA_EXT
> > > > +#endif
> > > > +
> > > > +#ifdef USE_AS_WCSCPY
> > > > +#  define STRCPY_PREFIX wc
> > > > +#  define STRCAT_PREFIX wcs
> > > > +#  ifdef USE_AS_STPCPY
> > > > +#    define STRCPY_POSTFIX pcpy
> > > > +#  else
> > > > +#    define STRCPY_POSTFIX scpy
> > > > +#  endif
> > > > +#else
> > > > +#  define STRCPY_PREFIX st
> > > > +#  define STRCAT_PREFIX str
> > > > +#  ifdef USE_AS_STPCPY
> > > > +#    define STRCPY_POSTFIX pcpy
> > > > +#  else
> > > > +#    define STRCPY_POSTFIX rcpy
> > > > +#  endif
> > > > +#endif
> > > > +#define STRCAT_POSTFIX cat
> > > > +
> > > > +#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext)                 \
> > > > +  underscores##prefix##postfix##ext
> > > > +
> > > > +#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
> > > > +
> > > > +#ifndef OVERFLOW_STRCPY
> > > > +#  define OVERFLOW_STRCPY                                                     \
> > > > +    OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
> > > > +#endif
> > > > +
> > > > +#ifndef OVERFLOW_STRCAT
> > > > +#  define OVERFLOW_STRCAT                                                     \
> > > > +    OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
> > > > +#endif
> > > > +
> > > > +#endif
> > > > --
> > > > 2.34.1
> > > >
> > >
> > > H.J.
>
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v4 1/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
  2022-11-04 23:04   ` [PATCH v4 1/4] " Noah Goldstein
                       ` (2 preceding siblings ...)
  2022-11-04 23:04     ` [PATCH v4 4/4] x86: Add avx2 " Noah Goldstein
@ 2022-11-04 23:34     ` H.J. Lu
  3 siblings, 0 replies; 42+ messages in thread
From: H.J. Lu @ 2022-11-04 23:34 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Nov 04, 2022 at 04:04:07PM -0700, Noah Goldstein wrote:
> Optimizations are:
>     1. Use more overlapping stores to avoid branches.
>     2. Reduce how unrolled the aligning copies are (this is more of a
>        code-size save, its a negative for some sizes in terms of
>        perf).
>     3. Improve the loop a bit (similiar to what we do in strlen with
>        2x vpminu + kortest instead of 3x vpminu + kmov + test).
>     4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
>        number that are taken.
> 
> Performance Changes:
> 
>     Times are from N = 10 runs of the benchmark suite and are
>     reported as geometric mean of all ratios of
>     New Implementation / Old Implementation.
> 
>     stpcpy-evex      -> 0.922
>     strcat-evex      -> 0.985
>     strcpy-evex      -> 0.880
> 
>     strncpy-evex     -> 0.831
>     stpncpy-evex     -> 0.780
> 
>     strncat-evex     -> 0.958
> 
> Code Size Changes:
>     function         -> Bytes New / Bytes Old -> Ratio
> 
>     strcat-evex      ->  819 / 1874 -> 0.437
>     strcpy-evex      ->  700 / 1074 -> 0.652
>     stpcpy-evex      ->  735 / 1094 -> 0.672
> 
>     strncpy-evex     -> 1397 / 2611 -> 0.535
>     stpncpy-evex     -> 1489 / 2691 -> 0.553
> 
>     strncat-evex     -> 1184 / 2832 -> 0.418
> 
> Notes:
>     1. Because of the significant difference between the
>        implementations they are split into three files.
> 
>            strcpy-evex.S    -> strcpy, stpcpy, strcat
>            strncpy-evex.S   -> strncpy
>            strncat-evex.S    > strncat
> 
>        I couldn't find a way to merge them without making the
>        ifdefs incredibly difficult to follow.
> 
>     2. All implementations can be made evex512 by including
>        "x86-evex512-vecs.h" at the top.
> 
>     3. All implementations have an optional define:
>         `USE_EVEX_MASKED_STORE`
>        Setting to one uses evex-masked stores for handling short
>        strings.  This saves code size and branches.  It's disabled
>        for all implementations are the moment as there are some
>        serious drawbacks to masked stores in certain cases, but
>        that may be fixed on future architectures.
> 
> Full check passes on x86-64 and build succeeds for all ISA levels w/
> and w/o multiarch.
> ---
>  sysdeps/x86_64/multiarch/stpncpy-evex.S       |    5 +-
>  sysdeps/x86_64/multiarch/strcat-evex.S        |  291 +---
>  sysdeps/x86_64/multiarch/strcpy-evex.S        | 1282 ++++++-----------
>  sysdeps/x86_64/multiarch/strncat-evex.S       |  525 ++++++-
>  sysdeps/x86_64/multiarch/strncpy-evex.S       |  995 ++++++++++++-
>  .../multiarch/strncpy-or-cat-overflow-def.h   |   65 +
>  6 files changed, 1990 insertions(+), 1173 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> 
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> index 99ea76a372..3693491baa 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
> +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> @@ -3,6 +3,5 @@
>  #endif
>  
>  #define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY	STPNCPY
> -#include "strcpy-evex.S"
> +#define STRNCPY	STPNCPY
> +#include "strncpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
> index 0e2df947e9..b4207b7889 100644
> --- a/sysdeps/x86_64/multiarch/strcat-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcat-evex.S
> @@ -1,286 +1,7 @@
> -/* strcat with 256-bit EVEX instructions.
> -   Copyright (C) 2021-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <isa-level.h>
> -
> -#if ISA_SHOULD_BUILD (4)
> -
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_evex
> -# endif
> -
> -# define VMOVU		vmovdqu64
> -# define VMOVA		vmovdqa64
> -
> -/* zero register */
> -# define XMMZERO	xmm16
> -# define YMMZERO	ymm16
> -# define YMM0		ymm17
> -# define YMM1		ymm18
> -
> -# define USE_AS_STRCAT
> -
> -/* Number of bytes in a vector register */
> -# define VEC_SIZE	32
> -
> -	.section .text.evex,"ax",@progbits
> -ENTRY (STRCAT)
> -	mov	%rdi, %r9
> -# ifdef USE_AS_STRNCAT
> -	mov	%rdx, %r8
> -# endif
> -
> -	xor	%eax, %eax
> -	mov	%edi, %ecx
> -	and	$((VEC_SIZE * 4) - 1), %ecx
> -	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
> -	cmp	$(VEC_SIZE * 3), %ecx
> -	ja	L(fourth_vector_boundary)
> -	vpcmpb	$0, (%rdi), %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_first_vector)
> -	mov	%rdi, %rax
> -	and	$-VEC_SIZE, %rax
> -	jmp	L(align_vec_size_start)
> -L(fourth_vector_boundary):
> -	mov	%rdi, %rax
> -	and	$-VEC_SIZE, %rax
> -	vpcmpb	$0, (%rax), %YMMZERO, %k0
> -	mov	$-1, %r10d
> -	sub	%rax, %rcx
> -	shl	%cl, %r10d
> -	kmovd	%k0, %edx
> -	and	%r10d, %edx
> -	jnz	L(exit)
> -
> -L(align_vec_size_start):
> -	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -	add	$(VEC_SIZE * 4), %rax
> -	kmovd	%k4, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -	kmovd	%k4, %edx
> -	add	$(VEC_SIZE * 4), %rax
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -	add	$(VEC_SIZE * 4), %rax
> -	kmovd	%k4, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -	add	$(VEC_SIZE * 5), %rax
> -	kmovd	%k4, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
> -	add	$VEC_SIZE, %rax
> -	kmovd	%k0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
> -	add	$VEC_SIZE, %rax
> -	kmovd	%k0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
> -	add	$VEC_SIZE, %rax
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	add	$VEC_SIZE, %rax
> -
> -	.p2align 4
> -L(align_four_vec_loop):
> -	VMOVA	(%rax), %YMM0
> -	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
> -	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
> -	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
> -	vpminub	%YMM0, %YMM1, %YMM0
> -	/* If K0 != 0, there is a null byte.  */
> -	vpcmpb	$0, %YMM0, %YMMZERO, %k0
> -	add	$(VEC_SIZE * 4), %rax
> -	ktestd	%k0, %k0
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
> -	sub	$(VEC_SIZE * 5), %rax
> -	kmovd	%k0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 4), %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit):
> -	sub	%rdi, %rax
> -L(exit_null_on_first_vector):
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_second_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$VEC_SIZE, %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_third_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 2), %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_fourth_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 3), %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_fifth_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 4), %rax
> -
> -	.p2align 4
> -L(StartStrcpyPart):
> -	lea	(%r9, %rax), %rdi
> -	mov	%rsi, %rcx
> -	mov	%r9, %rax      /* save result */
> -
> -# ifdef USE_AS_STRNCAT
> -	test	%r8, %r8
> -	jz	L(ExitZero)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-evex.S"
> +#ifndef STRCAT
> +# define STRCAT	__strcat_evex
>  #endif
> +
> +#define USE_AS_STRCAT
> +#define STRCPY	STRCAT
> +#include "strcpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
> index 82e45ac675..932129ab40 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
> @@ -1,4 +1,4 @@
> -/* strcpy with 256-bit EVEX instructions.
> +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
>     Copyright (C) 2021-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>  
> @@ -17,990 +17,526 @@
>     <https://www.gnu.org/licenses/>.  */
>  
>  #include <isa-level.h>
> -
>  #if ISA_SHOULD_BUILD (4)
>  
>  
> -# ifndef USE_AS_STRCAT
> -#  include <sysdep.h>
> +	/* Use evex-masked stores for small sizes. Turned off at the
> +	   moment.  */
> +# define USE_EVEX_MASKED_STORE	0
> +	/* Use movsb in page cross case to save code size.  */
> +# define USE_MOVSB_IN_PAGE_CROSS	1
>  
> -#  ifndef STRCPY
> -#   define STRCPY  __strcpy_evex
> -#  endif
> +# include <sysdep.h>
>  
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
>  # endif
>  
> -# define VMOVU		vmovdqu64
> -# define VMOVA		vmovdqa64
> -
> -/* Number of bytes in a vector register */
> -# ifndef VEC_SIZE
> -#  define VEC_SIZE	32
> +# ifndef STRCPY
> +#  define STRCPY	__strcpy_evex
>  # endif
>  
> -# define XMM2		xmm18
> -# define XMM3		xmm19
>  
> -# define YMM2		ymm18
> -# define YMM3		ymm19
> -# define YMM4		ymm20
> -# define YMM5		ymm21
> -# define YMM6		ymm22
> -# define YMM7		ymm23
> +# ifdef USE_AS_WCSCPY
> +#  define VMOVU_MASK	vmovdqu32
> +#  define VPMIN	vpminud
> +#  define VPTESTN	vptestnmd
> +#  define VPTEST	vptestmd
> +#  define VPCMPEQ	vpcmpeqd
> +#  define CHAR_SIZE	4
>  
> -# ifndef USE_AS_STRCAT
> +#  define REP_MOVS	rep movsd
>  
> -/* zero register */
> -#  define XMMZERO	xmm16
> -#  define YMMZERO	ymm16
> -#  define YMM1		ymm17
> -
> -	.section .text.evex,"ax",@progbits
> -ENTRY (STRCPY)
> -#  ifdef USE_AS_STRNCPY
> -	mov	%RDX_LP, %R8_LP
> -	test	%R8_LP, %R8_LP
> -	jz	L(ExitZero)
> -#  endif
> -	mov	%rsi, %rcx
> -#  ifndef USE_AS_STPCPY
> -	mov	%rdi, %rax      /* save result */
> -#  endif
> +#  define USE_WIDE_CHAR
> +# else
> +#  define VMOVU_MASK	vmovdqu8
> +#  define VPMIN	vpminub
> +#  define VPTESTN	vptestnmb
> +#  define VPTEST	vptestmb
> +#  define VPCMPEQ	vpcmpeqb
> +#  define CHAR_SIZE	1
>  
> -	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
> +#  define REP_MOVS	rep movsb
>  # endif
>  
> -	and	$((VEC_SIZE * 4) - 1), %ecx
> -	cmp	$(VEC_SIZE * 2), %ecx
> -	jbe	L(SourceStringAlignmentLessTwoVecSize)
> -
> -	and	$-VEC_SIZE, %rsi
> -	and	$(VEC_SIZE - 1), %ecx
> -
> -	vpcmpb	$0, (%rsi), %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	shr	%cl, %rdx
> +# include "reg-macros.h"
>  
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -	mov	$VEC_SIZE, %r10
> -	sub	%rcx, %r10
> -	cmp	%r10, %r8
> -#  else
> -	mov	$(VEC_SIZE + 1), %r10
> -	sub	%rcx, %r10
> -	cmp	%r10, %r8
> -#  endif
> -	jbe	L(CopyVecSizeTailCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeTail)
> -
> -	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
> -	kmovd	%k1, %edx
>  
> -# ifdef USE_AS_STRNCPY
> -	add	$VEC_SIZE, %r10
> -	cmp	%r10, %r8
> -	jbe	L(CopyTwoVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jnz	L(CopyTwoVecSize)
> -
> -	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
> -	VMOVU	%YMM2, (%rdi)
> -
> -/* If source address alignment != destination address alignment */
> -	.p2align 4
> -L(UnalignVecSizeBoth):
> -	sub	%rcx, %rdi
> -# ifdef USE_AS_STRNCPY
> -	add	%rcx, %r8
> -	sbb	%rcx, %rcx
> -	or	%rcx, %r8
> -# endif
> -	mov	$VEC_SIZE, %rcx
> -	VMOVA	(%rsi, %rcx), %YMM2
> -	VMOVU	%YMM2, (%rdi, %rcx)
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$(VEC_SIZE * 3), %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec2)
> +# ifdef USE_AS_STPCPY
> +#  define END_REG	rax
>  # else
> -	jnz	L(CopyVecSize)
> +#  define END_REG	rdi, %rdx, CHAR_SIZE
>  # endif
>  
> -	VMOVU	%YMM2, (%rdi, %rcx)
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
> -	vpcmpb	$0, %YMM3, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec3)
> +# ifdef USE_AS_STRCAT
> +#  define PAGE_ALIGN_REG	edx
> +#  define PAGE_ALIGN_REG_64	rdx
>  # else
> -	jnz	L(CopyVecSize)
> +#  define PAGE_ALIGN_REG	eax
> +#  define PAGE_ALIGN_REG_64	rax
>  # endif
>  
> -	VMOVU	%YMM3, (%rdi, %rcx)
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
> -	vpcmpb	$0, %YMM4, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec4)
> -# else
> -	jnz	L(CopyVecSize)
> -# endif
> +# define VZERO	VMM(7)
> +# define VZERO_128	VMM_128(7)
>  
> -	VMOVU	%YMM4, (%rdi, %rcx)
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec2)
> -# else
> -	jnz	L(CopyVecSize)
> -# endif
>  
> -	VMOVU	%YMM2, (%rdi, %rcx)
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec2)
> -# else
> -	jnz	L(CopyVecSize)
> -# endif
> +# define PAGE_SIZE	4096
> +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
>  
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
> -	VMOVU	%YMM2, (%rdi, %rcx)
> -	vpcmpb	$0, %YMM3, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec3)
> -# else
> -	jnz	L(CopyVecSize)
> -# endif
>  
> -	VMOVU	%YMM3, (%rdi, %rcx)
> -	mov	%rsi, %rdx
> -	lea	VEC_SIZE(%rsi, %rcx), %rsi
> -	and	$-(VEC_SIZE * 4), %rsi
> -	sub	%rsi, %rdx
> -	sub	%rdx, %rdi
> -# ifdef USE_AS_STRNCPY
> -	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
> -# endif
> -L(UnalignedFourVecSizeLoop):
> -	VMOVA	(%rsi), %YMM4
> -	VMOVA	VEC_SIZE(%rsi), %YMM5
> -	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
> -	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
> -	vpminub	%YMM5, %YMM4, %YMM2
> -	vpminub	%YMM7, %YMM6, %YMM3
> -	vpminub	%YMM2, %YMM3, %YMM2
> -	/* If K7 != 0, there is a null byte.  */
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k7
> -	kmovd	%k7, %edx
> -# ifdef USE_AS_STRNCPY
> -	sub	$(VEC_SIZE * 4), %r8
> -	jbe	L(UnalignedLeaveCase2OrCase3)
> +	.section SECTION(.text), "ax", @progbits
> +ENTRY(STRCPY)
> +# ifdef USE_AS_STRCAT
> +	movq	%rdi, %rax
> +#  include "strcat-strlen-evex.h.S"
>  # endif
> -	test	%edx, %edx
> -	jnz	L(UnalignedFourVecSizeLeave)
> -
> -L(UnalignedFourVecSizeLoop_start):
> -	add	$(VEC_SIZE * 4), %rdi
> -	add	$(VEC_SIZE * 4), %rsi
> -	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
> -	VMOVA	(%rsi), %YMM4
> -	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
> -	VMOVA	VEC_SIZE(%rsi), %YMM5
> -	vpminub	%YMM5, %YMM4, %YMM2
> -	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
> -	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
> -	VMOVU	%YMM7, -VEC_SIZE(%rdi)
> -	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
> -	vpminub	%YMM7, %YMM6, %YMM3
> -	vpminub	%YMM2, %YMM3, %YMM2
> -	/* If K7 != 0, there is a null byte.  */
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k7
> -	kmovd	%k7, %edx
> -# ifdef USE_AS_STRNCPY
> -	sub	$(VEC_SIZE * 4), %r8
> -	jbe	L(UnalignedLeaveCase2OrCase3)
> +
> +	movl	%esi, %PAGE_ALIGN_REG
> +	andl	$(PAGE_SIZE - 1), %PAGE_ALIGN_REG
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
> +	ja	L(page_cross)
> +L(page_cross_continue):
> +	VMOVU	(%rsi), %VMM(0)
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +	movq	%rdi, %rax
>  # endif
> -	test	%edx, %edx
> -	jz	L(UnalignedFourVecSizeLoop_start)
>  
> -L(UnalignedFourVecSizeLeave):
> -	vpcmpb	$0, %YMM4, %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeUnaligned_0)
>  
> -	vpcmpb	$0, %YMM5, %YMMZERO, %k2
> -	kmovd	%k2, %ecx
> -	test	%ecx, %ecx
> -	jnz	L(CopyVecSizeUnaligned_16)
> +	/* Two short string implementations. One with traditional
> +	   branching approach and one with masked instructions (which
> +	   have potential for dramatically bad perf if dst splits a
> +	   page and is not in the TLB).  */
> +# if USE_EVEX_MASKED_STORE
> +	VPTEST	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +#  ifdef USE_AS_WCSCPY
> +	subl	$((1 << CHAR_PER_VEC)- 1), %VRCX
> +#  else
> +	inc	%VRCX
> +#  endif
> +	jz	L(more_1x_vec)
> +	KMOV	%VRCX, %k1
> +	KXOR	%k0, %k1, %k1
>  
> -	vpcmpb	$0, %YMM6, %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeUnaligned_32)
> -
> -	vpcmpb	$0, %YMM7, %YMMZERO, %k4
> -	kmovd	%k4, %ecx
> -	bsf	%ecx, %edx
> -	VMOVU	%YMM4, (%rdi)
> -	VMOVU	%YMM5, VEC_SIZE(%rdi)
> -	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
> -# endif
> -	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
> -	add	$(VEC_SIZE - 1), %r8
> -	sub	%rdx, %r8
> -	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> -# else
> -	add	$(VEC_SIZE * 3), %rsi
> -	add	$(VEC_SIZE * 3), %rdi
> -	jmp	L(CopyVecSizeExit)
> -# endif
> +	VMOVU_MASK %VMM(0), (%rdi){%k1}
>  
> -/* If source address alignment == destination address alignment */
> +#  ifdef USE_AS_STPCPY
> +	bsf	%VRCX, %VRCX
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
> +#  endif
> +	ret
>  
> -L(SourceStringAlignmentLessTwoVecSize):
> -	VMOVU	(%rsi), %YMM3
> -	VMOVU	VEC_SIZE(%rsi), %YMM2
> -	vpcmpb	$0, %YMM3, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> +# else
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jz	L(more_1x_vec)
>  
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -	cmp	$VEC_SIZE, %r8
> +	xorl	%edx, %edx
> +	bsf	%VRCX, %VRDX
> +#  ifdef USE_AS_STPCPY
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#  endif
> +
> +	/* Use mask bits in rcx to detect which copy we need. If the low
> +	   mask is zero then there must be a bit set in the upper half.
> +	   I.e if rcx != 0 and ecx == 0, then match must be upper 32
> +	   bits so we use L(copy_32_63).  */
> +#  if VEC_SIZE == 64
> +#   ifdef USE_AS_WCSCPY
> +	testb	%cl, %cl
> +#   else
> +	testl	%ecx, %ecx
> +#   endif
> +	jz	L(copy_32_63)
> +#  endif
> +
> +#  ifdef USE_AS_WCSCPY
> +	testb	$0xf, %cl
>  #  else
> -	cmp	$(VEC_SIZE + 1), %r8
> +	testw	%cx, %cx
>  #  endif
> -	jbe	L(CopyVecSizeTail1Case2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeTail1)
> +	jz	L(copy_16_31)
>  
> -	VMOVU	%YMM3, (%rdi)
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k0
> -	kmovd	%k0, %edx
>  
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -	cmp	$(VEC_SIZE * 2), %r8
> +#  ifdef USE_AS_WCSCPY
> +	testb	$0x3, %cl
>  #  else
> -	cmp	$((VEC_SIZE * 2) + 1), %r8
> +	testb	%cl, %cl
>  #  endif
> -	jbe	L(CopyTwoVecSize1Case2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jnz	L(CopyTwoVecSize1)
> -
> -	and	$-VEC_SIZE, %rsi
> -	and	$(VEC_SIZE - 1), %ecx
> -	jmp	L(UnalignVecSizeBoth)
> +	jz	L(copy_8_15)
>  
> -/*------End of main part with loops---------------------*/
>  
> -/* Case1 */
> +#  ifdef USE_AS_WCSCPY
> +	vmovd	%VMM_128(0), (%rdi)
> +	/* No need to copy, we know its zero.  */
> +	movl	$0, (%END_REG)
>  
> -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> -	.p2align 4
> -L(CopyVecSize):
> -	add	%rcx, %rdi
> -# endif
> -L(CopyVecSizeTail):
> -	add	%rcx, %rsi
> -L(CopyVecSizeTail1):
> -	bsf	%edx, %edx
> -L(CopyVecSizeExit):
> -	cmp	$32, %edx
> -	jae	L(Exit32_63)
> -	cmp	$16, %edx
> -	jae	L(Exit16_31)
> -	cmp	$8, %edx
> -	jae	L(Exit8_15)
> -	cmp	$4, %edx
> -	jae	L(Exit4_7)
> -	cmp	$3, %edx
> -	je	L(Exit3)
> -	cmp	$1, %edx
> -	ja	L(Exit2)
> -	je	L(Exit1)
> -	movb	$0, (%rdi)
> -# ifdef USE_AS_STPCPY
> -	lea	(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$1, %r8
> -	lea	1(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> -# endif
>  	ret
> +#  else
>  
> -	.p2align 4
> -L(CopyTwoVecSize1):
> -	add	$VEC_SIZE, %rsi
> -	add	$VEC_SIZE, %rdi
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$VEC_SIZE, %r8
> -# endif
> -	jmp	L(CopyVecSizeTail1)
> -
> -	.p2align 4
> -L(CopyTwoVecSize):
> -	bsf	%edx, %edx
> -	add	%rcx, %rsi
> -	add	$VEC_SIZE, %edx
> -	sub	%ecx, %edx
> -	jmp	L(CopyVecSizeExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnaligned_0):
> -	bsf	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -	VMOVU	%YMM4, (%rdi)
> -	add	$((VEC_SIZE * 4) - 1), %r8
> -	sub	%rdx, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> -# else
> -	jmp	L(CopyVecSizeExit)
> -# endif
> +	testb	$0x7, %cl
> +	jz	L(copy_4_7)
>  
> -	.p2align 4
> -L(CopyVecSizeUnaligned_16):
> -	bsf	%ecx, %edx
> -	VMOVU	%YMM4, (%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	VEC_SIZE(%rdi, %rdx), %rax
> -# endif
> -	VMOVU	%YMM5, VEC_SIZE(%rdi)
> -	add	$((VEC_SIZE * 3) - 1), %r8
> -	sub	%rdx, %r8
> -	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> -# else
> -	add	$VEC_SIZE, %rsi
> -	add	$VEC_SIZE, %rdi
> -	jmp	L(CopyVecSizeExit)
> -# endif
>  
> -	.p2align 4
> -L(CopyVecSizeUnaligned_32):
> -	bsf	%edx, %edx
> -	VMOVU	%YMM4, (%rdi)
> -	VMOVU	%YMM5, VEC_SIZE(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
> -# endif
> -	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
> -	add	$((VEC_SIZE * 2) - 1), %r8
> -	sub	%rdx, %r8
> -	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> -# else
> -	add	$(VEC_SIZE * 2), %rsi
> -	add	$(VEC_SIZE * 2), %rdi
> -	jmp	L(CopyVecSizeExit)
> -# endif
> +	test	%edx, %edx
> +	jz	L(set_null_term)
>  
> -# ifdef USE_AS_STRNCPY
> -#  ifndef USE_AS_STRCAT
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec6):
> -	VMOVU	%YMM6, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec5):
> -	VMOVU	%YMM5, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec4):
> -	VMOVU	%YMM4, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec3):
> -	VMOVU	%YMM3, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> +	/* NB: make this `vmovw` if support for AVX512-FP16 is added.
> +	 */
> +	vmovd	%VMM_128(0), %esi
> +	movw	%si, (%rdi)
> +
> +	.p2align 4,, 1
> +L(set_null_term):
> +	/* No need to copy, we know its zero.  */
> +	movb	$0, (%END_REG)
> +	ret
>  #  endif
>  
> -/* Case2 */
> -
> -	.p2align 4
> -L(CopyVecSizeCase2):
> -	add	$VEC_SIZE, %r8
> -	add	%rcx, %rdi
> -	add	%rcx, %rsi
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyTwoVecSizeCase2):
> -	add	%rcx, %rsi
> -	bsf	%edx, %edx
> -	add	$VEC_SIZE, %edx
> -	sub	%ecx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -L(CopyVecSizeTailCase2):
> -	add	%rcx, %rsi
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -L(CopyVecSizeTail1Case2):
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -/* Case2 or Case3,  Case3 */
> -
> -	.p2align 4
> -L(CopyVecSizeCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyVecSizeCase2)
> -L(CopyVecSizeCase3):
> -	add	$VEC_SIZE, %r8
> -	add	%rcx, %rdi
> -	add	%rcx, %rsi
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyTwoVecSizeCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyTwoVecSizeCase2)
> -	add	%rcx, %rsi
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyVecSizeTailCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyVecSizeTailCase2)
> -	add	%rcx, %rsi
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyTwoVecSize1Case2OrCase3):
> -	add	$VEC_SIZE, %rdi
> -	add	$VEC_SIZE, %rsi
> -	sub	$VEC_SIZE, %r8
> -L(CopyVecSizeTail1Case2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyVecSizeTail1Case2)
> -	jmp	L(StrncpyExit)
> +#  if VEC_SIZE == 64
> +	.p2align 4,, 6
> +L(copy_32_63):
> +	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +	VMOVU	%VMM_256(0), (%rdi)
> +	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
> +	ret
> +#  endif
> +
> +
> +	.p2align 4,, 6
> +L(copy_16_31):
> +	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
> +	   and will save code size.  */
> +	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +	VMOVU	%VMM_128(0), (%rdi)
> +	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
> +	ret
> +
> +	.p2align 4,, 8
> +L(copy_8_15):
> +#  ifdef USE_AS_WCSCPY
> +	movl	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> +#  else
> +	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
> +#  endif
> +	vmovq	%VMM_128(0), (%rdi)
> +	movq	%rcx, -(8 - CHAR_SIZE)(%END_REG)
> +	ret
>  # endif
>  
> -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
>  
> -	.p2align 4
> -L(Exit1):
> -	movzwl	(%rsi), %edx
> -	mov	%dx, (%rdi)
> -# ifdef USE_AS_STPCPY
> -	lea	1(%rdi), %rax
> +# ifndef USE_AS_WCSCPY
> +	.p2align 4,, 12
> +L(copy_4_7):
> +	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> +	vmovd	%VMM_128(0), (%rdi)
> +	movl	%ecx, -(4 - CHAR_SIZE)(%END_REG)
> +	ret
>  # endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$2, %r8
> -	lea	2(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +
> +
> +	.p2align 4,, 8
> +L(more_1x_vec):
> +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> +	VMOVU	%VMM(0), (%rdi)
>  # endif
> -	ret
> +	subq	%rsi, %rdi
> +	andq	$-(VEC_SIZE), %rsi
> +	addq	%rsi, %rdi
> +	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
>  
> -	.p2align 4
> -L(Exit2):
> -	movzwl	(%rsi), %ecx
> -	mov	%cx, (%rdi)
> -	movb	$0, 2(%rdi)
> +	/* Ideally we store after moves to minimize impact of potential
> +	   false-dependencies.  */
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +	VMOVU	%VMM(0), (%rax)
> +# endif
> +
> +	VPTESTN	%VMM(1), %VMM(1), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x1)
> +
> +	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
> +	VMOVU	%VMM(1), VEC_SIZE(%rdi)
> +
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x2)
> +
> +	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
> +
> +	VPTESTN	%VMM(3), %VMM(3), %k0
> +	KMOV	%k0, %VRDX
> +	test	%VRDX, %VRDX
> +	jnz	L(ret_vec_x3)
> +
> +	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	VPTESTN	%VMM(4), %VMM(4), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x4)
> +
> +	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
> +
> +
> +	/* Align for 4x loop.  */
> +	subq	%rsi, %rdi
> +
> +	/* + VEC_SIZE * 5 because we never added the original VEC_SIZE
> +	   we covered before aligning.  */
> +	subq	$-(VEC_SIZE * 5), %rsi
> +	andq	$-(VEC_SIZE * 4), %rsi
> +
> +
> +	/* Load first half of the loop before entry.  */
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +	KORTEST	%k2, %k4
> +	jnz	L(loop_4x_done)
> +
> +	.p2align 4,, 11
> +L(loop_4x_vec):
> +
> +	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> +
> +	subq	$(VEC_SIZE * -4), %rsi
> +
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +	KORTEST	%k2, %k4
> +	jz	L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +	/* Restore rdi (%rdi).  */
> +	addq	%rsi, %rdi
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x0_end)
> +	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +
> +	KMOV	%k2, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x1)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x2)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +	/* Place L(ret_vec_x4) here to save code size.  We get a
> +	   meaningfuly benefit doing this for stpcpy.  */
> +	KMOV	%k4, %VRDX
> +L(ret_vec_x3):
> +	bsf	%VRDX, %VRDX
> +	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -	lea	2(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$3, %r8
> -	lea	3(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	leaq	(VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
>  # endif
> +L(return_end):
>  	ret
>  
> -	.p2align 4
> -L(Exit3):
> -	mov	(%rsi), %edx
> -	mov	%edx, (%rdi)
> +	.p2align 4,, 6
> +L(ret_vec_x0_end):
> +	bsf	%VRCX, %VRCX
>  # ifdef USE_AS_STPCPY
> -	lea	3(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$4, %r8
> -	lea	4(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
> +	inc	%VRCX
> +	VMOVU	(-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  	ret
>  
> -	.p2align 4
> -L(Exit4_7):
> -	mov	(%rsi), %ecx
> -	mov	%ecx, (%rdi)
> -	mov	-3(%rsi, %rdx), %ecx
> -	mov	%ecx, -3(%rdi, %rdx)
> +	.p2align 4,, 8
> +L(ret_vec_x1):
> +	bsf	%VRCX, %VRCX
> +	VMOVU	(VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	%rdx, %r8
> -	sub	$1, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
>  	ret
>  
> -	.p2align 4
> -L(Exit8_15):
> -	mov	(%rsi), %rcx
> -	mov	-7(%rsi, %rdx), %r9
> -	mov	%rcx, (%rdi)
> -	mov	%r9, -7(%rdi, %rdx)
> +	.p2align 4,, 4
> +L(ret_vec_x2):
> +	bsf	%VRCX, %VRCX
> +	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	%rdx, %r8
> -	sub	$1, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
>  	ret
>  
> -	.p2align 4
> -L(Exit16_31):
> -	VMOVU	(%rsi), %XMM2
> -	VMOVU	-15(%rsi, %rdx), %XMM3
> -	VMOVU	%XMM2, (%rdi)
> -	VMOVU	%XMM3, -15(%rdi, %rdx)
> +	/* ret_vec_x3 reuses return code after the loop.  */
> +	.p2align 4,, 6
> +L(ret_vec_x4):
> +	bsf	%VRCX, %VRCX
> +	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub %rdx, %r8
> -	sub $1, %r8
> -	lea 1(%rdi, %rdx), %rdi
> -	jnz L(StrncpyFillTailWithZero)
> +	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
>  	ret
>  
> -	.p2align 4
> -L(Exit32_63):
> -	VMOVU	(%rsi), %YMM2
> -	VMOVU	-31(%rsi, %rdx), %YMM3
> -	VMOVU	%YMM2, (%rdi)
> -	VMOVU	%YMM3, -31(%rdi, %rdx)
> -# ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> +
> +	.p2align 4,, 4
> +L(page_cross):
> +# ifndef USE_AS_STRCAT
> +	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
>  # endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	%rdx, %r8
> -	sub	$1, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	movq	%rsi, %rcx
> +	andq	$(VEC_SIZE * -1), %rcx
> +
> +	VPCMPEQ	(%rcx), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +# ifdef USE_AS_WCSCPY
> +	andl	$(VEC_SIZE - 1), %PAGE_ALIGN_REG
> +	shrl	$2, %PAGE_ALIGN_REG
>  # endif
> -	ret
> +	shrx	%VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
>  
> -# ifdef USE_AS_STRNCPY
> +# if USE_MOVSB_IN_PAGE_CROSS
> +	/* Optimizing more aggressively for space as this is very cold
> +	   code. This saves 2x cache lines.  */
>  
> -	.p2align 4
> -L(StrncpyExit1):
> -	movzbl	(%rsi), %edx
> -	mov	%dl, (%rdi)
> -#  ifdef USE_AS_STPCPY
> -	lea	1(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, 1(%rdi)
> +	/* This adds once to the later result which will get correct
> +	   copy bounds. NB: this can never zero-out a non-zero RCX as
> +	   to be in the page cross case rsi cannot be aligned and we
> +	   already right-shift rcx by the misalignment.  */
> +	shl	%VRCX
> +	jz	L(page_cross_continue)
> +#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +	movq	%rdi, %rax
>  #  endif
> -	ret
> +	bsf	%VRCX, %VRCX
> +	REP_MOVS
>  
> -	.p2align 4
> -L(StrncpyExit2):
> -	movzwl	(%rsi), %edx
> -	mov	%dx, (%rdi)
>  #  ifdef USE_AS_STPCPY
> -	lea	2(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, 2(%rdi)
> +	leaq	-CHAR_SIZE(%rdi), %rax
>  #  endif
>  	ret
>  
> -	.p2align 4
> -L(StrncpyExit3_4):
> -	movzwl	(%rsi), %ecx
> -	movzwl	-2(%rsi, %r8), %edx
> -	mov	%cx, (%rdi)
> -	mov	%dx, -2(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	ret
>  
> -	.p2align 4
> -L(StrncpyExit5_8):
> -	mov	(%rsi), %ecx
> -	mov	-4(%rsi, %r8), %edx
> -	mov	%ecx, (%rdi)
> -	mov	%edx, -4(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	ret
> +# else
> +	/* Check if we found zero-char before end of page.  */
> +	test	%VRCX, %VRCX
> +	jz	L(page_cross_continue)
>  
> -	.p2align 4
> -L(StrncpyExit9_16):
> -	mov	(%rsi), %rcx
> -	mov	-8(%rsi, %r8), %rdx
> -	mov	%rcx, (%rdi)
> -	mov	%rdx, -8(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	ret
> +	/* Traditional copy case, essentially same as used in non-page-
> +	   cross case but since we can't reuse VMM(0) we need twice as
> +	   many loads from rsi.  */
>  
> -	.p2align 4
> -L(StrncpyExit17_32):
> -	VMOVU	(%rsi), %XMM2
> -	VMOVU	-16(%rsi, %r8), %XMM3
> -	VMOVU	%XMM2, (%rdi)
> -	VMOVU	%XMM3, -16(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> +#  ifndef USE_AS_STRCAT
> +	xorl	%edx, %edx
>  #  endif
> -	ret
> -
> -	.p2align 4
> -L(StrncpyExit33_64):
> -	/*  0/32, 31/16 */
> -	VMOVU	(%rsi), %YMM2
> -	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
> -	VMOVU	%YMM2, (%rdi)
> -	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
> +	/* Dependency on rdi must already have been satisfied.  */
> +	bsf	%VRCX, %VRDX
>  #  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#  elif !defined USE_AS_STRCAT
> +	movq	%rdi, %rax
>  #  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	ret
>  
> -	.p2align 4
> -L(StrncpyExit65):
> -	/* 0/32, 32/32, 64/1 */
> -	VMOVU	(%rsi), %YMM2
> -	VMOVU	32(%rsi), %YMM3
> -	mov	64(%rsi), %cl
> -	VMOVU	%YMM2, (%rdi)
> -	VMOVU	%YMM3, 32(%rdi)
> -	mov	%cl, 64(%rdi)
> -#  ifdef USE_AS_STPCPY
> -	lea	65(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, 65(%rdi)
> +#  if VEC_SIZE == 64
> +#   ifdef USE_AS_WCSCPY
> +	testb	%cl, %cl
> +#   else
> +	test	%ecx, %ecx
> +#   endif
> +	jz	L(page_cross_copy_32_63)
>  #  endif
> -	ret
> -
> -#  ifndef USE_AS_STRCAT
>  
> -	.p2align 4
> -L(Fill1):
> -	mov	%dl, (%rdi)
> -	ret
> +#  ifdef USE_AS_WCSCPY
> +	testb	$0xf, %cl
> +#  else
> +	testw	%cx, %cx
> +#  endif
> +	jz	L(page_cross_copy_16_31)
>  
> -	.p2align 4
> -L(Fill2):
> -	mov	%dx, (%rdi)
> -	ret
> +#  ifdef USE_AS_WCSCPY
> +	testb	$0x3, %cl
> +#  else
> +	testb	%cl, %cl
> +#  endif
> +	jz	L(page_cross_copy_8_15)
>  
> -	.p2align 4
> -L(Fill3_4):
> -	mov	%dx, (%rdi)
> -	mov     %dx, -2(%rdi, %r8)
> +#  ifdef USE_AS_WCSCPY
> +	movl	(%rsi), %esi
> +	movl	%esi, (%rdi)
> +	movl	$0, (%END_REG)
>  	ret
> +#  else
>  
> -	.p2align 4
> -L(Fill5_8):
> -	mov	%edx, (%rdi)
> -	mov     %edx, -4(%rdi, %r8)
> -	ret
> +	testb	$0x7, %cl
> +	jz	L(page_cross_copy_4_7)
>  
> -	.p2align 4
> -L(Fill9_16):
> -	mov	%rdx, (%rdi)
> -	mov	%rdx, -8(%rdi, %r8)
> +	test	%edx, %edx
> +	jz	L(page_cross_set_null_term)
> +	movzwl	(%rsi), %ecx
> +	movw	%cx, (%rdi)
> +L(page_cross_set_null_term):
> +	movb	$0, (%END_REG)
>  	ret
>  
> -	.p2align 4
> -L(Fill17_32):
> -	VMOVU	%XMMZERO, (%rdi)
> -	VMOVU	%XMMZERO, -16(%rdi, %r8)
> -	ret
>  
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec2):
> -	VMOVU	%YMM2, (%rdi, %rcx)
> -
> -	.p2align 4
> -L(CopyVecSizeVecExit):
> -	bsf	%edx, %edx
> -	add	$(VEC_SIZE - 1), %r8
> -	add	%rcx, %rdi
> -#   ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -#   endif
> -	sub	%rdx, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -
> -	.p2align 4
> -L(StrncpyFillTailWithZero):
> -	xor	%edx, %edx
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(StrncpyFillExit)
> -
> -	VMOVU	%YMMZERO, (%rdi)
> -	add	$VEC_SIZE, %rdi
> -
> -	mov	%rdi, %rsi
> -	and	$(VEC_SIZE - 1), %esi
> -	sub	%rsi, %rdi
> -	add	%rsi, %r8
> -	sub	$(VEC_SIZE * 4), %r8
> -	jb	L(StrncpyFillLessFourVecSize)
> -
> -L(StrncpyFillLoopVmovdqa):
> -	VMOVA	%YMMZERO, (%rdi)
> -	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
> -	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
> -	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
> -	add	$(VEC_SIZE * 4), %rdi
> -	sub	$(VEC_SIZE * 4), %r8
> -	jae	L(StrncpyFillLoopVmovdqa)
> -
> -L(StrncpyFillLessFourVecSize):
> -	add	$(VEC_SIZE * 2), %r8
> -	jl	L(StrncpyFillLessTwoVecSize)
> -	VMOVA	%YMMZERO, (%rdi)
> -	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
> -	add	$(VEC_SIZE * 2), %rdi
> -	sub	$VEC_SIZE, %r8
> -	jl	L(StrncpyFillExit)
> -	VMOVA	%YMMZERO, (%rdi)
> -	add	$VEC_SIZE, %rdi
> -	jmp	L(Fill)
> -
> -	.p2align 4
> -L(StrncpyFillLessTwoVecSize):
> -	add	$VEC_SIZE, %r8
> -	jl	L(StrncpyFillExit)
> -	VMOVA	%YMMZERO, (%rdi)
> -	add	$VEC_SIZE, %rdi
> -	jmp	L(Fill)
> -
> -	.p2align 4
> -L(StrncpyFillExit):
> -	add	$VEC_SIZE, %r8
> -L(Fill):
> -	cmp	$17, %r8d
> -	jae	L(Fill17_32)
> -	cmp	$9, %r8d
> -	jae	L(Fill9_16)
> -	cmp	$5, %r8d
> -	jae	L(Fill5_8)
> -	cmp	$3, %r8d
> -	jae	L(Fill3_4)
> -	cmp	$1, %r8d
> -	ja	L(Fill2)
> -	je	L(Fill1)
> +	.p2align 4,, 4
> +L(page_cross_copy_4_7):
> +	movl	(%rsi), %ecx
> +	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
> +	movl	%ecx, (%rdi)
> +	movl	%esi, -(4 - CHAR_SIZE)(%END_REG)
>  	ret
> -
> -/* end of ifndef USE_AS_STRCAT */
>  #  endif
>  
> -	.p2align 4
> -L(UnalignedLeaveCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(UnalignedFourVecSizeLeaveCase2)
> -L(UnalignedFourVecSizeLeaveCase3):
> -	lea	(VEC_SIZE * 4)(%r8), %rcx
> -	and	$-VEC_SIZE, %rcx
> -	add	$(VEC_SIZE * 3), %r8
> -	jl	L(CopyVecSizeCase3)
> -	VMOVU	%YMM4, (%rdi)
> -	sub	$VEC_SIZE, %r8
> -	jb	L(CopyVecSizeCase3)
> -	VMOVU	%YMM5, VEC_SIZE(%rdi)
> -	sub	$VEC_SIZE, %r8
> -	jb	L(CopyVecSizeCase3)
> -	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
> -	sub	$VEC_SIZE, %r8
> -	jb	L(CopyVecSizeCase3)
> -	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
> -#  ifdef USE_AS_STPCPY
> -	lea	(VEC_SIZE * 4)(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (VEC_SIZE * 4)(%rdi)
> -#  endif
> +#  if VEC_SIZE == 64
> +	.p2align 4,, 4
> +L(page_cross_copy_32_63):
> +	VMOVU	(%rsi), %VMM_256(0)
> +	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +	VMOVU	%VMM_256(0), (%rdi)
> +	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
>  	ret
> -
> -	.p2align 4
> -L(UnalignedFourVecSizeLeaveCase2):
> -	xor	%ecx, %ecx
> -	vpcmpb	$0, %YMM4, %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	add	$(VEC_SIZE * 3), %r8
> -	jle	L(CopyVecSizeCase2OrCase3)
> -	test	%edx, %edx
> -#  ifndef USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec4)
> -#  else
> -	jnz	L(CopyVecSize)
> -#  endif
> -	vpcmpb	$0, %YMM5, %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	VMOVU	%YMM4, (%rdi)
> -	add	$VEC_SIZE, %rcx
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -	test	%edx, %edx
> -#  ifndef USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec5)
> -#  else
> -	jnz	L(CopyVecSize)
>  #  endif
>  
> -	vpcmpb	$0, %YMM6, %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	VMOVU	%YMM5, VEC_SIZE(%rdi)
> -	add	$VEC_SIZE, %rcx
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -	test	%edx, %edx
> -#  ifndef USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec6)
> -#  else
> -	jnz	L(CopyVecSize)
> -#  endif
> -
> -	vpcmpb	$0, %YMM7, %YMMZERO, %k4
> -	kmovd	%k4, %edx
> -	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
> -	lea	VEC_SIZE(%rdi, %rcx), %rdi
> -	lea	VEC_SIZE(%rsi, %rcx), %rsi
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -L(StrncpyExit):
> -	cmp	$65, %r8d
> -	je	L(StrncpyExit65)
> -	cmp	$33, %r8d
> -	jae	L(StrncpyExit33_64)
> -	cmp	$17, %r8d
> -	jae	L(StrncpyExit17_32)
> -	cmp	$9, %r8d
> -	jae	L(StrncpyExit9_16)
> -	cmp	$5, %r8d
> -	jae	L(StrncpyExit5_8)
> -	cmp	$3, %r8d
> -	jae	L(StrncpyExit3_4)
> -	cmp	$1, %r8d
> -	ja	L(StrncpyExit2)
> -	je	L(StrncpyExit1)
> -#  ifdef USE_AS_STPCPY
> -	mov	%rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi)
> -#  endif
> +	.p2align 4,, 4
> +L(page_cross_copy_16_31):
> +	vmovdqu	(%rsi), %xmm0
> +	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +	vmovdqu	%xmm0, (%rdi)
> +	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
>  	ret
>  
> -	.p2align 4
> -L(ExitZero):
> -#  ifndef USE_AS_STRCAT
> -	mov	%rdi, %rax
> -#  endif
> +	.p2align 4,, 4
> +L(page_cross_copy_8_15):
> +	movq	(%rsi), %rcx
> +	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> +	movq	%rcx, (%rdi)
> +	movq	%rsi, -(8 - CHAR_SIZE)(%END_REG)
>  	ret
> -
> -# endif
> -
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# else
> -END (STRCAT)
>  # endif
> +END(STRCPY)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
> index 203a19bf21..9aed2d9970 100644
> --- a/sysdeps/x86_64/multiarch/strncat-evex.S
> +++ b/sysdeps/x86_64/multiarch/strncat-evex.S
> @@ -1,7 +1,520 @@
> -#ifndef STRNCAT
> -# define STRNCAT	__strncat_evex
> -#endif
> +/* {wcs|str}ncat  with 256/512-bit EVEX.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +	/* Use evex-masked stores for small sizes. Turned off at the
> +	   moment.  */
> +# define USE_EVEX_MASKED_STORE	0
> +
> +# include <sysdep.h>
> +
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +# ifndef STRNCAT
> +#  define STRNCAT	__strncat_evex
> +# endif
> +
> +
> +# ifdef USE_AS_WCSCPY
> +#  define movNULL	movl
> +#  define VMOVU_MASK	vmovdqu32
> +#  define VPMIN	vpminud
> +#  define VPTESTN	vptestnmd
> +#  define VPTEST	vptestmd
> +#  define VPCMPEQ	vpcmpeqd
> +#  define CHAR_SIZE	4
> +
> +#  define REP_MOVS	rep movsd
> +
> +#  define VMASK_REG	VR10
> +#  define FIND_FIRST_ONE(src, dst)	movl $CHAR_PER_VEC, %dst; bsf %src, %dst
> +
> +#  define USE_WIDE_CHAR
> +# else
> +#  define movNULL	movb
> +#  define VMOVU_MASK	vmovdqu8
> +#  define VPMIN	vpminub
> +#  define VPTESTN	vptestnmb
> +#  define VPTEST	vptestmb
> +#  define VPCMPEQ	vpcmpeqb
> +#  define CHAR_SIZE	1
> +
> +#  define REP_MOVS	rep movsb
> +
> +#  define VMASK_REG	VRCX
> +#  define FIND_FIRST_ONE(src, dst)	tzcnt %src, %dst
> +
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# include "reg-macros.h"
> +
> +
> +# define VZERO	VMM(7)
> +# define VZERO_128	VMM_128(7)
> +
> +# define PAGE_SIZE	4096
> +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
> +
> +	.section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCAT)
> +	movq	%rdi, %rax
> +
> +	/* NB: It's safe to filter out zero-length strings WITHOUT
> +	   setting null-term. Destination MUST be a null-terminated
> +	   string so essentially the work is already done.  */
> +# ifdef USE_AS_WCSCPY
> +	leaq	-1(%rdx), %rcx
> +	shrq	$56, %rcx
> +	jnz	L(zero_len)
> +# else
> +	test	%rdx, %rdx
> +	jle	L(zero_len)
> +# endif
> +
> +# include "strcat-strlen-evex.h.S"
> +
> +	movl	%esi, %ecx
> +	andl	$(PAGE_SIZE - 1), %ecx
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
> +	ja	L(page_cross)
> +L(page_cross_continue):
> +	VMOVU	(%rsi), %VMM(0)
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +
> +	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
> +	   <= CHAR_PER_VEC with masked instructions (which have
> +	   potential for dramatically bad perf if dst splits a page and
> +	   is not in the TLB).  */
> +# if USE_EVEX_MASKED_STORE
> +	KMOV	%k0, %VRCX
> +	FIND_FIRST_ONE (VRCX, VR8)
> +	cmpq	%r8, %rdx
> +	jbe	L(less_1x_vec)
> +
> +	test	%VRCX, %VRCX
> +	jz	L(more_1x_vec)
> +
> +	blsmsk	%VRCX, %VRCX
> +	KMOV	%VRCX, %k1
> +	VMOVU_MASK %VMM(0), (%rdi){%k1}
> +	ret
> +
> +L(less_1x_vec):
> +	mov	$-1, %VRCX
> +	bzhi	%VRDX, %VRCX, %VRCX
> +	KMOV	%VRCX, %k1
> +	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
> +	VMOVU_MASK %VMM(0), (%rdi){%k1}
> +
> +	ret
> +# else
> +	KMOV	%k0, %VMASK_REG
> +	/* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
> +	   %VMASK_REG, %VRCX` for wcsncat.  */
> +	FIND_FIRST_ONE (VMASK_REG, VRCX)
> +	cmpq	%rcx, %rdx
> +	jbe	L(less_1x_vec)
> +
> +	/* If there were no zero-CHARs (rcx was zero before
> +	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> +	cmpl	$CHAR_PER_VEC, %ecx
> +	je	L(more_1x_vec)
> +
> +	movl	%ecx, %edx
> +
> +L(less_1x_vec):
> +#  if VEC_SIZE == 64
> +	cmpl	$(32 / CHAR_SIZE), %edx
> +	jae	L(copy_32_63)
> +#  endif
> +
> +	cmpl	$(16 / CHAR_SIZE), %edx
> +	jae	L(copy_16_31)
> +
> +
> +	cmpl	$(8 / CHAR_SIZE), %edx
> +	jae	L(copy_8_15)
> +
> +#  ifdef USE_AS_WCSCPY
> +	vmovd	%VMM_128(0), (%rdi)
> +	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +#  else
> +
> +	cmpl	$4, %edx
> +	jae	L(copy_4_7)
> +
> +	movzbl	(%rsi), %ecx
> +	cmpl	$1, %edx
> +	jbe	L(set_null_term)
> +
> +	movzwl	1(%rsi), %esi
> +	movw	%si, 1(%rdi)
> +
> +	.p2align 4,, 1
> +L(set_null_term):
> +	movb	%cl, (%rdi)
> +	movNULL	$0, (%rdi, %rdx)
> +	ret
> +#  endif
> +
> +#  if VEC_SIZE == 64
> +	.p2align 4,, 6
> +L(copy_32_63):
> +	VMOVU	-(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +	VMOVU	%VMM_256(0), (%rdi)
> +	VMOVU	%VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
> +	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +#  endif
> +	.p2align 4,, 6
> +L(copy_16_31):
> +	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
> +	   and will save code size.  */
> +	vmovdqu	-(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +	VMOVU	%VMM_128(0), (%rdi)
> +	vmovdqu	%xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
> +	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +
> +	.p2align 4,, 2
> +L(copy_8_15):
> +	movq	-(8)(%rsi, %rdx, CHAR_SIZE), %rcx
> +	vmovq	%VMM_128(0), (%rdi)
> +	movq	%rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
> +	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +
> +#  ifndef USE_AS_WCSCPY
> +	.p2align 4,, 12
> +L(copy_4_7):
> +	movl	-(4)(%rsi, %rdx, CHAR_SIZE), %ecx
> +	vmovd	%VMM_128(0), (%rdi)
> +	movl	%ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
> +	movNULL	$0, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +#  endif
> +
> +# endif
> +	.p2align 4,, 4
> +L(zero_len):
> +# ifdef USE_AS_WCSCPY
> +	test	%rdx, %rdx
> +# endif
> +	jne	OVERFLOW_STRCAT
> +	ret
>  
> -#define USE_AS_STRNCAT
> -#define STRCAT	STRNCAT
> -#include "strcat-evex.S"
> +	.p2align 4,, 8
> +L(more_1x_vec):
> +	VMOVU	%VMM(0), (%rdi)
> +
> +	/* We are going to align rsi here so will need to be able to re-
> +	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> +	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
> +
> +	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> +	subq	%rsi, %rdi
> +	andq	$-(VEC_SIZE), %rsi
> +L(loop_last_4x_vec):
> +	addq	%rsi, %rdi
> +	subq	%rsi, %rdx
> +# ifdef USE_AS_WCSCPY
> +	shrq	$2, %rdx
> +# endif
> +
> +	/* Will need this regardless.  */
> +	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
> +	VPTESTN	%VMM(1), %VMM(1), %k0
> +	KMOV	%k0, %VMASK_REG
> +
> +	cmpq	$(CHAR_PER_VEC * 2), %rdx
> +	ja	L(more_2x_vec)
> +
> +L(last_2x_vec):
> +	FIND_FIRST_ONE (VMASK_REG, VRCX)
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x1_len)
> +
> +	/* If there were no zero-CHARs (rcx was zero before
> +	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> +	cmpl	$CHAR_PER_VEC, %ecx
> +	jne	L(ret_vec_x1)
> +
> +	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	addl	$-CHAR_PER_VEC, %edx
> +	bzhi	%VRDX, %VRCX, %VR8
> +	jz	L(ret_vec_x2_len)
> +L(ret_vec_x2):
> +	bsf	%VRCX, %VRDX
> +L(ret_vec_x2_len):
> +	VMOVU	(VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	movNULL	$0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +	VMOVU	%VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +	ret
> +
> +	.p2align 4,, 4
> +L(ret_vec_x1_len):
> +	movl	%edx, %ecx
> +L(ret_vec_x1):
> +	VMOVU	(VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	movNULL	$0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
> +	VMOVU	%VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +	VZEROUPPER_RETURN
> +
> +
> +	.p2align 4,, 8
> +L(last_4x_vec):
> +	addl	$-(CHAR_PER_VEC * 4), %edx
> +	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
> +	VPTESTN	%VMM(1), %VMM(1), %k0
> +	KMOV	%k0, %VMASK_REG
> +	subq	$-(VEC_SIZE * 4), %rsi
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	cmpl	$(CHAR_PER_VEC * 2), %edx
> +	jbe	L(last_2x_vec)
> +	.p2align 4,, 8
> +L(more_2x_vec):
> +# ifdef USE_AS_WCSCPY
> +	xorl	%ecx, %ecx
> +# endif
> +	bsf	%VMASK_REG, %VRCX
> +	jnz	L(ret_vec_x1)
> +
> +	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x2)
> +
> +	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
> +	VPTESTN	%VMM(3), %VMM(3), %k0
> +	KMOV	%k0, %VMASK_REG
> +
> +	cmpq	$(CHAR_PER_VEC * 4), %rdx
> +	ja	L(more_4x_vec)
> +
> +	/* Adjust length before going to L(ret_vec_x3_len) or
> +	   L(ret_vec_x3).  */
> +	addl	$(CHAR_PER_VEC * -2), %edx
> +
> +	FIND_FIRST_ONE (VMASK_REG, VRCX)
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x3_len)
> +
> +	/* If there were no zero-CHARs (rcx was zero before
> +	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> +	cmpl	$CHAR_PER_VEC, %ecx
> +	jne	L(ret_vec_x3)
> +
> +	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	VPTESTN	%VMM(4), %VMM(4), %k0
> +	KMOV	%k0, %VRCX
> +	addl	$-CHAR_PER_VEC, %edx
> +	bzhi	%VRDX, %VRCX, %VR8
> +	jz	L(ret_vec_x4_len)
> +L(ret_vec_x4):
> +	bsf	%VRCX, %VRDX
> +L(ret_vec_x4_len):
> +	VMOVU	(VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	movNULL	$0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
> +	VMOVU	%VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +	ret
> +
> +	.p2align 4,, 4
> +L(ret_vec_x3_len):
> +	movl	%edx, %ecx
> +L(ret_vec_x3):
> +	VMOVU	(VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	movNULL	$0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> +	VMOVU	%VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +	ret
> +
> +	.p2align 4,, 8
> +L(more_4x_vec):
> +# ifdef USE_AS_WCSCPY
> +	xorl	%ecx, %ecx
> +# endif
> +	bsf	%VMASK_REG, %VRCX
> +	jnz	L(ret_vec_x3)
> +
> +	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	VPTESTN	%VMM(4), %VMM(4), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x4)
> +
> +	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
> +
> +	/* Check if we are near the end before aligning.  */
> +	cmpq	$(CHAR_PER_VEC * 8), %rdx
> +	jbe	L(last_4x_vec)
> +
> +
> +	/* Add rsi to rdx (length) before aligning rsi. NB: Since we
> +	   filtered out huge lengths this cannot overflow.  */
> +# ifdef USE_AS_WCSCPY
> +	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
> +# else
> +	addq	%rsi, %rdx
> +# endif
> +
> +	/* Subtract rsi from rdi before aligning (add back will have
> +	   correct rdi for aligned rsi).  */
> +	subq	%rsi, %rdi
> +	subq	$-(VEC_SIZE * 5), %rsi
> +	andq	$(VEC_SIZE * -4), %rsi
> +
> +	/* Load first half of the loop before entry.  */
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +
> +	/* Offset rsi by VEC_SIZE so that we can jump to
> +	   L(loop_last_4x_vec).  */
> +	addq	$-(VEC_SIZE), %rsi
> +	KORTEST	%k2, %k4
> +	jnz	L(loop_4x_done)
> +
> +	/* Store loop end in r9.  */
> +	leaq	-(VEC_SIZE * 5)(%rdx), %r9
> +
> +	.p2align 4,, 11
> +L(loop_4x_vec):
> +	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> +
> +	subq	$(VEC_SIZE * -4), %rsi
> +	cmpq	%rsi, %r9
> +	jbe	L(loop_last_4x_vec)
> +
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +	KORTEST	%k2, %k4
> +	jz	L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +	/* Restore rdi (dst).  */
> +	addq	%rsi, %rdi
> +
> +	/* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
> +	   test with bsf.  */
> +	bsf	%VRCX, %VRCX
> +	jnz	L(ret_vec_x1)
> +	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
> +
> +	KMOV	%k2, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x2)
> +	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> +
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	bsf	%VRCX, %VRCX
> +	jnz	L(ret_vec_x3)
> +	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> +
> +	KMOV	%k4, %VRCX
> +	bsf	%VRCX, %VRCX
> +	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +	ret
> +
> +
> +	.p2align 4,, 4
> +L(page_cross):
> +	movq	%rsi, %r8
> +	andq	$(VEC_SIZE * -1), %r8
> +	VPCMPEQ	(%r8), %VZERO, %k0
> +
> +# ifdef USE_AS_WCSCPY
> +	KMOV	%k0, %VR9
> +	shrl	$2, %ecx
> +	andl	$(CHAR_PER_VEC - 1), %ecx
> +	shrx	%VRCX, %VR9, %VRCX
> +# else
> +	KMOV	%k0, %VRCX
> +	shrx	%VRSI, %VRCX, %VRCX
> +# endif
> +
> +	subl	%esi, %r8d
> +	andl	$(VEC_SIZE - 1), %r8d
> +# ifdef USE_AS_WCSCPY
> +	shrl	$2, %r8d
> +# endif
> +	cmpq	%r8, %rdx
> +	jbe	L(page_cross_small)
> +	/* Optimizing more for space as this is very cold code. This
> +	   saves 2x cache lines.  */
> +
> +	/* This adds once to the later result which will get correct
> +	   copy bounds. NB: this can never zero-out a non-zero RCX as
> +	   to be in the page cross case rsi cannot be aligned and we
> +	   already right-shift rcx by the misalignment.  */
> +	shl	%VRCX
> +	jz	L(page_cross_continue)
> +	bsf	%VRCX, %VRCX
> +	REP_MOVS
> +	ret
> +
> +L(page_cross_small):
> +	tzcnt	%VRCX, %VRCX
> +	jz	L(page_cross_setz)
> +	cmpl	%edx, %ecx
> +	cmova	%edx, %ecx
> +
> +# ifdef USE_AS_WCSCPY
> +	rep	movsd
> +# else
> +	rep	movsb
> +# endif
> +L(page_cross_setz):
> +	movNULL	$0, (%rdi)
> +	ret
> +END(STRNCAT)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
> index 1b3426d511..49eaf4cbd9 100644
> --- a/sysdeps/x86_64/multiarch/strncpy-evex.S
> +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
> @@ -1,7 +1,990 @@
> -#ifndef STRNCPY
> -# define STRNCPY	__strncpy_evex
> -#endif
> +/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +	/* Use evex-masked stores for small sizes. Turned off at the
> +	   moment.  */
> +# define USE_EVEX_MASKED_STORE	0
> +
> +
> +# include <sysdep.h>
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +
> +# ifndef STRNCPY
> +#  define STRNCPY	__strncpy_evex
> +# endif
> +
> +# ifdef USE_AS_WCSCPY
> +#  define VMOVU_MASK	vmovdqu32
> +#  define VPCMPEQ	vpcmpeqd
> +#  define VPMIN	vpminud
> +#  define VPTESTN	vptestnmd
> +#  define VPTEST	vptestmd
> +#  define CHAR_SIZE	4
> +
> +#  define REP_MOVS	rep movsd
> +#  define REP_STOS	rep stosl
> +
> +#  define USE_WIDE_CHAR
> +
> +# else
> +#  define VMOVU_MASK	vmovdqu8
> +#  define VPCMPEQ	vpcmpeqb
> +#  define VPMIN	vpminub
> +#  define VPTESTN	vptestnmb
> +#  define VPTEST	vptestmb
> +#  define CHAR_SIZE	1
> +
> +#  define REP_MOVS	rep movsb
> +#  define REP_STOS	rep stosb
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# define PAGE_SIZE	4096
> +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
> +
> +# include "reg-macros.h"
> +
> +
> +# define VZERO	VMM(7)
> +# define VZERO_256	VMM_256(7)
> +# define VZERO_128	VMM_128(7)
> +
> +# if VEC_SIZE == 64
> +#  define VZERO_HALF	VZERO_256
> +# else
> +#  define VZERO_HALF	VZERO_128
> +# endif
> +
> +	.section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCPY)
> +	/* Filter zero length strings and very long strings.  Zero
> +	   length strings just return, very long strings are handled by
> +	   just running rep stos{b|l} to zero set (which will almost
> +	   certainly segfault), if that succeeds then just calling
> +	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
> +# ifdef USE_AS_WCSCPY
> +	decq	%rdx
> +	movq	%rdx, %rax
> +	/* 56 is end of max supported address space.  */
> +	shr	$56, %rax
> +	jnz	L(zero_len)
> +# else
> +	decq	%rdx
> +	/* If the flag needs to become `jb` replace `dec` with `sub`.
> +	 */
> +	jl	L(zero_len)
> +# endif
> +
> +	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
> +	movl	%esi, %eax
> +	andl	$(PAGE_SIZE - 1), %eax
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
> +	ja	L(page_cross)
> +
> +L(page_cross_continue):
> +	VMOVU	(%rsi), %VMM(0)
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +
> +	/* If no STPCPY just save end ahead of time.  */
> +# ifndef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +# endif
> +
> +
> +	cmpq	$(CHAR_PER_VEC), %rdx
> +
> +	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
> +	   <= CHAR_PER_VEC with masked instructions (which have
> +	   potential for dramatically bad perf if dst splits a page and
> +	   is not in the TLB).  */
> +# if USE_EVEX_MASKED_STORE
> +	/* `jae` because length rdx is now length - 1.  */
> +	jae	L(more_1x_vec)
> +
> +	/* If there where multiple zero-CHAR matches in the first VEC,
> +	   VRCX will be overset but thats fine since any oversets where
> +	   at zero-positions anyways.  */
> +
> +#  ifdef USE_AS_STPCPY
> +	tzcnt	%VRCX, %VRAX
> +	cmpl	%eax, %edx
> +	cmovb	%edx, %eax
> +#   ifdef USE_AS_WCSCPY
> +	adcl	$0, %eax
> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
> +#   else
> +	adcq	%rdi, %rax
> +#   endif
> +#  endif
> +	dec	%VRCX
> +
> +	/* Zero out all non-zero CHAR's after the first zero match.  */
> +	KMOV	%VRCX, %k1
> +
> +	/* Use VZERO as destination so this can be reused for
> +	   L(zfill_less_vec) (which if jumped to by subsequent logic
> +	   will have zerod out VZERO.  */
> +	VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
> +L(zfill_less_vec):
> +	/* Get mask for what we need to set.  */
> +	incl	%edx
> +	mov	$-1, %VRCX
> +	bzhi	%VRDX, %VRCX, %VRCX
> +	KMOV	%VRCX, %k1
> +	VMOVU_MASK %VZERO, (%rdi){%k1}
> +	ret
> +
> +	.p2align 4,, 4
> +L(zero_len):
> +	cmpq	$-1, %rdx
> +	jne	L(best_effort_strncpy)
> +	movq	%rdi, %rax
> +	ret
> +
> +	.p2align 4,, 8
> +L(more_1x_vec):
> +# else
> +	/* `jb` because length rdx is now length - 1.  */
> +	jb	L(less_1x_vec)
> +# endif
> +
> +
> +	/* This may overset but thats fine because we still need to zero
> +	   fill.  */
> +	VMOVU	%VMM(0), (%rdi)
> +
> +
> +	/* Length must be >= CHAR_PER_VEC so match here means we must
> +	   zero-fill.  */
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill)
> +
> +
> +	/* We are going to align rsi here so will need to be able to re-
> +	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> +	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
> +	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> +	subq	%rsi, %rdi
> +	andq	$-(VEC_SIZE), %rsi
> +
> +L(loop_last_4x_vec):
> +	addq	%rsi, %rdi
> +	subq	%rsi, %rdx
> +# ifdef USE_AS_WCSCPY
> +	shrq	$2, %rdx
> +# endif
> +
> +	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
> +	VPTESTN	%VMM(1), %VMM(1), %k0
> +	KMOV	%k0, %VRCX
> +
> +	/* -1 because of the `dec %rdx` earlier.  */
> +	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
> +	ja	L(more_2x_vec)
> +
> +L(last_2x_vec):
> +	/* This will be need to be computed no matter what. We do it
> +	   ahead of time for CHAR_PER_VEC == 64 because we can't adjust
> +	   the value of `tzcnt` with a shift.  */
> +# if CHAR_PER_VEC == 64
> +	tzcntq	%rcx, %rcx
> +# endif
> +
> +	cmpl	$(CHAR_PER_VEC), %edx
> +	jb	L(ret_vec_x1_len)
> +
> +	/* Seperate logic for CHAR_PER_VEC == 64 because we already did
> +	   `tzcnt` on VRCX.  */
> +# if CHAR_PER_VEC == 64
> +	/* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
> +	cmpb	$CHAR_PER_VEC, %cl
> +	jnz	L(ret_vec_x1_no_bsf)
> +# else
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x1)
> +# endif
> +
> +
> +
> +	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %VZERO, %k0
> +	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
> +	KMOV	%k0, %VRCX
> +
> +# if CHAR_PER_VEC < 64
> +	/* This essentiallys adds CHAR_PER_VEC to computed result.  */
> +	shlq	$CHAR_PER_VEC, %rcx
> +# else
> +	tzcntq	%rcx, %rcx
> +	addl	$CHAR_PER_VEC, %ecx
> +# endif
> +
> +	.p2align 4,, 4
> +L(ret_vec_x1_len):
> +	/* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
> +	   already been done.  */
> +# if CHAR_PER_VEC < 64
> +	tzcntq	%rcx, %rcx
> +# endif
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x1_len_no_zfill)
> +	/* Fall through (expectation) is copy len < buffer len.  */
> +	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +L(ret_vec_x1_len_no_zfill_mov):
> +	movl	%ecx, %edx
> +# ifdef USE_AS_STPCPY
> +	/* clear flags.  */
> +	xorl	%ecx, %ecx
> +# endif
> +L(ret_vec_x1_len_no_zfill):
> +	VMOVU	((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +	leal	(VEC_SIZE)(%rdx), %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# endif
> +	ret
> +
> +
> +	.p2align 4,, 10
> +L(ret_vec_x1):
> +	bsf	%VRCX, %VRCX
> +L(ret_vec_x1_no_bsf):
> +	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +	subl	%ecx, %edx
> +	cmpl	$CHAR_PER_VEC, %edx
> +	jb	L(ret_vec_x1_len_no_zfill_mov)
> +	/* Fall through (expectation) is copy len < buffer len.  */
> +	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
> +	VMOVU	%VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +	leaq	(VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
> +# endif
> +	ret
> +
> +	.p2align 4,, 8
> +L(last_4x_vec):
> +	/* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
> +	   $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
> +	   using `movzbl`.  */
> +# if CHAR_PER_VEC == 64
> +	movzbl	%dl, %edx
> +# else
> +	andl	$(CHAR_PER_VEC * 4 - 1), %edx
> +# endif
> +	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
> +	VPTESTN	%VMM(1), %VMM(1), %k0
> +	KMOV	%k0, %VRCX
> +	subq	$-(VEC_SIZE * 4), %rsi
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	cmpl	$(CHAR_PER_VEC * 2 - 1), %edx
> +	jbe	L(last_2x_vec)
> +	.p2align 4,, 8
> +L(more_2x_vec):
> +	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
> +	test	%VRCX, %VRCX
> +	/* Must fill at least 2x VEC.  */
> +	jnz	L(zfill_vec1)
> +
> +	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	/* Must fill at least 1x VEC.  */
> +	jnz	L(zfill_vec2)
> +
> +	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
> +	VPTESTN	%VMM(3), %VMM(3), %k0
> +	KMOV	%k0, %VRCX
> +
> +	/* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
> +	cmpq	$(CHAR_PER_VEC * 4 - 1), %rdx
> +	ja	L(more_4x_vec)
> +
> +	subl	$(CHAR_PER_VEC * 3), %edx
> +	jb	L(ret_vec_x3_len)
> +
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x3)
> +
> +	VPCMPEQ	(VEC_SIZE * 4)(%rsi), %VZERO, %k0
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	KMOV	%k0, %VRCX
> +	tzcnt	%VRCX, %VRCX
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x4_len_no_zfill)
> +	/* Fall through (expectation) is copy len < buffer len.  */
> +	VMOVU	%VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +	movl	%ecx, %edx
> +L(ret_vec_x4_len_no_zfill):
> +	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +	leal	(VEC_SIZE * 4 + 0)(%rdx), %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# endif
> +	ret
> +
> +
> +L(ret_vec_x3_len):
> +	addl	$(CHAR_PER_VEC * 1), %edx
> +	tzcnt	%VRCX, %VRCX
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x3_len_no_zfill)
> +	/* Fall through (expectation) is copy len < buffer len.  */
> +	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +L(ret_vec_x3_len_no_zfill_mov):
> +	movl	%ecx, %edx
> +# ifdef USE_AS_STPCPY
> +	/* clear flags.  */
> +	xorl	%ecx, %ecx
> +# endif
> +	.p2align 4,, 4
> +L(ret_vec_x3_len_no_zfill):
> +	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +	leal	(VEC_SIZE * 3 + 0)(%rdx), %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# endif
> +	ret
> +
> +
> +	.p2align 4,, 8
> +L(ret_vec_x3):
> +	bsf	%VRCX, %VRCX
> +	VMOVU	%VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
> +	subl	%ecx, %edx
> +	jl	L(ret_vec_x3_len_no_zfill_mov)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
> +# endif
> +	ret
> +
> +	.p2align 4,, 8
> +L(more_4x_vec):
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill_vec3)
> +
> +	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
> +	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
> +	VPTESTN	%VMM(4), %VMM(4), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill_vec4)
>  
> -#define USE_AS_STRNCPY
> -#define STRCPY	STRNCPY
> -#include "strcpy-evex.S"
> +	/* Recheck length before aligning.  */
> +	cmpq	$(CHAR_PER_VEC * 8 - 1), %rdx
> +	jbe	L(last_4x_vec)
> +
> +	/* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
> +# ifdef USE_AS_WCSCPY
> +	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
> +# else
> +	addq	%rsi, %rdx
> +# endif
> +	subq	%rsi, %rdi
> +	subq	$-(VEC_SIZE * 5), %rsi
> +	andq	$(VEC_SIZE * -4), %rsi
> +
> +
> +	/* Load first half of the loop before entry.  */
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +
> +
> +	/* Offset rsi by VEC_SIZE so that we can jump to
> +	   L(loop_last_4x_vec).  */
> +	addq	$-(VEC_SIZE), %rsi
> +	KORTEST	%k2, %k4
> +	jnz	L(loop_4x_done)
> +
> +	/* Store loop end in r9.  */
> +	leaq	-(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
> +
> +	.p2align 4,, 11
> +L(loop_4x_vec):
> +	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> +
> +	subq	$(VEC_SIZE * -4), %rsi
> +	cmpq	%rsi, %r9
> +	jbe	L(loop_last_4x_vec)
> +
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +	KORTEST	%k2, %k4
> +	jz	L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +	/* Restore rdx (length).  */
> +	subq	%rsi, %rdx
> +# ifdef USE_AS_WCSCPY
> +	shrq	$2, %rdx
> +# endif
> +	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +	/* Restore rdi (dst).  */
> +	addq	%rsi, %rdi
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill_vec1)
> +
> +	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> +	KMOV	%k2, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill_vec2)
> +
> +	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill_vec3)
> +
> +	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
> +	KMOV	%k4, %VRCX
> +	// Zfill more....
> +
> +	.p2align 4,, 4
> +L(zfill_vec4):
> +	subq	$(VEC_SIZE * -2), %rdi
> +	addq	$(CHAR_PER_VEC * -2), %rdx
> +L(zfill_vec2):
> +	subq	$(VEC_SIZE * -2), %rdi
> +	addq	$(CHAR_PER_VEC * -1), %rdx
> +L(zfill):
> +	/* VRCX must be non-zero.  */
> +	bsf	%VRCX, %VRCX
> +
> +	/* Adjust length / dst for zfill.  */
> +	subq	%rcx, %rdx
> +# ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
> +# else
> +	addq	%rcx, %rdi
> +# endif
> +# ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +# endif
> +L(zfill_from_page_cross):
> +
> +	/* From here on out its just memset(rdi, 0, rdx).  */
> +	cmpq	$CHAR_PER_VEC, %rdx
> +	jb	L(zfill_less_vec)
> +
> +L(zfill_more_1x_vec):
> +	VMOVU	%VZERO, (%rdi)
> +	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
> +	ja	L(zfill_more_2x_vec)
> +L(zfill_done0):
> +	ret
> +
> +	/* Coming from vec1/vec2 we must be able to zfill at least 2x
> +	   VEC.  */
> +	.p2align 4,, 8
> +L(zfill_vec3):
> +	subq	$(VEC_SIZE * -2), %rdi
> +	addq	$(CHAR_PER_VEC * -2), %rdx
> +	.p2align 4,, 2
> +L(zfill_vec1):
> +	bsfq	%rcx, %rcx
> +	/* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
> +	 */
> +	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> +	subq	%rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +# endif
> +
> +
> +	VMOVU	%VZERO, (%rdi)
> +	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +	cmpq	$(CHAR_PER_VEC * 2), %rdx
> +	jb	L(zfill_done0)
> +L(zfill_more_2x_vec):
> +	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +	VMOVU	%VZERO, (VEC_SIZE)(%rdi)
> +	subq	$(CHAR_PER_VEC * 4 - 1), %rdx
> +	jbe	L(zfill_done)
> +
> +# ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rdx
> +# else
> +	addq	%rdi, %rdx
> +# endif
> +
> +	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi)
> +	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi)
> +
> +
> +	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
> +	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
> +
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	cmpq	%rdi, %rdx
> +	jbe	L(zfill_done)
> +
> +	/* Align rdi and zfill loop.  */
> +	andq	$-(VEC_SIZE), %rdi
> +	.p2align 4,, 12
> +L(zfill_loop_4x_vec):
> +	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
> +	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
> +	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
> +	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	cmpq	%rdi, %rdx
> +	ja	L(zfill_loop_4x_vec)
> +L(zfill_done):
> +	ret
> +
> +
> +	/* Less 1x VEC case if we are not using evex masked store.  */
> +# if !USE_EVEX_MASKED_STORE
> +	.p2align 4,, 8
> +L(copy_1x):
> +	/* Special case for copy 1x. It can be handled quickly and many
> +	   buffer sizes have convenient alignment.  */
> +	VMOVU	%VMM(0), (%rdi)
> +	/* If no zeros then we are done.  */
> +	testl	%ecx, %ecx
> +	jz	L(ret_1x_1x)
> +
> +	/* Need to zfill, not we know that length <= CHAR_PER_VEC so we
> +	   only handle the small case here.  */
> +	bsf	%VRCX, %VRCX
> +L(zfill_less_vec_no_bsf):
> +	/* Adjust length / dst then just zfill less_vec.  */
> +	subq	%rcx, %rdx
> +#  ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
> +#  else
> +	addq	%rcx, %rdi
> +#  endif
> +#  ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#  endif
> +
> +L(zfill_less_vec):
> +	cmpl	$((VEC_SIZE / 2) / CHAR_SIZE), %edx
> +	jb	L(zfill_less_half)
> +
> +	VMOVU	%VZERO_HALF, (%rdi)
> +	VMOVU	%VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +	ret
> +#  ifdef USE_AS_STPCPY
> +L(ret_1x_1x):
> +	leaq	CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
> +	ret
> +#  endif
> +
> +
> +#  if VEC_SIZE == 64
> +	.p2align 4,, 4
> +L(copy_32_63):
> +	/* Overfill to avoid branches.  */
> +	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +	VMOVU	%VMM_256(0), (%rdi)
> +	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +
> +	/* We are taking advantage of the fact that to be here we must
> +	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
> +	   way for overwriting.  */
> +	cmpl	%ecx, %edx
> +	ja	L(zfill_less_vec_no_bsf)
> +#   ifndef USE_AS_STPCPY
> +L(ret_1x_1x):
> +#   else
> +#    ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#    else
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#    endif
> +#   endif
> +	ret
> +#  endif
> +
> +	.p2align 4,, 4
> +L(copy_16_31):
> +	/* Overfill to avoid branches.  */
> +	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +	VMOVU	%VMM_128(0), (%rdi)
> +	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +	cmpl	%ecx, %edx
> +
> +	/* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
> +	   we have a larger copy block for 32-63 so this is just falls
> +	   through to zfill 16-31. If VEC_SIZE == 32 then we check for
> +	   full zfill of less 1x VEC.  */
> +#  if VEC_SIZE == 64
> +	jbe	L(ret_16_31)
> +	subl	%ecx, %edx
> +#   ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
> +#   else
> +	addq	%rcx, %rdi
> +#   endif
> +#   ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#   endif
> +L(zfill_less_half):
> +L(zfill_less_32):
> +	cmpl	$(16 / CHAR_SIZE), %edx
> +	jb	L(zfill_less_16)
> +	VMOVU	%VZERO_128, (%rdi)
> +	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +#   ifdef USE_AS_STPCPY
> +	ret
> +#   endif
> +L(ret_16_31):
> +#   ifdef USE_AS_STPCPY
> +#    ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#    else
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#    endif
> +#   endif
> +	ret
> +#  else
> +	/* VEC_SIZE == 32 begins.  */
> +	ja	L(zfill_less_vec_no_bsf)
> +#   ifndef USE_AS_STPCPY
> +L(ret_1x_1x):
> +#   else
> +#    ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#    else
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#    endif
> +#   endif
> +	ret
> +#  endif
> +
> +
> +	.p2align 4,, 4
> +L(copy_8_15):
> +	/* Overfill to avoid branches.  */
> +	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> +	vmovq	%VMM_128(0), (%rdi)
> +	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_8_15)
> +	subl	%ecx, %edx
> +#  ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
> +#  else
> +	addq	%rcx, %rdi
> +#  endif
> +#  ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#  endif
> +	.p2align 4,, 8
> +#  if VEC_SIZE == 32
> +L(zfill_less_half):
> +#  endif
> +L(zfill_less_16):
> +	xorl	%ecx, %ecx
> +	cmpl	$(8 / CHAR_SIZE), %edx
> +	jb	L(zfill_less_8)
> +	movq	%rcx, (%rdi)
> +	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +#  ifndef USE_AS_STPCPY
> +L(ret_8_15):
> +#  endif
> +	ret
> +
> +	.p2align 4,, 8
> +L(less_1x_vec):
> +	je	L(copy_1x)
> +
> +	/* We will need `tzcnt` result for all other copy sizes.  */
> +	tzcnt	%VRCX, %VRCX
> +#  if VEC_SIZE == 64
> +	cmpl	$(32 / CHAR_SIZE), %edx
> +	jae	L(copy_32_63)
> +#  endif
> +
> +	cmpl	$(16 / CHAR_SIZE), %edx
> +	jae	L(copy_16_31)
> +
> +	cmpl	$(8 / CHAR_SIZE), %edx
> +	jae	L(copy_8_15)
> +#  ifdef USE_AS_WCSCPY
> +	testl	%ecx, %ecx
> +	jz	L(zfill_less_8_set_ret)
> +
> +	movl	(%rsi, %rdx, CHAR_SIZE), %esi
> +	vmovd	%VMM_128(0), (%rdi)
> +	movl	%esi, (%rdi, %rdx, CHAR_SIZE)
> +#   ifdef USE_AS_STPCPY
> +	cmpl	%ecx, %edx
> +L(ret_8_15):
> +	adcq	$0, %rdx
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#   endif
> +	ret
> +L(zfill_less_8_set_ret):
> +	xorl	%ecx, %ecx
> +#   ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#   endif
> +L(zfill_less_8):
> +	movl	%ecx, (%rdi)
> +	movl	%ecx, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +#  else
> +	cmpl	$3, %edx
> +	jb	L(copy_0_3)
> +	/* Overfill to avoid branches.  */
> +	movl	-3(%rsi, %rdx), %esi
> +	vmovd	%VMM_128(0), (%rdi)
> +	movl	%esi, -3(%rdi, %rdx)
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_4_7)
> +	subq	%rcx, %rdx
> +	addq	%rcx, %rdi
> +#   ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#   endif
> +	xorl	%ecx, %ecx
> +	.p2align 4,, 8
> +L(zfill_less_8):
> +	cmpl	$3, %edx
> +	jb	L(zfill_less_3)
> +	movl	%ecx, (%rdi)
> +	movl	%ecx, -3(%rdi, %rdx)
> +#   ifdef USE_AS_STPCPY
> +	ret
> +#   endif
> +
> +L(ret_4_7):
> +#   ifdef USE_AS_STPCPY
> +L(ret_8_15):
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#   endif
> +	ret
> +
> +	.p2align 4,, 4
> +L(zfill_less_3):
> +	testl	%edx, %edx
> +	jz	L(zfill_1)
> +	movw	%cx, (%rdi)
> +L(zfill_1):
> +	movb	%cl, (%rdi, %rdx)
> +	ret
> +
> +	.p2align 4,, 8
> +L(copy_0_3):
> +	vmovd	%VMM_128(0), %r8d
> +	testl	%edx, %edx
> +	jz	L(copy_1)
> +	movw	%r8w, (%rdi)
> +	cmpl	%ecx, %edx
> +	ja	L(zfill_from_1)
> +	movzbl	(%rsi, %rdx), %r8d
> +#   ifdef USE_AS_STPCPY
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +	movb	%r8b, (%rdi, %rdx)
> +	ret
> +#   endif
> +
> +L(copy_1):
> +#   ifdef USE_AS_STPCPY
> +	movl	%edx, %eax
> +	cmpl	%ecx, %edx
> +	adcq	%rdi, %rax
> +#   endif
> +#   ifdef USE_AS_WCSCPY
> +	vmovd	%VMM_128(0), (%rdi)
> +#   else
> +	movb	%r8b, (%rdi, %rdx)
> +#   endif
> +	ret
> +#  endif
> +
> +
> +#  ifndef USE_AS_WCSCPY
> +	.p2align 4,, 8
> +L(zfill_from_1):
> +#   ifdef USE_AS_STPCPY
> +	leaq	(%rdi, %rcx), %rax
> +#   endif
> +	movw	$0, -1(%rdi, %rdx)
> +	ret
> +#  endif
> +
> +	.p2align 4,, 4
> +L(zero_len):
> +	incq	%rdx
> +	jne	L(best_effort_strncpy)
> +	movq	%rdi, %rax
> +	ret
> +# endif
> +
> +
> +	.p2align 4,, 4
> +	.p2align 6,, 8
> +L(page_cross):
> +	movq	%rsi, %rax
> +	andq	$(VEC_SIZE * -1), %rax
> +	VPCMPEQ	(%rax), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +# ifdef USE_AS_WCSCPY
> +	movl	%esi, %r8d
> +	shrl	$2, %r8d
> +	andl	$(CHAR_PER_VEC - 1), %r8d
> +	shrx	%VR8, %VRCX, %VRCX
> +# else
> +	shrx	%VRSI, %VRCX, %VRCX
> +# endif
> +
> +	/* Compute amount of bytes we checked.  */
> +	subl	%esi, %eax
> +	andl	$(VEC_SIZE - 1), %eax
> +# ifdef USE_AS_WCSCPY
> +	shrl	$2, %eax
> +# endif
> +
> +	/* If rax > rdx then we are finishing the copy at the end of the
> +	   page.  */
> +	cmpq	%rax, %rdx
> +	jb	L(page_cross_small)
> +
> +
> +	/* If rcx is non-zero then continue.  */
> +	test	%VRCX, %VRCX
> +	jz	L(page_cross_continue)
> +
> +	/* We found zero-CHAR so need to copy then zfill (we know we
> +	   didn't cover all of length here).  */
> +	bsf	%VRCX, %VRCX
> +L(movsb_and_zfill):
> +	incl	%ecx
> +	subq	%rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +	leaq	-CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
> +# else
> +	movq	%rdi, %rax
> +# endif
> +
> +	REP_MOVS
> +# ifdef USE_AS_WCSCPY
> +	movl	$0, (%rdi)
> +# else
> +	movb	$0, (%rdi)
> +# endif
> +	jmp	L(zfill_from_page_cross)
> +
> +L(page_cross_small):
> +	tzcnt	%VRCX, %VRCX
> +	cmpl	%ecx, %edx
> +	jbe	L(page_cross_copy_only)
> +
> +	/* Do a zfill of the tail before copying.  */
> +	movq	%rdi, %r9
> +	xorl	%eax, %eax
> +
> +	movl	%ecx, %r8d
> +
> +	subl	%ecx, %edx
> +	leaq	CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> +	movl	%edx, %ecx
> +	REP_STOS
> +	movq	%r9, %rdi
> +	movl	%r8d, %edx
> +L(page_cross_copy_only):
> +	leal	1(%rdx), %ecx
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	adcl	$0, %edx
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# else
> +	movq	%rdi, %rax
> +# endif
> +	REP_MOVS
> +	ret
> +
> +
> +L(best_effort_strncpy):
> +	movq	%rdx, %rcx
> +	xorl	%eax, %eax
> +	movq	%rdi, %r8
> +	/* The length is >= 2^63. We very much so expect to segfault at
> +	   rep stos. If that doesn't happen then just strcpy to finish.
> +	 */
> +	REP_STOS
> +	movq	%r8, %rdi
> +	jmp	OVERFLOW_STRCPY
> +END(STRNCPY)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> new file mode 100644
> index 0000000000..d5ff4cbe50
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h

Need copyright notice.

> @@ -0,0 +1,65 @@
> +#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
> +#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
> +
> +#if defined USE_MULTIARCH && IS_IN(libc)
> +#  define UNDERSCORES __
> +#  ifdef USE_WITH_SSE2
> +#    define ISA_EXT _sse2
> +#  elif defined USE_WITH_AVX
> +#    ifdef USE_WITH_RTM
> +#      define ISA_EXT _avx_rtm
> +#    else
> +#      define ISA_EXT _avx
> +#    endif

No need for USE_WITH_AVX.  The only AVX function is memmove family.
We won't add more AVX functions.  We will only add AVX2 functions.

> +#  elif defined USE_WITH_AVX2
> +#    ifdef USE_WITH_RTM
> +#      define ISA_EXT _avx2_rtm
> +#    else
> +#      define ISA_EXT _avx2
> +#    endif
> +
> +#  elif defined USE_WITH_EVEX256
> +#    define ISA_EXT _evex
> +#  elif defined USE_WITH_EVEX512
> +#    define ISA_EXT _evex512
> +#  endif
> +#else
> +#  define UNDERSCORES
> +#  define ISA_EXT
> +#endif
> +
> +#ifdef USE_AS_WCSCPY
> +#  define STRCPY_PREFIX wc
> +#  define STRCAT_PREFIX wcs
> +#  ifdef USE_AS_STPCPY
> +#    define STRCPY_POSTFIX pcpy
> +#  else
> +#    define STRCPY_POSTFIX scpy
> +#  endif
> +#else
> +#  define STRCPY_PREFIX st
> +#  define STRCAT_PREFIX str
> +#  ifdef USE_AS_STPCPY
> +#    define STRCPY_POSTFIX pcpy
> +#  else
> +#    define STRCPY_POSTFIX rcpy
> +#  endif
> +#endif
> +#define STRCAT_POSTFIX cat
> +
> +#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext)                 \
> +  underscores##prefix##postfix##ext
> +
> +#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
> +
> +#ifndef OVERFLOW_STRCPY
> +#  define OVERFLOW_STRCPY                                                     \
> +    OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
> +#endif
> +
> +#ifndef OVERFLOW_STRCAT
> +#  define OVERFLOW_STRCAT                                                     \
> +    OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
> +#endif
> +
> +#endif
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v5 1/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
  2022-11-03  8:53 ` [PATCH v1 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
  2022-11-03  8:55   ` Noah Goldstein
  2022-11-04 23:04   ` [PATCH v4 1/4] " Noah Goldstein
@ 2022-11-09  1:38   ` Noah Goldstein
  2022-11-09  1:38     ` [PATCH v5 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
                       ` (3 more replies)
  2 siblings, 4 replies; 42+ messages in thread
From: Noah Goldstein @ 2022-11-09  1:38 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
    1. Use more overlapping stores to avoid branches.
    2. Reduce how unrolled the aligning copies are (this is more of a
       code-size save, its a negative for some sizes in terms of
       perf).
    3. Improve the loop a bit (similiar to what we do in strlen with
       2x vpminu + kortest instead of 3x vpminu + kmov + test).
    4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
       number that are taken.

Performance Changes:

    Times are from N = 10 runs of the benchmark suite and are
    reported as geometric mean of all ratios of
    New Implementation / Old Implementation.

    stpcpy-evex      -> 0.922
    strcat-evex      -> 0.985
    strcpy-evex      -> 0.880

    strncpy-evex     -> 0.831
    stpncpy-evex     -> 0.780

    strncat-evex     -> 0.958

Code Size Changes:
    function         -> Bytes New / Bytes Old -> Ratio

    strcat-evex      ->  819 / 1874 -> 0.437
    strcpy-evex      ->  700 / 1074 -> 0.652
    stpcpy-evex      ->  735 / 1094 -> 0.672

    strncpy-evex     -> 1397 / 2611 -> 0.535
    stpncpy-evex     -> 1489 / 2691 -> 0.553

    strncat-evex     -> 1184 / 2832 -> 0.418

Notes:
    1. Because of the significant difference between the
       implementations they are split into three files.

           strcpy-evex.S    -> strcpy, stpcpy, strcat
           strncpy-evex.S   -> strncpy
           strncat-evex.S    > strncat

       I couldn't find a way to merge them without making the
       ifdefs incredibly difficult to follow.

    2. All implementations can be made evex512 by including
       "x86-evex512-vecs.h" at the top.

    3. All implementations have an optional define:
        `USE_EVEX_MASKED_STORE`
       Setting to one uses evex-masked stores for handling short
       strings.  This saves code size and branches.  It's disabled
       for all implementations are the moment as there are some
       serious drawbacks to masked stores in certain cases, but
       that may be fixed on future architectures.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
 sysdeps/x86_64/multiarch/stpncpy-evex.S       |    5 +-
 sysdeps/x86_64/multiarch/strcat-evex.S        |  291 +---
 .../x86_64/multiarch/strcat-strlen-evex.h.S   |  110 ++
 sysdeps/x86_64/multiarch/strcpy-evex.S        | 1282 ++++++-----------
 sysdeps/x86_64/multiarch/strncat-evex.S       |  525 ++++++-
 sysdeps/x86_64/multiarch/strncpy-evex.S       |  995 ++++++++++++-
 .../multiarch/strncpy-or-cat-overflow-def.h   |   80 +
 7 files changed, 2115 insertions(+), 1173 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S
 create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h

diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
index 99ea76a372..3693491baa 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
@@ -3,6 +3,5 @@
 #endif
 
 #define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY	STPNCPY
-#include "strcpy-evex.S"
+#define STRNCPY	STPNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
index 0e2df947e9..b4207b7889 100644
--- a/sysdeps/x86_64/multiarch/strcat-evex.S
+++ b/sysdeps/x86_64/multiarch/strcat-evex.S
@@ -1,286 +1,7 @@
-/* strcat with 256-bit EVEX instructions.
-   Copyright (C) 2021-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_evex
-# endif
-
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
-/* zero register */
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMM0		ymm17
-# define YMM1		ymm18
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE	32
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRCAT)
-	mov	%rdi, %r9
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-	xor	%eax, %eax
-	mov	%edi, %ecx
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-	cmp	$(VEC_SIZE * 3), %ecx
-	ja	L(fourth_vector_boundary)
-	vpcmpb	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_first_vector)
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	jmp	L(align_vec_size_start)
-L(fourth_vector_boundary):
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	vpcmpb	$0, (%rax), %YMMZERO, %k0
-	mov	$-1, %r10d
-	sub	%rax, %rcx
-	shl	%cl, %r10d
-	kmovd	%k0, %edx
-	and	%r10d, %edx
-	jnz	L(exit)
-
-L(align_vec_size_start):
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 4), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	kmovd	%k4, %edx
-	add	$(VEC_SIZE * 4), %rax
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 4), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 5), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	add	$VEC_SIZE, %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	add	$VEC_SIZE, %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
-	add	$VEC_SIZE, %rax
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	add	$VEC_SIZE, %rax
-
-	.p2align 4
-L(align_four_vec_loop):
-	VMOVA	(%rax), %YMM0
-	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
-	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
-	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
-	vpminub	%YMM0, %YMM1, %YMM0
-	/* If K0 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM0, %YMMZERO, %k0
-	add	$(VEC_SIZE * 4), %rax
-	ktestd	%k0, %k0
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
-	sub	$(VEC_SIZE * 5), %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit):
-	sub	%rdi, %rax
-L(exit_null_on_first_vector):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_second_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$VEC_SIZE, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_third_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 2), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fourth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 3), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fifth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-
-	.p2align 4
-L(StartStrcpyPart):
-	lea	(%r9, %rax), %rdi
-	mov	%rsi, %rcx
-	mov	%r9, %rax      /* save result */
-
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(ExitZero)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-evex.S"
+#ifndef STRCAT
+# define STRCAT	__strcat_evex
 #endif
+
+#define USE_AS_STRCAT
+#define STRCPY	STRCAT
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S
new file mode 100644
index 0000000000..9530d7b683
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S
@@ -0,0 +1,110 @@
+/* strlen used for begining of str{n}cat using EVEX 256/512.
+   Copyright (C) 2011-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+/* NOTE: This file is meant to be included by strcat-evex or
+   strncat-evex and does not standalone.  Before including %rdi
+   must be saved in %rax.  */
+
+
+/* Simple strlen implementation that ends at
+   L(strcat_strlen_done).  */
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+	VPCMPEQ	(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+#ifdef USE_AS_WCSCPY
+	subl	%r8d, %edi
+	shrl	$2, %edi
+#endif
+	shrx	%VRDI, %VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+	movq	%rax, %rdi
+#endif
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v0)
+
+
+	VPCMPEQ	VEC_SIZE(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	leaq	(VEC_SIZE)(%r8), %rdi
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v1)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v2)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v3)
+
+	andq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4,, 8
+L(loop_2x_vec):
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(0)
+	VPMIN	(VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(2)
+	VPMIN	(VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	VPTESTN	%VMM(3), %VMM(3), %k3
+	subq	$(VEC_SIZE * -4), %rdi
+	KORTEST	%k1, %k3
+	jz	L(loop_2x_vec)
+
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v0)
+
+	KMOV	%k1, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v1)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v2)
+
+	KMOV	%k3, %VRCX
+L(bsf_and_done_v3):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v2):
+	bsf	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
+	jmp	L(strcat_strlen_done)
+
+	.p2align 4,, 4
+L(bsf_and_done_v1):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v0):
+	bsf	%VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#else
+	addq	%rcx, %rdi
+#endif
+L(strcat_strlen_done):
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
index 82e45ac675..932129ab40 100644
--- a/sysdeps/x86_64/multiarch/strcpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
@@ -1,4 +1,4 @@
-/* strcpy with 256-bit EVEX instructions.
+/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
    Copyright (C) 2021-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,990 +17,526 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <isa-level.h>
-
 #if ISA_SHOULD_BUILD (4)
 
 
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+	/* Use movsb in page cross case to save code size.  */
+# define USE_MOVSB_IN_PAGE_CROSS	1
 
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_evex
-#  endif
+# include <sysdep.h>
 
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
-/* Number of bytes in a vector register */
-# ifndef VEC_SIZE
-#  define VEC_SIZE	32
+# ifndef STRCPY
+#  define STRCPY	__strcpy_evex
 # endif
 
-# define XMM2		xmm18
-# define XMM3		xmm19
 
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
-# define YMM7		ymm23
+# ifdef USE_AS_WCSCPY
+#  define VMOVU_MASK	vmovdqu32
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
 
-# ifndef USE_AS_STRCAT
+#  define REP_MOVS	rep movsd
 
-/* zero register */
-#  define XMMZERO	xmm16
-#  define YMMZERO	ymm16
-#  define YMM1		ymm17
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRCPY)
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-	test	%R8_LP, %R8_LP
-	jz	L(ExitZero)
-#  endif
-	mov	%rsi, %rcx
-#  ifndef USE_AS_STPCPY
-	mov	%rdi, %rax      /* save result */
-#  endif
+#  define USE_WIDE_CHAR
+# else
+#  define VMOVU_MASK	vmovdqu8
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
 
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+#  define REP_MOVS	rep movsb
 # endif
 
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	cmp	$(VEC_SIZE * 2), %ecx
-	jbe	L(SourceStringAlignmentLessTwoVecSize)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-
-	vpcmpb	$0, (%rsi), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	shr	%cl, %rdx
+# include "reg-macros.h"
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	mov	$VEC_SIZE, %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  else
-	mov	$(VEC_SIZE + 1), %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  endif
-	jbe	L(CopyVecSizeTailCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail)
-
-	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
-	kmovd	%k1, %edx
 
-# ifdef USE_AS_STRNCPY
-	add	$VEC_SIZE, %r10
-	cmp	%r10, %r8
-	jbe	L(CopyTwoVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize)
-
-	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
-	VMOVU	%YMM2, (%rdi)
-
-/* If source address alignment != destination address alignment */
-	.p2align 4
-L(UnalignVecSizeBoth):
-	sub	%rcx, %rdi
-# ifdef USE_AS_STRNCPY
-	add	%rcx, %r8
-	sbb	%rcx, %rcx
-	or	%rcx, %r8
-# endif
-	mov	$VEC_SIZE, %rcx
-	VMOVA	(%rsi, %rcx), %YMM2
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 3), %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STPCPY
+#  define END_REG	rax
 # else
-	jnz	L(CopyVecSize)
+#  define END_REG	rdi, %rdx, CHAR_SIZE
 # endif
 
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
+# ifdef USE_AS_STRCAT
+#  define PAGE_ALIGN_REG	edx
+#  define PAGE_ALIGN_REG_64	rdx
 # else
-	jnz	L(CopyVecSize)
+#  define PAGE_ALIGN_REG	eax
+#  define PAGE_ALIGN_REG_64	rax
 # endif
 
-	VMOVU	%YMM3, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
-	vpcmpb	$0, %YMM4, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
 
-	VMOVU	%YMM4, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
 
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
-	VMOVU	%YMM2, (%rdi, %rcx)
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
 
-	VMOVU	%YMM3, (%rdi, %rcx)
-	mov	%rsi, %rdx
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	and	$-(VEC_SIZE * 4), %rsi
-	sub	%rsi, %rdx
-	sub	%rdx, %rdi
-# ifdef USE_AS_STRNCPY
-	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
-# endif
-L(UnalignedFourVecSizeLoop):
-	VMOVA	(%rsi), %YMM4
-	VMOVA	VEC_SIZE(%rsi), %YMM5
-	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
-	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
-	vpminub	%YMM5, %YMM4, %YMM2
-	vpminub	%YMM7, %YMM6, %YMM3
-	vpminub	%YMM2, %YMM3, %YMM2
-	/* If K7 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM2, %YMMZERO, %k7
-	kmovd	%k7, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRCPY)
+# ifdef USE_AS_STRCAT
+	movq	%rdi, %rax
+#  include "strcat-strlen-evex.h.S"
 # endif
-	test	%edx, %edx
-	jnz	L(UnalignedFourVecSizeLeave)
-
-L(UnalignedFourVecSizeLoop_start):
-	add	$(VEC_SIZE * 4), %rdi
-	add	$(VEC_SIZE * 4), %rsi
-	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
-	VMOVA	(%rsi), %YMM4
-	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
-	VMOVA	VEC_SIZE(%rsi), %YMM5
-	vpminub	%YMM5, %YMM4, %YMM2
-	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
-	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
-	VMOVU	%YMM7, -VEC_SIZE(%rdi)
-	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
-	vpminub	%YMM7, %YMM6, %YMM3
-	vpminub	%YMM2, %YMM3, %YMM2
-	/* If K7 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM2, %YMMZERO, %k7
-	kmovd	%k7, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
+
+	movl	%esi, %PAGE_ALIGN_REG
+	andl	$(PAGE_SIZE - 1), %PAGE_ALIGN_REG
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 # endif
-	test	%edx, %edx
-	jz	L(UnalignedFourVecSizeLoop_start)
 
-L(UnalignedFourVecSizeLeave):
-	vpcmpb	$0, %YMM4, %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_0)
 
-	vpcmpb	$0, %YMM5, %YMMZERO, %k2
-	kmovd	%k2, %ecx
-	test	%ecx, %ecx
-	jnz	L(CopyVecSizeUnaligned_16)
+	/* Two short string implementations. One with traditional
+	   branching approach and one with masked instructions (which
+	   have potential for dramatically bad perf if dst splits a
+	   page and is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	VPTEST	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+#  ifdef USE_AS_WCSCPY
+	subl	$((1 << CHAR_PER_VEC)- 1), %VRCX
+#  else
+	inc	%VRCX
+#  endif
+	jz	L(more_1x_vec)
+	KMOV	%VRCX, %k1
+	KXOR	%k0, %k1, %k1
 
-	vpcmpb	$0, %YMM6, %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_32)
-
-	vpcmpb	$0, %YMM7, %YMMZERO, %k4
-	kmovd	%k4, %ecx
-	bsf	%ecx, %edx
-	VMOVU	%YMM4, (%rdi)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$(VEC_SIZE * 3), %rsi
-	add	$(VEC_SIZE * 3), %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
 
-/* If source address alignment == destination address alignment */
+#  ifdef USE_AS_STPCPY
+	bsf	%VRCX, %VRCX
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
+#  endif
+	ret
 
-L(SourceStringAlignmentLessTwoVecSize):
-	VMOVU	(%rsi), %YMM3
-	VMOVU	VEC_SIZE(%rsi), %YMM2
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
+# else
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jz	L(more_1x_vec)
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$VEC_SIZE, %r8
+	xorl	%edx, %edx
+	bsf	%VRCX, %VRDX
+#  ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  endif
+
+	/* Use mask bits in rcx to detect which copy we need. If the low
+	   mask is zero then there must be a bit set in the upper half.
+	   I.e if rcx != 0 and ecx == 0, then match must be upper 32
+	   bits so we use L(copy_32_63).  */
+#  if VEC_SIZE == 64
+#   ifdef USE_AS_WCSCPY
+	testb	%cl, %cl
+#   else
+	testl	%ecx, %ecx
+#   endif
+	jz	L(copy_32_63)
+#  endif
+
+#  ifdef USE_AS_WCSCPY
+	testb	$0xf, %cl
 #  else
-	cmp	$(VEC_SIZE + 1), %r8
+	testw	%cx, %cx
 #  endif
-	jbe	L(CopyVecSizeTail1Case2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail1)
+	jz	L(copy_16_31)
 
-	VMOVU	%YMM3, (%rdi)
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$(VEC_SIZE * 2), %r8
+#  ifdef USE_AS_WCSCPY
+	testb	$0x3, %cl
 #  else
-	cmp	$((VEC_SIZE * 2) + 1), %r8
+	testb	%cl, %cl
 #  endif
-	jbe	L(CopyTwoVecSize1Case2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize1)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-	jmp	L(UnalignVecSizeBoth)
+	jz	L(copy_8_15)
 
-/*------End of main part with loops---------------------*/
 
-/* Case1 */
+#  ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+	/* No need to copy, we know its zero.  */
+	movl	$0, (%END_REG)
 
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
-	.p2align 4
-L(CopyVecSize):
-	add	%rcx, %rdi
-# endif
-L(CopyVecSizeTail):
-	add	%rcx, %rsi
-L(CopyVecSizeTail1):
-	bsf	%edx, %edx
-L(CopyVecSizeExit):
-	cmp	$32, %edx
-	jae	L(Exit32_63)
-	cmp	$16, %edx
-	jae	L(Exit16_31)
-	cmp	$8, %edx
-	jae	L(Exit8_15)
-	cmp	$4, %edx
-	jae	L(Exit4_7)
-	cmp	$3, %edx
-	je	L(Exit3)
-	cmp	$1, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	movb	$0, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$1, %r8
-	lea	1(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
 	ret
+#  else
 
-	.p2align 4
-L(CopyTwoVecSize1):
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$VEC_SIZE, %r8
-# endif
-	jmp	L(CopyVecSizeTail1)
-
-	.p2align 4
-L(CopyTwoVecSize):
-	bsf	%edx, %edx
-	add	%rcx, %rsi
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	jmp	L(CopyVecSizeExit)
-
-	.p2align 4
-L(CopyVecSizeUnaligned_0):
-	bsf	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM4, (%rdi)
-	add	$((VEC_SIZE * 4) - 1), %r8
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	jmp	L(CopyVecSizeExit)
-# endif
+	testb	$0x7, %cl
+	jz	L(copy_4_7)
 
-	.p2align 4
-L(CopyVecSizeUnaligned_16):
-	bsf	%ecx, %edx
-	VMOVU	%YMM4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	VEC_SIZE(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	add	$((VEC_SIZE * 3) - 1), %r8
-	sub	%rdx, %r8
-	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
 
-	.p2align 4
-L(CopyVecSizeUnaligned_32):
-	bsf	%edx, %edx
-	VMOVU	%YMM4, (%rdi)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
-# endif
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-	add	$((VEC_SIZE * 2) - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$(VEC_SIZE * 2), %rsi
-	add	$(VEC_SIZE * 2), %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
+	test	%edx, %edx
+	jz	L(set_null_term)
 
-# ifdef USE_AS_STRNCPY
-#  ifndef USE_AS_STRCAT
-	.p2align 4
-L(CopyVecSizeUnalignedVec6):
-	VMOVU	%YMM6, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec5):
-	VMOVU	%YMM5, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec4):
-	VMOVU	%YMM4, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec3):
-	VMOVU	%YMM3, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
+	/* NB: make this `vmovw` if support for AVX512-FP16 is added.
+	 */
+	vmovd	%VMM_128(0), %esi
+	movw	%si, (%rdi)
+
+	.p2align 4,, 1
+L(set_null_term):
+	/* No need to copy, we know its zero.  */
+	movb	$0, (%END_REG)
+	ret
 #  endif
 
-/* Case2 */
-
-	.p2align 4
-L(CopyVecSizeCase2):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTailCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTail1Case2):
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-/* Case2 or Case3,  Case3 */
-
-	.p2align 4
-L(CopyVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeCase2)
-L(CopyVecSizeCase3):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyTwoVecSizeCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyVecSizeTailCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTailCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSize1Case2OrCase3):
-	add	$VEC_SIZE, %rdi
-	add	$VEC_SIZE, %rsi
-	sub	$VEC_SIZE, %r8
-L(CopyVecSizeTail1Case2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTail1Case2)
-	jmp	L(StrncpyExit)
+#  if VEC_SIZE == 64
+	.p2align 4,, 6
+L(copy_32_63):
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
+	ret
+#  endif
+
+
+	.p2align 4,, 6
+L(copy_16_31):
+	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
+	   and will save code size.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
+	ret
+
+	.p2align 4,, 8
+L(copy_8_15):
+#  ifdef USE_AS_WCSCPY
+	movl	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+#  else
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
+#  endif
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%END_REG)
+	ret
 # endif
 
-/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
 
-	.p2align 4
-L(Exit1):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
+# ifndef USE_AS_WCSCPY
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%ecx, -(4 - CHAR_SIZE)(%END_REG)
+	ret
 # endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$2, %r8
-	lea	2(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rdi)
 # endif
-	ret
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+	addq	%rsi, %rdi
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
 
-	.p2align 4
-L(Exit2):
-	movzwl	(%rsi), %ecx
-	mov	%cx, (%rdi)
-	movb	$0, 2(%rdi)
+	/* Ideally we store after moves to minimize impact of potential
+	   false-dependencies.  */
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rax)
+# endif
+
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+
+
+	/* Align for 4x loop.  */
+	subq	%rsi, %rdi
+
+	/* + VEC_SIZE * 5 because we never added the original VEC_SIZE
+	   we covered before aligning.  */
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$-(VEC_SIZE * 4), %rsi
+
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	/* Restore rdi (%rdi).  */
+	addq	%rsi, %rdi
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x0_end)
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	/* Place L(ret_vec_x4) here to save code size.  We get a
+	   meaningfuly benefit doing this for stpcpy.  */
+	KMOV	%k4, %VRDX
+L(ret_vec_x3):
+	bsf	%VRDX, %VRDX
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$3, %r8
-	lea	3(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
 # endif
+L(return_end):
 	ret
 
-	.p2align 4
-L(Exit3):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
+	.p2align 4,, 6
+L(ret_vec_x0_end):
+	bsf	%VRCX, %VRCX
 # ifdef USE_AS_STPCPY
-	lea	3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$4, %r8
-	lea	4(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
+	inc	%VRCX
+	VMOVU	(-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
 	ret
 
-	.p2align 4
-L(Exit4_7):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	-3(%rsi, %rdx), %ecx
-	mov	%ecx, -3(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x1):
+	bsf	%VRCX, %VRCX
+	VMOVU	(VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
 	ret
 
-	.p2align 4
-L(Exit8_15):
-	mov	(%rsi), %rcx
-	mov	-7(%rsi, %rdx), %r9
-	mov	%rcx, (%rdi)
-	mov	%r9, -7(%rdi, %rdx)
+	.p2align 4,, 4
+L(ret_vec_x2):
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
 	ret
 
-	.p2align 4
-L(Exit16_31):
-	VMOVU	(%rsi), %XMM2
-	VMOVU	-15(%rsi, %rdx), %XMM3
-	VMOVU	%XMM2, (%rdi)
-	VMOVU	%XMM3, -15(%rdi, %rdx)
+	/* ret_vec_x3 reuses return code after the loop.  */
+	.p2align 4,, 6
+L(ret_vec_x4):
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub %rdx, %r8
-	sub $1, %r8
-	lea 1(%rdi, %rdx), %rdi
-	jnz L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
 	ret
 
-	.p2align 4
-L(Exit32_63):
-	VMOVU	(%rsi), %YMM2
-	VMOVU	-31(%rsi, %rdx), %YMM3
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, -31(%rdi, %rdx)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
+
+	.p2align 4,, 4
+L(page_cross):
+# ifndef USE_AS_STRCAT
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
 # endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	movq	%rsi, %rcx
+	andq	$(VEC_SIZE * -1), %rcx
+
+	VPCMPEQ	(%rcx), %VZERO, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSCPY
+	andl	$(VEC_SIZE - 1), %PAGE_ALIGN_REG
+	shrl	$2, %PAGE_ALIGN_REG
 # endif
-	ret
+	shrx	%VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
 
-# ifdef USE_AS_STRNCPY
+# if USE_MOVSB_IN_PAGE_CROSS
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
 
-	.p2align 4
-L(StrncpyExit1):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 1(%rdi)
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shl	%VRCX
+	jz	L(page_cross_continue)
+#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 #  endif
-	ret
+	bsf	%VRCX, %VRCX
+	REP_MOVS
 
-	.p2align 4
-L(StrncpyExit2):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
 #  ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 2(%rdi)
+	leaq	-CHAR_SIZE(%rdi), %rax
 #  endif
 	ret
 
-	.p2align 4
-L(StrncpyExit3_4):
-	movzwl	(%rsi), %ecx
-	movzwl	-2(%rsi, %r8), %edx
-	mov	%cx, (%rdi)
-	mov	%dx, -2(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
 
-	.p2align 4
-L(StrncpyExit5_8):
-	mov	(%rsi), %ecx
-	mov	-4(%rsi, %r8), %edx
-	mov	%ecx, (%rdi)
-	mov	%edx, -4(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
+# else
+	/* Check if we found zero-char before end of page.  */
+	test	%VRCX, %VRCX
+	jz	L(page_cross_continue)
 
-	.p2align 4
-L(StrncpyExit9_16):
-	mov	(%rsi), %rcx
-	mov	-8(%rsi, %r8), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
+	/* Traditional copy case, essentially same as used in non-page-
+	   cross case but since we can't reuse VMM(0) we need twice as
+	   many loads from rsi.  */
 
-	.p2align 4
-L(StrncpyExit17_32):
-	VMOVU	(%rsi), %XMM2
-	VMOVU	-16(%rsi, %r8), %XMM3
-	VMOVU	%XMM2, (%rdi)
-	VMOVU	%XMM3, -16(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
+#  ifndef USE_AS_STRCAT
+	xorl	%edx, %edx
 #  endif
-	ret
-
-	.p2align 4
-L(StrncpyExit33_64):
-	/*  0/32, 31/16 */
-	VMOVU	(%rsi), %YMM2
-	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
+	/* Dependency on rdi must already have been satisfied.  */
+	bsf	%VRCX, %VRDX
 #  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  elif !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 #  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	ret
 
-	.p2align 4
-L(StrncpyExit65):
-	/* 0/32, 32/32, 64/1 */
-	VMOVU	(%rsi), %YMM2
-	VMOVU	32(%rsi), %YMM3
-	mov	64(%rsi), %cl
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, 32(%rdi)
-	mov	%cl, 64(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	65(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 65(%rdi)
+#  if VEC_SIZE == 64
+#   ifdef USE_AS_WCSCPY
+	testb	%cl, %cl
+#   else
+	test	%ecx, %ecx
+#   endif
+	jz	L(page_cross_copy_32_63)
 #  endif
-	ret
-
-#  ifndef USE_AS_STRCAT
 
-	.p2align 4
-L(Fill1):
-	mov	%dl, (%rdi)
-	ret
+#  ifdef USE_AS_WCSCPY
+	testb	$0xf, %cl
+#  else
+	testw	%cx, %cx
+#  endif
+	jz	L(page_cross_copy_16_31)
 
-	.p2align 4
-L(Fill2):
-	mov	%dx, (%rdi)
-	ret
+#  ifdef USE_AS_WCSCPY
+	testb	$0x3, %cl
+#  else
+	testb	%cl, %cl
+#  endif
+	jz	L(page_cross_copy_8_15)
 
-	.p2align 4
-L(Fill3_4):
-	mov	%dx, (%rdi)
-	mov     %dx, -2(%rdi, %r8)
+#  ifdef USE_AS_WCSCPY
+	movl	(%rsi), %esi
+	movl	%esi, (%rdi)
+	movl	$0, (%END_REG)
 	ret
+#  else
 
-	.p2align 4
-L(Fill5_8):
-	mov	%edx, (%rdi)
-	mov     %edx, -4(%rdi, %r8)
-	ret
+	testb	$0x7, %cl
+	jz	L(page_cross_copy_4_7)
 
-	.p2align 4
-L(Fill9_16):
-	mov	%rdx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
+	test	%edx, %edx
+	jz	L(page_cross_set_null_term)
+	movzwl	(%rsi), %ecx
+	movw	%cx, (%rdi)
+L(page_cross_set_null_term):
+	movb	$0, (%END_REG)
 	ret
 
-	.p2align 4
-L(Fill17_32):
-	VMOVU	%XMMZERO, (%rdi)
-	VMOVU	%XMMZERO, -16(%rdi, %r8)
-	ret
 
-	.p2align 4
-L(CopyVecSizeUnalignedVec2):
-	VMOVU	%YMM2, (%rdi, %rcx)
-
-	.p2align 4
-L(CopyVecSizeVecExit):
-	bsf	%edx, %edx
-	add	$(VEC_SIZE - 1), %r8
-	add	%rcx, %rdi
-#   ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-#   endif
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-
-	.p2align 4
-L(StrncpyFillTailWithZero):
-	xor	%edx, %edx
-	sub	$VEC_SIZE, %r8
-	jbe	L(StrncpyFillExit)
-
-	VMOVU	%YMMZERO, (%rdi)
-	add	$VEC_SIZE, %rdi
-
-	mov	%rdi, %rsi
-	and	$(VEC_SIZE - 1), %esi
-	sub	%rsi, %rdi
-	add	%rsi, %r8
-	sub	$(VEC_SIZE * 4), %r8
-	jb	L(StrncpyFillLessFourVecSize)
-
-L(StrncpyFillLoopVmovdqa):
-	VMOVA	%YMMZERO, (%rdi)
-	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
-	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
-	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE * 4), %rdi
-	sub	$(VEC_SIZE * 4), %r8
-	jae	L(StrncpyFillLoopVmovdqa)
-
-L(StrncpyFillLessFourVecSize):
-	add	$(VEC_SIZE * 2), %r8
-	jl	L(StrncpyFillLessTwoVecSize)
-	VMOVA	%YMMZERO, (%rdi)
-	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
-	add	$(VEC_SIZE * 2), %rdi
-	sub	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	VMOVA	%YMMZERO, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillLessTwoVecSize):
-	add	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	VMOVA	%YMMZERO, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillExit):
-	add	$VEC_SIZE, %r8
-L(Fill):
-	cmp	$17, %r8d
-	jae	L(Fill17_32)
-	cmp	$9, %r8d
-	jae	L(Fill9_16)
-	cmp	$5, %r8d
-	jae	L(Fill5_8)
-	cmp	$3, %r8d
-	jae	L(Fill3_4)
-	cmp	$1, %r8d
-	ja	L(Fill2)
-	je	L(Fill1)
+	.p2align 4,, 4
+L(page_cross_copy_4_7):
+	movl	(%rsi), %ecx
+	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, -(4 - CHAR_SIZE)(%END_REG)
 	ret
-
-/* end of ifndef USE_AS_STRCAT */
 #  endif
 
-	.p2align 4
-L(UnalignedLeaveCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(UnalignedFourVecSizeLeaveCase2)
-L(UnalignedFourVecSizeLeaveCase3):
-	lea	(VEC_SIZE * 4)(%r8), %rcx
-	and	$-VEC_SIZE, %rcx
-	add	$(VEC_SIZE * 3), %r8
-	jl	L(CopyVecSizeCase3)
-	VMOVU	%YMM4, (%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 4)(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (VEC_SIZE * 4)(%rdi)
-#  endif
+#  if VEC_SIZE == 64
+	.p2align 4,, 4
+L(page_cross_copy_32_63):
+	VMOVU	(%rsi), %VMM_256(0)
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
 	ret
-
-	.p2align 4
-L(UnalignedFourVecSizeLeaveCase2):
-	xor	%ecx, %ecx
-	vpcmpb	$0, %YMM4, %YMMZERO, %k1
-	kmovd	%k1, %edx
-	add	$(VEC_SIZE * 3), %r8
-	jle	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
-	vpcmpb	$0, %YMM5, %YMMZERO, %k2
-	kmovd	%k2, %edx
-	VMOVU	%YMM4, (%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec5)
-#  else
-	jnz	L(CopyVecSize)
 #  endif
 
-	vpcmpb	$0, %YMM6, %YMMZERO, %k3
-	kmovd	%k3, %edx
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec6)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
-
-	vpcmpb	$0, %YMM7, %YMMZERO, %k4
-	kmovd	%k4, %edx
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
-	lea	VEC_SIZE(%rdi, %rcx), %rdi
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-L(StrncpyExit):
-	cmp	$65, %r8d
-	je	L(StrncpyExit65)
-	cmp	$33, %r8d
-	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8d
-	jae	L(StrncpyExit17_32)
-	cmp	$9, %r8d
-	jae	L(StrncpyExit9_16)
-	cmp	$5, %r8d
-	jae	L(StrncpyExit5_8)
-	cmp	$3, %r8d
-	jae	L(StrncpyExit3_4)
-	cmp	$1, %r8d
-	ja	L(StrncpyExit2)
-	je	L(StrncpyExit1)
-#  ifdef USE_AS_STPCPY
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi)
-#  endif
+	.p2align 4,, 4
+L(page_cross_copy_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
 	ret
 
-	.p2align 4
-L(ExitZero):
-#  ifndef USE_AS_STRCAT
-	mov	%rdi, %rax
-#  endif
+	.p2align 4,, 4
+L(page_cross_copy_8_15):
+	movq	(%rsi), %rcx
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+	movq	%rcx, (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%END_REG)
 	ret
-
-# endif
-
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
 # endif
+END(STRCPY)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
index 203a19bf21..bced4e8944 100644
--- a/sysdeps/x86_64/multiarch/strncat-evex.S
+++ b/sysdeps/x86_64/multiarch/strncat-evex.S
@@ -1,7 +1,520 @@
-#ifndef STRNCAT
-# define STRNCAT	__strncat_evex
-#endif
+/* {wcs|str}ncat  with 256/512-bit EVEX.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+# ifndef STRNCAT
+#  define STRNCAT	__strncat_evex
+# endif
+
+
+# ifdef USE_AS_WCSCPY
+#  define MOVCHAR	movl
+#  define VMOVU_MASK	vmovdqu32
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
+
+#  define REP_MOVS	rep movsd
+
+#  define VMASK_REG	VR10
+#  define FIND_FIRST_ONE(src, dst)	movl $CHAR_PER_VEC, %dst; bsf %src, %dst
+
+#  define USE_WIDE_CHAR
+# else
+#  define MOVCHAR	movb
+#  define VMOVU_MASK	vmovdqu8
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
+
+#  define REP_MOVS	rep movsb
+
+#  define VMASK_REG	VRCX
+#  define FIND_FIRST_ONE(src, dst)	tzcnt %src, %dst
+
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# include "reg-macros.h"
+
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCAT)
+	movq	%rdi, %rax
+
+	/* NB: It's safe to filter out zero-length strings WITHOUT
+	   setting null-term. Destination MUST be a null-terminated
+	   string so essentially the work is already done.  */
+# ifdef USE_AS_WCSCPY
+	leaq	-1(%rdx), %rcx
+	shrq	$56, %rcx
+	jnz	L(zero_len)
+# else
+	test	%rdx, %rdx
+	jle	L(zero_len)
+# endif
+
+# include "strcat-strlen-evex.h.S"
+
+	movl	%esi, %ecx
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPTESTN	%VMM(0), %VMM(0), %k0
+
+	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
+	   <= CHAR_PER_VEC with masked instructions (which have
+	   potential for dramatically bad perf if dst splits a page and
+	   is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	KMOV	%k0, %VRCX
+	FIND_FIRST_ONE (VRCX, VR8)
+	cmpq	%r8, %rdx
+	jbe	L(less_1x_vec)
+
+	test	%VRCX, %VRCX
+	jz	L(more_1x_vec)
+
+	blsmsk	%VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
+	ret
+
+L(less_1x_vec):
+	mov	$-1, %VRCX
+	bzhi	%VRDX, %VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
+
+	ret
+# else
+	KMOV	%k0, %VMASK_REG
+	/* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
+	   %VMASK_REG, %VRCX` for wcsncat.  */
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpq	%rcx, %rdx
+	jbe	L(less_1x_vec)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	je	L(more_1x_vec)
+
+	movl	%ecx, %edx
+
+L(less_1x_vec):
+#  if VEC_SIZE == 64
+	cmpl	$(32 / CHAR_SIZE), %edx
+	jae	L(copy_32_63)
+#  endif
+
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jae	L(copy_16_31)
+
+
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jae	L(copy_8_15)
+
+#  ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  else
+
+	cmpl	$4, %edx
+	jae	L(copy_4_7)
+
+	movzbl	(%rsi), %ecx
+	cmpl	$1, %edx
+	jbe	L(set_null_term)
+
+	movzwl	1(%rsi), %esi
+	movw	%si, 1(%rdi)
+
+	.p2align 4,, 1
+L(set_null_term):
+	movb	%cl, (%rdi)
+	MOVCHAR	$0, (%rdi, %rdx)
+	ret
+#  endif
+
+#  if VEC_SIZE == 64
+	.p2align 4,, 6
+L(copy_32_63):
+	VMOVU	-(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
+	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  endif
+	.p2align 4,, 6
+L(copy_16_31):
+	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
+	   and will save code size.  */
+	vmovdqu	-(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
+	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 2
+L(copy_8_15):
+	movq	-(8)(%rsi, %rdx, CHAR_SIZE), %rcx
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
+	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+
+#  ifndef USE_AS_WCSCPY
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-(4)(%rsi, %rdx, CHAR_SIZE), %ecx
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
+	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  endif
+
+# endif
+	.p2align 4,, 4
+L(zero_len):
+# ifdef USE_AS_WCSCPY
+	test	%rdx, %rdx
+# endif
+	jne	OVERFLOW_STRCAT
+	ret
 
-#define USE_AS_STRNCAT
-#define STRCAT	STRNCAT
-#include "strcat-evex.S"
+	.p2align 4,, 8
+L(more_1x_vec):
+	VMOVU	%VMM(0), (%rdi)
+
+	/* We are going to align rsi here so will need to be able to re-
+	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
+
+	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+L(loop_last_4x_vec):
+	addq	%rsi, %rdi
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+
+	/* Will need this regardless.  */
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VMASK_REG
+
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	jne	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	addl	$-CHAR_PER_VEC, %edx
+	bzhi	%VRDX, %VRCX, %VR8
+	jz	L(ret_vec_x2_len)
+L(ret_vec_x2):
+	bsf	%VRCX, %VRDX
+L(ret_vec_x2_len):
+	VMOVU	(VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	MOVCHAR	$0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 4
+L(ret_vec_x1_len):
+	movl	%edx, %ecx
+L(ret_vec_x1):
+	VMOVU	(VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	MOVCHAR	$0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	addl	$-(CHAR_PER_VEC * 4), %edx
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VMASK_REG
+	subq	$-(VEC_SIZE * 4), %rsi
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+# ifdef USE_AS_WCSCPY
+	xorl	%ecx, %ecx
+# endif
+	bsf	%VMASK_REG, %VRCX
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VMASK_REG
+
+	cmpq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(more_4x_vec)
+
+	/* Adjust length before going to L(ret_vec_x3_len) or
+	   L(ret_vec_x3).  */
+	addl	$(CHAR_PER_VEC * -2), %edx
+
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	jne	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	addl	$-CHAR_PER_VEC, %edx
+	bzhi	%VRDX, %VRCX, %VR8
+	jz	L(ret_vec_x4_len)
+L(ret_vec_x4):
+	bsf	%VRCX, %VRDX
+L(ret_vec_x4_len):
+	VMOVU	(VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	MOVCHAR	$0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 4
+L(ret_vec_x3_len):
+	movl	%edx, %ecx
+L(ret_vec_x3):
+	VMOVU	(VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	MOVCHAR	$0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 8
+L(more_4x_vec):
+# ifdef USE_AS_WCSCPY
+	xorl	%ecx, %ecx
+# endif
+	bsf	%VMASK_REG, %VRCX
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+
+	/* Check if we are near the end before aligning.  */
+	cmpq	$(CHAR_PER_VEC * 8), %rdx
+	jbe	L(last_4x_vec)
+
+
+	/* Add rsi to rdx (length) before aligning rsi. NB: Since we
+	   filtered out huge lengths this cannot overflow.  */
+# ifdef USE_AS_WCSCPY
+	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rsi, %rdx
+# endif
+
+	/* Subtract rsi from rdi before aligning (add back will have
+	   correct rdi for aligned rsi).  */
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+
+	/* Offset rsi by VEC_SIZE so that we can jump to
+	   L(loop_last_4x_vec).  */
+	addq	$-(VEC_SIZE), %rsi
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	/* Store loop end in r9.  */
+	leaq	-(VEC_SIZE * 5)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	/* Restore rdi (dst).  */
+	addq	%rsi, %rdi
+
+	/* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
+	   test with bsf.  */
+	bsf	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
+
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	bsf	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+
+	KMOV	%k4, %VRCX
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	ret
+
+
+	.p2align 4,, 4
+L(page_cross):
+	movq	%rsi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+	VPCMPEQ	(%r8), %VZERO, %k0
+
+# ifdef USE_AS_WCSCPY
+	KMOV	%k0, %VR9
+	shrl	$2, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+	shrx	%VRCX, %VR9, %VRCX
+# else
+	KMOV	%k0, %VRCX
+	shrx	%VRSI, %VRCX, %VRCX
+# endif
+
+	subl	%esi, %r8d
+	andl	$(VEC_SIZE - 1), %r8d
+# ifdef USE_AS_WCSCPY
+	shrl	$2, %r8d
+# endif
+	cmpq	%r8, %rdx
+	jbe	L(page_cross_small)
+	/* Optimizing more for space as this is very cold code. This
+	   saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shl	%VRCX
+	jz	L(page_cross_continue)
+	bsf	%VRCX, %VRCX
+	REP_MOVS
+	ret
+
+L(page_cross_small):
+	tzcnt	%VRCX, %VRCX
+	jz	L(page_cross_setz)
+	cmpl	%edx, %ecx
+	cmova	%edx, %ecx
+
+# ifdef USE_AS_WCSCPY
+	rep	movsd
+# else
+	rep	movsb
+# endif
+L(page_cross_setz):
+	MOVCHAR	$0, (%rdi)
+	ret
+END(STRNCAT)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
index 1b3426d511..49eaf4cbd9 100644
--- a/sysdeps/x86_64/multiarch/strncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
@@ -1,7 +1,990 @@
-#ifndef STRNCPY
-# define STRNCPY	__strncpy_evex
-#endif
+/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+
+
+# include <sysdep.h>
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+
+# ifndef STRNCPY
+#  define STRNCPY	__strncpy_evex
+# endif
+
+# ifdef USE_AS_WCSCPY
+#  define VMOVU_MASK	vmovdqu32
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define CHAR_SIZE	4
+
+#  define REP_MOVS	rep movsd
+#  define REP_STOS	rep stosl
+
+#  define USE_WIDE_CHAR
+
+# else
+#  define VMOVU_MASK	vmovdqu8
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define CHAR_SIZE	1
+
+#  define REP_MOVS	rep movsb
+#  define REP_STOS	rep stosb
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+
+# define VZERO	VMM(7)
+# define VZERO_256	VMM_256(7)
+# define VZERO_128	VMM_128(7)
+
+# if VEC_SIZE == 64
+#  define VZERO_HALF	VZERO_256
+# else
+#  define VZERO_HALF	VZERO_128
+# endif
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCPY)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   just running rep stos{b|l} to zero set (which will almost
+	   certainly segfault), if that succeeds then just calling
+	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
+# ifdef USE_AS_WCSCPY
+	decq	%rdx
+	movq	%rdx, %rax
+	/* 56 is end of max supported address space.  */
+	shr	$56, %rax
+	jnz	L(zero_len)
+# else
+	decq	%rdx
+	/* If the flag needs to become `jb` replace `dec` with `sub`.
+	 */
+	jl	L(zero_len)
+# endif
+
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
+	movl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+
+	/* If no STPCPY just save end ahead of time.  */
+# ifndef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+
+
+	cmpq	$(CHAR_PER_VEC), %rdx
+
+	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
+	   <= CHAR_PER_VEC with masked instructions (which have
+	   potential for dramatically bad perf if dst splits a page and
+	   is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	/* `jae` because length rdx is now length - 1.  */
+	jae	L(more_1x_vec)
+
+	/* If there where multiple zero-CHAR matches in the first VEC,
+	   VRCX will be overset but thats fine since any oversets where
+	   at zero-positions anyways.  */
+
+#  ifdef USE_AS_STPCPY
+	tzcnt	%VRCX, %VRAX
+	cmpl	%eax, %edx
+	cmovb	%edx, %eax
+#   ifdef USE_AS_WCSCPY
+	adcl	$0, %eax
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#   else
+	adcq	%rdi, %rax
+#   endif
+#  endif
+	dec	%VRCX
+
+	/* Zero out all non-zero CHAR's after the first zero match.  */
+	KMOV	%VRCX, %k1
+
+	/* Use VZERO as destination so this can be reused for
+	   L(zfill_less_vec) (which if jumped to by subsequent logic
+	   will have zerod out VZERO.  */
+	VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
+L(zfill_less_vec):
+	/* Get mask for what we need to set.  */
+	incl	%edx
+	mov	$-1, %VRCX
+	bzhi	%VRDX, %VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	VMOVU_MASK %VZERO, (%rdi){%k1}
+	ret
+
+	.p2align 4,, 4
+L(zero_len):
+	cmpq	$-1, %rdx
+	jne	L(best_effort_strncpy)
+	movq	%rdi, %rax
+	ret
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# else
+	/* `jb` because length rdx is now length - 1.  */
+	jb	L(less_1x_vec)
+# endif
+
+
+	/* This may overset but thats fine because we still need to zero
+	   fill.  */
+	VMOVU	%VMM(0), (%rdi)
+
+
+	/* Length must be >= CHAR_PER_VEC so match here means we must
+	   zero-fill.  */
+	test	%VRCX, %VRCX
+	jnz	L(zfill)
+
+
+	/* We are going to align rsi here so will need to be able to re-
+	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
+	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+
+L(loop_last_4x_vec):
+	addq	%rsi, %rdi
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+
+	/* -1 because of the `dec %rdx` earlier.  */
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	/* This will be need to be computed no matter what. We do it
+	   ahead of time for CHAR_PER_VEC == 64 because we can't adjust
+	   the value of `tzcnt` with a shift.  */
+# if CHAR_PER_VEC == 64
+	tzcntq	%rcx, %rcx
+# endif
+
+	cmpl	$(CHAR_PER_VEC), %edx
+	jb	L(ret_vec_x1_len)
+
+	/* Seperate logic for CHAR_PER_VEC == 64 because we already did
+	   `tzcnt` on VRCX.  */
+# if CHAR_PER_VEC == 64
+	/* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
+	cmpb	$CHAR_PER_VEC, %cl
+	jnz	L(ret_vec_x1_no_bsf)
+# else
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+# endif
+
+
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %VZERO, %k0
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	KMOV	%k0, %VRCX
+
+# if CHAR_PER_VEC < 64
+	/* This essentiallys adds CHAR_PER_VEC to computed result.  */
+	shlq	$CHAR_PER_VEC, %rcx
+# else
+	tzcntq	%rcx, %rcx
+	addl	$CHAR_PER_VEC, %ecx
+# endif
+
+	.p2align 4,, 4
+L(ret_vec_x1_len):
+	/* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
+	   already been done.  */
+# if CHAR_PER_VEC < 64
+	tzcntq	%rcx, %rcx
+# endif
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x1_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+L(ret_vec_x1_len_no_zfill):
+	VMOVU	((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+	.p2align 4,, 10
+L(ret_vec_x1):
+	bsf	%VRCX, %VRCX
+L(ret_vec_x1_no_bsf):
+	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	subl	%ecx, %edx
+	cmpl	$CHAR_PER_VEC, %edx
+	jb	L(ret_vec_x1_len_no_zfill_mov)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+	ret
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	/* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
+	   $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
+	   using `movzbl`.  */
+# if CHAR_PER_VEC == 64
+	movzbl	%dl, %edx
+# else
+	andl	$(CHAR_PER_VEC * 4 - 1), %edx
+# endif
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+	subq	$-(VEC_SIZE * 4), %rsi
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2 - 1), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	test	%VRCX, %VRCX
+	/* Must fill at least 2x VEC.  */
+	jnz	L(zfill_vec1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	/* Must fill at least 1x VEC.  */
+	jnz	L(zfill_vec2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRCX
+
+	/* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rdx
+	ja	L(more_4x_vec)
+
+	subl	$(CHAR_PER_VEC * 3), %edx
+	jb	L(ret_vec_x3_len)
+
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rsi), %VZERO, %k0
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	KMOV	%k0, %VRCX
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x4_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	movl	%ecx, %edx
+L(ret_vec_x4_len_no_zfill):
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 4 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+L(ret_vec_x3_len):
+	addl	$(CHAR_PER_VEC * 1), %edx
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x3_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+	.p2align 4,, 4
+L(ret_vec_x3_len_no_zfill):
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 3 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsf	%VRCX, %VRCX
+	VMOVU	%VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
+	subl	%ecx, %edx
+	jl	L(ret_vec_x3_len_no_zfill_mov)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+	ret
+
+	.p2align 4,, 8
+L(more_4x_vec):
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec4)
 
-#define USE_AS_STRNCPY
-#define STRCPY	STRNCPY
-#include "strcpy-evex.S"
+	/* Recheck length before aligning.  */
+	cmpq	$(CHAR_PER_VEC * 8 - 1), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
+# ifdef USE_AS_WCSCPY
+	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rsi, %rdx
+# endif
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+
+
+	/* Offset rsi by VEC_SIZE so that we can jump to
+	   L(loop_last_4x_vec).  */
+	addq	$-(VEC_SIZE), %rsi
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	/* Store loop end in r9.  */
+	leaq	-(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	/* Restore rdx (length).  */
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	/* Restore rdi (dst).  */
+	addq	%rsi, %rdi
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec1)
+
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec2)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec3)
+
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
+	KMOV	%k4, %VRCX
+	// Zfill more....
+
+	.p2align 4,, 4
+L(zfill_vec4):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -2), %rdx
+L(zfill_vec2):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -1), %rdx
+L(zfill):
+	/* VRCX must be non-zero.  */
+	bsf	%VRCX, %VRCX
+
+	/* Adjust length / dst for zfill.  */
+	subq	%rcx, %rdx
+# ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+# else
+	addq	%rcx, %rdi
+# endif
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_from_page_cross):
+
+	/* From here on out its just memset(rdi, 0, rdx).  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jb	L(zfill_less_vec)
+
+L(zfill_more_1x_vec):
+	VMOVU	%VZERO, (%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
+	ja	L(zfill_more_2x_vec)
+L(zfill_done0):
+	ret
+
+	/* Coming from vec1/vec2 we must be able to zfill at least 2x
+	   VEC.  */
+	.p2align 4,, 8
+L(zfill_vec3):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -2), %rdx
+	.p2align 4,, 2
+L(zfill_vec1):
+	bsfq	%rcx, %rcx
+	/* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
+	 */
+	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+
+
+	VMOVU	%VZERO, (%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	jb	L(zfill_done0)
+L(zfill_more_2x_vec):
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VZERO, (VEC_SIZE)(%rdi)
+	subq	$(CHAR_PER_VEC * 4 - 1), %rdx
+	jbe	L(zfill_done)
+
+# ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rdi, %rdx
+# endif
+
+	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi)
+
+
+	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
+	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	jbe	L(zfill_done)
+
+	/* Align rdi and zfill loop.  */
+	andq	$-(VEC_SIZE), %rdi
+	.p2align 4,, 12
+L(zfill_loop_4x_vec):
+	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	ja	L(zfill_loop_4x_vec)
+L(zfill_done):
+	ret
+
+
+	/* Less 1x VEC case if we are not using evex masked store.  */
+# if !USE_EVEX_MASKED_STORE
+	.p2align 4,, 8
+L(copy_1x):
+	/* Special case for copy 1x. It can be handled quickly and many
+	   buffer sizes have convenient alignment.  */
+	VMOVU	%VMM(0), (%rdi)
+	/* If no zeros then we are done.  */
+	testl	%ecx, %ecx
+	jz	L(ret_1x_1x)
+
+	/* Need to zfill, not we know that length <= CHAR_PER_VEC so we
+	   only handle the small case here.  */
+	bsf	%VRCX, %VRCX
+L(zfill_less_vec_no_bsf):
+	/* Adjust length / dst then just zfill less_vec.  */
+	subq	%rcx, %rdx
+#  ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#  else
+	addq	%rcx, %rdi
+#  endif
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+
+L(zfill_less_vec):
+	cmpl	$((VEC_SIZE / 2) / CHAR_SIZE), %edx
+	jb	L(zfill_less_half)
+
+	VMOVU	%VZERO_HALF, (%rdi)
+	VMOVU	%VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	ret
+#  ifdef USE_AS_STPCPY
+L(ret_1x_1x):
+	leaq	CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
+	ret
+#  endif
+
+
+#  if VEC_SIZE == 64
+	.p2align 4,, 4
+L(copy_32_63):
+	/* Overfill to avoid branches.  */
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+
+	/* We are taking advantage of the fact that to be here we must
+	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
+	   way for overwriting.  */
+	cmpl	%ecx, %edx
+	ja	L(zfill_less_vec_no_bsf)
+#   ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+#   else
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  endif
+
+	.p2align 4,, 4
+L(copy_16_31):
+	/* Overfill to avoid branches.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpl	%ecx, %edx
+
+	/* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
+	   we have a larger copy block for 32-63 so this is just falls
+	   through to zfill 16-31. If VEC_SIZE == 32 then we check for
+	   full zfill of less 1x VEC.  */
+#  if VEC_SIZE == 64
+	jbe	L(ret_16_31)
+	subl	%ecx, %edx
+#   ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#   else
+	addq	%rcx, %rdi
+#   endif
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+L(zfill_less_half):
+L(zfill_less_32):
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jb	L(zfill_less_16)
+	VMOVU	%VZERO_128, (%rdi)
+	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+#   ifdef USE_AS_STPCPY
+	ret
+#   endif
+L(ret_16_31):
+#   ifdef USE_AS_STPCPY
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  else
+	/* VEC_SIZE == 32 begins.  */
+	ja	L(zfill_less_vec_no_bsf)
+#   ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+#   else
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  endif
+
+
+	.p2align 4,, 4
+L(copy_8_15):
+	/* Overfill to avoid branches.  */
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpl	%ecx, %edx
+	jbe	L(ret_8_15)
+	subl	%ecx, %edx
+#  ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#  else
+	addq	%rcx, %rdi
+#  endif
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+	.p2align 4,, 8
+#  if VEC_SIZE == 32
+L(zfill_less_half):
+#  endif
+L(zfill_less_16):
+	xorl	%ecx, %ecx
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jb	L(zfill_less_8)
+	movq	%rcx, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+#  ifndef USE_AS_STPCPY
+L(ret_8_15):
+#  endif
+	ret
+
+	.p2align 4,, 8
+L(less_1x_vec):
+	je	L(copy_1x)
+
+	/* We will need `tzcnt` result for all other copy sizes.  */
+	tzcnt	%VRCX, %VRCX
+#  if VEC_SIZE == 64
+	cmpl	$(32 / CHAR_SIZE), %edx
+	jae	L(copy_32_63)
+#  endif
+
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jae	L(copy_16_31)
+
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jae	L(copy_8_15)
+#  ifdef USE_AS_WCSCPY
+	testl	%ecx, %ecx
+	jz	L(zfill_less_8_set_ret)
+
+	movl	(%rsi, %rdx, CHAR_SIZE), %esi
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%esi, (%rdi, %rdx, CHAR_SIZE)
+#   ifdef USE_AS_STPCPY
+	cmpl	%ecx, %edx
+L(ret_8_15):
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#   endif
+	ret
+L(zfill_less_8_set_ret):
+	xorl	%ecx, %ecx
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+L(zfill_less_8):
+	movl	%ecx, (%rdi)
+	movl	%ecx, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  else
+	cmpl	$3, %edx
+	jb	L(copy_0_3)
+	/* Overfill to avoid branches.  */
+	movl	-3(%rsi, %rdx), %esi
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%esi, -3(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_4_7)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+	xorl	%ecx, %ecx
+	.p2align 4,, 8
+L(zfill_less_8):
+	cmpl	$3, %edx
+	jb	L(zfill_less_3)
+	movl	%ecx, (%rdi)
+	movl	%ecx, -3(%rdi, %rdx)
+#   ifdef USE_AS_STPCPY
+	ret
+#   endif
+
+L(ret_4_7):
+#   ifdef USE_AS_STPCPY
+L(ret_8_15):
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#   endif
+	ret
+
+	.p2align 4,, 4
+L(zfill_less_3):
+	testl	%edx, %edx
+	jz	L(zfill_1)
+	movw	%cx, (%rdi)
+L(zfill_1):
+	movb	%cl, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_0_3):
+	vmovd	%VMM_128(0), %r8d
+	testl	%edx, %edx
+	jz	L(copy_1)
+	movw	%r8w, (%rdi)
+	cmpl	%ecx, %edx
+	ja	L(zfill_from_1)
+	movzbl	(%rsi, %rdx), %r8d
+#   ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+	movb	%r8b, (%rdi, %rdx)
+	ret
+#   endif
+
+L(copy_1):
+#   ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	cmpl	%ecx, %edx
+	adcq	%rdi, %rax
+#   endif
+#   ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+#   else
+	movb	%r8b, (%rdi, %rdx)
+#   endif
+	ret
+#  endif
+
+
+#  ifndef USE_AS_WCSCPY
+	.p2align 4,, 8
+L(zfill_from_1):
+#   ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+#   endif
+	movw	$0, -1(%rdi, %rdx)
+	ret
+#  endif
+
+	.p2align 4,, 4
+L(zero_len):
+	incq	%rdx
+	jne	L(best_effort_strncpy)
+	movq	%rdi, %rax
+	ret
+# endif
+
+
+	.p2align 4,, 4
+	.p2align 6,, 8
+L(page_cross):
+	movq	%rsi, %rax
+	andq	$(VEC_SIZE * -1), %rax
+	VPCMPEQ	(%rax), %VZERO, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSCPY
+	movl	%esi, %r8d
+	shrl	$2, %r8d
+	andl	$(CHAR_PER_VEC - 1), %r8d
+	shrx	%VR8, %VRCX, %VRCX
+# else
+	shrx	%VRSI, %VRCX, %VRCX
+# endif
+
+	/* Compute amount of bytes we checked.  */
+	subl	%esi, %eax
+	andl	$(VEC_SIZE - 1), %eax
+# ifdef USE_AS_WCSCPY
+	shrl	$2, %eax
+# endif
+
+	/* If rax > rdx then we are finishing the copy at the end of the
+	   page.  */
+	cmpq	%rax, %rdx
+	jb	L(page_cross_small)
+
+
+	/* If rcx is non-zero then continue.  */
+	test	%VRCX, %VRCX
+	jz	L(page_cross_continue)
+
+	/* We found zero-CHAR so need to copy then zfill (we know we
+	   didn't cover all of length here).  */
+	bsf	%VRCX, %VRCX
+L(movsb_and_zfill):
+	incl	%ecx
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	leaq	-CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
+# else
+	movq	%rdi, %rax
+# endif
+
+	REP_MOVS
+# ifdef USE_AS_WCSCPY
+	movl	$0, (%rdi)
+# else
+	movb	$0, (%rdi)
+# endif
+	jmp	L(zfill_from_page_cross)
+
+L(page_cross_small):
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(page_cross_copy_only)
+
+	/* Do a zfill of the tail before copying.  */
+	movq	%rdi, %r9
+	xorl	%eax, %eax
+
+	movl	%ecx, %r8d
+
+	subl	%ecx, %edx
+	leaq	CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+	movl	%edx, %ecx
+	REP_STOS
+	movq	%r9, %rdi
+	movl	%r8d, %edx
+L(page_cross_copy_only):
+	leal	1(%rdx), %ecx
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcl	$0, %edx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# else
+	movq	%rdi, %rax
+# endif
+	REP_MOVS
+	ret
+
+
+L(best_effort_strncpy):
+	movq	%rdx, %rcx
+	xorl	%eax, %eax
+	movq	%rdi, %r8
+	/* The length is >= 2^63. We very much so expect to segfault at
+	   rep stos. If that doesn't happen then just strcpy to finish.
+	 */
+	REP_STOS
+	movq	%r8, %rdi
+	jmp	OVERFLOW_STRCPY
+END(STRNCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
new file mode 100644
index 0000000000..d4f4d6c82b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
@@ -0,0 +1,80 @@
+/* Helper for getting proper name of overflow fallback function for
+   {wc|st}{p|r|s}n{cat|cpy}
+
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
+#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
+
+#if defined USE_MULTIARCH && IS_IN(libc)
+#  define UNDERSCORES __
+#  ifdef USE_WITH_SSE2
+#    define ISA_EXT _sse2
+#  elif defined USE_WITH_AVX2
+#    ifdef USE_WITH_RTM
+#      define ISA_EXT _avx2_rtm
+#    else
+#      define ISA_EXT _avx2
+#    endif
+
+#  elif defined USE_WITH_EVEX256
+#    define ISA_EXT _evex
+#  elif defined USE_WITH_EVEX512
+#    define ISA_EXT _evex512
+#  endif
+#else
+#  define UNDERSCORES
+#  define ISA_EXT
+#endif
+
+#ifdef USE_AS_WCSCPY
+#  define STRCPY_PREFIX wc
+#  define STRCAT_PREFIX wcs
+#  ifdef USE_AS_STPCPY
+#    define STRCPY_POSTFIX pcpy
+#  else
+#    define STRCPY_POSTFIX scpy
+#  endif
+#else
+#  define STRCPY_PREFIX st
+#  define STRCAT_PREFIX str
+#  ifdef USE_AS_STPCPY
+#    define STRCPY_POSTFIX pcpy
+#  else
+#    define STRCPY_POSTFIX rcpy
+#  endif
+#endif
+#define STRCAT_POSTFIX cat
+
+#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext)                 \
+  underscores##prefix##postfix##ext
+
+#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
+
+#ifndef OVERFLOW_STRCPY
+#  define OVERFLOW_STRCPY                                                     \
+    OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
+#endif
+
+#ifndef OVERFLOW_STRCAT
+#  define OVERFLOW_STRCAT                                                     \
+    OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
+#endif
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v5 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions
  2022-11-09  1:38   ` [PATCH v5 " Noah Goldstein
@ 2022-11-09  1:38     ` Noah Goldstein
  2022-11-09  3:00       ` H.J. Lu
  2022-11-09  1:38     ` [PATCH v5 3/4] x86: Add evex optimized functions for the wchar_t strcpy family Noah Goldstein
                       ` (2 subsequent siblings)
  3 siblings, 1 reply; 42+ messages in thread
From: Noah Goldstein @ 2022-11-09  1:38 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
    1. Use more overlapping stores to avoid branches.
    2. Reduce how unrolled the aligning copies are (this is more of a
       code-size save, its a negative for some sizes in terms of
       perf).
    3. For st{r|p}n{cat|cpy} re-order the branches to minimize the
       number that are taken.

Performance Changes:

    Times are from N = 10 runs of the benchmark suite and are
    reported as geometric mean of all ratios of
    New Implementation / Old Implementation.

    strcat-avx2      -> 0.998
    strcpy-avx2      -> 0.937
    stpcpy-avx2      -> 0.971

    strncpy-avx2     -> 0.793
    stpncpy-avx2     -> 0.775

    strncat-avx2     -> 0.962

Code Size Changes:
    function         -> Bytes New / Bytes Old -> Ratio

    strcat-avx2      ->  685 / 1639 -> 0.418
    strcpy-avx2      ->  560 /  903 -> 0.620
    stpcpy-avx2      ->  592 /  939 -> 0.630

    strncpy-avx2     -> 1176 / 2390 -> 0.492
    stpncpy-avx2     -> 1268 / 2438 -> 0.520

    strncat-avx2     -> 1042 / 2563 -> 0.407

Notes:
    1. Because of the significant difference between the
       implementations they are split into three files.

           strcpy-avx2.S    -> strcpy, stpcpy, strcat
           strncpy-avx2.S   -> strncpy
           strncat-avx2.S    > strncat

       I couldn't find a way to merge them without making the
       ifdefs incredibly difficult to follow.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
 sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S    |    6 +-
 sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S   |    7 +-
 sysdeps/x86_64/multiarch/stpncpy-avx2.S       |    5 +-
 sysdeps/x86_64/multiarch/strcat-avx2-rtm.S    |   13 +-
 sysdeps/x86_64/multiarch/strcat-avx2.S        |  268 +---
 .../x86_64/multiarch/strcat-strlen-avx2.h.S   |  101 ++
 sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S    |   13 +-
 sysdeps/x86_64/multiarch/strcpy-avx2.S        | 1236 +++++------------
 sysdeps/x86_64/multiarch/strncat-avx2-rtm.S   |    6 +-
 sysdeps/x86_64/multiarch/strncat-avx2.S       |  424 +++++-
 sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S   |    6 +-
 sysdeps/x86_64/multiarch/strncpy-avx2.S       |  740 +++++++++-
 sysdeps/x86_64/multiarch/x86-avx-vecs.h       |    3 +-
 13 files changed, 1594 insertions(+), 1234 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S

diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
index 2b9c07a59f..90e532dbe8 100644
--- a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STPCPY	__stpcpy_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
+#include "stpcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
index 60a2ccfe53..46ee07be36 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
@@ -1,4 +1,3 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STPNCPY	__stpncpy_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
+#include "stpncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
index b2f8c19143..a46a8edbe2 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
@@ -3,6 +3,5 @@
 #endif
 
 #define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY	STPNCPY
-#include "strcpy-avx2.S"
+#define STRNCPY	STPNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
index 637fb557c4..e84f4f1fef 100644
--- a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
@@ -1,12 +1,3 @@
-#ifndef STRCAT
-# define STRCAT __strcat_avx2_rtm
-#endif
-
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
-
-#define SECTION(p) p##.avx.rtm
-
+#define STRCAT	__strcat_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
 #include "strcat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
index d9b7fb2a43..3f914fa342 100644
--- a/sysdeps/x86_64/multiarch/strcat-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
@@ -16,266 +16,10 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (3)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_avx2
-# endif
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE	32
-
-# ifndef SECTION
-#  define SECTION(p)	p##.avx
-# endif
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRCAT)
-	mov	%rdi, %r9
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-	xor	%eax, %eax
-	mov	%edi, %ecx
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	vpxor	%xmm6, %xmm6, %xmm6
-	cmp	$(VEC_SIZE * 3), %ecx
-	ja	L(fourth_vector_boundary)
-	vpcmpeqb (%rdi), %ymm6, %ymm0
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_first_vector)
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	jmp	L(align_vec_size_start)
-L(fourth_vector_boundary):
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	vpcmpeqb	(%rax), %ymm6, %ymm0
-	mov	$-1, %r10d
-	sub	%rax, %rcx
-	shl	%cl, %r10d
-	vpmovmskb %ymm0, %edx
-	and	%r10d, %edx
-	jnz	L(exit)
-
-L(align_vec_size_start):
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 4), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 4), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 4), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-	add	$(VEC_SIZE * 5), %rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
-	add	$VEC_SIZE, %rax
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
-	add	$VEC_SIZE, %rax
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
-	add	$VEC_SIZE, %rax
-	vpmovmskb %ymm3, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	add	$VEC_SIZE, %rax
-
-	.p2align 4
-L(align_four_vec_loop):
-	vmovaps	(%rax),	%ymm4
-	vpminub	VEC_SIZE(%rax),	%ymm4, %ymm4
-	vmovaps	(VEC_SIZE * 2)(%rax),	%ymm5
-	vpminub	(VEC_SIZE * 3)(%rax),	%ymm5, %ymm5
-	add	$(VEC_SIZE * 4),	%rax
-	vpminub	%ymm4,	%ymm5, %ymm5
-	vpcmpeqb %ymm5,	%ymm6, %ymm5
-	vpmovmskb %ymm5,	%edx
-	test	%edx,	%edx
-	jz	L(align_four_vec_loop)
-
-	vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
-	sub	$(VEC_SIZE * 5),	%rax
-	vpmovmskb %ymm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-	vpmovmskb %ymm1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-	vpmovmskb %ymm2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-	vpmovmskb %ymm3, %edx
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit):
-	sub	%rdi, %rax
-L(exit_null_on_first_vector):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_second_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$VEC_SIZE, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_third_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 2), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fourth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 3), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fifth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-
-	.p2align 4
-L(StartStrcpyPart):
-	lea	(%r9, %rax), %rdi
-	mov	%rsi, %rcx
-	mov	%r9, %rax      /* save result */
-
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(ExitZero)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-avx2.S"
+#ifndef STRCAT
+# define STRCAT	__strcat_avx2
 #endif
+
+#define USE_AS_STRCAT
+#define STRCPY	STRCAT
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S
new file mode 100644
index 0000000000..f50514e07c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S
@@ -0,0 +1,101 @@
+/* strlen used for begining of str{n}cat using AVX2.
+   Copyright (C) 2011-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+/* NOTE: This file is meant to be included by strcat-avx2 or
+   strncat-avx2 and does not standalone.  Before including %rdi
+   must be saved in %rax.  */
+
+
+/* Simple strlen implementation that ends at
+   L(strcat_strlen_done).  */
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+	VPCMPEQ	(%r8), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	shrxl	%edi, %ecx, %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	VEC_SIZE(%r8), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	leaq	(VEC_SIZE)(%r8), %rdi
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v1)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v2)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v3)
+
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	.p2align 4,, 8
+L(loop_2x_vec):
+	VMOVA	(VEC_SIZE * 0 + 1)(%rdi), %VMM(0)
+	VPMIN	(VEC_SIZE * 1 + 1)(%rdi), %VMM(0), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 1)(%rdi), %VMM(2)
+	VPMIN	(VEC_SIZE * 3 + 1)(%rdi), %VMM(2), %VMM(3)
+	VPMIN	%VMM(1), %VMM(3), %VMM(3)
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(3)
+	vpmovmskb %VMM(3), %r8d
+	subq	$(VEC_SIZE * -4), %rdi
+	testl	%r8d, %r8d
+	jz	L(loop_2x_vec)
+
+	addq	$(VEC_SIZE * -4 + 1), %rdi
+
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(0)
+	vpmovmskb %VMM(0), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(1)
+	vpmovmskb %VMM(1), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v1)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(2)
+	vpmovmskb %VMM(2), %ecx
+	testl	%ecx, %ecx
+	jnz	L(bsf_and_done_v2)
+
+	movl	%r8d, %ecx
+L(bsf_and_done_v3):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v2):
+	bsfl	%ecx, %ecx
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx), %rdi
+	jmp	L(strcat_strlen_done)
+
+	.p2align 4,, 4
+L(bsf_and_done_v1):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v0):
+	bsfl	%ecx, %ecx
+	addq	%rcx, %rdi
+L(strcat_strlen_done):
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
index c2c581ecf7..3ae2de8ea9 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
@@ -1,12 +1,3 @@
-#ifndef STRCPY
-# define STRCPY __strcpy_avx2_rtm
-#endif
-
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
-
-#define SECTION(p) p##.avx.rtm
-
+#define STRCPY	__strcpy_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
 #include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index c725834929..32f86baa4c 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -20,984 +20,378 @@
 
 #if ISA_SHOULD_BUILD (3)
 
+# include <sysdep.h>
 
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
-
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_avx2
-#  endif
-
-# endif
-
-/* Number of bytes in a vector register */
 # ifndef VEC_SIZE
-#  define VEC_SIZE	32
-# endif
-
-# ifndef VZEROUPPER
-#  define VZEROUPPER	vzeroupper
-# endif
-
-# ifndef SECTION
-#  define SECTION(p)	p##.avx
-# endif
-
-/* zero register */
-#define xmmZ	xmm0
-#define ymmZ	ymm0
-
-/* mask register */
-#define ymmM	ymm1
-
-# ifndef USE_AS_STRCAT
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRCPY)
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-	test	%R8_LP, %R8_LP
-	jz	L(ExitZero)
-#  endif
-	mov	%rsi, %rcx
-#  ifndef USE_AS_STPCPY
-	mov	%rdi, %rax      /* save result */
-#  endif
-
+#  include "x86-avx-vecs.h"
 # endif
 
-	vpxor	%xmmZ, %xmmZ, %xmmZ
-
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	cmp	$(VEC_SIZE * 2), %ecx
-	jbe	L(SourceStringAlignmentLessTwoVecSize)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-
-	vpcmpeqb (%rsi), %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	shr	%cl, %rdx
-
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	mov	$VEC_SIZE, %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  else
-	mov	$(VEC_SIZE + 1), %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  endif
-	jbe	L(CopyVecSizeTailCase2OrCase3)
+# ifndef STRCPY
+#  define STRCPY	__strcpy_avx2
 # endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail)
 
-	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
-	vpmovmskb %ymm2, %edx
+	/* Use movsb in page cross case to save code size.  */
+# define USE_MOVSB_IN_PAGE_CROSS	1
 
-# ifdef USE_AS_STRNCPY
-	add	$VEC_SIZE, %r10
-	cmp	%r10, %r8
-	jbe	L(CopyTwoVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize)
-
-	vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
-	vmovdqu %ymm2, (%rdi)
-
-/* If source address alignment != destination address alignment */
-	.p2align 4
-L(UnalignVecSizeBoth):
-	sub	%rcx, %rdi
-# ifdef USE_AS_STRNCPY
-	add	%rcx, %r8
-	sbb	%rcx, %rcx
-	or	%rcx, %r8
-# endif
-	mov	$VEC_SIZE, %rcx
-	vmovdqa (%rsi, %rcx), %ymm2
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 3), %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_WCSCPY
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
 # else
-	jnz	L(CopyVecSize)
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
 # endif
 
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
-	vpcmpeqb %ymm3, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define PAGE_SIZE	4096
 
-	vmovdqu %ymm3, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
-	vpcmpeqb %ymm4, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
+# ifdef USE_AS_STPCPY
+#  define END_REG	rax
 # else
-	jnz	L(CopyVecSize)
+#  define END_REG	rdi, %rdx
 # endif
 
-	vmovdqu %ymm4, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STRCAT
+#  define PAGE_ALIGN_REG	ecx
 # else
-	jnz	L(CopyVecSize)
+#  define PAGE_ALIGN_REG	eax
 # endif
 
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
 
-	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
-	vmovdqu %ymm2, (%rdi, %rcx)
-	vpcmpeqb %ymm3, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRCPY)
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
 
-	vmovdqu %ymm3, (%rdi, %rcx)
-	mov	%rsi, %rdx
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	and	$-(VEC_SIZE * 4), %rsi
-	sub	%rsi, %rdx
-	sub	%rdx, %rdi
-# ifdef USE_AS_STRNCPY
-	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
-# endif
-L(UnalignedFourVecSizeLoop):
-	vmovdqa (%rsi), %ymm4
-	vmovdqa VEC_SIZE(%rsi), %ymm5
-	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
-	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
-	vpminub %ymm5, %ymm4, %ymm2
-	vpminub %ymm7, %ymm6, %ymm3
-	vpminub %ymm2, %ymm3, %ymm3
-	vpcmpeqb %ymmM, %ymm3, %ymm3
-	vpmovmskb %ymm3, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(UnalignedFourVecSizeLeave)
-
-L(UnalignedFourVecSizeLoop_start):
-	add	$(VEC_SIZE * 4), %rdi
-	add	$(VEC_SIZE * 4), %rsi
-	vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
-	vmovdqa (%rsi), %ymm4
-	vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
-	vmovdqa VEC_SIZE(%rsi), %ymm5
-	vpminub %ymm5, %ymm4, %ymm2
-	vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
-	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
-	vmovdqu %ymm7, -VEC_SIZE(%rdi)
-	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
-	vpminub %ymm7, %ymm6, %ymm3
-	vpminub %ymm2, %ymm3, %ymm3
-	vpcmpeqb %ymmM, %ymm3, %ymm3
-	vpmovmskb %ymm3, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jz	L(UnalignedFourVecSizeLoop_start)
-
-L(UnalignedFourVecSizeLeave):
-	vpcmpeqb %ymm4, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_0)
-
-	vpcmpeqb %ymm5, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %ecx
-	test	%ecx, %ecx
-	jnz	L(CopyVecSizeUnaligned_16)
-
-	vpcmpeqb %ymm6, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	test	%edx, %edx
-	jnz	L(CopyVecSizeUnaligned_32)
-
-	vpcmpeqb %ymm7, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %ecx
-	bsf	%ecx, %edx
-	vmovdqu %ymm4, (%rdi)
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	add	$(VEC_SIZE * 3), %rsi
-	add	$(VEC_SIZE * 3), %rdi
-	jmp	L(CopyVecSizeExit)
+# ifdef USE_AS_STRCAT
+	movq	%rdi, %rax
+#  include "strcat-strlen-avx2.h.S"
 # endif
 
-/* If source address alignment == destination address alignment */
-
-L(SourceStringAlignmentLessTwoVecSize):
-	vmovdqu (%rsi), %ymm3
-	vmovdqu VEC_SIZE(%rsi), %ymm2
-	vpcmpeqb %ymm3, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$VEC_SIZE, %r8
-#  else
-	cmp	$(VEC_SIZE + 1), %r8
-#  endif
-	jbe	L(CopyVecSizeTail1Case2OrCase3)
+	movl	%esi, %PAGE_ALIGN_REG
+	andl	$(PAGE_SIZE - 1), %PAGE_ALIGN_REG
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
+	ja	L(page_cross)
+L(page_cross_continue):
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 # endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail1)
-
-	vmovdqu %ymm3, (%rdi)
-	vpcmpeqb %ymm2, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	cmp	$(VEC_SIZE * 2), %r8
-#  else
-	cmp	$((VEC_SIZE * 2) + 1), %r8
-#  endif
-	jbe	L(CopyTwoVecSize1Case2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize1)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-	jmp	L(UnalignVecSizeBoth)
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
 
-/*------End of main part with loops---------------------*/
+	testl	%ecx, %ecx
+	jz	L(more_1x_vec)
 
-/* Case1 */
+	/* No longer need ymm registers so just vzeroupper so it doesn't
+	   need to be duplicated at each return statement.  */
+	COND_VZEROUPPER
 
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
-	.p2align 4
-L(CopyVecSize):
-	add	%rcx, %rdi
-# endif
-L(CopyVecSizeTail):
-	add	%rcx, %rsi
-L(CopyVecSizeTail1):
-	bsf	%edx, %edx
-L(CopyVecSizeExit):
-	cmp	$32, %edx
-	jae	L(Exit32_63)
-	cmp	$16, %edx
-	jae	L(Exit16_31)
-	cmp	$8, %edx
-	jae	L(Exit8_15)
-	cmp	$4, %edx
-	jae	L(Exit4_7)
-	cmp	$3, %edx
-	je	L(Exit3)
-	cmp	$1, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	movb	$0, (%rdi)
+	xorl	%edx, %edx
+	bsfl	%ecx, %edx
 # ifdef USE_AS_STPCPY
-	lea	(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$1, %r8
-	lea	1(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
-
-	.p2align 4
-L(CopyTwoVecSize1):
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$VEC_SIZE, %r8
-# endif
-	jmp	L(CopyVecSizeTail1)
-
-	.p2align 4
-L(CopyTwoVecSize):
-	bsf	%edx, %edx
-	add	%rcx, %rsi
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	jmp	L(CopyVecSizeExit)
-
-	.p2align 4
-L(CopyVecSizeUnaligned_0):
-	bsf	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm4, (%rdi)
-	add	$((VEC_SIZE * 4) - 1), %r8
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
-# else
-	jmp	L(CopyVecSizeExit)
-# endif
-
-	.p2align 4
-L(CopyVecSizeUnaligned_16):
-	bsf	%ecx, %edx
-	vmovdqu %ymm4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	VEC_SIZE(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	add	$((VEC_SIZE * 3) - 1), %r8
-	sub	%rdx, %r8
-	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
+	leaq	(%rdi, %rdx), %rax
+# endif
+
+	/* Use mask bits in rcx to detect which copy we need. If the low
+	   mask is zero then there must be a bit set in the upper half.
+	   I.e if ecx != 0 and cx == 0, then match must be upper 16
+	   bits so we use L(copy_16_31).  */
+	testw	%cx, %cx
+	jz	L(copy_16_31)
+
+	testb	%cl, %cl
+	jz	L(copy_8_15)
+# ifdef USE_AS_WCSCPY
+	vmovd	%xmm0, (%rdi)
+	movl	$0, (%END_REG)
+	ret
 # else
-	add	$VEC_SIZE, %rsi
-	add	$VEC_SIZE, %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
-
-	.p2align 4
-L(CopyVecSizeUnaligned_32):
-	bsf	%edx, %edx
-	vmovdqu %ymm4, (%rdi)
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
-# endif
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-	add	$((VEC_SIZE * 2) - 1), %r8
-	sub	%rdx, %r8
-	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
-	jmp	L(StrncpyFillTailWithZero)
+	testb	$0x7, %cl
+	jz	L(copy_4_7)
+
+	testl	%edx, %edx
+	jz	L(set_null_term)
+	vmovd	%xmm0, %ecx
+	movw	%cx, (%rdi)
+
+	.p2align 4,, 2
+L(set_null_term):
+	movb	$0, (%END_REG)
+	ret
+
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-3(%rsi, %rdx), %ecx
+	vmovd	%xmm0, (%rdi)
+	movl	%ecx, -3(%END_REG)
+	ret
+# endif
+
+	.p2align 4,, 10
+L(copy_16_31):
+	VMOVU	-(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
+	ret
+
+	.p2align 4,, 10
+L(copy_8_15):
+# ifdef USE_AS_WCSCPY
+	movl	-(8 - CHAR_SIZE)(%rsi, %rdx), %ecx
 # else
-	add	$(VEC_SIZE * 2), %rsi
-	add	$(VEC_SIZE * 2), %rdi
-	jmp	L(CopyVecSizeExit)
-# endif
-
-# ifdef USE_AS_STRNCPY
-#  ifndef USE_AS_STRCAT
-	.p2align 4
-L(CopyVecSizeUnalignedVec6):
-	vmovdqu %ymm6, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec5):
-	vmovdqu %ymm5, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec4):
-	vmovdqu %ymm4, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec3):
-	vmovdqu %ymm3, (%rdi, %rcx)
-	jmp	L(CopyVecSizeVecExit)
-#  endif
-
-/* Case2 */
-
-	.p2align 4
-L(CopyVecSizeCase2):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	add	$VEC_SIZE, %edx
-	sub	%ecx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTailCase2):
-	add	%rcx, %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-L(CopyVecSizeTail1Case2):
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-	jmp	L(StrncpyExit)
-
-/* Case2 or Case3,  Case3 */
-
-	.p2align 4
-L(CopyVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeCase2)
-L(CopyVecSizeCase3):
-	add	$VEC_SIZE, %r8
-	add	%rcx, %rdi
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSizeCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyTwoVecSizeCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyVecSizeTailCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTailCase2)
-	add	%rcx, %rsi
-	jmp	L(StrncpyExit)
-
-	.p2align 4
-L(CopyTwoVecSize1Case2OrCase3):
-	add	$VEC_SIZE, %rdi
-	add	$VEC_SIZE, %rsi
-	sub	$VEC_SIZE, %r8
-L(CopyVecSizeTail1Case2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(CopyVecSizeTail1Case2)
-	jmp	L(StrncpyExit)
-# endif
-
-/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
-
-	.p2align 4
-L(Exit1):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$2, %r8
-	lea	2(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Exit2):
-	movzwl	(%rsi), %ecx
-	mov	%cx, (%rdi)
-	movb	$0, 2(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$3, %r8
-	lea	3(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Exit3):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
+# endif
+	vmovq	%xmm0, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%END_REG)
+	ret
+
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rdi)
+# endif
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	addq	%rsi, %rdi
+	VMOVA	1(%rsi), %VMM(1)
+
+	/* Try and order stores after as many loads as is reasonable to
+	   avoid potential false dependencies.  */
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	VMOVU	%VMM(0), (%rax)
+# endif
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE + 1)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), 1(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE + 1)(%rdi)
+
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2 + 1)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %edx
+	testl	%edx, %edx
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 3 + 1)(%rdi)
+
+	/* Subtract rsi from rdi before aligning. Adding back rsi will
+	   get proper rdi (dst) for new src.  */
+	subq	%rsi, %rdi
+	incq	%rsi
+	orq	$(VEC_SIZE * 4 - 1), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %edx
+	addq	%rsi, %rdi
+
+	testl	%edx, %edx
+	jnz	L(loop_4x_done)
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 1)(%rdi)
+
+
+	VMOVA	(VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %edx
+	subq	$(VEC_SIZE * -4), %rdi
+	testl	%edx, %edx
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
+
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
+L(ret_vec_x4):
+	bsfl	%edx, %edx
+	VMOVU	((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
 # ifdef USE_AS_STPCPY
-	lea	3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$4, %r8
-	lea	4(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax
 # endif
+L(return_end):
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(Exit4_7):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	-3(%rsi, %rdx), %ecx
-	mov	%ecx, -3(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x1):
+	bsfl	%ecx, %ecx
+	VMOVU	(1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	1(%rcx, %rdi), %rax
 # endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Exit8_15):
-	mov	(%rsi), %rcx
-	mov	-7(%rsi, %rdx), %r9
-	mov	%rcx, (%rdi)
-	mov	%r9, -7(%rdi, %rdx)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
-L(Exit16_31):
-	vmovdqu (%rsi), %xmm2
-	vmovdqu -15(%rsi, %rdx), %xmm3
-	vmovdqu %xmm2, (%rdi)
-	vmovdqu %xmm3, -15(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x2):
+	bsfl	%ecx, %ecx
+	VMOVU	((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub %rdx, %r8
-	sub $1, %r8
-	lea 1(%rdi, %rdx), %rdi
-	jnz L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax
 # endif
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(Exit32_63):
-	vmovdqu (%rsi), %ymm2
-	vmovdqu -31(%rsi, %rdx), %ymm3
-	vmovdqu %ymm2, (%rdi)
-	vmovdqu %ymm3, -31(%rdi, %rdx)
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsfl	%ecx, %ecx
+	VMOVU	((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
 # ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	%rdx, %r8
-	sub	$1, %r8
-	lea	1(%rdi, %rdx), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	leaq	(VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax
 # endif
 	VZEROUPPER_RETURN
 
-# ifdef USE_AS_STRNCPY
 
-	.p2align 4
-L(StrncpyExit1):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
+	.p2align 4,, 4
+L(page_cross):
+	movq	%rsi, %rcx
+	andq	$(VEC_SIZE * -1), %rcx
+
+	VPCMPEQ	(%rcx), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+# if USE_MOVSB_IN_PAGE_CROSS
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shll	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsfl	%ecx, %ecx
+#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+	movq	%rdi, %rax
+#  endif
+	rep	movsb
 #  ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 1(%rdi)
+	leaq	-CHAR_SIZE(%rdi), %rax
 #  endif
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(StrncpyExit2):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 2(%rdi)
-#  endif
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(StrncpyExit3_4):
-	movzwl	(%rsi), %ecx
-	movzwl	-2(%rsi, %r8), %edx
-	mov	%cx, (%rdi)
-	mov	%dx, -2(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit5_8):
-	mov	(%rsi), %ecx
-	mov	-4(%rsi, %r8), %edx
-	mov	%ecx, (%rdi)
-	mov	%edx, -4(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit9_16):
-	mov	(%rsi), %rcx
-	mov	-8(%rsi, %r8), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit17_32):
-	vmovdqu (%rsi), %xmm2
-	vmovdqu -16(%rsi, %r8), %xmm3
-	vmovdqu %xmm2, (%rdi)
-	vmovdqu %xmm3, -16(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit33_64):
-	/*  0/32, 31/16 */
-	vmovdqu (%rsi), %ymm2
-	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
-	vmovdqu %ymm2, (%rdi)
-	vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdi, %r8), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi, %r8)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(StrncpyExit65):
-	/* 0/32, 32/32, 64/1 */
-	vmovdqu (%rsi), %ymm2
-	vmovdqu 32(%rsi), %ymm3
-	mov	64(%rsi), %cl
-	vmovdqu %ymm2, (%rdi)
-	vmovdqu %ymm3, 32(%rdi)
-	mov	%cl, 64(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	65(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 65(%rdi)
-#  endif
-	VZEROUPPER_RETURN
+# else
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
 
+	/* Traditional copy case, essentially same as used in non-page-
+	   cross case but since we can't reuse VMM(0) we need twice as
+	   many loads from rsi.  */
 #  ifndef USE_AS_STRCAT
-
-	.p2align 4
-L(Fill1):
-	mov	%dl, (%rdi)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill2):
-	mov	%dx, (%rdi)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill3_4):
-	mov	%dx, (%rdi)
-	mov     %dx, -2(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill5_8):
-	mov	%edx, (%rdi)
-	mov     %edx, -4(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill9_16):
-	mov	%rdx, (%rdi)
-	mov	%rdx, -8(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(Fill17_32):
-	vmovdqu %xmmZ, (%rdi)
-	vmovdqu %xmmZ, -16(%rdi, %r8)
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(CopyVecSizeUnalignedVec2):
-	vmovdqu %ymm2, (%rdi, %rcx)
-
-	.p2align 4
-L(CopyVecSizeVecExit):
-	bsf	%edx, %edx
-	add	$(VEC_SIZE - 1), %r8
-	add	%rcx, %rdi
-#   ifdef USE_AS_STPCPY
-	lea	(%rdi, %rdx), %rax
-#   endif
-	sub	%rdx, %r8
-	lea	1(%rdi, %rdx), %rdi
-
-	.p2align 4
-L(StrncpyFillTailWithZero):
-	xor	%edx, %edx
-	sub	$VEC_SIZE, %r8
-	jbe	L(StrncpyFillExit)
-
-	vmovdqu %ymmZ, (%rdi)
-	add	$VEC_SIZE, %rdi
-
-	mov	%rdi, %rsi
-	and	$(VEC_SIZE - 1), %esi
-	sub	%rsi, %rdi
-	add	%rsi, %r8
-	sub	$(VEC_SIZE * 4), %r8
-	jb	L(StrncpyFillLessFourVecSize)
-
-L(StrncpyFillLoopVmovdqa):
-	vmovdqa %ymmZ, (%rdi)
-	vmovdqa %ymmZ, VEC_SIZE(%rdi)
-	vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
-	vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
-	add	$(VEC_SIZE * 4), %rdi
-	sub	$(VEC_SIZE * 4), %r8
-	jae	L(StrncpyFillLoopVmovdqa)
-
-L(StrncpyFillLessFourVecSize):
-	add	$(VEC_SIZE * 2), %r8
-	jl	L(StrncpyFillLessTwoVecSize)
-	vmovdqa %ymmZ, (%rdi)
-	vmovdqa %ymmZ, VEC_SIZE(%rdi)
-	add	$(VEC_SIZE * 2), %rdi
-	sub	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	vmovdqa %ymmZ, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillLessTwoVecSize):
-	add	$VEC_SIZE, %r8
-	jl	L(StrncpyFillExit)
-	vmovdqa %ymmZ, (%rdi)
-	add	$VEC_SIZE, %rdi
-	jmp	L(Fill)
-
-	.p2align 4
-L(StrncpyFillExit):
-	add	$VEC_SIZE, %r8
-L(Fill):
-	cmp	$17, %r8d
-	jae	L(Fill17_32)
-	cmp	$9, %r8d
-	jae	L(Fill9_16)
-	cmp	$5, %r8d
-	jae	L(Fill5_8)
-	cmp	$3, %r8d
-	jae	L(Fill3_4)
-	cmp	$1, %r8d
-	ja	L(Fill2)
-	je	L(Fill1)
-	VZEROUPPER_RETURN
-
-/* end of ifndef USE_AS_STRCAT */
+	xorl	%edx, %edx
 #  endif
-
-	.p2align 4
-L(UnalignedLeaveCase2OrCase3):
-	test	%rdx, %rdx
-	jnz	L(UnalignedFourVecSizeLeaveCase2)
-L(UnalignedFourVecSizeLeaveCase3):
-	lea	(VEC_SIZE * 4)(%r8), %rcx
-	and	$-VEC_SIZE, %rcx
-	add	$(VEC_SIZE * 3), %r8
-	jl	L(CopyVecSizeCase3)
-	vmovdqu %ymm4, (%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-	sub	$VEC_SIZE, %r8
-	jb	L(CopyVecSizeCase3)
-	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
+	bsfl	%ecx, %edx
 #  ifdef USE_AS_STPCPY
-	lea	(VEC_SIZE * 4)(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (VEC_SIZE * 4)(%rdi)
+	leaq	(%rdi, %rdx), %rax
+#  elif !defined USE_AS_STRCAT
+	movq	%rdi, %rax
 #  endif
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(UnalignedFourVecSizeLeaveCase2):
-	xor	%ecx, %ecx
-	vpcmpeqb %ymm4, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	add	$(VEC_SIZE * 3), %r8
-	jle	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
-	vpcmpeqb %ymm5, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	vmovdqu %ymm4, (%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec5)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
+	/* vzeroupper early to avoid duplicating at each return.  */
+	COND_VZEROUPPER
 
-	vpcmpeqb %ymm6, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	vmovdqu %ymm5, VEC_SIZE(%rdi)
-	add	$VEC_SIZE, %rcx
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%edx, %edx
-#  ifndef USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec6)
-#  else
-	jnz	L(CopyVecSize)
-#  endif
+	testw	%cx, %cx
+	jz	L(page_cross_copy_16_31)
 
-	vpcmpeqb %ymm7, %ymmZ, %ymmM
-	vpmovmskb %ymmM, %edx
-	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-	lea	VEC_SIZE(%rdi, %rcx), %rdi
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	bsf	%edx, %edx
-	cmp	%r8d, %edx
-	jb	L(CopyVecSizeExit)
-L(StrncpyExit):
-	cmp	$65, %r8d
-	je	L(StrncpyExit65)
-	cmp	$33, %r8d
-	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8d
-	jae	L(StrncpyExit17_32)
-	cmp	$9, %r8d
-	jae	L(StrncpyExit9_16)
-	cmp	$5, %r8d
-	jae	L(StrncpyExit5_8)
-	cmp	$3, %r8d
-	jae	L(StrncpyExit3_4)
-	cmp	$1, %r8d
-	ja	L(StrncpyExit2)
-	je	L(StrncpyExit1)
-#  ifdef USE_AS_STPCPY
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, (%rdi)
-#  endif
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(ExitZero):
-#  ifndef USE_AS_STRCAT
-	mov	%rdi, %rax
-#  endif
-	VZEROUPPER_RETURN
+	testb	%cl, %cl
+	jz	L(page_cross_copy_8_15)
 
-# endif
+	testl	$0x7, %cl
+	jz	L(page_cross_copy_4_7)
 
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
-# endif
+	testl	%edx, %edx
+	jz	L(page_cross_set_null_term)
+	movzwl	(%rsi), %ecx
+	movw	%cx, (%rdi)
+L(page_cross_set_null_term):
+	movb	$0, (%END_REG)
+	ret
+
+	.p2align 4,, 4
+L(page_cross_copy_4_7):
+	movl	(%rsi), %ecx
+	movl	-3(%rsi, %rdx), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, -3(%END_REG)
+	ret
+
+	.p2align 4,, 4
+L(page_cross_copy_8_15):
+	movq	(%rsi), %rcx
+	movq	-7(%rsi, %rdx), %rsi
+	movq	%rcx, (%rdi)
+	movq	%rsi, -7(%END_REG)
+	ret
+
+
+	.p2align 4,, 3
+L(page_cross_copy_16_31):
+	VMOVU	(%rsi), %xmm0
+	VMOVU	-15(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -15(%END_REG)
+	ret
+# endif
+
+END(STRCPY)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
index 0dcea18dbb..7272deef2c 100644
--- a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_avx2_rtm
-#include "strcat-avx2-rtm.S"
+#define STRNCAT	__strncat_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
+#include "strncat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
index 52ecbca943..ffa58bd0de 100644
--- a/sysdeps/x86_64/multiarch/strncat-avx2.S
+++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
@@ -1,7 +1,419 @@
-#ifndef STRNCAT
-# define STRNCAT	__strncat_avx2
-#endif
+/* strncat with AVX2
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+#  include "x86-avx-vecs.h"
+# endif
+
+# ifndef STRNCAT
+#  define STRNCAT	__strncat_avx2
+# endif
+
+# ifdef USE_AS_WCSCPY
+#  define MOVCHAR	movl
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define MOVCHAR	movb
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCAT)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   using the non-length variant {wcs|str}cat.  */
+	movq	%rdi, %rax
+# ifdef USE_AS_WCSCPY
+	leaq	-1(%rdx), %rcx
+	shr	$56, %rcx
+	jnz	L(zero_len)
+	salq	$2, %rdx
+# else
+	test	%rdx, %rdx
+	jl	L(zero_len)
+# endif
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
+
+# include "strcat-strlen-avx2.h.S"
+
+	movl	%esi, %ecx
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	tzcnt	%ecx, %r8d
+	cmpq	%r8, %rdx
+	jbe	L(less_1x_vec)
+
+	testl	%ecx, %ecx
+	jz	L(more_1x_vec)
+
+	/* Hoist this to save code size.  */
+
+	movl	%r8d, %edx
+
+L(less_1x_vec):
+	COND_VZEROUPPER
+
+	cmpl	$16, %edx
+	jae	L(copy_16_31)
+	cmpl	$8, %edx
+	jae	L(copy_8_15)
+
+
+# ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+	MOVCHAR	$0, (%rdi, %rdx)
+	ret
+# else
+	cmpl	$4, %edx
+	jae	L(copy_4_7)
+
+	movzbl	(%rsi), %ecx
+	cmpl	$1, %edx
+	jbe	L(set_null_term)
+
+	/* NB: make this `vmovw` if support for AVX512-FP16 is added.
+	 */
+	movzwl	1(%rsi), %esi
+	movw	%si, 1(%rdi)
+
+	.p2align 4,, 1
+L(set_null_term):
+	movb	%cl, (%rdi)
+	MOVCHAR	$0, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 11
+L(copy_4_7):
+	movl	-(4)(%rsi, %rdx), %ecx
+	vmovd	%xmm0, (%rdi)
+	movl	%ecx, -(4)(%rdi, %rdx)
+	MOVCHAR	$0, (%rdi, %rdx)
+	ret
+# endif
+
+
+	.p2align 4,, 10
+L(copy_16_31):
+	VMOVU	-(16)(%rsi, %rdx), %xmm1
+	VMOVU	%xmm0, (%rdi)
+	VMOVU	%xmm1, -(16)(%rdi, %rdx)
+	MOVCHAR	$0, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 10
+L(copy_8_15):
+	movq	-(8)(%rsi, %rdx), %rcx
+	vmovq	%xmm0, (%rdi)
+	movq	%rcx, -(8)(%rdi, %rdx)
+	MOVCHAR	$0, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+	.p2align 6,, 14
+L(more_1x_vec):
+	VMOVU	%VMM(0), (%rdi)
+
+	/* Align rsi (src) and just rdx/rdi (length/dst).  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	incq	%rsi
+	addq	%rsi, %rdi
+L(loop_last_4x_vec):
+	subq	%rsi, %rdx
+	VMOVA	0(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	tzcnt	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len)
+
+	cmpl	$VEC_SIZE, %ecx
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	addl	$-VEC_SIZE, %edx
+	bzhil	%edx, %ecx, %r8d
+	jz	L(ret_vec_x2_len)
+L(ret_vec_x2):
+	bsfl	%ecx, %edx
+L(ret_vec_x2_len):
+	VMOVU	(%rsi, %rdx), %VMM(0)
+	MOVCHAR	$0, (VEC_SIZE)(%rdi, %rdx)
+	VMOVU	%VMM(0), (%rdi, %rdx)
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+
+	.p2align 4,, 12
+L(ret_vec_x1_len):
+	movl	%edx, %ecx
+L(ret_vec_x1):
+	VMOVU	-(VEC_SIZE)(%rsi, %rcx), %VMM(1)
+	MOVCHAR	$0, (%rdi, %rcx)
+	VMOVU	%VMM(1), -VEC_SIZE(%rdi, %rcx)
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	subq	$-(VEC_SIZE * 4), %rsi
+	VMOVA	0(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	subq	$-(VEC_SIZE * 4), %rdi
+	addl	$-(VEC_SIZE * 4), %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+	/* L(ret_vec_x1) expects ecx to have position of first match so
+	   test with bsf.  */
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
 
-#define USE_AS_STRNCAT
-#define STRCAT	STRNCAT
-#include "strcat-avx2.S"
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE * 1)(%rdi)
+
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* Check if length is greater than 4x VEC.  */
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	addl	$(VEC_SIZE * -2), %edx
+
+	tzcnt	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len)
+
+	cmpl	$VEC_SIZE, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	addl	$-VEC_SIZE, %edx
+	bzhil	%edx, %ecx, %r8d
+	jz	L(ret_vec_x4_len)
+L(ret_vec_x4):
+	bsfl	%ecx, %edx
+L(ret_vec_x4_len):
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %VMM(0)
+	MOVCHAR	$0, (VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * 2)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 4
+L(ret_vec_x3_len):
+	movl	%edx, %ecx
+L(ret_vec_x3):
+	VMOVU	(VEC_SIZE)(%rsi, %rcx), %VMM(0)
+	MOVCHAR	$0, (VEC_SIZE * 2)(%rdi, %rcx)
+	VMOVU	%VMM(0), (VEC_SIZE)(%rdi, %rcx)
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(more_4x_vec):
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi)
+
+
+	/* Recheck length before aligning.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Align rsi (src) and just rdx/rdi (length/dst).  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %r8d
+	addq	%rsi, %rdi
+	testl	%r8d, %r8d
+	jnz	L(loop_4x_done)
+
+	/* Use r9 for end of region before handling last 4x VEC
+	   specially.  */
+	leaq	-(VEC_SIZE * 4)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %r8d
+
+	testl	%r8d, %r8d
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	/* L(ret_vec_x1) expects ecx to have position of first match so
+	   test with bsf.  */
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	bsfl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	bsfl	%r8d, %r8d
+	VMOVU	(VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8)
+	VZEROUPPER_RETURN
+
+
+
+	.p2align 4,, 4
+L(page_cross):
+	movq	%rsi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+
+	VPCMPEQ	(%r8), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+
+	subl	%esi, %r8d
+	andl	$(VEC_SIZE - 1), %r8d
+	cmpq	%r8, %rdx
+	jb	L(page_cross_small)
+
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shll	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsfl	%ecx, %ecx
+	rep	movsb
+	VZEROUPPER_RETURN
+
+L(page_cross_small):
+	tzcntl	%ecx, %ecx
+	jz	L(page_cross_setz)
+	cmpl	%edx, %ecx
+	cmova	%edx, %ecx
+	rep	movsb
+L(page_cross_setz):
+	MOVCHAR	$0, (%rdi)
+	VZEROUPPER_RETURN
+L(zero_len):
+# ifdef USE_AS_WCSCPY
+	test	%rdx, %rdx
+# endif
+	jnz	OVERFLOW_STRCAT
+	ret
+
+
+END(STRNCAT)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
index 79e7083299..d42ad88b3d 100644
--- a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STRNCPY	__strncpy_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
index ce634e94fa..e9afd8fbed 100644
--- a/sysdeps/x86_64/multiarch/strncpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
@@ -1,7 +1,735 @@
-#ifndef STRNCPY
-# define STRNCPY	__strncpy_avx2
-#endif
+/* strncpy with AVX2
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+
+# ifndef VEC_SIZE
+#  include "x86-avx-vecs.h"
+# endif
+
+# ifndef STRNCPY
+#  define STRNCPY	__strncpy_avx2
+# endif
+
+
+# ifdef USE_AS_WCSCPY
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCPY)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   just running rep stos{b|l} to zero set (which will almost
+	   certainly segfault), if that succeeds then just calling
+	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
+# ifdef USE_AS_WCSCPY
+	decq	%rdx
+	movq	%rdx, %rax
+	/* 56 is end of max supported address space.  */
+	shr	$56, %rax
+	jnz	L(zero_len)
+	salq	$2, %rdx
+# else
+	decq	%rdx
+	/* `dec` can macrofuse with `jl`. If the flag needs to become
+	   `jb` replace `dec` with `sub`.  */
+	jl	L(zero_len)
+# endif
+
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
+	movl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* If no STPCPY just save end ahead of time.  */
+# ifndef USE_AS_STPCPY
+	movq	%rdi, %rax
+# elif defined USE_AS_WCSCPY
+	/* Clear dependency as nearly all return code for wcpncpy uses
+	   `setc %al`.  */
+	xorl	%eax, %eax
+# endif
+
+	cmpq	$(VEC_SIZE - CHAR_SIZE), %rdx
+	/* `jb` because length rdx is now length - CHAR_SIZE.  */
+	jbe	L(less_1x_vec)
+
+	/* This may overset but thats fine because we still need to zero
+	   fill.  */
+	VMOVU	%VMM(0), (%rdi)
+
+	testl	%ecx, %ecx
+	jnz	L(zfill)
+
+	/* Align.  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	incq	%rsi
+L(last_4x_vec):
+	addq	%rsi, %rdi
+L(loop_last_4x_vec):
+	subq	%rsi, %rdx
+
+
+	VMOVA	0(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jae	L(more_2x_vec)
+
+	cmpl	$(VEC_SIZE), %edx
+	jb	L(ret_vec_x1_len)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VPCMPEQ	VEC_SIZE(%rsi), %VZERO, %VMM(6)
+	VMOVU	%VMM(1), (%rdi)
+	vpmovmskb %VMM(6), %ecx
+	shlq	$VEC_SIZE, %rcx
+L(ret_vec_x1_len):
+	tzcntq	%rcx, %rcx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+L(ret_vec_x1_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+L(ret_vec_x1_len_no_zfill):
+	VMOVU	((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	.p2align 4,, 6
+L(ret_vec_x1):
+	bsfl	%ecx, %ecx
+	VMOVU	%VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+	subl	%ecx, %edx
+	/* Check if we need to reload/store.  */
+	cmpl	$VEC_SIZE, %edx
+	jb	L(ret_vec_x1_len_no_zfill_mov)
+	/* Otherwise safe to just store directly.  */
+	VMOVU	%VMM(1), (%rdi)
+	VMOVU	%VZERO, (%rdi, %rcx)
+# ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 12
+L(more_2x_vec):
+	VMOVU	%VMM(1), (%rdi)
+	testl	%ecx, %ecx
+	/* Must fill at least 2x VEC.  */
+	jnz	L(zfill_vec1)
+
+	VMOVA	VEC_SIZE(%rsi), %VMM(2)
+	VMOVU	%VMM(2), VEC_SIZE(%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	/* Must fill at least 1x VEC.  */
+	jnz	L(zfill_vec2)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
+	   CHAR_SIZE.  */
+	cmpq	$(VEC_SIZE * 4 - CHAR_SIZE), %rdx
+	ja	L(more_4x_vec)
+
+	subl	$(VEC_SIZE * 3), %edx
+	jb	L(ret_vec_x3_len)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	vpmovmskb %VMM(6), %ecx
+	tzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x4_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+	movl	%ecx, %edx
+L(ret_vec_x4_len_no_zfill):
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 3 + 0)(%edx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+
+L(ret_vec_x3_len):
+	addl	$(VEC_SIZE * 1), %edx
+	tzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+L(ret_vec_x3_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+	.p2align 4,, 4
+L(ret_vec_x3_len_no_zfill):
+	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 2 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsfl	%ecx, %ecx
+	VMOVU	%VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
+	subl	%ecx, %edx
+	jl	L(ret_vec_x3_len_no_zfill_mov)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx), %rax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(more_4x_vec):
+
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec3)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(4)
+	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec4)
+
+	movq	%rdx, %rcx
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+	/* Recheck length before aligning.  */
+	cmpq	$(VEC_SIZE * 8 - CHAR_SIZE), %rcx
+	jbe	L(last_4x_vec)
+
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %r8d
+	addq	%rsi, %rdi
+	testl	%r8d, %r8d
+	jnz	L(loop_4x_done)
+
+	/* Use r9 as end register.  */
+	leaq	-(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
 
-#define USE_AS_STRNCPY
-#define STRCPY	STRNCPY
-#include "strcpy-avx2.S"
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %r8d
+
+	testl	%r8d, %r8d
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	subq	%rsi, %rdx
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec1)
+
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec2)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec3)
+
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+	movl	%r8d, %ecx
+
+	// Zfill more....
+
+	.p2align 4,, 4
+L(zfill_vec4):
+	addq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2), %rdx
+L(zfill_vec2):
+	shlq	$VEC_SIZE, %rcx
+L(zfill):
+	bsfq	%rcx, %rcx
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_from_page_cross):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(zfill_less_vec_vzeroupper)
+
+L(zfill_more_1x_vec):
+	VMOVU	%VZERO, CHAR_SIZE(%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jae	L(zfill_more_2x_vec)
+L(zfill_done0):
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(zfill_vec3):
+	addq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2), %rdx
+	.p2align 4,, 2
+L(zfill_vec1):
+	bsfl	%ecx, %ecx
+	addq	%rcx, %rdi
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+	/* zfill from vec1/vec3 must have to set at least 2x VECS.  */
+
+	VMOVU	%VZERO, CHAR_SIZE(%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jb	L(zfill_done0)
+L(zfill_more_2x_vec):
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
+	subq	$(VEC_SIZE * 4 - CHAR_SIZE), %rdx
+	jbe	L(zfill_done)
+
+	addq	%rdi, %rdx
+	VMOVU	%VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
+
+
+	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
+	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
+
+	subq	$-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
+	cmpq	%rdi, %rdx
+	jbe	L(zfill_done)
+
+	andq	$-(VEC_SIZE), %rdi
+	.p2align 4,, 12
+L(zfill_loop_4x_vec):
+	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	ja	L(zfill_loop_4x_vec)
+L(zfill_done):
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(copy_1x):
+	VMOVU	%VMM(0), (%rdi)
+	testl	%ecx, %ecx
+	jz	L(ret_32_32)
+L(zfill_less_vec):
+	bsfl	%ecx, %ecx
+L(zfill_less_vec_no_bsf):
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_less_vec_vzeroupper):
+	COND_VZEROUPPER
+	/* We are taking advantage of the fact that to be here we must
+	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
+	   way for overwriting.  */
+	cmpl	$16, %edx
+	jb	L(zfill_less_16)
+	VMOVU	%VZERO_128, (%rdi)
+	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
+	ret
+# ifdef USE_AS_STPCPY
+L(ret_32_32):
+	leaq	CHAR_SIZE(%rdi, %rdx), %rax
+	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4,, 4
+L(copy_16_31):
+	/* Overfill to avoid branches.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	ja	L(zfill_less_vec_no_bsf)
+# ifndef USE_AS_STPCPY
+L(ret_32_32):
+# else
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 4
+L(copy_8_15):
+	/* Overfill to avoid branches.  */
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
+	vmovq	%xmm0, (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_8_15)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+	.p2align 4,, 8
+L(zfill_less_16):
+	xorl	%ecx, %ecx
+	cmpl	$8, %edx
+	jb	L(zfill_less_8)
+	movq	%rcx, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
+# ifndef USE_AS_STPCPY
+L(ret_8_15):
+# endif
+	ret
+
+
+	.p2align 4,, 8
+L(less_1x_vec):
+	/* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
+	   buffer sizes are aligned conventially.  */
+	je	L(copy_1x)
+
+	tzcntl	%ecx, %ecx
+	cmpl	$16, %edx
+	jae	L(copy_16_31)
+
+	COND_VZEROUPPER
+	cmpl	$8, %edx
+	jae	L(copy_8_15)
+# ifdef USE_AS_WCSCPY
+	testl	%ecx, %ecx
+	jz	L(zfill_less_8_set_ret)
+
+	movl	(%rsi, %rdx), %esi
+	vmovd	%xmm0, (%rdi)
+	movl	%esi, (%rdi, %rdx)
+
+#  ifdef USE_AS_STPCPY
+	cmpl	%ecx, %edx
+L(ret_8_15):
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  endif
+	ret
+L(zfill_less_8_set_ret):
+	xorl	%ecx, %ecx
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+L(zfill_less_8):
+	movl	%ecx, (%rdi)
+	movl	%ecx, (%rdi, %rdx)
+	ret
+
+# else
+	cmpl	$3, %edx
+	jb	L(copy_0_3)
+	/* Overfill to avoid branches.  */
+	movl	-3(%rsi, %rdx), %esi
+	vmovd	%xmm0, (%rdi)
+	movl	%esi, -3(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_4_7)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+	xorl	%ecx, %ecx
+	.p2align 4,, 8
+L(zfill_less_8):
+	cmpl	$3, %edx
+	jb	L(zfill_less_3)
+	movl	%ecx, (%rdi)
+	movl	%ecx, -3(%rdi, %rdx)
+#  ifdef USE_AS_STPCPY
+	ret
+#  endif
+
+L(ret_4_7):
+#  ifdef USE_AS_STPCPY
+L(ret_8_15):
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+	ret
+
+	.p2align 4,, 4
+L(zfill_less_3):
+	testl	%edx, %edx
+	jz	L(zfill_1)
+	movw	%cx, (%rdi)
+L(zfill_1):
+	movb	%cl, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_0_3):
+	vmovd	%xmm0, %r8d
+	testl	%edx, %edx
+	jz	L(copy_1)
+	movw	%r8w, (%rdi)
+	cmpl	%ecx, %edx
+	ja	L(zfill_from_1)
+	movzbl	(%rsi, %rdx), %r8d
+#  ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+	movb	%r8b, (%rdi, %rdx)
+	ret
+#  endif
+
+L(copy_1):
+#  ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	cmpl	%ecx, %edx
+	adcq	%rdi, %rax
+#  endif
+#  ifdef USE_AS_WCSCPY
+	vmovd	%xmm0, (%rdi)
+#  else
+	movb	%r8b, (%rdi, %rdx)
+#  endif
+	ret
+# endif
+
+	.p2align 4,, 2
+L(zero_len):
+	movq	%rdi, %rax
+	ret
+# ifndef USE_AS_WCSCPY
+	.p2align 4,, 8
+L(zfill_from_1):
+#  ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+#  endif
+	movw	$0, -1(%rdi, %rdx)
+	ret
+# endif
+
+	.p2align 4,, 4
+	.p2align 6,, 8
+L(page_cross):
+	movq	%rsi, %rax
+	andq	$(VEC_SIZE * -1), %rax
+
+	VPCMPEQ	(%rax), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+
+	subl	%esi, %eax
+	andl	$(VEC_SIZE - 1), %eax
+	cmpq	%rax, %rdx
+	jb	L(page_cross_small)
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* If rcx is non-zero then continue.  */
+	shl	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsf	%ecx, %ecx
+
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	leaq	-CHAR_SIZE(%rdi, %rcx), %rax
+# else
+	movq	%rdi, %rax
+# endif
+
+	rep	movsb
+# ifdef USE_AS_WCSCPY
+	movl	$0, (%rdi)
+# else
+	movb	$0, (%rdi)
+# endif
+	jmp	L(zfill_from_page_cross)
+
+L(page_cross_small):
+	tzcntl	%ecx, %ecx
+	xorl	%eax, %eax
+	cmpl	%ecx, %edx
+	jbe	L(page_cross_copy_only)
+
+	/* Do a zfill of the tail before copying.  */
+	movq	%rdi, %r9
+	movl	%ecx, %r8d
+
+	subl	%ecx, %edx
+	leaq	CHAR_SIZE(%rdi, %rcx), %rdi
+	movl	%edx, %ecx
+	rep	stosb
+	movq	%r9, %rdi
+	movl	%r8d, %edx
+L(page_cross_copy_only):
+	leal	CHAR_SIZE(%rdx), %ecx
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdi, %rdx
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# else
+	movq	%rdi, %rax
+# endif
+	rep	movsb
+	ret
+
+
+L(best_effort_strncpy):
+	movq	%rdx, %rcx
+	xorl	%eax, %eax
+	movq	%rdi, %r8
+	/* The length is >= 2^63. We very much so expect to segfault at
+	   rep stos. If that doesn't happen then just strcpy to finish.
+	 */
+# ifdef USE_AS_WCSCPY
+	rep	stosl
+# else
+	rep	stosb
+# endif
+	movq	%r8, %rdi
+	jmp	OVERFLOW_STRCPY
+END(STRNCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
index dca1089060..275af7560a 100644
--- a/sysdeps/x86_64/multiarch/x86-avx-vecs.h
+++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
@@ -27,7 +27,8 @@
 #define VEC_SIZE			32
 #include "x86-vec-macros.h"
 
-#define USE_WITH_AVX		1
+#define USE_WITH_AVX2		1
+
 #define SECTION(p)			p##.avx
 
 /* 4-byte mov instructions with AVX2.  */
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v5 3/4] x86: Add evex optimized functions for the wchar_t strcpy family
  2022-11-09  1:38   ` [PATCH v5 " Noah Goldstein
  2022-11-09  1:38     ` [PATCH v5 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
@ 2022-11-09  1:38     ` Noah Goldstein
  2022-11-09  3:01       ` H.J. Lu
  2022-11-09  1:38     ` [PATCH v5 4/4] x86: Add avx2 " Noah Goldstein
  2022-11-09  3:00     ` [PATCH v5 1/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions H.J. Lu
  3 siblings, 1 reply; 42+ messages in thread
From: Noah Goldstein @ 2022-11-09  1:38 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Implemented:
    wcscat-evex  (+ 905 bytes)
    wcscpy-evex  (+ 674 bytes)
    wcpcpy-evex  (+ 709 bytes)
    wcsncpy-evex (+1358 bytes)
    wcpncpy-evex (+1467 bytes)
    wcsncat-evex (+1213 bytes)

Performance Changes:
    Times are from N = 10 runs of the benchmark suite and are reported
    as geometric mean of all ratios of New Implementation / Best Old
    Implementation. Best Old Implementation was determined with the
    highest ISA implementation.

    wcscat-evex     -> 0.991
    wcscpy-evex     -> 0.587
    wcpcpy-evex     -> 0.695
    wcsncpy-evex    -> 0.719
    wcpncpy-evex    -> 0.694
    wcsncat-evex    -> 0.979

Code Size Changes:
    This change  increase the size of libc.so by ~6.3kb bytes. For
    reference the patch optimizing the normal strcpy family functions
    decreases libc.so by ~5.7kb.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
 sysdeps/x86_64/Makefile                    |  5 ++
 sysdeps/x86_64/multiarch/Makefile          | 14 ++++-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c | 63 ++++++++++++++++++++--
 sysdeps/x86_64/multiarch/ifunc-wcs.h       | 48 +++++++++++++++++
 sysdeps/x86_64/multiarch/wcpcpy-evex.S     |  8 +++
 sysdeps/x86_64/multiarch/wcpcpy-generic.c  | 27 ++++++++++
 sysdeps/x86_64/multiarch/wcpcpy.c          | 37 +++++++++++++
 sysdeps/x86_64/multiarch/wcpncpy-evex.S    |  8 +++
 sysdeps/x86_64/multiarch/wcpncpy-generic.c | 27 ++++++++++
 sysdeps/x86_64/multiarch/wcpncpy.c         | 37 +++++++++++++
 sysdeps/x86_64/multiarch/wcscat-evex.S     |  9 ++++
 sysdeps/x86_64/multiarch/wcscat-generic.c  | 27 ++++++++++
 sysdeps/x86_64/multiarch/wcscat.c          | 37 +++++++++++++
 sysdeps/x86_64/multiarch/wcscpy-evex.S     |  7 +++
 sysdeps/x86_64/multiarch/wcscpy-generic.c  |  3 +-
 sysdeps/x86_64/multiarch/wcscpy.c          | 11 ++++
 sysdeps/x86_64/multiarch/wcsncat-evex.S    |  9 ++++
 sysdeps/x86_64/multiarch/wcsncat-generic.c | 27 ++++++++++
 sysdeps/x86_64/multiarch/wcsncat.c         | 34 ++++++++++++
 sysdeps/x86_64/multiarch/wcsncpy-evex.S    |  7 +++
 sysdeps/x86_64/multiarch/wcsncpy-generic.c | 27 ++++++++++
 sysdeps/x86_64/multiarch/wcsncpy.c         | 37 +++++++++++++
 sysdeps/x86_64/wcpcpy-generic.c            | 31 +++++++++++
 sysdeps/x86_64/wcpcpy.S                    | 40 ++++++++++++++
 sysdeps/x86_64/wcpncpy-generic.c           | 31 +++++++++++
 sysdeps/x86_64/wcpncpy.S                   | 40 ++++++++++++++
 sysdeps/x86_64/wcscat-generic.c            | 31 +++++++++++
 sysdeps/x86_64/wcscat.S                    | 40 ++++++++++++++
 sysdeps/x86_64/wcscpy.S                    |  3 +-
 sysdeps/x86_64/wcsncat-generic.c           | 31 +++++++++++
 sysdeps/x86_64/wcsncat.S                   | 38 +++++++++++++
 sysdeps/x86_64/wcsncpy-generic.c           | 31 +++++++++++
 sysdeps/x86_64/wcsncpy.S                   | 40 ++++++++++++++
 33 files changed, 858 insertions(+), 7 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcs.h
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy.c
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy.c
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcscat.c
 create mode 100644 sysdeps/x86_64/multiarch/wcscpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat.c
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy.c
 create mode 100644 sysdeps/x86_64/wcpcpy-generic.c
 create mode 100644 sysdeps/x86_64/wcpcpy.S
 create mode 100644 sysdeps/x86_64/wcpncpy-generic.c
 create mode 100644 sysdeps/x86_64/wcpncpy.S
 create mode 100644 sysdeps/x86_64/wcscat-generic.c
 create mode 100644 sysdeps/x86_64/wcscat.S
 create mode 100644 sysdeps/x86_64/wcsncat-generic.c
 create mode 100644 sysdeps/x86_64/wcsncat.S
 create mode 100644 sysdeps/x86_64/wcsncpy-generic.c
 create mode 100644 sysdeps/x86_64/wcsncpy.S

diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 3627c5659f..688eb2d7c4 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -188,8 +188,13 @@ endif
 ifeq ($(subdir),wcsmbs)
 
 sysdep_routines += \
+  wcpcpy-generic \
+  wcpncpy-generic \
+  wcscat-generic \
   wcscpy-generic \
+  wcsncat-generic \
   wcsncmp-generic \
+  wcsncpy-generic \
   wcsnlen-generic \
 # sysdep_routines
 
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 066bfa48d9..d6e01940c3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -131,6 +131,12 @@ endif
 
 ifeq ($(subdir),wcsmbs)
 sysdep_routines += \
+  wcpcpy-evex \
+  wcpcpy-generic \
+  wcpncpy-evex \
+  wcpncpy-generic \
+  wcscat-evex \
+  wcscat-generic \
   wcschr-avx2 \
   wcschr-avx2-rtm \
   wcschr-evex \
@@ -140,6 +146,8 @@ sysdep_routines += \
   wcscmp-avx2-rtm \
   wcscmp-evex \
   wcscmp-sse2 \
+  wcscpy-evex \
+  wcscpy-generic \
   wcscpy-ssse3 \
   wcslen-avx2 \
   wcslen-avx2-rtm \
@@ -147,9 +155,13 @@ sysdep_routines += \
   wcslen-evex512 \
   wcslen-sse2 \
   wcslen-sse4_1 \
+  wcsncat-evex \
+  wcsncat-generic \
   wcsncmp-avx2 \
   wcsncmp-avx2-rtm \
   wcsncmp-evex \
+  wcsncpy-evex \
+  wcsncpy-generic \
   wcsnlen-avx2 \
   wcsnlen-avx2-rtm \
   wcsnlen-evex \
@@ -163,8 +175,8 @@ sysdep_routines += \
   wmemchr-avx2 \
   wmemchr-avx2-rtm \
   wmemchr-evex \
-  wmemchr-evex512 \
   wmemchr-evex-rtm \
+  wmemchr-evex512 \
   wmemchr-sse2 \
   wmemcmp-avx2-movbe \
   wmemcmp-avx2-movbe-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7cebee7ec7..c908d6c158 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -901,16 +901,73 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/wcscpy.c.  */
   IFUNC_IMPL (i, name, wcscpy,
-	      /* ISA V4 wrapper for SSSE3 implementation because
-	         the SSSE3 implementation is also used at ISA
-	         level 3/4.  */
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcscpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __wcscpy_ssse3)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
 				     1,
 				     __wcscpy_generic))
 
+  /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
+  IFUNC_IMPL (i, name, wcsncpy,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
+				     1,
+				     __wcsncpy_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcpcpy.c.  */
+  IFUNC_IMPL (i, name, wcpcpy,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcpcpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpcpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
+				     1,
+				     __wcpcpy_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcpncpy.c.  */
+  IFUNC_IMPL (i, name, wcpncpy,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcpncpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpncpy_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
+				     1,
+				     __wcpncpy_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcscat.c.  */
+  IFUNC_IMPL (i, name, wcscat,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcscat,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscat_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
+				     1,
+				     __wcscat_generic))
+
+  /* Support sysdeps/x86_64/multiarch/wcsncat.c.  */
+  IFUNC_IMPL (i, name, wcsncat,
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncat,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncat_evex)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
+				     1,
+				     __wcsncat_generic))
+
   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
   IFUNC_IMPL (i, name, wcslen,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcslen,
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcs.h b/sysdeps/x86_64/multiarch/ifunc-wcs.h
new file mode 100644
index 0000000000..1d2a63458b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-wcs.h
@@ -0,0 +1,48 @@
+/* Common definition for ifunc selections optimized wide-character
+   string copy functions.
+
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+#ifndef GENERIC
+# define GENERIC generic
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+				      AVX_Fast_Unaligned_Load, ))
+    {
+      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+    }
+
+  return OPTIMIZE (GENERIC);
+}
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-evex.S b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
new file mode 100644
index 0000000000..ac6429cc07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
@@ -0,0 +1,8 @@
+#ifndef WCPCPY
+# define WCPCPY	__wcpcpy_evex
+#endif
+
+#define USE_AS_STPCPY
+#define USE_AS_WCSCPY
+#define STRCPY	WCPCPY
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-generic.c b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
new file mode 100644
index 0000000000..6039196a3e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
@@ -0,0 +1,27 @@
+/* wcpcpy.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (3)
+
+# define WCPCPY __wcpcpy_generic
+# include <wcsmbs/wcpcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcpcpy.c b/sysdeps/x86_64/multiarch/wcpcpy.c
new file mode 100644
index 0000000000..8f96ddbc99
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcpcpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcpcpy __redirect_wcpcpy
+# include <wchar.h>
+# undef __wcpcpy
+
+# define SYMBOL_NAME wcpcpy
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcpcpy, __wcpcpy, IFUNC_SELECTOR ());
+weak_alias (__wcpcpy, wcpcpy)
+# ifdef SHARED
+__hidden_ver1 (__wcpcpy, __GI___wcpcpy, __redirect_wcpcpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpcpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-evex.S b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
new file mode 100644
index 0000000000..62ddb694fe
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
@@ -0,0 +1,8 @@
+#ifndef WCPNCPY
+# define WCPNCPY	__wcpncpy_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STPCPY
+#define STRNCPY	WCPNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-generic.c b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
new file mode 100644
index 0000000000..de8d34320e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
@@ -0,0 +1,27 @@
+/* wcpncpy.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (3)
+
+# define WCPNCPY __wcpncpy_generic
+# include <wcsmbs/wcpncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcpncpy.c b/sysdeps/x86_64/multiarch/wcpncpy.c
new file mode 100644
index 0000000000..ed8f307e07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcpncpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcpncpy __redirect_wcpncpy
+# include <wchar.h>
+# undef __wcpncpy
+
+# define SYMBOL_NAME wcpncpy
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcpncpy, __wcpncpy, IFUNC_SELECTOR ());
+weak_alias (__wcpncpy, wcpncpy)
+# ifdef SHARED
+__hidden_ver1 (__wcpncpy, __GI___wcpncpy, __redirect_wcpncpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpncpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcscat-evex.S b/sysdeps/x86_64/multiarch/wcscat-evex.S
new file mode 100644
index 0000000000..1d017e4899
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-evex.S
@@ -0,0 +1,9 @@
+#ifndef WCSCAT
+# define WCSCAT	__wcscat_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRCPY	WCSCAT
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcscat-generic.c b/sysdeps/x86_64/multiarch/wcscat-generic.c
new file mode 100644
index 0000000000..d86b4d5c00
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-generic.c
@@ -0,0 +1,27 @@
+/* wcscat.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (3)
+
+# define WCSCAT __wcscat_generic
+# include <wcsmbs/wcscat.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcscat.c b/sysdeps/x86_64/multiarch/wcscat.c
new file mode 100644
index 0000000000..3277c44561
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcscat.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcscat __redirect_wcscat
+# include <wchar.h>
+# undef __wcscat
+
+# define SYMBOL_NAME wcscat
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcscat, __wcscat, IFUNC_SELECTOR ());
+weak_alias (__wcscat, wcscat)
+# ifdef SHARED
+__hidden_ver1 (__wcscat, __GI___wcscat, __redirect_wcscat)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcscat);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcscpy-evex.S b/sysdeps/x86_64/multiarch/wcscpy-evex.S
new file mode 100644
index 0000000000..1069a8e224
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-evex.S
@@ -0,0 +1,7 @@
+#ifndef WCSCPY
+# define WCSCPY	__wcscpy_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define STRCPY	WCSCPY
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-generic.c b/sysdeps/x86_64/multiarch/wcscpy-generic.c
index 93d314aaad..4a1fffae4b 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcscpy-generic.c
@@ -18,8 +18,7 @@
 
 
 #include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (1)
+#if ISA_SHOULD_BUILD (3)
 
 # define WCSCPY  __wcscpy_generic
 # include <wcsmbs/wcscpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
index 92c917b6b4..9ad77da8ac 100644
--- a/sysdeps/x86_64/multiarch/wcscpy.c
+++ b/sysdeps/x86_64/multiarch/wcscpy.c
@@ -26,6 +26,8 @@
 # define SYMBOL_NAME wcscpy
 # include <init-arch.h>
 
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
@@ -35,6 +37,15 @@ IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
+  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load, ))
+    {
+      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+    }
+
   if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
     return OPTIMIZE (ssse3);
 
diff --git a/sysdeps/x86_64/multiarch/wcsncat-evex.S b/sysdeps/x86_64/multiarch/wcsncat-evex.S
new file mode 100644
index 0000000000..392215950a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-evex.S
@@ -0,0 +1,9 @@
+#ifndef WCSCAT
+# define WCSCAT	__wcsncat_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRNCAT	WCSCAT
+#include "strncat-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncat-generic.c b/sysdeps/x86_64/multiarch/wcsncat-generic.c
new file mode 100644
index 0000000000..4b55cb40bc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-generic.c
@@ -0,0 +1,27 @@
+/* wcsncat.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (3)
+
+# define WCSNCAT __wcsncat_generic
+# include <wcsmbs/wcsncat.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncat.c b/sysdeps/x86_64/multiarch/wcsncat.c
new file mode 100644
index 0000000000..49c46aef08
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat.c
@@ -0,0 +1,34 @@
+/* Multiple versions of wcsncat.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define wcsncat __redirect_wcsncat
+# include <wchar.h>
+# undef wcsncat
+
+# define SYMBOL_NAME wcsncat
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcsncat, wcsncat, IFUNC_SELECTOR ());
+# ifdef SHARED
+__hidden_ver1 (wcsncat, __GI_wcsncat, __redirect_wcsncat)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncat);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-evex.S b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
new file mode 100644
index 0000000000..2debb8fd6b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
@@ -0,0 +1,7 @@
+#ifndef WCSNCPY
+# define WCSNCPY	__wcsncpy_evex
+#endif
+
+#define USE_AS_WCSCPY
+#define STRNCPY	WCSNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-generic.c b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
new file mode 100644
index 0000000000..d0e8a86605
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
@@ -0,0 +1,27 @@
+/* wcsncpy.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We always need to build this implementation as strspn-sse4 needs to
+   be able to fallback to it.  */
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (3)
+
+# define WCSNCPY __wcsncpy_generic
+# include <wcsmbs/wcsncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncpy.c b/sysdeps/x86_64/multiarch/wcsncpy.c
new file mode 100644
index 0000000000..5b89dd4d27
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcsncpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcsncpy __redirect_wcsncpy
+# include <wchar.h>
+# undef __wcsncpy
+
+# define SYMBOL_NAME wcsncpy
+# include <init-arch.h>
+
+# include "ifunc-wcs.h"
+
+libc_ifunc_redirected (__redirect_wcsncpy, __wcsncpy, IFUNC_SELECTOR ());
+weak_alias (__wcsncpy, wcsncpy)
+# ifdef SHARED
+__hidden_ver1 (__wcsncpy, __GI___wcsncpy, __redirect_wcsncpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/wcpcpy-generic.c b/sysdeps/x86_64/wcpcpy-generic.c
new file mode 100644
index 0000000000..3ddc98872f
--- /dev/null
+++ b/sysdeps/x86_64/wcpcpy-generic.c
@@ -0,0 +1,31 @@
+/* ISA level static dispatch for wcpcpy .c files.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpcpy non-multiarch build is split into two files,
+   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcpcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcpcpy.S b/sysdeps/x86_64/wcpcpy.S
new file mode 100644
index 0000000000..4e4fca71eb
--- /dev/null
+++ b/sysdeps/x86_64/wcpcpy.S
@@ -0,0 +1,40 @@
+/* ISA level static dispatch for wcpcpy .S files.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpcpy non-multiarch build is split into two files,
+   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCPCPY	__wcpcpy
+
+# define DEFAULT_IMPL_V4	"multiarch/wcpcpy-evex.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcpcpy, wcpcpy)
+libc_hidden_def (__wcpcpy)
+#endif
diff --git a/sysdeps/x86_64/wcpncpy-generic.c b/sysdeps/x86_64/wcpncpy-generic.c
new file mode 100644
index 0000000000..0c76e5614c
--- /dev/null
+++ b/sysdeps/x86_64/wcpncpy-generic.c
@@ -0,0 +1,31 @@
+/* ISA level static dispatch for wcpncpy .c files.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpncpy non-multiarch build is split into two files,
+   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcpncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcpncpy.S b/sysdeps/x86_64/wcpncpy.S
new file mode 100644
index 0000000000..b4e531473e
--- /dev/null
+++ b/sysdeps/x86_64/wcpncpy.S
@@ -0,0 +1,40 @@
+/* ISA level static dispatch for wcpcpy .S files.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcpncpy non-multiarch build is split into two files,
+   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCPNCPY	__wcpncpy
+
+# define DEFAULT_IMPL_V4	"multiarch/wcpncpy-evex.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcpncpy, wcpncpy)
+libc_hidden_def (__wcpncpy)
+#endif
diff --git a/sysdeps/x86_64/wcscat-generic.c b/sysdeps/x86_64/wcscat-generic.c
new file mode 100644
index 0000000000..512d0e4d43
--- /dev/null
+++ b/sysdeps/x86_64/wcscat-generic.c
@@ -0,0 +1,31 @@
+/* ISA level static dispatch for wcscat .c files.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcscat non-multiarch build is split into two files,
+   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcscat.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcscat.S b/sysdeps/x86_64/wcscat.S
new file mode 100644
index 0000000000..ee8360b6e8
--- /dev/null
+++ b/sysdeps/x86_64/wcscat.S
@@ -0,0 +1,40 @@
+/* ISA level static dispatch for wcscat .S files.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcscat non-multiarch build is split into two files,
+   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCSCAT	__wcscat
+
+# define DEFAULT_IMPL_V4	"multiarch/wcscat-evex.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcscat, wcscat)
+libc_hidden_def (__wcscat)
+#endif
diff --git a/sysdeps/x86_64/wcscpy.S b/sysdeps/x86_64/wcscpy.S
index 11d0bb4bab..e403579961 100644
--- a/sysdeps/x86_64/wcscpy.S
+++ b/sysdeps/x86_64/wcscpy.S
@@ -1,4 +1,4 @@
-/* wcscpy dispatch for RTLD and non-multiarch .c files
+/* ISA level static dispatch for wcscpy .S files.
    Copyright (C) 2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -28,6 +28,7 @@
 
 # define WCSCPY	__wcscpy
 
+# define DEFAULT_IMPL_V4	"multiarch/wcscpy-evex.S"
 # define DEFAULT_IMPL_V2	"multiarch/wcscpy-ssse3.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
diff --git a/sysdeps/x86_64/wcsncat-generic.c b/sysdeps/x86_64/wcsncat-generic.c
new file mode 100644
index 0000000000..86e20d9028
--- /dev/null
+++ b/sysdeps/x86_64/wcsncat-generic.c
@@ -0,0 +1,31 @@
+/* ISA level static dispatch for wcsncat .c files.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncat non-multiarch build is split into two files,
+   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcsncat.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcsncat.S b/sysdeps/x86_64/wcsncat.S
new file mode 100644
index 0000000000..090055a1b8
--- /dev/null
+++ b/sysdeps/x86_64/wcsncat.S
@@ -0,0 +1,38 @@
+/* ISA level static dispatch for wcsncat .S files.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncat non-multiarch build is split into two files,
+   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCSNCAT	wcsncat
+
+# define DEFAULT_IMPL_V4	"multiarch/wcsncat-evex.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+#endif
diff --git a/sysdeps/x86_64/wcsncpy-generic.c b/sysdeps/x86_64/wcsncpy-generic.c
new file mode 100644
index 0000000000..0f0ee65b65
--- /dev/null
+++ b/sysdeps/x86_64/wcsncpy-generic.c
@@ -0,0 +1,31 @@
+/* ISA level static dispatch for wcsncpy .c files.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncpy non-multiarch build is split into two files,
+   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL <= 3
+
+# include <wcsmbs/wcsncpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/wcsncpy.S b/sysdeps/x86_64/wcsncpy.S
new file mode 100644
index 0000000000..32eaf1163b
--- /dev/null
+++ b/sysdeps/x86_64/wcsncpy.S
@@ -0,0 +1,40 @@
+/* ISA level static dispatch for wcsncpy .S files.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* wcsncpy non-multiarch build is split into two files,
+   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
+   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
+   This must be split into two files because we cannot include C
+   code from assembly or vice versa.  */
+
+#include <isa-level.h>
+
+#if MINIMUM_X86_ISA_LEVEL >= 4
+
+# define WCSNCPY	__wcsncpy
+
+# define DEFAULT_IMPL_V4	"multiarch/wcsncpy-evex.S"
+/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
+   should never be used from here.  */
+# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
+
+# include "isa-default-impl.h"
+
+weak_alias (__wcsncpy, wcsncpy)
+libc_hidden_def (__wcsncpy)
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v5 4/4] x86: Add avx2 optimized functions for the wchar_t strcpy family
  2022-11-09  1:38   ` [PATCH v5 " Noah Goldstein
  2022-11-09  1:38     ` [PATCH v5 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
  2022-11-09  1:38     ` [PATCH v5 3/4] x86: Add evex optimized functions for the wchar_t strcpy family Noah Goldstein
@ 2022-11-09  1:38     ` Noah Goldstein
  2022-11-09  3:01       ` H.J. Lu
  2022-11-09  3:00     ` [PATCH v5 1/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions H.J. Lu
  3 siblings, 1 reply; 42+ messages in thread
From: Noah Goldstein @ 2022-11-09  1:38 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Implemented:
    wcscat-avx2  (+ 744 bytes
    wcscpy-avx2  (+ 539 bytes)
    wcpcpy-avx2  (+ 577 bytes)
    wcsncpy-avx2 (+1108 bytes)
    wcpncpy-avx2 (+1214 bytes)
    wcsncat-avx2 (+1085 bytes)

Performance Changes:
    Times are from N = 10 runs of the benchmark suite and are reported
    as geometric mean of all ratios of New Implementation / Best Old
    Implementation. Best Old Implementation was determined with the
    highest ISA implementation.

    wcscat-avx2     -> 0.975
    wcscpy-avx2     -> 0.591
    wcpcpy-avx2     -> 0.698
    wcsncpy-avx2    -> 0.730
    wcpncpy-avx2    -> 0.711
    wcsncat-avx2    -> 0.954

Code Size Changes:
    This change  increase the size of libc.so by ~5.5kb bytes. For
    reference the patch optimizing the normal strcpy family functions
    decreases libc.so by ~5.2kb.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
 sysdeps/x86_64/multiarch/Makefile          |  6 +++++
 sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 ++++++++++++++++++++--
 sysdeps/x86_64/multiarch/ifunc-wcs.h       |  7 ++++++
 sysdeps/x86_64/multiarch/wcpcpy-avx2.S     |  8 +++++++
 sysdeps/x86_64/multiarch/wcpcpy-generic.c  |  2 +-
 sysdeps/x86_64/multiarch/wcpncpy-avx2.S    |  8 +++++++
 sysdeps/x86_64/multiarch/wcpncpy-generic.c |  2 +-
 sysdeps/x86_64/multiarch/wcscat-avx2.S     | 10 ++++++++
 sysdeps/x86_64/multiarch/wcscat-generic.c  |  2 +-
 sysdeps/x86_64/multiarch/wcscpy-avx2.S     |  7 ++++++
 sysdeps/x86_64/multiarch/wcscpy-generic.c  |  2 +-
 sysdeps/x86_64/multiarch/wcscpy.c          |  5 ++++
 sysdeps/x86_64/multiarch/wcsncat-avx2.S    |  9 +++++++
 sysdeps/x86_64/multiarch/wcsncat-generic.c |  2 +-
 sysdeps/x86_64/multiarch/wcsncpy-avx2.S    |  7 ++++++
 sysdeps/x86_64/multiarch/wcsncpy-generic.c |  2 +-
 sysdeps/x86_64/wcpcpy-generic.c            |  2 +-
 sysdeps/x86_64/wcpcpy.S                    |  3 ++-
 sysdeps/x86_64/wcpncpy-generic.c           |  2 +-
 sysdeps/x86_64/wcpncpy.S                   |  3 ++-
 sysdeps/x86_64/wcscat-generic.c            |  2 +-
 sysdeps/x86_64/wcscat.S                    |  3 ++-
 sysdeps/x86_64/wcscpy.S                    |  1 +
 sysdeps/x86_64/wcsncat-generic.c           |  2 +-
 sysdeps/x86_64/wcsncat.S                   |  3 ++-
 sysdeps/x86_64/wcsncpy-generic.c           |  2 +-
 sysdeps/x86_64/wcsncpy.S                   |  3 ++-
 27 files changed, 115 insertions(+), 18 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d6e01940c3..e1e894c963 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -131,10 +131,13 @@ endif
 
 ifeq ($(subdir),wcsmbs)
 sysdep_routines += \
+  wcpcpy-avx2 \
   wcpcpy-evex \
   wcpcpy-generic \
+  wcpncpy-avx2 \
   wcpncpy-evex \
   wcpncpy-generic \
+  wcscat-avx2 \
   wcscat-evex \
   wcscat-generic \
   wcschr-avx2 \
@@ -146,6 +149,7 @@ sysdep_routines += \
   wcscmp-avx2-rtm \
   wcscmp-evex \
   wcscmp-sse2 \
+  wcscpy-avx2 \
   wcscpy-evex \
   wcscpy-generic \
   wcscpy-ssse3 \
@@ -155,11 +159,13 @@ sysdep_routines += \
   wcslen-evex512 \
   wcslen-sse2 \
   wcslen-sse4_1 \
+  wcsncat-avx2 \
   wcsncat-evex \
   wcsncat-generic \
   wcsncmp-avx2 \
   wcsncmp-avx2-rtm \
   wcsncmp-evex \
+  wcsncpy-avx2 \
   wcsncpy-evex \
   wcsncpy-generic \
   wcsnlen-avx2 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c908d6c158..0c15dfebfd 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -907,6 +907,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcscpy_evex)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcscpy,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __wcscpy_ssse3)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
@@ -920,7 +924,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcsncpy_evex)
-	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
 				     1,
 				     __wcsncpy_generic))
 
@@ -932,6 +940,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcpcpy_evex)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpcpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcpcpy,
 				     1,
 				     __wcpcpy_generic))
 
@@ -942,7 +954,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcpncpy_evex)
-	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpncpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
 				     1,
 				     __wcpncpy_generic))
 
@@ -954,6 +970,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcscat_evex)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscat_avx2)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcscat,
 				     1,
 				     __wcscat_generic))
 
@@ -965,6 +985,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcsncat_evex)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncat_avx2)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncat,
 				     1,
 				     __wcsncat_generic))
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcs.h b/sysdeps/x86_64/multiarch/ifunc-wcs.h
index 1d2a63458b..51194e620e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wcs.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wcs.h
@@ -27,6 +27,8 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+
 extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
 
 static inline void *
@@ -42,6 +44,11 @@ IFUNC_SELECTOR (void)
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 	return OPTIMIZE (evex);
+
+      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+				       Prefer_No_VZEROUPPER, !))
+	return OPTIMIZE (avx2);
+
     }
 
   return OPTIMIZE (GENERIC);
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
new file mode 100644
index 0000000000..0fffd912d3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
@@ -0,0 +1,8 @@
+#ifndef WCPCPY
+# define WCPCPY	__wcpcpy_avx2
+#endif
+
+#define USE_AS_STPCPY
+#define USE_AS_WCSCPY
+#define STRCPY	WCPCPY
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-generic.c b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
index 6039196a3e..0ba29b081f 100644
--- a/sysdeps/x86_64/multiarch/wcpcpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
@@ -19,7 +19,7 @@
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCPCPY __wcpcpy_generic
 # include <wcsmbs/wcpcpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
new file mode 100644
index 0000000000..b7e594f7b7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
@@ -0,0 +1,8 @@
+#ifndef WCPNCPY
+# define WCPNCPY	__wcpncpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STPCPY
+#define STRNCPY	WCPNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-generic.c b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
index de8d34320e..4aab4ecdd2 100644
--- a/sysdeps/x86_64/multiarch/wcpncpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
@@ -19,7 +19,7 @@
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCPNCPY __wcpncpy_generic
 # include <wcsmbs/wcpncpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2.S b/sysdeps/x86_64/multiarch/wcscat-avx2.S
new file mode 100644
index 0000000000..a20f23c09d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-avx2.S
@@ -0,0 +1,10 @@
+#ifndef WCSCAT
+# define WCSCAT	__wcscat_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRCPY	WCSCAT
+
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscat-generic.c b/sysdeps/x86_64/multiarch/wcscat-generic.c
index d86b4d5c00..6476f85bbb 100644
--- a/sysdeps/x86_64/multiarch/wcscat-generic.c
+++ b/sysdeps/x86_64/multiarch/wcscat-generic.c
@@ -19,7 +19,7 @@
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSCAT __wcscat_generic
 # include <wcsmbs/wcscat.c>
diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2.S b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
new file mode 100644
index 0000000000..6bc509da07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
@@ -0,0 +1,7 @@
+#ifndef WCSCPY
+# define WCSCPY	__wcscpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define STRCPY	WCSCPY
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-generic.c b/sysdeps/x86_64/multiarch/wcscpy-generic.c
index 4a1fffae4b..600d606c45 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcscpy-generic.c
@@ -18,7 +18,7 @@
 
 
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSCPY  __wcscpy_generic
 # include <wcsmbs/wcscpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
index 9ad77da8ac..e204059873 100644
--- a/sysdeps/x86_64/multiarch/wcscpy.c
+++ b/sysdeps/x86_64/multiarch/wcscpy.c
@@ -28,6 +28,8 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
@@ -44,6 +46,9 @@ IFUNC_SELECTOR (void)
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 	return OPTIMIZE (evex);
+
+      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER, !))
+	return OPTIMIZE (avx2);
     }
 
   if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2.S b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
new file mode 100644
index 0000000000..a72105b7e9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
@@ -0,0 +1,9 @@
+#ifndef WCSNCAT
+# define WCSNCAT	__wcsncat_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRNCAT	WCSNCAT
+#include "strncat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncat-generic.c b/sysdeps/x86_64/multiarch/wcsncat-generic.c
index 4b55cb40bc..9ced02b35e 100644
--- a/sysdeps/x86_64/multiarch/wcsncat-generic.c
+++ b/sysdeps/x86_64/multiarch/wcsncat-generic.c
@@ -19,7 +19,7 @@
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSNCAT __wcsncat_generic
 # include <wcsmbs/wcsncat.c>
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
new file mode 100644
index 0000000000..3a1a8a372c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
@@ -0,0 +1,7 @@
+#ifndef WCSNCPY
+# define WCSNCPY	__wcsncpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define STRNCPY	WCSNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-generic.c b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
index d0e8a86605..693521713b 100644
--- a/sysdeps/x86_64/multiarch/wcsncpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
@@ -19,7 +19,7 @@
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSNCPY __wcsncpy_generic
 # include <wcsmbs/wcsncpy.c>
diff --git a/sysdeps/x86_64/wcpcpy-generic.c b/sysdeps/x86_64/wcpcpy-generic.c
index 3ddc98872f..4ab6182cd9 100644
--- a/sysdeps/x86_64/wcpcpy-generic.c
+++ b/sysdeps/x86_64/wcpcpy-generic.c
@@ -24,7 +24,7 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcpcpy.c>
 
diff --git a/sysdeps/x86_64/wcpcpy.S b/sysdeps/x86_64/wcpcpy.S
index 4e4fca71eb..e64af6977f 100644
--- a/sysdeps/x86_64/wcpcpy.S
+++ b/sysdeps/x86_64/wcpcpy.S
@@ -24,11 +24,12 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCPCPY	__wcpcpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcpcpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcpcpy-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcpncpy-generic.c b/sysdeps/x86_64/wcpncpy-generic.c
index 0c76e5614c..18c0377d35 100644
--- a/sysdeps/x86_64/wcpncpy-generic.c
+++ b/sysdeps/x86_64/wcpncpy-generic.c
@@ -24,7 +24,7 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcpncpy.c>
 
diff --git a/sysdeps/x86_64/wcpncpy.S b/sysdeps/x86_64/wcpncpy.S
index b4e531473e..0e0f432fbb 100644
--- a/sysdeps/x86_64/wcpncpy.S
+++ b/sysdeps/x86_64/wcpncpy.S
@@ -24,11 +24,12 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCPNCPY	__wcpncpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcpncpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcpncpy-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcscat-generic.c b/sysdeps/x86_64/wcscat-generic.c
index 512d0e4d43..639ceac523 100644
--- a/sysdeps/x86_64/wcscat-generic.c
+++ b/sysdeps/x86_64/wcscat-generic.c
@@ -24,7 +24,7 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcscat.c>
 
diff --git a/sysdeps/x86_64/wcscat.S b/sysdeps/x86_64/wcscat.S
index ee8360b6e8..06130f58f9 100644
--- a/sysdeps/x86_64/wcscat.S
+++ b/sysdeps/x86_64/wcscat.S
@@ -24,11 +24,12 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCSCAT	__wcscat
 
 # define DEFAULT_IMPL_V4	"multiarch/wcscat-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcscat-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcscpy.S b/sysdeps/x86_64/wcscpy.S
index e403579961..4a859585a6 100644
--- a/sysdeps/x86_64/wcscpy.S
+++ b/sysdeps/x86_64/wcscpy.S
@@ -29,6 +29,7 @@
 # define WCSCPY	__wcscpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcscpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcscpy-avx2.S"
 # define DEFAULT_IMPL_V2	"multiarch/wcscpy-ssse3.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
diff --git a/sysdeps/x86_64/wcsncat-generic.c b/sysdeps/x86_64/wcsncat-generic.c
index 86e20d9028..57bdd9b7cf 100644
--- a/sysdeps/x86_64/wcsncat-generic.c
+++ b/sysdeps/x86_64/wcsncat-generic.c
@@ -24,7 +24,7 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcsncat.c>
 
diff --git a/sysdeps/x86_64/wcsncat.S b/sysdeps/x86_64/wcsncat.S
index 090055a1b8..e1d8609651 100644
--- a/sysdeps/x86_64/wcsncat.S
+++ b/sysdeps/x86_64/wcsncat.S
@@ -24,11 +24,12 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCSNCAT	wcsncat
 
 # define DEFAULT_IMPL_V4	"multiarch/wcsncat-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcsncat-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcsncpy-generic.c b/sysdeps/x86_64/wcsncpy-generic.c
index 0f0ee65b65..4dcbd8ac7f 100644
--- a/sysdeps/x86_64/wcsncpy-generic.c
+++ b/sysdeps/x86_64/wcsncpy-generic.c
@@ -24,7 +24,7 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcsncpy.c>
 
diff --git a/sysdeps/x86_64/wcsncpy.S b/sysdeps/x86_64/wcsncpy.S
index 32eaf1163b..f305b5eb9b 100644
--- a/sysdeps/x86_64/wcsncpy.S
+++ b/sysdeps/x86_64/wcsncpy.S
@@ -24,11 +24,12 @@
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCSNCPY	__wcsncpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcsncpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcsncpy-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v5 1/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
  2022-11-09  1:38   ` [PATCH v5 " Noah Goldstein
                       ` (2 preceding siblings ...)
  2022-11-09  1:38     ` [PATCH v5 4/4] x86: Add avx2 " Noah Goldstein
@ 2022-11-09  3:00     ` H.J. Lu
  3 siblings, 0 replies; 42+ messages in thread
From: H.J. Lu @ 2022-11-09  3:00 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Tue, Nov 08, 2022 at 05:38:38PM -0800, Noah Goldstein wrote:
> Optimizations are:
>     1. Use more overlapping stores to avoid branches.
>     2. Reduce how unrolled the aligning copies are (this is more of a
>        code-size save, its a negative for some sizes in terms of
>        perf).
>     3. Improve the loop a bit (similiar to what we do in strlen with
>        2x vpminu + kortest instead of 3x vpminu + kmov + test).
>     4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
>        number that are taken.
> 
> Performance Changes:
> 
>     Times are from N = 10 runs of the benchmark suite and are
>     reported as geometric mean of all ratios of
>     New Implementation / Old Implementation.
> 
>     stpcpy-evex      -> 0.922
>     strcat-evex      -> 0.985
>     strcpy-evex      -> 0.880
> 
>     strncpy-evex     -> 0.831
>     stpncpy-evex     -> 0.780
> 
>     strncat-evex     -> 0.958
> 
> Code Size Changes:
>     function         -> Bytes New / Bytes Old -> Ratio
> 
>     strcat-evex      ->  819 / 1874 -> 0.437
>     strcpy-evex      ->  700 / 1074 -> 0.652
>     stpcpy-evex      ->  735 / 1094 -> 0.672
> 
>     strncpy-evex     -> 1397 / 2611 -> 0.535
>     stpncpy-evex     -> 1489 / 2691 -> 0.553
> 
>     strncat-evex     -> 1184 / 2832 -> 0.418
> 
> Notes:
>     1. Because of the significant difference between the
>        implementations they are split into three files.
> 
>            strcpy-evex.S    -> strcpy, stpcpy, strcat
>            strncpy-evex.S   -> strncpy
>            strncat-evex.S    > strncat
> 
>        I couldn't find a way to merge them without making the
>        ifdefs incredibly difficult to follow.
> 
>     2. All implementations can be made evex512 by including
>        "x86-evex512-vecs.h" at the top.
> 
>     3. All implementations have an optional define:
>         `USE_EVEX_MASKED_STORE`
>        Setting to one uses evex-masked stores for handling short
>        strings.  This saves code size and branches.  It's disabled
>        for all implementations are the moment as there are some
>        serious drawbacks to masked stores in certain cases, but
>        that may be fixed on future architectures.
> 
> Full check passes on x86-64 and build succeeds for all ISA levels w/
> and w/o multiarch.
> ---
>  sysdeps/x86_64/multiarch/stpncpy-evex.S       |    5 +-
>  sysdeps/x86_64/multiarch/strcat-evex.S        |  291 +---
>  .../x86_64/multiarch/strcat-strlen-evex.h.S   |  110 ++
>  sysdeps/x86_64/multiarch/strcpy-evex.S        | 1282 ++++++-----------
>  sysdeps/x86_64/multiarch/strncat-evex.S       |  525 ++++++-
>  sysdeps/x86_64/multiarch/strncpy-evex.S       |  995 ++++++++++++-
>  .../multiarch/strncpy-or-cat-overflow-def.h   |   80 +
>  7 files changed, 2115 insertions(+), 1173 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S
>  create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> 
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> index 99ea76a372..3693491baa 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
> +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
> @@ -3,6 +3,5 @@
>  #endif
>  
>  #define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY	STPNCPY
> -#include "strcpy-evex.S"
> +#define STRNCPY	STPNCPY
> +#include "strncpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
> index 0e2df947e9..b4207b7889 100644
> --- a/sysdeps/x86_64/multiarch/strcat-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcat-evex.S
> @@ -1,286 +1,7 @@
> -/* strcat with 256-bit EVEX instructions.
> -   Copyright (C) 2021-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <isa-level.h>
> -
> -#if ISA_SHOULD_BUILD (4)
> -
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_evex
> -# endif
> -
> -# define VMOVU		vmovdqu64
> -# define VMOVA		vmovdqa64
> -
> -/* zero register */
> -# define XMMZERO	xmm16
> -# define YMMZERO	ymm16
> -# define YMM0		ymm17
> -# define YMM1		ymm18
> -
> -# define USE_AS_STRCAT
> -
> -/* Number of bytes in a vector register */
> -# define VEC_SIZE	32
> -
> -	.section .text.evex,"ax",@progbits
> -ENTRY (STRCAT)
> -	mov	%rdi, %r9
> -# ifdef USE_AS_STRNCAT
> -	mov	%rdx, %r8
> -# endif
> -
> -	xor	%eax, %eax
> -	mov	%edi, %ecx
> -	and	$((VEC_SIZE * 4) - 1), %ecx
> -	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
> -	cmp	$(VEC_SIZE * 3), %ecx
> -	ja	L(fourth_vector_boundary)
> -	vpcmpb	$0, (%rdi), %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_first_vector)
> -	mov	%rdi, %rax
> -	and	$-VEC_SIZE, %rax
> -	jmp	L(align_vec_size_start)
> -L(fourth_vector_boundary):
> -	mov	%rdi, %rax
> -	and	$-VEC_SIZE, %rax
> -	vpcmpb	$0, (%rax), %YMMZERO, %k0
> -	mov	$-1, %r10d
> -	sub	%rax, %rcx
> -	shl	%cl, %r10d
> -	kmovd	%k0, %edx
> -	and	%r10d, %edx
> -	jnz	L(exit)
> -
> -L(align_vec_size_start):
> -	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -	add	$(VEC_SIZE * 4), %rax
> -	kmovd	%k4, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -	kmovd	%k4, %edx
> -	add	$(VEC_SIZE * 4), %rax
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -	add	$(VEC_SIZE * 4), %rax
> -	kmovd	%k4, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
> -	add	$(VEC_SIZE * 5), %rax
> -	kmovd	%k4, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
> -	add	$VEC_SIZE, %rax
> -	kmovd	%k0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
> -	add	$VEC_SIZE, %rax
> -	kmovd	%k0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
> -	add	$VEC_SIZE, %rax
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	add	$VEC_SIZE, %rax
> -
> -	.p2align 4
> -L(align_four_vec_loop):
> -	VMOVA	(%rax), %YMM0
> -	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
> -	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
> -	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
> -	vpminub	%YMM0, %YMM1, %YMM0
> -	/* If K0 != 0, there is a null byte.  */
> -	vpcmpb	$0, %YMM0, %YMMZERO, %k0
> -	add	$(VEC_SIZE * 4), %rax
> -	ktestd	%k0, %k0
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
> -	sub	$(VEC_SIZE * 5), %rax
> -	kmovd	%k0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 4), %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit):
> -	sub	%rdi, %rax
> -L(exit_null_on_first_vector):
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_second_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$VEC_SIZE, %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_third_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 2), %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_fourth_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 3), %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_fifth_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 4), %rax
> -
> -	.p2align 4
> -L(StartStrcpyPart):
> -	lea	(%r9, %rax), %rdi
> -	mov	%rsi, %rcx
> -	mov	%r9, %rax      /* save result */
> -
> -# ifdef USE_AS_STRNCAT
> -	test	%r8, %r8
> -	jz	L(ExitZero)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-evex.S"
> +#ifndef STRCAT
> +# define STRCAT	__strcat_evex
>  #endif
> +
> +#define USE_AS_STRCAT
> +#define STRCPY	STRCAT
> +#include "strcpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S
> new file mode 100644
> index 0000000000..9530d7b683
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S
> @@ -0,0 +1,110 @@
> +/* strlen used for begining of str{n}cat using EVEX 256/512.
> +   Copyright (C) 2011-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* NOTE: This file is meant to be included by strcat-evex or
> +   strncat-evex and does not standalone.  Before including %rdi
> +   must be saved in %rax.  */
> +
> +
> +/* Simple strlen implementation that ends at
> +   L(strcat_strlen_done).  */
> +	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
> +	movq	%rdi, %r8
> +	andq	$(VEC_SIZE * -1), %r8
> +	VPCMPEQ	(%r8), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +#ifdef USE_AS_WCSCPY
> +	subl	%r8d, %edi
> +	shrl	$2, %edi
> +#endif
> +	shrx	%VRDI, %VRCX, %VRCX
> +#ifdef USE_AS_WCSCPY
> +	movq	%rax, %rdi
> +#endif
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v0)
> +
> +
> +	VPCMPEQ	VEC_SIZE(%r8), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +	leaq	(VEC_SIZE)(%r8), %rdi
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v0)
> +
> +	VPCMPEQ	(VEC_SIZE * 2)(%r8), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v1)
> +
> +	VPCMPEQ	(VEC_SIZE * 3)(%r8), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v2)
> +
> +	VPCMPEQ	(VEC_SIZE * 4)(%r8), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v3)
> +
> +	andq	$-(VEC_SIZE * 4), %rdi
> +	.p2align 4,, 8
> +L(loop_2x_vec):
> +	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(0)
> +	VPMIN	(VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
> +	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(2)
> +	VPMIN	(VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
> +	VPTESTN	%VMM(1), %VMM(1), %k1
> +	VPTESTN	%VMM(3), %VMM(3), %k3
> +	subq	$(VEC_SIZE * -4), %rdi
> +	KORTEST	%k1, %k3
> +	jz	L(loop_2x_vec)
> +
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v0)
> +
> +	KMOV	%k1, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v1)
> +
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(bsf_and_done_v2)
> +
> +	KMOV	%k3, %VRCX
> +L(bsf_and_done_v3):
> +	addq	$VEC_SIZE, %rdi
> +L(bsf_and_done_v2):
> +	bsf	%VRCX, %VRCX
> +	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
> +	jmp	L(strcat_strlen_done)
> +
> +	.p2align 4,, 4
> +L(bsf_and_done_v1):
> +	addq	$VEC_SIZE, %rdi
> +L(bsf_and_done_v0):
> +	bsf	%VRCX, %VRCX
> +#ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
> +#else
> +	addq	%rcx, %rdi
> +#endif
> +L(strcat_strlen_done):
> diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
> index 82e45ac675..932129ab40 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
> @@ -1,4 +1,4 @@
> -/* strcpy with 256-bit EVEX instructions.
> +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
>     Copyright (C) 2021-2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>  
> @@ -17,990 +17,526 @@
>     <https://www.gnu.org/licenses/>.  */
>  
>  #include <isa-level.h>
> -
>  #if ISA_SHOULD_BUILD (4)
>  
>  
> -# ifndef USE_AS_STRCAT
> -#  include <sysdep.h>
> +	/* Use evex-masked stores for small sizes. Turned off at the
> +	   moment.  */
> +# define USE_EVEX_MASKED_STORE	0
> +	/* Use movsb in page cross case to save code size.  */
> +# define USE_MOVSB_IN_PAGE_CROSS	1
>  
> -#  ifndef STRCPY
> -#   define STRCPY  __strcpy_evex
> -#  endif
> +# include <sysdep.h>
>  
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
>  # endif
>  
> -# define VMOVU		vmovdqu64
> -# define VMOVA		vmovdqa64
> -
> -/* Number of bytes in a vector register */
> -# ifndef VEC_SIZE
> -#  define VEC_SIZE	32
> +# ifndef STRCPY
> +#  define STRCPY	__strcpy_evex
>  # endif
>  
> -# define XMM2		xmm18
> -# define XMM3		xmm19
>  
> -# define YMM2		ymm18
> -# define YMM3		ymm19
> -# define YMM4		ymm20
> -# define YMM5		ymm21
> -# define YMM6		ymm22
> -# define YMM7		ymm23
> +# ifdef USE_AS_WCSCPY
> +#  define VMOVU_MASK	vmovdqu32
> +#  define VPMIN	vpminud
> +#  define VPTESTN	vptestnmd
> +#  define VPTEST	vptestmd
> +#  define VPCMPEQ	vpcmpeqd
> +#  define CHAR_SIZE	4
>  
> -# ifndef USE_AS_STRCAT
> +#  define REP_MOVS	rep movsd
>  
> -/* zero register */
> -#  define XMMZERO	xmm16
> -#  define YMMZERO	ymm16
> -#  define YMM1		ymm17
> -
> -	.section .text.evex,"ax",@progbits
> -ENTRY (STRCPY)
> -#  ifdef USE_AS_STRNCPY
> -	mov	%RDX_LP, %R8_LP
> -	test	%R8_LP, %R8_LP
> -	jz	L(ExitZero)
> -#  endif
> -	mov	%rsi, %rcx
> -#  ifndef USE_AS_STPCPY
> -	mov	%rdi, %rax      /* save result */
> -#  endif
> +#  define USE_WIDE_CHAR
> +# else
> +#  define VMOVU_MASK	vmovdqu8
> +#  define VPMIN	vpminub
> +#  define VPTESTN	vptestnmb
> +#  define VPTEST	vptestmb
> +#  define VPCMPEQ	vpcmpeqb
> +#  define CHAR_SIZE	1
>  
> -	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
> +#  define REP_MOVS	rep movsb
>  # endif
>  
> -	and	$((VEC_SIZE * 4) - 1), %ecx
> -	cmp	$(VEC_SIZE * 2), %ecx
> -	jbe	L(SourceStringAlignmentLessTwoVecSize)
> -
> -	and	$-VEC_SIZE, %rsi
> -	and	$(VEC_SIZE - 1), %ecx
> -
> -	vpcmpb	$0, (%rsi), %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	shr	%cl, %rdx
> +# include "reg-macros.h"
>  
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -	mov	$VEC_SIZE, %r10
> -	sub	%rcx, %r10
> -	cmp	%r10, %r8
> -#  else
> -	mov	$(VEC_SIZE + 1), %r10
> -	sub	%rcx, %r10
> -	cmp	%r10, %r8
> -#  endif
> -	jbe	L(CopyVecSizeTailCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeTail)
> -
> -	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
> -	kmovd	%k1, %edx
>  
> -# ifdef USE_AS_STRNCPY
> -	add	$VEC_SIZE, %r10
> -	cmp	%r10, %r8
> -	jbe	L(CopyTwoVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jnz	L(CopyTwoVecSize)
> -
> -	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
> -	VMOVU	%YMM2, (%rdi)
> -
> -/* If source address alignment != destination address alignment */
> -	.p2align 4
> -L(UnalignVecSizeBoth):
> -	sub	%rcx, %rdi
> -# ifdef USE_AS_STRNCPY
> -	add	%rcx, %r8
> -	sbb	%rcx, %rcx
> -	or	%rcx, %r8
> -# endif
> -	mov	$VEC_SIZE, %rcx
> -	VMOVA	(%rsi, %rcx), %YMM2
> -	VMOVU	%YMM2, (%rdi, %rcx)
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$(VEC_SIZE * 3), %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec2)
> +# ifdef USE_AS_STPCPY
> +#  define END_REG	rax
>  # else
> -	jnz	L(CopyVecSize)
> +#  define END_REG	rdi, %rdx, CHAR_SIZE
>  # endif
>  
> -	VMOVU	%YMM2, (%rdi, %rcx)
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
> -	vpcmpb	$0, %YMM3, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec3)
> +# ifdef USE_AS_STRCAT
> +#  define PAGE_ALIGN_REG	edx
> +#  define PAGE_ALIGN_REG_64	rdx
>  # else
> -	jnz	L(CopyVecSize)
> +#  define PAGE_ALIGN_REG	eax
> +#  define PAGE_ALIGN_REG_64	rax
>  # endif
>  
> -	VMOVU	%YMM3, (%rdi, %rcx)
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
> -	vpcmpb	$0, %YMM4, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec4)
> -# else
> -	jnz	L(CopyVecSize)
> -# endif
> +# define VZERO	VMM(7)
> +# define VZERO_128	VMM_128(7)
>  
> -	VMOVU	%YMM4, (%rdi, %rcx)
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec2)
> -# else
> -	jnz	L(CopyVecSize)
> -# endif
>  
> -	VMOVU	%YMM2, (%rdi, %rcx)
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec2)
> -# else
> -	jnz	L(CopyVecSize)
> -# endif
> +# define PAGE_SIZE	4096
> +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
>  
> -	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
> -	VMOVU	%YMM2, (%rdi, %rcx)
> -	vpcmpb	$0, %YMM3, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec3)
> -# else
> -	jnz	L(CopyVecSize)
> -# endif
>  
> -	VMOVU	%YMM3, (%rdi, %rcx)
> -	mov	%rsi, %rdx
> -	lea	VEC_SIZE(%rsi, %rcx), %rsi
> -	and	$-(VEC_SIZE * 4), %rsi
> -	sub	%rsi, %rdx
> -	sub	%rdx, %rdi
> -# ifdef USE_AS_STRNCPY
> -	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
> -# endif
> -L(UnalignedFourVecSizeLoop):
> -	VMOVA	(%rsi), %YMM4
> -	VMOVA	VEC_SIZE(%rsi), %YMM5
> -	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
> -	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
> -	vpminub	%YMM5, %YMM4, %YMM2
> -	vpminub	%YMM7, %YMM6, %YMM3
> -	vpminub	%YMM2, %YMM3, %YMM2
> -	/* If K7 != 0, there is a null byte.  */
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k7
> -	kmovd	%k7, %edx
> -# ifdef USE_AS_STRNCPY
> -	sub	$(VEC_SIZE * 4), %r8
> -	jbe	L(UnalignedLeaveCase2OrCase3)
> +	.section SECTION(.text), "ax", @progbits
> +ENTRY(STRCPY)
> +# ifdef USE_AS_STRCAT
> +	movq	%rdi, %rax
> +#  include "strcat-strlen-evex.h.S"
>  # endif
> -	test	%edx, %edx
> -	jnz	L(UnalignedFourVecSizeLeave)
> -
> -L(UnalignedFourVecSizeLoop_start):
> -	add	$(VEC_SIZE * 4), %rdi
> -	add	$(VEC_SIZE * 4), %rsi
> -	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
> -	VMOVA	(%rsi), %YMM4
> -	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
> -	VMOVA	VEC_SIZE(%rsi), %YMM5
> -	vpminub	%YMM5, %YMM4, %YMM2
> -	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
> -	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
> -	VMOVU	%YMM7, -VEC_SIZE(%rdi)
> -	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
> -	vpminub	%YMM7, %YMM6, %YMM3
> -	vpminub	%YMM2, %YMM3, %YMM2
> -	/* If K7 != 0, there is a null byte.  */
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k7
> -	kmovd	%k7, %edx
> -# ifdef USE_AS_STRNCPY
> -	sub	$(VEC_SIZE * 4), %r8
> -	jbe	L(UnalignedLeaveCase2OrCase3)
> +
> +	movl	%esi, %PAGE_ALIGN_REG
> +	andl	$(PAGE_SIZE - 1), %PAGE_ALIGN_REG
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
> +	ja	L(page_cross)
> +L(page_cross_continue):
> +	VMOVU	(%rsi), %VMM(0)
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +	movq	%rdi, %rax
>  # endif
> -	test	%edx, %edx
> -	jz	L(UnalignedFourVecSizeLoop_start)
>  
> -L(UnalignedFourVecSizeLeave):
> -	vpcmpb	$0, %YMM4, %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeUnaligned_0)
>  
> -	vpcmpb	$0, %YMM5, %YMMZERO, %k2
> -	kmovd	%k2, %ecx
> -	test	%ecx, %ecx
> -	jnz	L(CopyVecSizeUnaligned_16)
> +	/* Two short string implementations. One with traditional
> +	   branching approach and one with masked instructions (which
> +	   have potential for dramatically bad perf if dst splits a
> +	   page and is not in the TLB).  */
> +# if USE_EVEX_MASKED_STORE
> +	VPTEST	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +#  ifdef USE_AS_WCSCPY
> +	subl	$((1 << CHAR_PER_VEC)- 1), %VRCX
> +#  else
> +	inc	%VRCX
> +#  endif
> +	jz	L(more_1x_vec)
> +	KMOV	%VRCX, %k1
> +	KXOR	%k0, %k1, %k1
>  
> -	vpcmpb	$0, %YMM6, %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeUnaligned_32)
> -
> -	vpcmpb	$0, %YMM7, %YMMZERO, %k4
> -	kmovd	%k4, %ecx
> -	bsf	%ecx, %edx
> -	VMOVU	%YMM4, (%rdi)
> -	VMOVU	%YMM5, VEC_SIZE(%rdi)
> -	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
> -# endif
> -	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
> -	add	$(VEC_SIZE - 1), %r8
> -	sub	%rdx, %r8
> -	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> -# else
> -	add	$(VEC_SIZE * 3), %rsi
> -	add	$(VEC_SIZE * 3), %rdi
> -	jmp	L(CopyVecSizeExit)
> -# endif
> +	VMOVU_MASK %VMM(0), (%rdi){%k1}
>  
> -/* If source address alignment == destination address alignment */
> +#  ifdef USE_AS_STPCPY
> +	bsf	%VRCX, %VRCX
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
> +#  endif
> +	ret
>  
> -L(SourceStringAlignmentLessTwoVecSize):
> -	VMOVU	(%rsi), %YMM3
> -	VMOVU	VEC_SIZE(%rsi), %YMM2
> -	vpcmpb	$0, %YMM3, %YMMZERO, %k0
> -	kmovd	%k0, %edx
> +# else
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jz	L(more_1x_vec)
>  
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -	cmp	$VEC_SIZE, %r8
> +	xorl	%edx, %edx
> +	bsf	%VRCX, %VRDX
> +#  ifdef USE_AS_STPCPY
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#  endif
> +
> +	/* Use mask bits in rcx to detect which copy we need. If the low
> +	   mask is zero then there must be a bit set in the upper half.
> +	   I.e if rcx != 0 and ecx == 0, then match must be upper 32
> +	   bits so we use L(copy_32_63).  */
> +#  if VEC_SIZE == 64
> +#   ifdef USE_AS_WCSCPY
> +	testb	%cl, %cl
> +#   else
> +	testl	%ecx, %ecx
> +#   endif
> +	jz	L(copy_32_63)
> +#  endif
> +
> +#  ifdef USE_AS_WCSCPY
> +	testb	$0xf, %cl
>  #  else
> -	cmp	$(VEC_SIZE + 1), %r8
> +	testw	%cx, %cx
>  #  endif
> -	jbe	L(CopyVecSizeTail1Case2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeTail1)
> +	jz	L(copy_16_31)
>  
> -	VMOVU	%YMM3, (%rdi)
> -	vpcmpb	$0, %YMM2, %YMMZERO, %k0
> -	kmovd	%k0, %edx
>  
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -	cmp	$(VEC_SIZE * 2), %r8
> +#  ifdef USE_AS_WCSCPY
> +	testb	$0x3, %cl
>  #  else
> -	cmp	$((VEC_SIZE * 2) + 1), %r8
> +	testb	%cl, %cl
>  #  endif
> -	jbe	L(CopyTwoVecSize1Case2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jnz	L(CopyTwoVecSize1)
> -
> -	and	$-VEC_SIZE, %rsi
> -	and	$(VEC_SIZE - 1), %ecx
> -	jmp	L(UnalignVecSizeBoth)
> +	jz	L(copy_8_15)
>  
> -/*------End of main part with loops---------------------*/
>  
> -/* Case1 */
> +#  ifdef USE_AS_WCSCPY
> +	vmovd	%VMM_128(0), (%rdi)
> +	/* No need to copy, we know its zero.  */
> +	movl	$0, (%END_REG)
>  
> -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> -	.p2align 4
> -L(CopyVecSize):
> -	add	%rcx, %rdi
> -# endif
> -L(CopyVecSizeTail):
> -	add	%rcx, %rsi
> -L(CopyVecSizeTail1):
> -	bsf	%edx, %edx
> -L(CopyVecSizeExit):
> -	cmp	$32, %edx
> -	jae	L(Exit32_63)
> -	cmp	$16, %edx
> -	jae	L(Exit16_31)
> -	cmp	$8, %edx
> -	jae	L(Exit8_15)
> -	cmp	$4, %edx
> -	jae	L(Exit4_7)
> -	cmp	$3, %edx
> -	je	L(Exit3)
> -	cmp	$1, %edx
> -	ja	L(Exit2)
> -	je	L(Exit1)
> -	movb	$0, (%rdi)
> -# ifdef USE_AS_STPCPY
> -	lea	(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$1, %r8
> -	lea	1(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> -# endif
>  	ret
> +#  else
>  
> -	.p2align 4
> -L(CopyTwoVecSize1):
> -	add	$VEC_SIZE, %rsi
> -	add	$VEC_SIZE, %rdi
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$VEC_SIZE, %r8
> -# endif
> -	jmp	L(CopyVecSizeTail1)
> -
> -	.p2align 4
> -L(CopyTwoVecSize):
> -	bsf	%edx, %edx
> -	add	%rcx, %rsi
> -	add	$VEC_SIZE, %edx
> -	sub	%ecx, %edx
> -	jmp	L(CopyVecSizeExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnaligned_0):
> -	bsf	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -	VMOVU	%YMM4, (%rdi)
> -	add	$((VEC_SIZE * 4) - 1), %r8
> -	sub	%rdx, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> -# else
> -	jmp	L(CopyVecSizeExit)
> -# endif
> +	testb	$0x7, %cl
> +	jz	L(copy_4_7)
>  
> -	.p2align 4
> -L(CopyVecSizeUnaligned_16):
> -	bsf	%ecx, %edx
> -	VMOVU	%YMM4, (%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	VEC_SIZE(%rdi, %rdx), %rax
> -# endif
> -	VMOVU	%YMM5, VEC_SIZE(%rdi)
> -	add	$((VEC_SIZE * 3) - 1), %r8
> -	sub	%rdx, %r8
> -	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> -# else
> -	add	$VEC_SIZE, %rsi
> -	add	$VEC_SIZE, %rdi
> -	jmp	L(CopyVecSizeExit)
> -# endif
>  
> -	.p2align 4
> -L(CopyVecSizeUnaligned_32):
> -	bsf	%edx, %edx
> -	VMOVU	%YMM4, (%rdi)
> -	VMOVU	%YMM5, VEC_SIZE(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
> -# endif
> -	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
> -	add	$((VEC_SIZE * 2) - 1), %r8
> -	sub	%rdx, %r8
> -	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> -# else
> -	add	$(VEC_SIZE * 2), %rsi
> -	add	$(VEC_SIZE * 2), %rdi
> -	jmp	L(CopyVecSizeExit)
> -# endif
> +	test	%edx, %edx
> +	jz	L(set_null_term)
>  
> -# ifdef USE_AS_STRNCPY
> -#  ifndef USE_AS_STRCAT
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec6):
> -	VMOVU	%YMM6, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec5):
> -	VMOVU	%YMM5, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec4):
> -	VMOVU	%YMM4, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec3):
> -	VMOVU	%YMM3, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> +	/* NB: make this `vmovw` if support for AVX512-FP16 is added.
> +	 */
> +	vmovd	%VMM_128(0), %esi
> +	movw	%si, (%rdi)
> +
> +	.p2align 4,, 1
> +L(set_null_term):
> +	/* No need to copy, we know its zero.  */
> +	movb	$0, (%END_REG)
> +	ret
>  #  endif
>  
> -/* Case2 */
> -
> -	.p2align 4
> -L(CopyVecSizeCase2):
> -	add	$VEC_SIZE, %r8
> -	add	%rcx, %rdi
> -	add	%rcx, %rsi
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyTwoVecSizeCase2):
> -	add	%rcx, %rsi
> -	bsf	%edx, %edx
> -	add	$VEC_SIZE, %edx
> -	sub	%ecx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -L(CopyVecSizeTailCase2):
> -	add	%rcx, %rsi
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -L(CopyVecSizeTail1Case2):
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -/* Case2 or Case3,  Case3 */
> -
> -	.p2align 4
> -L(CopyVecSizeCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyVecSizeCase2)
> -L(CopyVecSizeCase3):
> -	add	$VEC_SIZE, %r8
> -	add	%rcx, %rdi
> -	add	%rcx, %rsi
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyTwoVecSizeCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyTwoVecSizeCase2)
> -	add	%rcx, %rsi
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyVecSizeTailCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyVecSizeTailCase2)
> -	add	%rcx, %rsi
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyTwoVecSize1Case2OrCase3):
> -	add	$VEC_SIZE, %rdi
> -	add	$VEC_SIZE, %rsi
> -	sub	$VEC_SIZE, %r8
> -L(CopyVecSizeTail1Case2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyVecSizeTail1Case2)
> -	jmp	L(StrncpyExit)
> +#  if VEC_SIZE == 64
> +	.p2align 4,, 6
> +L(copy_32_63):
> +	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +	VMOVU	%VMM_256(0), (%rdi)
> +	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
> +	ret
> +#  endif
> +
> +
> +	.p2align 4,, 6
> +L(copy_16_31):
> +	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
> +	   and will save code size.  */
> +	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +	VMOVU	%VMM_128(0), (%rdi)
> +	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
> +	ret
> +
> +	.p2align 4,, 8
> +L(copy_8_15):
> +#  ifdef USE_AS_WCSCPY
> +	movl	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> +#  else
> +	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
> +#  endif
> +	vmovq	%VMM_128(0), (%rdi)
> +	movq	%rcx, -(8 - CHAR_SIZE)(%END_REG)
> +	ret
>  # endif
>  
> -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
>  
> -	.p2align 4
> -L(Exit1):
> -	movzwl	(%rsi), %edx
> -	mov	%dx, (%rdi)
> -# ifdef USE_AS_STPCPY
> -	lea	1(%rdi), %rax
> +# ifndef USE_AS_WCSCPY
> +	.p2align 4,, 12
> +L(copy_4_7):
> +	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
> +	vmovd	%VMM_128(0), (%rdi)
> +	movl	%ecx, -(4 - CHAR_SIZE)(%END_REG)
> +	ret
>  # endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$2, %r8
> -	lea	2(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +
> +
> +	.p2align 4,, 8
> +L(more_1x_vec):
> +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> +	VMOVU	%VMM(0), (%rdi)
>  # endif
> -	ret
> +	subq	%rsi, %rdi
> +	andq	$-(VEC_SIZE), %rsi
> +	addq	%rsi, %rdi
> +	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
>  
> -	.p2align 4
> -L(Exit2):
> -	movzwl	(%rsi), %ecx
> -	mov	%cx, (%rdi)
> -	movb	$0, 2(%rdi)
> +	/* Ideally we store after moves to minimize impact of potential
> +	   false-dependencies.  */
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +	VMOVU	%VMM(0), (%rax)
> +# endif
> +
> +	VPTESTN	%VMM(1), %VMM(1), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x1)
> +
> +	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
> +	VMOVU	%VMM(1), VEC_SIZE(%rdi)
> +
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x2)
> +
> +	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
> +
> +	VPTESTN	%VMM(3), %VMM(3), %k0
> +	KMOV	%k0, %VRDX
> +	test	%VRDX, %VRDX
> +	jnz	L(ret_vec_x3)
> +
> +	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	VPTESTN	%VMM(4), %VMM(4), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x4)
> +
> +	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
> +
> +
> +	/* Align for 4x loop.  */
> +	subq	%rsi, %rdi
> +
> +	/* + VEC_SIZE * 5 because we never added the original VEC_SIZE
> +	   we covered before aligning.  */
> +	subq	$-(VEC_SIZE * 5), %rsi
> +	andq	$-(VEC_SIZE * 4), %rsi
> +
> +
> +	/* Load first half of the loop before entry.  */
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +	KORTEST	%k2, %k4
> +	jnz	L(loop_4x_done)
> +
> +	.p2align 4,, 11
> +L(loop_4x_vec):
> +
> +	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> +
> +	subq	$(VEC_SIZE * -4), %rsi
> +
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +	KORTEST	%k2, %k4
> +	jz	L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +	/* Restore rdi (%rdi).  */
> +	addq	%rsi, %rdi
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x0_end)
> +	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +
> +	KMOV	%k2, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x1)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x2)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +	/* Place L(ret_vec_x4) here to save code size.  We get a
> +	   meaningfuly benefit doing this for stpcpy.  */
> +	KMOV	%k4, %VRDX
> +L(ret_vec_x3):
> +	bsf	%VRDX, %VRDX
> +	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -	lea	2(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$3, %r8
> -	lea	3(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	leaq	(VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
>  # endif
> +L(return_end):
>  	ret
>  
> -	.p2align 4
> -L(Exit3):
> -	mov	(%rsi), %edx
> -	mov	%edx, (%rdi)
> +	.p2align 4,, 6
> +L(ret_vec_x0_end):
> +	bsf	%VRCX, %VRCX
>  # ifdef USE_AS_STPCPY
> -	lea	3(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$4, %r8
> -	lea	4(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
> +	inc	%VRCX
> +	VMOVU	(-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  	ret
>  
> -	.p2align 4
> -L(Exit4_7):
> -	mov	(%rsi), %ecx
> -	mov	%ecx, (%rdi)
> -	mov	-3(%rsi, %rdx), %ecx
> -	mov	%ecx, -3(%rdi, %rdx)
> +	.p2align 4,, 8
> +L(ret_vec_x1):
> +	bsf	%VRCX, %VRCX
> +	VMOVU	(VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	%rdx, %r8
> -	sub	$1, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
>  	ret
>  
> -	.p2align 4
> -L(Exit8_15):
> -	mov	(%rsi), %rcx
> -	mov	-7(%rsi, %rdx), %r9
> -	mov	%rcx, (%rdi)
> -	mov	%r9, -7(%rdi, %rdx)
> +	.p2align 4,, 4
> +L(ret_vec_x2):
> +	bsf	%VRCX, %VRCX
> +	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	%rdx, %r8
> -	sub	$1, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
>  	ret
>  
> -	.p2align 4
> -L(Exit16_31):
> -	VMOVU	(%rsi), %XMM2
> -	VMOVU	-15(%rsi, %rdx), %XMM3
> -	VMOVU	%XMM2, (%rdi)
> -	VMOVU	%XMM3, -15(%rdi, %rdx)
> +	/* ret_vec_x3 reuses return code after the loop.  */
> +	.p2align 4,, 6
> +L(ret_vec_x4):
> +	bsf	%VRCX, %VRCX
> +	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
>  # ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub %rdx, %r8
> -	sub $1, %r8
> -	lea 1(%rdi, %rdx), %rdi
> -	jnz L(StrncpyFillTailWithZero)
> +	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
>  # endif
>  	ret
>  
> -	.p2align 4
> -L(Exit32_63):
> -	VMOVU	(%rsi), %YMM2
> -	VMOVU	-31(%rsi, %rdx), %YMM3
> -	VMOVU	%YMM2, (%rdi)
> -	VMOVU	%YMM3, -31(%rdi, %rdx)
> -# ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> +
> +	.p2align 4,, 4
> +L(page_cross):
> +# ifndef USE_AS_STRCAT
> +	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
>  # endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	%rdx, %r8
> -	sub	$1, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	movq	%rsi, %rcx
> +	andq	$(VEC_SIZE * -1), %rcx
> +
> +	VPCMPEQ	(%rcx), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +# ifdef USE_AS_WCSCPY
> +	andl	$(VEC_SIZE - 1), %PAGE_ALIGN_REG
> +	shrl	$2, %PAGE_ALIGN_REG
>  # endif
> -	ret
> +	shrx	%VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
>  
> -# ifdef USE_AS_STRNCPY
> +# if USE_MOVSB_IN_PAGE_CROSS
> +	/* Optimizing more aggressively for space as this is very cold
> +	   code. This saves 2x cache lines.  */
>  
> -	.p2align 4
> -L(StrncpyExit1):
> -	movzbl	(%rsi), %edx
> -	mov	%dl, (%rdi)
> -#  ifdef USE_AS_STPCPY
> -	lea	1(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, 1(%rdi)
> +	/* This adds once to the later result which will get correct
> +	   copy bounds. NB: this can never zero-out a non-zero RCX as
> +	   to be in the page cross case rsi cannot be aligned and we
> +	   already right-shift rcx by the misalignment.  */
> +	shl	%VRCX
> +	jz	L(page_cross_continue)
> +#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +	movq	%rdi, %rax
>  #  endif
> -	ret
> +	bsf	%VRCX, %VRCX
> +	REP_MOVS
>  
> -	.p2align 4
> -L(StrncpyExit2):
> -	movzwl	(%rsi), %edx
> -	mov	%dx, (%rdi)
>  #  ifdef USE_AS_STPCPY
> -	lea	2(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, 2(%rdi)
> +	leaq	-CHAR_SIZE(%rdi), %rax
>  #  endif
>  	ret
>  
> -	.p2align 4
> -L(StrncpyExit3_4):
> -	movzwl	(%rsi), %ecx
> -	movzwl	-2(%rsi, %r8), %edx
> -	mov	%cx, (%rdi)
> -	mov	%dx, -2(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	ret
>  
> -	.p2align 4
> -L(StrncpyExit5_8):
> -	mov	(%rsi), %ecx
> -	mov	-4(%rsi, %r8), %edx
> -	mov	%ecx, (%rdi)
> -	mov	%edx, -4(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	ret
> +# else
> +	/* Check if we found zero-char before end of page.  */
> +	test	%VRCX, %VRCX
> +	jz	L(page_cross_continue)
>  
> -	.p2align 4
> -L(StrncpyExit9_16):
> -	mov	(%rsi), %rcx
> -	mov	-8(%rsi, %r8), %rdx
> -	mov	%rcx, (%rdi)
> -	mov	%rdx, -8(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	ret
> +	/* Traditional copy case, essentially same as used in non-page-
> +	   cross case but since we can't reuse VMM(0) we need twice as
> +	   many loads from rsi.  */
>  
> -	.p2align 4
> -L(StrncpyExit17_32):
> -	VMOVU	(%rsi), %XMM2
> -	VMOVU	-16(%rsi, %r8), %XMM3
> -	VMOVU	%XMM2, (%rdi)
> -	VMOVU	%XMM3, -16(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> +#  ifndef USE_AS_STRCAT
> +	xorl	%edx, %edx
>  #  endif
> -	ret
> -
> -	.p2align 4
> -L(StrncpyExit33_64):
> -	/*  0/32, 31/16 */
> -	VMOVU	(%rsi), %YMM2
> -	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
> -	VMOVU	%YMM2, (%rdi)
> -	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
> +	/* Dependency on rdi must already have been satisfied.  */
> +	bsf	%VRCX, %VRDX
>  #  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#  elif !defined USE_AS_STRCAT
> +	movq	%rdi, %rax
>  #  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	ret
>  
> -	.p2align 4
> -L(StrncpyExit65):
> -	/* 0/32, 32/32, 64/1 */
> -	VMOVU	(%rsi), %YMM2
> -	VMOVU	32(%rsi), %YMM3
> -	mov	64(%rsi), %cl
> -	VMOVU	%YMM2, (%rdi)
> -	VMOVU	%YMM3, 32(%rdi)
> -	mov	%cl, 64(%rdi)
> -#  ifdef USE_AS_STPCPY
> -	lea	65(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, 65(%rdi)
> +#  if VEC_SIZE == 64
> +#   ifdef USE_AS_WCSCPY
> +	testb	%cl, %cl
> +#   else
> +	test	%ecx, %ecx
> +#   endif
> +	jz	L(page_cross_copy_32_63)
>  #  endif
> -	ret
> -
> -#  ifndef USE_AS_STRCAT
>  
> -	.p2align 4
> -L(Fill1):
> -	mov	%dl, (%rdi)
> -	ret
> +#  ifdef USE_AS_WCSCPY
> +	testb	$0xf, %cl
> +#  else
> +	testw	%cx, %cx
> +#  endif
> +	jz	L(page_cross_copy_16_31)
>  
> -	.p2align 4
> -L(Fill2):
> -	mov	%dx, (%rdi)
> -	ret
> +#  ifdef USE_AS_WCSCPY
> +	testb	$0x3, %cl
> +#  else
> +	testb	%cl, %cl
> +#  endif
> +	jz	L(page_cross_copy_8_15)
>  
> -	.p2align 4
> -L(Fill3_4):
> -	mov	%dx, (%rdi)
> -	mov     %dx, -2(%rdi, %r8)
> +#  ifdef USE_AS_WCSCPY
> +	movl	(%rsi), %esi
> +	movl	%esi, (%rdi)
> +	movl	$0, (%END_REG)
>  	ret
> +#  else
>  
> -	.p2align 4
> -L(Fill5_8):
> -	mov	%edx, (%rdi)
> -	mov     %edx, -4(%rdi, %r8)
> -	ret
> +	testb	$0x7, %cl
> +	jz	L(page_cross_copy_4_7)
>  
> -	.p2align 4
> -L(Fill9_16):
> -	mov	%rdx, (%rdi)
> -	mov	%rdx, -8(%rdi, %r8)
> +	test	%edx, %edx
> +	jz	L(page_cross_set_null_term)
> +	movzwl	(%rsi), %ecx
> +	movw	%cx, (%rdi)
> +L(page_cross_set_null_term):
> +	movb	$0, (%END_REG)
>  	ret
>  
> -	.p2align 4
> -L(Fill17_32):
> -	VMOVU	%XMMZERO, (%rdi)
> -	VMOVU	%XMMZERO, -16(%rdi, %r8)
> -	ret
>  
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec2):
> -	VMOVU	%YMM2, (%rdi, %rcx)
> -
> -	.p2align 4
> -L(CopyVecSizeVecExit):
> -	bsf	%edx, %edx
> -	add	$(VEC_SIZE - 1), %r8
> -	add	%rcx, %rdi
> -#   ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -#   endif
> -	sub	%rdx, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -
> -	.p2align 4
> -L(StrncpyFillTailWithZero):
> -	xor	%edx, %edx
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(StrncpyFillExit)
> -
> -	VMOVU	%YMMZERO, (%rdi)
> -	add	$VEC_SIZE, %rdi
> -
> -	mov	%rdi, %rsi
> -	and	$(VEC_SIZE - 1), %esi
> -	sub	%rsi, %rdi
> -	add	%rsi, %r8
> -	sub	$(VEC_SIZE * 4), %r8
> -	jb	L(StrncpyFillLessFourVecSize)
> -
> -L(StrncpyFillLoopVmovdqa):
> -	VMOVA	%YMMZERO, (%rdi)
> -	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
> -	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
> -	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
> -	add	$(VEC_SIZE * 4), %rdi
> -	sub	$(VEC_SIZE * 4), %r8
> -	jae	L(StrncpyFillLoopVmovdqa)
> -
> -L(StrncpyFillLessFourVecSize):
> -	add	$(VEC_SIZE * 2), %r8
> -	jl	L(StrncpyFillLessTwoVecSize)
> -	VMOVA	%YMMZERO, (%rdi)
> -	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
> -	add	$(VEC_SIZE * 2), %rdi
> -	sub	$VEC_SIZE, %r8
> -	jl	L(StrncpyFillExit)
> -	VMOVA	%YMMZERO, (%rdi)
> -	add	$VEC_SIZE, %rdi
> -	jmp	L(Fill)
> -
> -	.p2align 4
> -L(StrncpyFillLessTwoVecSize):
> -	add	$VEC_SIZE, %r8
> -	jl	L(StrncpyFillExit)
> -	VMOVA	%YMMZERO, (%rdi)
> -	add	$VEC_SIZE, %rdi
> -	jmp	L(Fill)
> -
> -	.p2align 4
> -L(StrncpyFillExit):
> -	add	$VEC_SIZE, %r8
> -L(Fill):
> -	cmp	$17, %r8d
> -	jae	L(Fill17_32)
> -	cmp	$9, %r8d
> -	jae	L(Fill9_16)
> -	cmp	$5, %r8d
> -	jae	L(Fill5_8)
> -	cmp	$3, %r8d
> -	jae	L(Fill3_4)
> -	cmp	$1, %r8d
> -	ja	L(Fill2)
> -	je	L(Fill1)
> +	.p2align 4,, 4
> +L(page_cross_copy_4_7):
> +	movl	(%rsi), %ecx
> +	movl	-(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
> +	movl	%ecx, (%rdi)
> +	movl	%esi, -(4 - CHAR_SIZE)(%END_REG)
>  	ret
> -
> -/* end of ifndef USE_AS_STRCAT */
>  #  endif
>  
> -	.p2align 4
> -L(UnalignedLeaveCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(UnalignedFourVecSizeLeaveCase2)
> -L(UnalignedFourVecSizeLeaveCase3):
> -	lea	(VEC_SIZE * 4)(%r8), %rcx
> -	and	$-VEC_SIZE, %rcx
> -	add	$(VEC_SIZE * 3), %r8
> -	jl	L(CopyVecSizeCase3)
> -	VMOVU	%YMM4, (%rdi)
> -	sub	$VEC_SIZE, %r8
> -	jb	L(CopyVecSizeCase3)
> -	VMOVU	%YMM5, VEC_SIZE(%rdi)
> -	sub	$VEC_SIZE, %r8
> -	jb	L(CopyVecSizeCase3)
> -	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
> -	sub	$VEC_SIZE, %r8
> -	jb	L(CopyVecSizeCase3)
> -	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
> -#  ifdef USE_AS_STPCPY
> -	lea	(VEC_SIZE * 4)(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (VEC_SIZE * 4)(%rdi)
> -#  endif
> +#  if VEC_SIZE == 64
> +	.p2align 4,, 4
> +L(page_cross_copy_32_63):
> +	VMOVU	(%rsi), %VMM_256(0)
> +	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +	VMOVU	%VMM_256(0), (%rdi)
> +	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
>  	ret
> -
> -	.p2align 4
> -L(UnalignedFourVecSizeLeaveCase2):
> -	xor	%ecx, %ecx
> -	vpcmpb	$0, %YMM4, %YMMZERO, %k1
> -	kmovd	%k1, %edx
> -	add	$(VEC_SIZE * 3), %r8
> -	jle	L(CopyVecSizeCase2OrCase3)
> -	test	%edx, %edx
> -#  ifndef USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec4)
> -#  else
> -	jnz	L(CopyVecSize)
> -#  endif
> -	vpcmpb	$0, %YMM5, %YMMZERO, %k2
> -	kmovd	%k2, %edx
> -	VMOVU	%YMM4, (%rdi)
> -	add	$VEC_SIZE, %rcx
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -	test	%edx, %edx
> -#  ifndef USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec5)
> -#  else
> -	jnz	L(CopyVecSize)
>  #  endif
>  
> -	vpcmpb	$0, %YMM6, %YMMZERO, %k3
> -	kmovd	%k3, %edx
> -	VMOVU	%YMM5, VEC_SIZE(%rdi)
> -	add	$VEC_SIZE, %rcx
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -	test	%edx, %edx
> -#  ifndef USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec6)
> -#  else
> -	jnz	L(CopyVecSize)
> -#  endif
> -
> -	vpcmpb	$0, %YMM7, %YMMZERO, %k4
> -	kmovd	%k4, %edx
> -	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
> -	lea	VEC_SIZE(%rdi, %rcx), %rdi
> -	lea	VEC_SIZE(%rsi, %rcx), %rsi
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -L(StrncpyExit):
> -	cmp	$65, %r8d
> -	je	L(StrncpyExit65)
> -	cmp	$33, %r8d
> -	jae	L(StrncpyExit33_64)
> -	cmp	$17, %r8d
> -	jae	L(StrncpyExit17_32)
> -	cmp	$9, %r8d
> -	jae	L(StrncpyExit9_16)
> -	cmp	$5, %r8d
> -	jae	L(StrncpyExit5_8)
> -	cmp	$3, %r8d
> -	jae	L(StrncpyExit3_4)
> -	cmp	$1, %r8d
> -	ja	L(StrncpyExit2)
> -	je	L(StrncpyExit1)
> -#  ifdef USE_AS_STPCPY
> -	mov	%rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi)
> -#  endif
> +	.p2align 4,, 4
> +L(page_cross_copy_16_31):
> +	vmovdqu	(%rsi), %xmm0
> +	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +	vmovdqu	%xmm0, (%rdi)
> +	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
>  	ret
>  
> -	.p2align 4
> -L(ExitZero):
> -#  ifndef USE_AS_STRCAT
> -	mov	%rdi, %rax
> -#  endif
> +	.p2align 4,, 4
> +L(page_cross_copy_8_15):
> +	movq	(%rsi), %rcx
> +	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> +	movq	%rcx, (%rdi)
> +	movq	%rsi, -(8 - CHAR_SIZE)(%END_REG)
>  	ret
> -
> -# endif
> -
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# else
> -END (STRCAT)
>  # endif
> +END(STRCPY)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
> index 203a19bf21..bced4e8944 100644
> --- a/sysdeps/x86_64/multiarch/strncat-evex.S
> +++ b/sysdeps/x86_64/multiarch/strncat-evex.S
> @@ -1,7 +1,520 @@
> -#ifndef STRNCAT
> -# define STRNCAT	__strncat_evex
> -#endif
> +/* {wcs|str}ncat  with 256/512-bit EVEX.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +	/* Use evex-masked stores for small sizes. Turned off at the
> +	   moment.  */
> +# define USE_EVEX_MASKED_STORE	0
> +
> +# include <sysdep.h>
> +
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +# ifndef STRNCAT
> +#  define STRNCAT	__strncat_evex
> +# endif
> +
> +
> +# ifdef USE_AS_WCSCPY
> +#  define MOVCHAR	movl
> +#  define VMOVU_MASK	vmovdqu32
> +#  define VPMIN	vpminud
> +#  define VPTESTN	vptestnmd
> +#  define VPTEST	vptestmd
> +#  define VPCMPEQ	vpcmpeqd
> +#  define CHAR_SIZE	4
> +
> +#  define REP_MOVS	rep movsd
> +
> +#  define VMASK_REG	VR10
> +#  define FIND_FIRST_ONE(src, dst)	movl $CHAR_PER_VEC, %dst; bsf %src, %dst
> +
> +#  define USE_WIDE_CHAR
> +# else
> +#  define MOVCHAR	movb
> +#  define VMOVU_MASK	vmovdqu8
> +#  define VPMIN	vpminub
> +#  define VPTESTN	vptestnmb
> +#  define VPTEST	vptestmb
> +#  define VPCMPEQ	vpcmpeqb
> +#  define CHAR_SIZE	1
> +
> +#  define REP_MOVS	rep movsb
> +
> +#  define VMASK_REG	VRCX
> +#  define FIND_FIRST_ONE(src, dst)	tzcnt %src, %dst
> +
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# include "reg-macros.h"
> +
> +
> +# define VZERO	VMM(7)
> +# define VZERO_128	VMM_128(7)
> +
> +# define PAGE_SIZE	4096
> +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
> +
> +	.section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCAT)
> +	movq	%rdi, %rax
> +
> +	/* NB: It's safe to filter out zero-length strings WITHOUT
> +	   setting null-term. Destination MUST be a null-terminated
> +	   string so essentially the work is already done.  */
> +# ifdef USE_AS_WCSCPY
> +	leaq	-1(%rdx), %rcx
> +	shrq	$56, %rcx
> +	jnz	L(zero_len)
> +# else
> +	test	%rdx, %rdx
> +	jle	L(zero_len)
> +# endif
> +
> +# include "strcat-strlen-evex.h.S"
> +
> +	movl	%esi, %ecx
> +	andl	$(PAGE_SIZE - 1), %ecx
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
> +	ja	L(page_cross)
> +L(page_cross_continue):
> +	VMOVU	(%rsi), %VMM(0)
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +
> +	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
> +	   <= CHAR_PER_VEC with masked instructions (which have
> +	   potential for dramatically bad perf if dst splits a page and
> +	   is not in the TLB).  */
> +# if USE_EVEX_MASKED_STORE
> +	KMOV	%k0, %VRCX
> +	FIND_FIRST_ONE (VRCX, VR8)
> +	cmpq	%r8, %rdx
> +	jbe	L(less_1x_vec)
> +
> +	test	%VRCX, %VRCX
> +	jz	L(more_1x_vec)
> +
> +	blsmsk	%VRCX, %VRCX
> +	KMOV	%VRCX, %k1
> +	VMOVU_MASK %VMM(0), (%rdi){%k1}
> +	ret
> +
> +L(less_1x_vec):
> +	mov	$-1, %VRCX
> +	bzhi	%VRDX, %VRCX, %VRCX
> +	KMOV	%VRCX, %k1
> +	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
> +	VMOVU_MASK %VMM(0), (%rdi){%k1}
> +
> +	ret
> +# else
> +	KMOV	%k0, %VMASK_REG
> +	/* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
> +	   %VMASK_REG, %VRCX` for wcsncat.  */
> +	FIND_FIRST_ONE (VMASK_REG, VRCX)
> +	cmpq	%rcx, %rdx
> +	jbe	L(less_1x_vec)
> +
> +	/* If there were no zero-CHARs (rcx was zero before
> +	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> +	cmpl	$CHAR_PER_VEC, %ecx
> +	je	L(more_1x_vec)
> +
> +	movl	%ecx, %edx
> +
> +L(less_1x_vec):
> +#  if VEC_SIZE == 64
> +	cmpl	$(32 / CHAR_SIZE), %edx
> +	jae	L(copy_32_63)
> +#  endif
> +
> +	cmpl	$(16 / CHAR_SIZE), %edx
> +	jae	L(copy_16_31)
> +
> +
> +	cmpl	$(8 / CHAR_SIZE), %edx
> +	jae	L(copy_8_15)
> +
> +#  ifdef USE_AS_WCSCPY
> +	vmovd	%VMM_128(0), (%rdi)
> +	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +#  else
> +
> +	cmpl	$4, %edx
> +	jae	L(copy_4_7)
> +
> +	movzbl	(%rsi), %ecx
> +	cmpl	$1, %edx
> +	jbe	L(set_null_term)
> +
> +	movzwl	1(%rsi), %esi
> +	movw	%si, 1(%rdi)
> +
> +	.p2align 4,, 1
> +L(set_null_term):
> +	movb	%cl, (%rdi)
> +	MOVCHAR	$0, (%rdi, %rdx)
> +	ret
> +#  endif
> +
> +#  if VEC_SIZE == 64
> +	.p2align 4,, 6
> +L(copy_32_63):
> +	VMOVU	-(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +	VMOVU	%VMM_256(0), (%rdi)
> +	VMOVU	%VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
> +	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +#  endif
> +	.p2align 4,, 6
> +L(copy_16_31):
> +	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
> +	   and will save code size.  */
> +	vmovdqu	-(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +	VMOVU	%VMM_128(0), (%rdi)
> +	vmovdqu	%xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
> +	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +
> +	.p2align 4,, 2
> +L(copy_8_15):
> +	movq	-(8)(%rsi, %rdx, CHAR_SIZE), %rcx
> +	vmovq	%VMM_128(0), (%rdi)
> +	movq	%rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
> +	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +
> +#  ifndef USE_AS_WCSCPY
> +	.p2align 4,, 12
> +L(copy_4_7):
> +	movl	-(4)(%rsi, %rdx, CHAR_SIZE), %ecx
> +	vmovd	%VMM_128(0), (%rdi)
> +	movl	%ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
> +	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +#  endif
> +
> +# endif
> +	.p2align 4,, 4
> +L(zero_len):
> +# ifdef USE_AS_WCSCPY
> +	test	%rdx, %rdx
> +# endif
> +	jne	OVERFLOW_STRCAT
> +	ret
>  
> -#define USE_AS_STRNCAT
> -#define STRCAT	STRNCAT
> -#include "strcat-evex.S"
> +	.p2align 4,, 8
> +L(more_1x_vec):
> +	VMOVU	%VMM(0), (%rdi)
> +
> +	/* We are going to align rsi here so will need to be able to re-
> +	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> +	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
> +
> +	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> +	subq	%rsi, %rdi
> +	andq	$-(VEC_SIZE), %rsi
> +L(loop_last_4x_vec):
> +	addq	%rsi, %rdi
> +	subq	%rsi, %rdx
> +# ifdef USE_AS_WCSCPY
> +	shrq	$2, %rdx
> +# endif
> +
> +	/* Will need this regardless.  */
> +	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
> +	VPTESTN	%VMM(1), %VMM(1), %k0
> +	KMOV	%k0, %VMASK_REG
> +
> +	cmpq	$(CHAR_PER_VEC * 2), %rdx
> +	ja	L(more_2x_vec)
> +
> +L(last_2x_vec):
> +	FIND_FIRST_ONE (VMASK_REG, VRCX)
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x1_len)
> +
> +	/* If there were no zero-CHARs (rcx was zero before
> +	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> +	cmpl	$CHAR_PER_VEC, %ecx
> +	jne	L(ret_vec_x1)
> +
> +	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	addl	$-CHAR_PER_VEC, %edx
> +	bzhi	%VRDX, %VRCX, %VR8
> +	jz	L(ret_vec_x2_len)
> +L(ret_vec_x2):
> +	bsf	%VRCX, %VRDX
> +L(ret_vec_x2_len):
> +	VMOVU	(VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	MOVCHAR	$0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +	VMOVU	%VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +	ret
> +
> +	.p2align 4,, 4
> +L(ret_vec_x1_len):
> +	movl	%edx, %ecx
> +L(ret_vec_x1):
> +	VMOVU	(VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	MOVCHAR	$0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
> +	VMOVU	%VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +	VZEROUPPER_RETURN
> +
> +
> +	.p2align 4,, 8
> +L(last_4x_vec):
> +	addl	$-(CHAR_PER_VEC * 4), %edx
> +	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
> +	VPTESTN	%VMM(1), %VMM(1), %k0
> +	KMOV	%k0, %VMASK_REG
> +	subq	$-(VEC_SIZE * 4), %rsi
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	cmpl	$(CHAR_PER_VEC * 2), %edx
> +	jbe	L(last_2x_vec)
> +	.p2align 4,, 8
> +L(more_2x_vec):
> +# ifdef USE_AS_WCSCPY
> +	xorl	%ecx, %ecx
> +# endif
> +	bsf	%VMASK_REG, %VRCX
> +	jnz	L(ret_vec_x1)
> +
> +	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x2)
> +
> +	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
> +	VPTESTN	%VMM(3), %VMM(3), %k0
> +	KMOV	%k0, %VMASK_REG
> +
> +	cmpq	$(CHAR_PER_VEC * 4), %rdx
> +	ja	L(more_4x_vec)
> +
> +	/* Adjust length before going to L(ret_vec_x3_len) or
> +	   L(ret_vec_x3).  */
> +	addl	$(CHAR_PER_VEC * -2), %edx
> +
> +	FIND_FIRST_ONE (VMASK_REG, VRCX)
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x3_len)
> +
> +	/* If there were no zero-CHARs (rcx was zero before
> +	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
> +	cmpl	$CHAR_PER_VEC, %ecx
> +	jne	L(ret_vec_x3)
> +
> +	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	VPTESTN	%VMM(4), %VMM(4), %k0
> +	KMOV	%k0, %VRCX
> +	addl	$-CHAR_PER_VEC, %edx
> +	bzhi	%VRDX, %VRCX, %VR8
> +	jz	L(ret_vec_x4_len)
> +L(ret_vec_x4):
> +	bsf	%VRCX, %VRDX
> +L(ret_vec_x4_len):
> +	VMOVU	(VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	MOVCHAR	$0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
> +	VMOVU	%VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +	ret
> +
> +	.p2align 4,, 4
> +L(ret_vec_x3_len):
> +	movl	%edx, %ecx
> +L(ret_vec_x3):
> +	VMOVU	(VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	MOVCHAR	$0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> +	VMOVU	%VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +	ret
> +
> +	.p2align 4,, 8
> +L(more_4x_vec):
> +# ifdef USE_AS_WCSCPY
> +	xorl	%ecx, %ecx
> +# endif
> +	bsf	%VMASK_REG, %VRCX
> +	jnz	L(ret_vec_x3)
> +
> +	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	VPTESTN	%VMM(4), %VMM(4), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x4)
> +
> +	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
> +
> +	/* Check if we are near the end before aligning.  */
> +	cmpq	$(CHAR_PER_VEC * 8), %rdx
> +	jbe	L(last_4x_vec)
> +
> +
> +	/* Add rsi to rdx (length) before aligning rsi. NB: Since we
> +	   filtered out huge lengths this cannot overflow.  */
> +# ifdef USE_AS_WCSCPY
> +	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
> +# else
> +	addq	%rsi, %rdx
> +# endif
> +
> +	/* Subtract rsi from rdi before aligning (add back will have
> +	   correct rdi for aligned rsi).  */
> +	subq	%rsi, %rdi
> +	subq	$-(VEC_SIZE * 5), %rsi
> +	andq	$(VEC_SIZE * -4), %rsi
> +
> +	/* Load first half of the loop before entry.  */
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +
> +	/* Offset rsi by VEC_SIZE so that we can jump to
> +	   L(loop_last_4x_vec).  */
> +	addq	$-(VEC_SIZE), %rsi
> +	KORTEST	%k2, %k4
> +	jnz	L(loop_4x_done)
> +
> +	/* Store loop end in r9.  */
> +	leaq	-(VEC_SIZE * 5)(%rdx), %r9
> +
> +	.p2align 4,, 11
> +L(loop_4x_vec):
> +	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> +
> +	subq	$(VEC_SIZE * -4), %rsi
> +	cmpq	%rsi, %r9
> +	jbe	L(loop_last_4x_vec)
> +
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +	KORTEST	%k2, %k4
> +	jz	L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +	/* Restore rdi (dst).  */
> +	addq	%rsi, %rdi
> +
> +	/* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
> +	   test with bsf.  */
> +	bsf	%VRCX, %VRCX
> +	jnz	L(ret_vec_x1)
> +	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
> +
> +	KMOV	%k2, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x2)
> +	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> +
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	bsf	%VRCX, %VRCX
> +	jnz	L(ret_vec_x3)
> +	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> +
> +	KMOV	%k4, %VRCX
> +	bsf	%VRCX, %VRCX
> +	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
> +	ret
> +
> +
> +	.p2align 4,, 4
> +L(page_cross):
> +	movq	%rsi, %r8
> +	andq	$(VEC_SIZE * -1), %r8
> +	VPCMPEQ	(%r8), %VZERO, %k0
> +
> +# ifdef USE_AS_WCSCPY
> +	KMOV	%k0, %VR9
> +	shrl	$2, %ecx
> +	andl	$(CHAR_PER_VEC - 1), %ecx
> +	shrx	%VRCX, %VR9, %VRCX
> +# else
> +	KMOV	%k0, %VRCX
> +	shrx	%VRSI, %VRCX, %VRCX
> +# endif
> +
> +	subl	%esi, %r8d
> +	andl	$(VEC_SIZE - 1), %r8d
> +# ifdef USE_AS_WCSCPY
> +	shrl	$2, %r8d
> +# endif
> +	cmpq	%r8, %rdx
> +	jbe	L(page_cross_small)
> +	/* Optimizing more for space as this is very cold code. This
> +	   saves 2x cache lines.  */
> +
> +	/* This adds once to the later result which will get correct
> +	   copy bounds. NB: this can never zero-out a non-zero RCX as
> +	   to be in the page cross case rsi cannot be aligned and we
> +	   already right-shift rcx by the misalignment.  */
> +	shl	%VRCX
> +	jz	L(page_cross_continue)
> +	bsf	%VRCX, %VRCX
> +	REP_MOVS
> +	ret
> +
> +L(page_cross_small):
> +	tzcnt	%VRCX, %VRCX
> +	jz	L(page_cross_setz)
> +	cmpl	%edx, %ecx
> +	cmova	%edx, %ecx
> +
> +# ifdef USE_AS_WCSCPY
> +	rep	movsd
> +# else
> +	rep	movsb
> +# endif
> +L(page_cross_setz):
> +	MOVCHAR	$0, (%rdi)
> +	ret
> +END(STRNCAT)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
> index 1b3426d511..49eaf4cbd9 100644
> --- a/sysdeps/x86_64/multiarch/strncpy-evex.S
> +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
> @@ -1,7 +1,990 @@
> -#ifndef STRNCPY
> -# define STRNCPY	__strncpy_evex
> -#endif
> +/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +	/* Use evex-masked stores for small sizes. Turned off at the
> +	   moment.  */
> +# define USE_EVEX_MASKED_STORE	0
> +
> +
> +# include <sysdep.h>
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +
> +# ifndef STRNCPY
> +#  define STRNCPY	__strncpy_evex
> +# endif
> +
> +# ifdef USE_AS_WCSCPY
> +#  define VMOVU_MASK	vmovdqu32
> +#  define VPCMPEQ	vpcmpeqd
> +#  define VPMIN	vpminud
> +#  define VPTESTN	vptestnmd
> +#  define VPTEST	vptestmd
> +#  define CHAR_SIZE	4
> +
> +#  define REP_MOVS	rep movsd
> +#  define REP_STOS	rep stosl
> +
> +#  define USE_WIDE_CHAR
> +
> +# else
> +#  define VMOVU_MASK	vmovdqu8
> +#  define VPCMPEQ	vpcmpeqb
> +#  define VPMIN	vpminub
> +#  define VPTESTN	vptestnmb
> +#  define VPTEST	vptestmb
> +#  define CHAR_SIZE	1
> +
> +#  define REP_MOVS	rep movsb
> +#  define REP_STOS	rep stosb
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# define PAGE_SIZE	4096
> +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
> +
> +# include "reg-macros.h"
> +
> +
> +# define VZERO	VMM(7)
> +# define VZERO_256	VMM_256(7)
> +# define VZERO_128	VMM_128(7)
> +
> +# if VEC_SIZE == 64
> +#  define VZERO_HALF	VZERO_256
> +# else
> +#  define VZERO_HALF	VZERO_128
> +# endif
> +
> +	.section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCPY)
> +	/* Filter zero length strings and very long strings.  Zero
> +	   length strings just return, very long strings are handled by
> +	   just running rep stos{b|l} to zero set (which will almost
> +	   certainly segfault), if that succeeds then just calling
> +	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
> +# ifdef USE_AS_WCSCPY
> +	decq	%rdx
> +	movq	%rdx, %rax
> +	/* 56 is end of max supported address space.  */
> +	shr	$56, %rax
> +	jnz	L(zero_len)
> +# else
> +	decq	%rdx
> +	/* If the flag needs to become `jb` replace `dec` with `sub`.
> +	 */
> +	jl	L(zero_len)
> +# endif
> +
> +	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
> +	movl	%esi, %eax
> +	andl	$(PAGE_SIZE - 1), %eax
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
> +	ja	L(page_cross)
> +
> +L(page_cross_continue):
> +	VMOVU	(%rsi), %VMM(0)
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +
> +	/* If no STPCPY just save end ahead of time.  */
> +# ifndef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +# endif
> +
> +
> +	cmpq	$(CHAR_PER_VEC), %rdx
> +
> +	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
> +	   <= CHAR_PER_VEC with masked instructions (which have
> +	   potential for dramatically bad perf if dst splits a page and
> +	   is not in the TLB).  */
> +# if USE_EVEX_MASKED_STORE
> +	/* `jae` because length rdx is now length - 1.  */
> +	jae	L(more_1x_vec)
> +
> +	/* If there where multiple zero-CHAR matches in the first VEC,
> +	   VRCX will be overset but thats fine since any oversets where
> +	   at zero-positions anyways.  */
> +
> +#  ifdef USE_AS_STPCPY
> +	tzcnt	%VRCX, %VRAX
> +	cmpl	%eax, %edx
> +	cmovb	%edx, %eax
> +#   ifdef USE_AS_WCSCPY
> +	adcl	$0, %eax
> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
> +#   else
> +	adcq	%rdi, %rax
> +#   endif
> +#  endif
> +	dec	%VRCX
> +
> +	/* Zero out all non-zero CHAR's after the first zero match.  */
> +	KMOV	%VRCX, %k1
> +
> +	/* Use VZERO as destination so this can be reused for
> +	   L(zfill_less_vec) (which if jumped to by subsequent logic
> +	   will have zerod out VZERO.  */
> +	VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
> +L(zfill_less_vec):
> +	/* Get mask for what we need to set.  */
> +	incl	%edx
> +	mov	$-1, %VRCX
> +	bzhi	%VRDX, %VRCX, %VRCX
> +	KMOV	%VRCX, %k1
> +	VMOVU_MASK %VZERO, (%rdi){%k1}
> +	ret
> +
> +	.p2align 4,, 4
> +L(zero_len):
> +	cmpq	$-1, %rdx
> +	jne	L(best_effort_strncpy)
> +	movq	%rdi, %rax
> +	ret
> +
> +	.p2align 4,, 8
> +L(more_1x_vec):
> +# else
> +	/* `jb` because length rdx is now length - 1.  */
> +	jb	L(less_1x_vec)
> +# endif
> +
> +
> +	/* This may overset but thats fine because we still need to zero
> +	   fill.  */
> +	VMOVU	%VMM(0), (%rdi)
> +
> +
> +	/* Length must be >= CHAR_PER_VEC so match here means we must
> +	   zero-fill.  */
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill)
> +
> +
> +	/* We are going to align rsi here so will need to be able to re-
> +	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
> +	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
> +	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
> +	subq	%rsi, %rdi
> +	andq	$-(VEC_SIZE), %rsi
> +
> +L(loop_last_4x_vec):
> +	addq	%rsi, %rdi
> +	subq	%rsi, %rdx
> +# ifdef USE_AS_WCSCPY
> +	shrq	$2, %rdx
> +# endif
> +
> +	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
> +	VPTESTN	%VMM(1), %VMM(1), %k0
> +	KMOV	%k0, %VRCX
> +
> +	/* -1 because of the `dec %rdx` earlier.  */
> +	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
> +	ja	L(more_2x_vec)
> +
> +L(last_2x_vec):
> +	/* This will be need to be computed no matter what. We do it
> +	   ahead of time for CHAR_PER_VEC == 64 because we can't adjust
> +	   the value of `tzcnt` with a shift.  */
> +# if CHAR_PER_VEC == 64
> +	tzcntq	%rcx, %rcx
> +# endif
> +
> +	cmpl	$(CHAR_PER_VEC), %edx
> +	jb	L(ret_vec_x1_len)
> +
> +	/* Seperate logic for CHAR_PER_VEC == 64 because we already did
> +	   `tzcnt` on VRCX.  */
> +# if CHAR_PER_VEC == 64
> +	/* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
> +	cmpb	$CHAR_PER_VEC, %cl
> +	jnz	L(ret_vec_x1_no_bsf)
> +# else
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x1)
> +# endif
> +
> +
> +
> +	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %VZERO, %k0
> +	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
> +	KMOV	%k0, %VRCX
> +
> +# if CHAR_PER_VEC < 64
> +	/* This essentiallys adds CHAR_PER_VEC to computed result.  */
> +	shlq	$CHAR_PER_VEC, %rcx
> +# else
> +	tzcntq	%rcx, %rcx
> +	addl	$CHAR_PER_VEC, %ecx
> +# endif
> +
> +	.p2align 4,, 4
> +L(ret_vec_x1_len):
> +	/* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
> +	   already been done.  */
> +# if CHAR_PER_VEC < 64
> +	tzcntq	%rcx, %rcx
> +# endif
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x1_len_no_zfill)
> +	/* Fall through (expectation) is copy len < buffer len.  */
> +	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +L(ret_vec_x1_len_no_zfill_mov):
> +	movl	%ecx, %edx
> +# ifdef USE_AS_STPCPY
> +	/* clear flags.  */
> +	xorl	%ecx, %ecx
> +# endif
> +L(ret_vec_x1_len_no_zfill):
> +	VMOVU	((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +	leal	(VEC_SIZE)(%rdx), %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# endif
> +	ret
> +
> +
> +	.p2align 4,, 10
> +L(ret_vec_x1):
> +	bsf	%VRCX, %VRCX
> +L(ret_vec_x1_no_bsf):
> +	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +	subl	%ecx, %edx
> +	cmpl	$CHAR_PER_VEC, %edx
> +	jb	L(ret_vec_x1_len_no_zfill_mov)
> +	/* Fall through (expectation) is copy len < buffer len.  */
> +	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
> +	VMOVU	%VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +	leaq	(VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
> +# endif
> +	ret
> +
> +	.p2align 4,, 8
> +L(last_4x_vec):
> +	/* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
> +	   $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
> +	   using `movzbl`.  */
> +# if CHAR_PER_VEC == 64
> +	movzbl	%dl, %edx
> +# else
> +	andl	$(CHAR_PER_VEC * 4 - 1), %edx
> +# endif
> +	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
> +	VPTESTN	%VMM(1), %VMM(1), %k0
> +	KMOV	%k0, %VRCX
> +	subq	$-(VEC_SIZE * 4), %rsi
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	cmpl	$(CHAR_PER_VEC * 2 - 1), %edx
> +	jbe	L(last_2x_vec)
> +	.p2align 4,, 8
> +L(more_2x_vec):
> +	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
> +	test	%VRCX, %VRCX
> +	/* Must fill at least 2x VEC.  */
> +	jnz	L(zfill_vec1)
> +
> +	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	/* Must fill at least 1x VEC.  */
> +	jnz	L(zfill_vec2)
> +
> +	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
> +	VPTESTN	%VMM(3), %VMM(3), %k0
> +	KMOV	%k0, %VRCX
> +
> +	/* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
> +	cmpq	$(CHAR_PER_VEC * 4 - 1), %rdx
> +	ja	L(more_4x_vec)
> +
> +	subl	$(CHAR_PER_VEC * 3), %edx
> +	jb	L(ret_vec_x3_len)
> +
> +	test	%VRCX, %VRCX
> +	jnz	L(ret_vec_x3)
> +
> +	VPCMPEQ	(VEC_SIZE * 4)(%rsi), %VZERO, %k0
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	KMOV	%k0, %VRCX
> +	tzcnt	%VRCX, %VRCX
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x4_len_no_zfill)
> +	/* Fall through (expectation) is copy len < buffer len.  */
> +	VMOVU	%VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +	movl	%ecx, %edx
> +L(ret_vec_x4_len_no_zfill):
> +	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +	leal	(VEC_SIZE * 4 + 0)(%rdx), %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# endif
> +	ret
> +
> +
> +L(ret_vec_x3_len):
> +	addl	$(CHAR_PER_VEC * 1), %edx
> +	tzcnt	%VRCX, %VRCX
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x3_len_no_zfill)
> +	/* Fall through (expectation) is copy len < buffer len.  */
> +	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +L(ret_vec_x3_len_no_zfill_mov):
> +	movl	%ecx, %edx
> +# ifdef USE_AS_STPCPY
> +	/* clear flags.  */
> +	xorl	%ecx, %ecx
> +# endif
> +	.p2align 4,, 4
> +L(ret_vec_x3_len_no_zfill):
> +	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
> +	VMOVU	%VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +	leal	(VEC_SIZE * 3 + 0)(%rdx), %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# endif
> +	ret
> +
> +
> +	.p2align 4,, 8
> +L(ret_vec_x3):
> +	bsf	%VRCX, %VRCX
> +	VMOVU	%VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
> +	subl	%ecx, %edx
> +	jl	L(ret_vec_x3_len_no_zfill_mov)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
> +# ifdef USE_AS_STPCPY
> +	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
> +# endif
> +	ret
> +
> +	.p2align 4,, 8
> +L(more_4x_vec):
> +	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill_vec3)
> +
> +	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
> +	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
> +	VPTESTN	%VMM(4), %VMM(4), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill_vec4)
>  
> -#define USE_AS_STRNCPY
> -#define STRCPY	STRNCPY
> -#include "strcpy-evex.S"
> +	/* Recheck length before aligning.  */
> +	cmpq	$(CHAR_PER_VEC * 8 - 1), %rdx
> +	jbe	L(last_4x_vec)
> +
> +	/* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
> +# ifdef USE_AS_WCSCPY
> +	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
> +# else
> +	addq	%rsi, %rdx
> +# endif
> +	subq	%rsi, %rdi
> +	subq	$-(VEC_SIZE * 5), %rsi
> +	andq	$(VEC_SIZE * -4), %rsi
> +
> +
> +	/* Load first half of the loop before entry.  */
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +
> +
> +	/* Offset rsi by VEC_SIZE so that we can jump to
> +	   L(loop_last_4x_vec).  */
> +	addq	$-(VEC_SIZE), %rsi
> +	KORTEST	%k2, %k4
> +	jnz	L(loop_4x_done)
> +
> +	/* Store loop end in r9.  */
> +	leaq	-(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
> +
> +	.p2align 4,, 11
> +L(loop_4x_vec):
> +	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
> +	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
> +
> +	subq	$(VEC_SIZE * -4), %rsi
> +	cmpq	%rsi, %r9
> +	jbe	L(loop_last_4x_vec)
> +
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPTESTN	%VMM(4), %VMM(4), %k2
> +	VPTESTN	%VMM(6), %VMM(6), %k4
> +	KORTEST	%k2, %k4
> +	jz	L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +	/* Restore rdx (length).  */
> +	subq	%rsi, %rdx
> +# ifdef USE_AS_WCSCPY
> +	shrq	$2, %rdx
> +# endif
> +	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
> +	/* Restore rdi (dst).  */
> +	addq	%rsi, %rdi
> +	VPTESTN	%VMM(0), %VMM(0), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill_vec1)
> +
> +	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
> +	KMOV	%k2, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill_vec2)
> +
> +	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
> +	VPTESTN	%VMM(2), %VMM(2), %k0
> +	KMOV	%k0, %VRCX
> +	test	%VRCX, %VRCX
> +	jnz	L(zfill_vec3)
> +
> +	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
> +	KMOV	%k4, %VRCX
> +	// Zfill more....
> +
> +	.p2align 4,, 4
> +L(zfill_vec4):
> +	subq	$(VEC_SIZE * -2), %rdi
> +	addq	$(CHAR_PER_VEC * -2), %rdx
> +L(zfill_vec2):
> +	subq	$(VEC_SIZE * -2), %rdi
> +	addq	$(CHAR_PER_VEC * -1), %rdx
> +L(zfill):
> +	/* VRCX must be non-zero.  */
> +	bsf	%VRCX, %VRCX
> +
> +	/* Adjust length / dst for zfill.  */
> +	subq	%rcx, %rdx
> +# ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
> +# else
> +	addq	%rcx, %rdi
> +# endif
> +# ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +# endif
> +L(zfill_from_page_cross):
> +
> +	/* From here on out its just memset(rdi, 0, rdx).  */
> +	cmpq	$CHAR_PER_VEC, %rdx
> +	jb	L(zfill_less_vec)
> +
> +L(zfill_more_1x_vec):
> +	VMOVU	%VZERO, (%rdi)
> +	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
> +	ja	L(zfill_more_2x_vec)
> +L(zfill_done0):
> +	ret
> +
> +	/* Coming from vec1/vec2 we must be able to zfill at least 2x
> +	   VEC.  */
> +	.p2align 4,, 8
> +L(zfill_vec3):
> +	subq	$(VEC_SIZE * -2), %rdi
> +	addq	$(CHAR_PER_VEC * -2), %rdx
> +	.p2align 4,, 2
> +L(zfill_vec1):
> +	bsfq	%rcx, %rcx
> +	/* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
> +	 */
> +	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> +	subq	%rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +# endif
> +
> +
> +	VMOVU	%VZERO, (%rdi)
> +	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +	cmpq	$(CHAR_PER_VEC * 2), %rdx
> +	jb	L(zfill_done0)
> +L(zfill_more_2x_vec):
> +	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +	VMOVU	%VZERO, (VEC_SIZE)(%rdi)
> +	subq	$(CHAR_PER_VEC * 4 - 1), %rdx
> +	jbe	L(zfill_done)
> +
> +# ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rdx
> +# else
> +	addq	%rdi, %rdx
> +# endif
> +
> +	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi)
> +	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi)
> +
> +
> +	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
> +	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
> +
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	cmpq	%rdi, %rdx
> +	jbe	L(zfill_done)
> +
> +	/* Align rdi and zfill loop.  */
> +	andq	$-(VEC_SIZE), %rdi
> +	.p2align 4,, 12
> +L(zfill_loop_4x_vec):
> +	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
> +	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
> +	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
> +	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	cmpq	%rdi, %rdx
> +	ja	L(zfill_loop_4x_vec)
> +L(zfill_done):
> +	ret
> +
> +
> +	/* Less 1x VEC case if we are not using evex masked store.  */
> +# if !USE_EVEX_MASKED_STORE
> +	.p2align 4,, 8
> +L(copy_1x):
> +	/* Special case for copy 1x. It can be handled quickly and many
> +	   buffer sizes have convenient alignment.  */
> +	VMOVU	%VMM(0), (%rdi)
> +	/* If no zeros then we are done.  */
> +	testl	%ecx, %ecx
> +	jz	L(ret_1x_1x)
> +
> +	/* Need to zfill, not we know that length <= CHAR_PER_VEC so we
> +	   only handle the small case here.  */
> +	bsf	%VRCX, %VRCX
> +L(zfill_less_vec_no_bsf):
> +	/* Adjust length / dst then just zfill less_vec.  */
> +	subq	%rcx, %rdx
> +#  ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
> +#  else
> +	addq	%rcx, %rdi
> +#  endif
> +#  ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#  endif
> +
> +L(zfill_less_vec):
> +	cmpl	$((VEC_SIZE / 2) / CHAR_SIZE), %edx
> +	jb	L(zfill_less_half)
> +
> +	VMOVU	%VZERO_HALF, (%rdi)
> +	VMOVU	%VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +	ret
> +#  ifdef USE_AS_STPCPY
> +L(ret_1x_1x):
> +	leaq	CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
> +	ret
> +#  endif
> +
> +
> +#  if VEC_SIZE == 64
> +	.p2align 4,, 4
> +L(copy_32_63):
> +	/* Overfill to avoid branches.  */
> +	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
> +	VMOVU	%VMM_256(0), (%rdi)
> +	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +
> +	/* We are taking advantage of the fact that to be here we must
> +	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
> +	   way for overwriting.  */
> +	cmpl	%ecx, %edx
> +	ja	L(zfill_less_vec_no_bsf)
> +#   ifndef USE_AS_STPCPY
> +L(ret_1x_1x):
> +#   else
> +#    ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#    else
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#    endif
> +#   endif
> +	ret
> +#  endif
> +
> +	.p2align 4,, 4
> +L(copy_16_31):
> +	/* Overfill to avoid branches.  */
> +	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
> +	VMOVU	%VMM_128(0), (%rdi)
> +	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +	cmpl	%ecx, %edx
> +
> +	/* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
> +	   we have a larger copy block for 32-63 so this is just falls
> +	   through to zfill 16-31. If VEC_SIZE == 32 then we check for
> +	   full zfill of less 1x VEC.  */
> +#  if VEC_SIZE == 64
> +	jbe	L(ret_16_31)
> +	subl	%ecx, %edx
> +#   ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
> +#   else
> +	addq	%rcx, %rdi
> +#   endif
> +#   ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#   endif
> +L(zfill_less_half):
> +L(zfill_less_32):
> +	cmpl	$(16 / CHAR_SIZE), %edx
> +	jb	L(zfill_less_16)
> +	VMOVU	%VZERO_128, (%rdi)
> +	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +#   ifdef USE_AS_STPCPY
> +	ret
> +#   endif
> +L(ret_16_31):
> +#   ifdef USE_AS_STPCPY
> +#    ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#    else
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#    endif
> +#   endif
> +	ret
> +#  else
> +	/* VEC_SIZE == 32 begins.  */
> +	ja	L(zfill_less_vec_no_bsf)
> +#   ifndef USE_AS_STPCPY
> +L(ret_1x_1x):
> +#   else
> +#    ifdef USE_AS_WCSCPY
> +	adcq	$0, %rdx
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#    else
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#    endif
> +#   endif
> +	ret
> +#  endif
> +
> +
> +	.p2align 4,, 4
> +L(copy_8_15):
> +	/* Overfill to avoid branches.  */
> +	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
> +	vmovq	%VMM_128(0), (%rdi)
> +	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_8_15)
> +	subl	%ecx, %edx
> +#  ifdef USE_AS_WCSCPY
> +	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
> +#  else
> +	addq	%rcx, %rdi
> +#  endif
> +#  ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#  endif
> +	.p2align 4,, 8
> +#  if VEC_SIZE == 32
> +L(zfill_less_half):
> +#  endif
> +L(zfill_less_16):
> +	xorl	%ecx, %ecx
> +	cmpl	$(8 / CHAR_SIZE), %edx
> +	jb	L(zfill_less_8)
> +	movq	%rcx, (%rdi)
> +	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +#  ifndef USE_AS_STPCPY
> +L(ret_8_15):
> +#  endif
> +	ret
> +
> +	.p2align 4,, 8
> +L(less_1x_vec):
> +	je	L(copy_1x)
> +
> +	/* We will need `tzcnt` result for all other copy sizes.  */
> +	tzcnt	%VRCX, %VRCX
> +#  if VEC_SIZE == 64
> +	cmpl	$(32 / CHAR_SIZE), %edx
> +	jae	L(copy_32_63)
> +#  endif
> +
> +	cmpl	$(16 / CHAR_SIZE), %edx
> +	jae	L(copy_16_31)
> +
> +	cmpl	$(8 / CHAR_SIZE), %edx
> +	jae	L(copy_8_15)
> +#  ifdef USE_AS_WCSCPY
> +	testl	%ecx, %ecx
> +	jz	L(zfill_less_8_set_ret)
> +
> +	movl	(%rsi, %rdx, CHAR_SIZE), %esi
> +	vmovd	%VMM_128(0), (%rdi)
> +	movl	%esi, (%rdi, %rdx, CHAR_SIZE)
> +#   ifdef USE_AS_STPCPY
> +	cmpl	%ecx, %edx
> +L(ret_8_15):
> +	adcq	$0, %rdx
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#   endif
> +	ret
> +L(zfill_less_8_set_ret):
> +	xorl	%ecx, %ecx
> +#   ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#   endif
> +L(zfill_less_8):
> +	movl	%ecx, (%rdi)
> +	movl	%ecx, (%rdi, %rdx, CHAR_SIZE)
> +	ret
> +#  else
> +	cmpl	$3, %edx
> +	jb	L(copy_0_3)
> +	/* Overfill to avoid branches.  */
> +	movl	-3(%rsi, %rdx), %esi
> +	vmovd	%VMM_128(0), (%rdi)
> +	movl	%esi, -3(%rdi, %rdx)
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_4_7)
> +	subq	%rcx, %rdx
> +	addq	%rcx, %rdi
> +#   ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#   endif
> +	xorl	%ecx, %ecx
> +	.p2align 4,, 8
> +L(zfill_less_8):
> +	cmpl	$3, %edx
> +	jb	L(zfill_less_3)
> +	movl	%ecx, (%rdi)
> +	movl	%ecx, -3(%rdi, %rdx)
> +#   ifdef USE_AS_STPCPY
> +	ret
> +#   endif
> +
> +L(ret_4_7):
> +#   ifdef USE_AS_STPCPY
> +L(ret_8_15):
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#   endif
> +	ret
> +
> +	.p2align 4,, 4
> +L(zfill_less_3):
> +	testl	%edx, %edx
> +	jz	L(zfill_1)
> +	movw	%cx, (%rdi)
> +L(zfill_1):
> +	movb	%cl, (%rdi, %rdx)
> +	ret
> +
> +	.p2align 4,, 8
> +L(copy_0_3):
> +	vmovd	%VMM_128(0), %r8d
> +	testl	%edx, %edx
> +	jz	L(copy_1)
> +	movw	%r8w, (%rdi)
> +	cmpl	%ecx, %edx
> +	ja	L(zfill_from_1)
> +	movzbl	(%rsi, %rdx), %r8d
> +#   ifdef USE_AS_STPCPY
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +	movb	%r8b, (%rdi, %rdx)
> +	ret
> +#   endif
> +
> +L(copy_1):
> +#   ifdef USE_AS_STPCPY
> +	movl	%edx, %eax
> +	cmpl	%ecx, %edx
> +	adcq	%rdi, %rax
> +#   endif
> +#   ifdef USE_AS_WCSCPY
> +	vmovd	%VMM_128(0), (%rdi)
> +#   else
> +	movb	%r8b, (%rdi, %rdx)
> +#   endif
> +	ret
> +#  endif
> +
> +
> +#  ifndef USE_AS_WCSCPY
> +	.p2align 4,, 8
> +L(zfill_from_1):
> +#   ifdef USE_AS_STPCPY
> +	leaq	(%rdi, %rcx), %rax
> +#   endif
> +	movw	$0, -1(%rdi, %rdx)
> +	ret
> +#  endif
> +
> +	.p2align 4,, 4
> +L(zero_len):
> +	incq	%rdx
> +	jne	L(best_effort_strncpy)
> +	movq	%rdi, %rax
> +	ret
> +# endif
> +
> +
> +	.p2align 4,, 4
> +	.p2align 6,, 8
> +L(page_cross):
> +	movq	%rsi, %rax
> +	andq	$(VEC_SIZE * -1), %rax
> +	VPCMPEQ	(%rax), %VZERO, %k0
> +	KMOV	%k0, %VRCX
> +# ifdef USE_AS_WCSCPY
> +	movl	%esi, %r8d
> +	shrl	$2, %r8d
> +	andl	$(CHAR_PER_VEC - 1), %r8d
> +	shrx	%VR8, %VRCX, %VRCX
> +# else
> +	shrx	%VRSI, %VRCX, %VRCX
> +# endif
> +
> +	/* Compute amount of bytes we checked.  */
> +	subl	%esi, %eax
> +	andl	$(VEC_SIZE - 1), %eax
> +# ifdef USE_AS_WCSCPY
> +	shrl	$2, %eax
> +# endif
> +
> +	/* If rax > rdx then we are finishing the copy at the end of the
> +	   page.  */
> +	cmpq	%rax, %rdx
> +	jb	L(page_cross_small)
> +
> +
> +	/* If rcx is non-zero then continue.  */
> +	test	%VRCX, %VRCX
> +	jz	L(page_cross_continue)
> +
> +	/* We found zero-CHAR so need to copy then zfill (we know we
> +	   didn't cover all of length here).  */
> +	bsf	%VRCX, %VRCX
> +L(movsb_and_zfill):
> +	incl	%ecx
> +	subq	%rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +	leaq	-CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
> +# else
> +	movq	%rdi, %rax
> +# endif
> +
> +	REP_MOVS
> +# ifdef USE_AS_WCSCPY
> +	movl	$0, (%rdi)
> +# else
> +	movb	$0, (%rdi)
> +# endif
> +	jmp	L(zfill_from_page_cross)
> +
> +L(page_cross_small):
> +	tzcnt	%VRCX, %VRCX
> +	cmpl	%ecx, %edx
> +	jbe	L(page_cross_copy_only)
> +
> +	/* Do a zfill of the tail before copying.  */
> +	movq	%rdi, %r9
> +	xorl	%eax, %eax
> +
> +	movl	%ecx, %r8d
> +
> +	subl	%ecx, %edx
> +	leaq	CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
> +	movl	%edx, %ecx
> +	REP_STOS
> +	movq	%r9, %rdi
> +	movl	%r8d, %edx
> +L(page_cross_copy_only):
> +	leal	1(%rdx), %ecx
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	adcl	$0, %edx
> +	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
> +#  else
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# else
> +	movq	%rdi, %rax
> +# endif
> +	REP_MOVS
> +	ret
> +
> +
> +L(best_effort_strncpy):
> +	movq	%rdx, %rcx
> +	xorl	%eax, %eax
> +	movq	%rdi, %r8
> +	/* The length is >= 2^63. We very much so expect to segfault at
> +	   rep stos. If that doesn't happen then just strcpy to finish.
> +	 */
> +	REP_STOS
> +	movq	%r8, %rdi
> +	jmp	OVERFLOW_STRCPY
> +END(STRNCPY)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> new file mode 100644
> index 0000000000..d4f4d6c82b
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
> @@ -0,0 +1,80 @@
> +/* Helper for getting proper name of overflow fallback function for
> +   {wc|st}{p|r|s}n{cat|cpy}
> +
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
> +#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
> +
> +#if defined USE_MULTIARCH && IS_IN(libc)
> +#  define UNDERSCORES __
> +#  ifdef USE_WITH_SSE2
> +#    define ISA_EXT _sse2
> +#  elif defined USE_WITH_AVX2
> +#    ifdef USE_WITH_RTM
> +#      define ISA_EXT _avx2_rtm
> +#    else
> +#      define ISA_EXT _avx2
> +#    endif
> +
> +#  elif defined USE_WITH_EVEX256
> +#    define ISA_EXT _evex
> +#  elif defined USE_WITH_EVEX512
> +#    define ISA_EXT _evex512
> +#  endif
> +#else
> +#  define UNDERSCORES
> +#  define ISA_EXT
> +#endif
> +
> +#ifdef USE_AS_WCSCPY
> +#  define STRCPY_PREFIX wc
> +#  define STRCAT_PREFIX wcs
> +#  ifdef USE_AS_STPCPY
> +#    define STRCPY_POSTFIX pcpy
> +#  else
> +#    define STRCPY_POSTFIX scpy
> +#  endif
> +#else
> +#  define STRCPY_PREFIX st
> +#  define STRCAT_PREFIX str
> +#  ifdef USE_AS_STPCPY
> +#    define STRCPY_POSTFIX pcpy
> +#  else
> +#    define STRCPY_POSTFIX rcpy
> +#  endif
> +#endif
> +#define STRCAT_POSTFIX cat
> +
> +#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext)                 \
> +  underscores##prefix##postfix##ext
> +
> +#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
> +
> +#ifndef OVERFLOW_STRCPY
> +#  define OVERFLOW_STRCPY                                                     \
> +    OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
> +#endif
> +
> +#ifndef OVERFLOW_STRCAT
> +#  define OVERFLOW_STRCAT                                                     \
> +    OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
> +#endif
> +
> +#endif
> -- 
> 2.34.1
> 

LGTM.

Thanks.

H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v5 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions
  2022-11-09  1:38     ` [PATCH v5 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
@ 2022-11-09  3:00       ` H.J. Lu
  0 siblings, 0 replies; 42+ messages in thread
From: H.J. Lu @ 2022-11-09  3:00 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Tue, Nov 08, 2022 at 05:38:39PM -0800, Noah Goldstein wrote:
> Optimizations are:
>     1. Use more overlapping stores to avoid branches.
>     2. Reduce how unrolled the aligning copies are (this is more of a
>        code-size save, its a negative for some sizes in terms of
>        perf).
>     3. For st{r|p}n{cat|cpy} re-order the branches to minimize the
>        number that are taken.
> 
> Performance Changes:
> 
>     Times are from N = 10 runs of the benchmark suite and are
>     reported as geometric mean of all ratios of
>     New Implementation / Old Implementation.
> 
>     strcat-avx2      -> 0.998
>     strcpy-avx2      -> 0.937
>     stpcpy-avx2      -> 0.971
> 
>     strncpy-avx2     -> 0.793
>     stpncpy-avx2     -> 0.775
> 
>     strncat-avx2     -> 0.962
> 
> Code Size Changes:
>     function         -> Bytes New / Bytes Old -> Ratio
> 
>     strcat-avx2      ->  685 / 1639 -> 0.418
>     strcpy-avx2      ->  560 /  903 -> 0.620
>     stpcpy-avx2      ->  592 /  939 -> 0.630
> 
>     strncpy-avx2     -> 1176 / 2390 -> 0.492
>     stpncpy-avx2     -> 1268 / 2438 -> 0.520
> 
>     strncat-avx2     -> 1042 / 2563 -> 0.407
> 
> Notes:
>     1. Because of the significant difference between the
>        implementations they are split into three files.
> 
>            strcpy-avx2.S    -> strcpy, stpcpy, strcat
>            strncpy-avx2.S   -> strncpy
>            strncat-avx2.S    > strncat
> 
>        I couldn't find a way to merge them without making the
>        ifdefs incredibly difficult to follow.
> 
> Full check passes on x86-64 and build succeeds for all ISA levels w/
> and w/o multiarch.
> ---
>  sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S    |    6 +-
>  sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S   |    7 +-
>  sysdeps/x86_64/multiarch/stpncpy-avx2.S       |    5 +-
>  sysdeps/x86_64/multiarch/strcat-avx2-rtm.S    |   13 +-
>  sysdeps/x86_64/multiarch/strcat-avx2.S        |  268 +---
>  .../x86_64/multiarch/strcat-strlen-avx2.h.S   |  101 ++
>  sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S    |   13 +-
>  sysdeps/x86_64/multiarch/strcpy-avx2.S        | 1236 +++++------------
>  sysdeps/x86_64/multiarch/strncat-avx2-rtm.S   |    6 +-
>  sysdeps/x86_64/multiarch/strncat-avx2.S       |  424 +++++-
>  sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S   |    6 +-
>  sysdeps/x86_64/multiarch/strncpy-avx2.S       |  740 +++++++++-
>  sysdeps/x86_64/multiarch/x86-avx-vecs.h       |    3 +-
>  13 files changed, 1594 insertions(+), 1234 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S
> 
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> index 2b9c07a59f..90e532dbe8 100644
> --- a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> @@ -1,3 +1,3 @@
> -#define USE_AS_STPCPY
> -#define STRCPY __stpcpy_avx2_rtm
> -#include "strcpy-avx2-rtm.S"
> +#define STPCPY	__stpcpy_avx2_rtm
> +#include "x86-avx-rtm-vecs.h"
> +#include "stpcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> index 60a2ccfe53..46ee07be36 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> @@ -1,4 +1,3 @@
> -#define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY __stpncpy_avx2_rtm
> -#include "strcpy-avx2-rtm.S"
> +#define STPNCPY	__stpncpy_avx2_rtm
> +#include "x86-avx-rtm-vecs.h"
> +#include "stpncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> index b2f8c19143..a46a8edbe2 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> @@ -3,6 +3,5 @@
>  #endif
>  
>  #define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY	STPNCPY
> -#include "strcpy-avx2.S"
> +#define STRNCPY	STPNCPY
> +#include "strncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> index 637fb557c4..e84f4f1fef 100644
> --- a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> @@ -1,12 +1,3 @@
> -#ifndef STRCAT
> -# define STRCAT __strcat_avx2_rtm
> -#endif
> -
> -#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> -  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> -
> -#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
> -
> -#define SECTION(p) p##.avx.rtm
> -
> +#define STRCAT	__strcat_avx2_rtm
> +#include "x86-avx-rtm-vecs.h"
>  #include "strcat-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
> index d9b7fb2a43..3f914fa342 100644
> --- a/sysdeps/x86_64/multiarch/strcat-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
> @@ -16,266 +16,10 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>  
> -#include <isa-level.h>
> -
> -#if ISA_SHOULD_BUILD (3)
> -
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_avx2
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -/* Number of bytes in a vector register */
> -# define VEC_SIZE	32
> -
> -# ifndef SECTION
> -#  define SECTION(p)	p##.avx
> -# endif
> -
> -	.section SECTION(.text),"ax",@progbits
> -ENTRY (STRCAT)
> -	mov	%rdi, %r9
> -# ifdef USE_AS_STRNCAT
> -	mov	%rdx, %r8
> -# endif
> -
> -	xor	%eax, %eax
> -	mov	%edi, %ecx
> -	and	$((VEC_SIZE * 4) - 1), %ecx
> -	vpxor	%xmm6, %xmm6, %xmm6
> -	cmp	$(VEC_SIZE * 3), %ecx
> -	ja	L(fourth_vector_boundary)
> -	vpcmpeqb (%rdi), %ymm6, %ymm0
> -	vpmovmskb %ymm0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_first_vector)
> -	mov	%rdi, %rax
> -	and	$-VEC_SIZE, %rax
> -	jmp	L(align_vec_size_start)
> -L(fourth_vector_boundary):
> -	mov	%rdi, %rax
> -	and	$-VEC_SIZE, %rax
> -	vpcmpeqb	(%rax), %ymm6, %ymm0
> -	mov	$-1, %r10d
> -	sub	%rax, %rcx
> -	shl	%cl, %r10d
> -	vpmovmskb %ymm0, %edx
> -	and	%r10d, %edx
> -	jnz	L(exit)
> -
> -L(align_vec_size_start):
> -	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
> -	vpmovmskb %ymm0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> -	vpmovmskb %ymm1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> -	vpmovmskb %ymm2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> -	vpmovmskb %ymm3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> -	add	$(VEC_SIZE * 4), %rax
> -	vpmovmskb %ymm0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> -	vpmovmskb %ymm1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> -	vpmovmskb %ymm2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> -	vpmovmskb %ymm3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> -	add	$(VEC_SIZE * 4), %rax
> -	vpmovmskb %ymm0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> -	vpmovmskb %ymm1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> -	vpmovmskb %ymm2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> -	vpmovmskb %ymm3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> -	add	$(VEC_SIZE * 4), %rax
> -	vpmovmskb %ymm0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> -	vpmovmskb %ymm1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> -	vpmovmskb %ymm2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> -	vpmovmskb %ymm3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fifth_vector)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> -	add	$(VEC_SIZE * 5), %rax
> -	vpmovmskb %ymm0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
> -	add	$VEC_SIZE, %rax
> -	vpmovmskb %ymm1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
> -	add	$VEC_SIZE, %rax
> -	vpmovmskb %ymm2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	test	$((VEC_SIZE * 4) - 1), %rax
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
> -	add	$VEC_SIZE, %rax
> -	vpmovmskb %ymm3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	add	$VEC_SIZE, %rax
> -
> -	.p2align 4
> -L(align_four_vec_loop):
> -	vmovaps	(%rax),	%ymm4
> -	vpminub	VEC_SIZE(%rax),	%ymm4, %ymm4
> -	vmovaps	(VEC_SIZE * 2)(%rax),	%ymm5
> -	vpminub	(VEC_SIZE * 3)(%rax),	%ymm5, %ymm5
> -	add	$(VEC_SIZE * 4),	%rax
> -	vpminub	%ymm4,	%ymm5, %ymm5
> -	vpcmpeqb %ymm5,	%ymm6, %ymm5
> -	vpmovmskb %ymm5,	%edx
> -	test	%edx,	%edx
> -	jz	L(align_four_vec_loop)
> -
> -	vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
> -	sub	$(VEC_SIZE * 5),	%rax
> -	vpmovmskb %ymm0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_second_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> -	vpmovmskb %ymm1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_third_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> -	vpmovmskb %ymm2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_null_on_fourth_vector)
> -
> -	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> -	vpmovmskb %ymm3, %edx
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 4), %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit):
> -	sub	%rdi, %rax
> -L(exit_null_on_first_vector):
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_second_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$VEC_SIZE, %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_third_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 2), %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_fourth_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 3), %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_null_on_fifth_vector):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$(VEC_SIZE * 4), %rax
> -
> -	.p2align 4
> -L(StartStrcpyPart):
> -	lea	(%r9, %rax), %rdi
> -	mov	%rsi, %rcx
> -	mov	%r9, %rax      /* save result */
> -
> -# ifdef USE_AS_STRNCAT
> -	test	%r8, %r8
> -	jz	L(ExitZero)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-avx2.S"
> +#ifndef STRCAT
> +# define STRCAT	__strcat_avx2
>  #endif
> +
> +#define USE_AS_STRCAT
> +#define STRCPY	STRCAT
> +#include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S
> new file mode 100644
> index 0000000000..f50514e07c
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S
> @@ -0,0 +1,101 @@
> +/* strlen used for begining of str{n}cat using AVX2.
> +   Copyright (C) 2011-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* NOTE: This file is meant to be included by strcat-avx2 or
> +   strncat-avx2 and does not standalone.  Before including %rdi
> +   must be saved in %rax.  */
> +
> +
> +/* Simple strlen implementation that ends at
> +   L(strcat_strlen_done).  */
> +	movq	%rdi, %r8
> +	andq	$(VEC_SIZE * -1), %r8
> +	VPCMPEQ	(%r8), %VZERO, %VMM(0)
> +	vpmovmskb %VMM(0), %ecx
> +	shrxl	%edi, %ecx, %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(bsf_and_done_v0)
> +
> +	VPCMPEQ	VEC_SIZE(%r8), %VZERO, %VMM(0)
> +	vpmovmskb %VMM(0), %ecx
> +	leaq	(VEC_SIZE)(%r8), %rdi
> +	testl	%ecx, %ecx
> +	jnz	L(bsf_and_done_v0)
> +
> +	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VZERO, %VMM(0)
> +	vpmovmskb %VMM(0), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(bsf_and_done_v1)
> +
> +	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %VMM(0)
> +	vpmovmskb %VMM(0), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(bsf_and_done_v2)
> +
> +	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %VMM(0)
> +	vpmovmskb %VMM(0), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(bsf_and_done_v3)
> +
> +	orq	$(VEC_SIZE * 4 - 1), %rdi
> +	.p2align 4,, 8
> +L(loop_2x_vec):
> +	VMOVA	(VEC_SIZE * 0 + 1)(%rdi), %VMM(0)
> +	VPMIN	(VEC_SIZE * 1 + 1)(%rdi), %VMM(0), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 1)(%rdi), %VMM(2)
> +	VPMIN	(VEC_SIZE * 3 + 1)(%rdi), %VMM(2), %VMM(3)
> +	VPMIN	%VMM(1), %VMM(3), %VMM(3)
> +	VPCMPEQ	%VMM(3), %VZERO, %VMM(3)
> +	vpmovmskb %VMM(3), %r8d
> +	subq	$(VEC_SIZE * -4), %rdi
> +	testl	%r8d, %r8d
> +	jz	L(loop_2x_vec)
> +
> +	addq	$(VEC_SIZE * -4 + 1), %rdi
> +
> +	VPCMPEQ	%VMM(0), %VZERO, %VMM(0)
> +	vpmovmskb %VMM(0), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(bsf_and_done_v0)
> +
> +	VPCMPEQ	%VMM(1), %VZERO, %VMM(1)
> +	vpmovmskb %VMM(1), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(bsf_and_done_v1)
> +
> +	VPCMPEQ	%VMM(2), %VZERO, %VMM(2)
> +	vpmovmskb %VMM(2), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(bsf_and_done_v2)
> +
> +	movl	%r8d, %ecx
> +L(bsf_and_done_v3):
> +	addq	$VEC_SIZE, %rdi
> +L(bsf_and_done_v2):
> +	bsfl	%ecx, %ecx
> +	leaq	(VEC_SIZE * 2)(%rdi, %rcx), %rdi
> +	jmp	L(strcat_strlen_done)
> +
> +	.p2align 4,, 4
> +L(bsf_and_done_v1):
> +	addq	$VEC_SIZE, %rdi
> +L(bsf_and_done_v0):
> +	bsfl	%ecx, %ecx
> +	addq	%rcx, %rdi
> +L(strcat_strlen_done):
> diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> index c2c581ecf7..3ae2de8ea9 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> @@ -1,12 +1,3 @@
> -#ifndef STRCPY
> -# define STRCPY __strcpy_avx2_rtm
> -#endif
> -
> -#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> -  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> -
> -#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
> -
> -#define SECTION(p) p##.avx.rtm
> -
> +#define STRCPY	__strcpy_avx2_rtm
> +#include "x86-avx-rtm-vecs.h"
>  #include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
> index c725834929..32f86baa4c 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
> @@ -20,984 +20,378 @@
>  
>  #if ISA_SHOULD_BUILD (3)
>  
> +# include <sysdep.h>
>  
> -# ifndef USE_AS_STRCAT
> -#  include <sysdep.h>
> -
> -#  ifndef STRCPY
> -#   define STRCPY  __strcpy_avx2
> -#  endif
> -
> -# endif
> -
> -/* Number of bytes in a vector register */
>  # ifndef VEC_SIZE
> -#  define VEC_SIZE	32
> -# endif
> -
> -# ifndef VZEROUPPER
> -#  define VZEROUPPER	vzeroupper
> -# endif
> -
> -# ifndef SECTION
> -#  define SECTION(p)	p##.avx
> -# endif
> -
> -/* zero register */
> -#define xmmZ	xmm0
> -#define ymmZ	ymm0
> -
> -/* mask register */
> -#define ymmM	ymm1
> -
> -# ifndef USE_AS_STRCAT
> -
> -	.section SECTION(.text),"ax",@progbits
> -ENTRY (STRCPY)
> -#  ifdef USE_AS_STRNCPY
> -	mov	%RDX_LP, %R8_LP
> -	test	%R8_LP, %R8_LP
> -	jz	L(ExitZero)
> -#  endif
> -	mov	%rsi, %rcx
> -#  ifndef USE_AS_STPCPY
> -	mov	%rdi, %rax      /* save result */
> -#  endif
> -
> +#  include "x86-avx-vecs.h"
>  # endif
>  
> -	vpxor	%xmmZ, %xmmZ, %xmmZ
> -
> -	and	$((VEC_SIZE * 4) - 1), %ecx
> -	cmp	$(VEC_SIZE * 2), %ecx
> -	jbe	L(SourceStringAlignmentLessTwoVecSize)
> -
> -	and	$-VEC_SIZE, %rsi
> -	and	$(VEC_SIZE - 1), %ecx
> -
> -	vpcmpeqb (%rsi), %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %edx
> -	shr	%cl, %rdx
> -
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -	mov	$VEC_SIZE, %r10
> -	sub	%rcx, %r10
> -	cmp	%r10, %r8
> -#  else
> -	mov	$(VEC_SIZE + 1), %r10
> -	sub	%rcx, %r10
> -	cmp	%r10, %r8
> -#  endif
> -	jbe	L(CopyVecSizeTailCase2OrCase3)
> +# ifndef STRCPY
> +#  define STRCPY	__strcpy_avx2
>  # endif
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeTail)
>  
> -	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
> -	vpmovmskb %ymm2, %edx
> +	/* Use movsb in page cross case to save code size.  */
> +# define USE_MOVSB_IN_PAGE_CROSS	1
>  
> -# ifdef USE_AS_STRNCPY
> -	add	$VEC_SIZE, %r10
> -	cmp	%r10, %r8
> -	jbe	L(CopyTwoVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jnz	L(CopyTwoVecSize)
> -
> -	vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
> -	vmovdqu %ymm2, (%rdi)
> -
> -/* If source address alignment != destination address alignment */
> -	.p2align 4
> -L(UnalignVecSizeBoth):
> -	sub	%rcx, %rdi
> -# ifdef USE_AS_STRNCPY
> -	add	%rcx, %r8
> -	sbb	%rcx, %rcx
> -	or	%rcx, %r8
> -# endif
> -	mov	$VEC_SIZE, %rcx
> -	vmovdqa (%rsi, %rcx), %ymm2
> -	vmovdqu %ymm2, (%rdi, %rcx)
> -	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> -	vpcmpeqb %ymm2, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$(VEC_SIZE * 3), %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec2)
> +# ifdef USE_AS_WCSCPY
> +#  define VPCMPEQ	vpcmpeqd
> +#  define VPMIN	vpminud
> +#  define CHAR_SIZE	4
>  # else
> -	jnz	L(CopyVecSize)
> +#  define VPCMPEQ	vpcmpeqb
> +#  define VPMIN	vpminub
> +#  define CHAR_SIZE	1
>  # endif
>  
> -	vmovdqu %ymm2, (%rdi, %rcx)
> -	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
> -	vpcmpeqb %ymm3, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec3)
> -# else
> -	jnz	L(CopyVecSize)
> -# endif
> +# define PAGE_SIZE	4096
>  
> -	vmovdqu %ymm3, (%rdi, %rcx)
> -	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
> -	vpcmpeqb %ymm4, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec4)
> +# ifdef USE_AS_STPCPY
> +#  define END_REG	rax
>  # else
> -	jnz	L(CopyVecSize)
> +#  define END_REG	rdi, %rdx
>  # endif
>  
> -	vmovdqu %ymm4, (%rdi, %rcx)
> -	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> -	vpcmpeqb %ymm2, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec2)
> +# ifdef USE_AS_STRCAT
> +#  define PAGE_ALIGN_REG	ecx
>  # else
> -	jnz	L(CopyVecSize)
> +#  define PAGE_ALIGN_REG	eax
>  # endif
>  
> -	vmovdqu %ymm2, (%rdi, %rcx)
> -	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> -	vpcmpeqb %ymm2, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec2)
> -# else
> -	jnz	L(CopyVecSize)
> -# endif
> +# define VZERO	VMM(7)
> +# define VZERO_128	VMM_128(7)
>  
> -	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
> -	vmovdqu %ymm2, (%rdi, %rcx)
> -	vpcmpeqb %ymm3, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %edx
> -	add	$VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec3)
> -# else
> -	jnz	L(CopyVecSize)
> -# endif
> +	.section SECTION(.text), "ax", @progbits
> +ENTRY(STRCPY)
> +	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
>  
> -	vmovdqu %ymm3, (%rdi, %rcx)
> -	mov	%rsi, %rdx
> -	lea	VEC_SIZE(%rsi, %rcx), %rsi
> -	and	$-(VEC_SIZE * 4), %rsi
> -	sub	%rsi, %rdx
> -	sub	%rdx, %rdi
> -# ifdef USE_AS_STRNCPY
> -	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
> -# endif
> -L(UnalignedFourVecSizeLoop):
> -	vmovdqa (%rsi), %ymm4
> -	vmovdqa VEC_SIZE(%rsi), %ymm5
> -	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
> -	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
> -	vpminub %ymm5, %ymm4, %ymm2
> -	vpminub %ymm7, %ymm6, %ymm3
> -	vpminub %ymm2, %ymm3, %ymm3
> -	vpcmpeqb %ymmM, %ymm3, %ymm3
> -	vpmovmskb %ymm3, %edx
> -# ifdef USE_AS_STRNCPY
> -	sub	$(VEC_SIZE * 4), %r8
> -	jbe	L(UnalignedLeaveCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jnz	L(UnalignedFourVecSizeLeave)
> -
> -L(UnalignedFourVecSizeLoop_start):
> -	add	$(VEC_SIZE * 4), %rdi
> -	add	$(VEC_SIZE * 4), %rsi
> -	vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
> -	vmovdqa (%rsi), %ymm4
> -	vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
> -	vmovdqa VEC_SIZE(%rsi), %ymm5
> -	vpminub %ymm5, %ymm4, %ymm2
> -	vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
> -	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
> -	vmovdqu %ymm7, -VEC_SIZE(%rdi)
> -	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
> -	vpminub %ymm7, %ymm6, %ymm3
> -	vpminub %ymm2, %ymm3, %ymm3
> -	vpcmpeqb %ymmM, %ymm3, %ymm3
> -	vpmovmskb %ymm3, %edx
> -# ifdef USE_AS_STRNCPY
> -	sub	$(VEC_SIZE * 4), %r8
> -	jbe	L(UnalignedLeaveCase2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jz	L(UnalignedFourVecSizeLoop_start)
> -
> -L(UnalignedFourVecSizeLeave):
> -	vpcmpeqb %ymm4, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %edx
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeUnaligned_0)
> -
> -	vpcmpeqb %ymm5, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %ecx
> -	test	%ecx, %ecx
> -	jnz	L(CopyVecSizeUnaligned_16)
> -
> -	vpcmpeqb %ymm6, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %edx
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeUnaligned_32)
> -
> -	vpcmpeqb %ymm7, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %ecx
> -	bsf	%ecx, %edx
> -	vmovdqu %ymm4, (%rdi)
> -	vmovdqu %ymm5, VEC_SIZE(%rdi)
> -	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
> -# endif
> -	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
> -	add	$(VEC_SIZE - 1), %r8
> -	sub	%rdx, %r8
> -	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> -# else
> -	add	$(VEC_SIZE * 3), %rsi
> -	add	$(VEC_SIZE * 3), %rdi
> -	jmp	L(CopyVecSizeExit)
> +# ifdef USE_AS_STRCAT
> +	movq	%rdi, %rax
> +#  include "strcat-strlen-avx2.h.S"
>  # endif
>  
> -/* If source address alignment == destination address alignment */
> -
> -L(SourceStringAlignmentLessTwoVecSize):
> -	vmovdqu (%rsi), %ymm3
> -	vmovdqu VEC_SIZE(%rsi), %ymm2
> -	vpcmpeqb %ymm3, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %edx
> -
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -	cmp	$VEC_SIZE, %r8
> -#  else
> -	cmp	$(VEC_SIZE + 1), %r8
> -#  endif
> -	jbe	L(CopyVecSizeTail1Case2OrCase3)
> +	movl	%esi, %PAGE_ALIGN_REG
> +	andl	$(PAGE_SIZE - 1), %PAGE_ALIGN_REG
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
> +	ja	L(page_cross)
> +L(page_cross_continue):
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +	movq	%rdi, %rax
>  # endif
> -	test	%edx, %edx
> -	jnz	L(CopyVecSizeTail1)
> -
> -	vmovdqu %ymm3, (%rdi)
> -	vpcmpeqb %ymm2, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %edx
> -
> -# ifdef USE_AS_STRNCPY
> -#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> -	cmp	$(VEC_SIZE * 2), %r8
> -#  else
> -	cmp	$((VEC_SIZE * 2) + 1), %r8
> -#  endif
> -	jbe	L(CopyTwoVecSize1Case2OrCase3)
> -# endif
> -	test	%edx, %edx
> -	jnz	L(CopyTwoVecSize1)
> -
> -	and	$-VEC_SIZE, %rsi
> -	and	$(VEC_SIZE - 1), %ecx
> -	jmp	L(UnalignVecSizeBoth)
> +	VMOVU	(%rsi), %VMM(0)
> +	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
>  
> -/*------End of main part with loops---------------------*/
> +	testl	%ecx, %ecx
> +	jz	L(more_1x_vec)
>  
> -/* Case1 */
> +	/* No longer need ymm registers so just vzeroupper so it doesn't
> +	   need to be duplicated at each return statement.  */
> +	COND_VZEROUPPER
>  
> -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> -	.p2align 4
> -L(CopyVecSize):
> -	add	%rcx, %rdi
> -# endif
> -L(CopyVecSizeTail):
> -	add	%rcx, %rsi
> -L(CopyVecSizeTail1):
> -	bsf	%edx, %edx
> -L(CopyVecSizeExit):
> -	cmp	$32, %edx
> -	jae	L(Exit32_63)
> -	cmp	$16, %edx
> -	jae	L(Exit16_31)
> -	cmp	$8, %edx
> -	jae	L(Exit8_15)
> -	cmp	$4, %edx
> -	jae	L(Exit4_7)
> -	cmp	$3, %edx
> -	je	L(Exit3)
> -	cmp	$1, %edx
> -	ja	L(Exit2)
> -	je	L(Exit1)
> -	movb	$0, (%rdi)
> +	xorl	%edx, %edx
> +	bsfl	%ecx, %edx
>  # ifdef USE_AS_STPCPY
> -	lea	(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$1, %r8
> -	lea	1(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> -# endif
> -L(return_vzeroupper):
> -	ZERO_UPPER_VEC_REGISTERS_RETURN
> -
> -	.p2align 4
> -L(CopyTwoVecSize1):
> -	add	$VEC_SIZE, %rsi
> -	add	$VEC_SIZE, %rdi
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$VEC_SIZE, %r8
> -# endif
> -	jmp	L(CopyVecSizeTail1)
> -
> -	.p2align 4
> -L(CopyTwoVecSize):
> -	bsf	%edx, %edx
> -	add	%rcx, %rsi
> -	add	$VEC_SIZE, %edx
> -	sub	%ecx, %edx
> -	jmp	L(CopyVecSizeExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnaligned_0):
> -	bsf	%edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -	vmovdqu %ymm4, (%rdi)
> -	add	$((VEC_SIZE * 4) - 1), %r8
> -	sub	%rdx, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> -# else
> -	jmp	L(CopyVecSizeExit)
> -# endif
> -
> -	.p2align 4
> -L(CopyVecSizeUnaligned_16):
> -	bsf	%ecx, %edx
> -	vmovdqu %ymm4, (%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	VEC_SIZE(%rdi, %rdx), %rax
> -# endif
> -	vmovdqu %ymm5, VEC_SIZE(%rdi)
> -	add	$((VEC_SIZE * 3) - 1), %r8
> -	sub	%rdx, %r8
> -	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> +	leaq	(%rdi, %rdx), %rax
> +# endif
> +
> +	/* Use mask bits in rcx to detect which copy we need. If the low
> +	   mask is zero then there must be a bit set in the upper half.
> +	   I.e if ecx != 0 and cx == 0, then match must be upper 16
> +	   bits so we use L(copy_16_31).  */
> +	testw	%cx, %cx
> +	jz	L(copy_16_31)
> +
> +	testb	%cl, %cl
> +	jz	L(copy_8_15)
> +# ifdef USE_AS_WCSCPY
> +	vmovd	%xmm0, (%rdi)
> +	movl	$0, (%END_REG)
> +	ret
>  # else
> -	add	$VEC_SIZE, %rsi
> -	add	$VEC_SIZE, %rdi
> -	jmp	L(CopyVecSizeExit)
> -# endif
> -
> -	.p2align 4
> -L(CopyVecSizeUnaligned_32):
> -	bsf	%edx, %edx
> -	vmovdqu %ymm4, (%rdi)
> -	vmovdqu %ymm5, VEC_SIZE(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> -	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
> -# endif
> -	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> -	add	$((VEC_SIZE * 2) - 1), %r8
> -	sub	%rdx, %r8
> -	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
> -	jmp	L(StrncpyFillTailWithZero)
> +	testb	$0x7, %cl
> +	jz	L(copy_4_7)
> +
> +	testl	%edx, %edx
> +	jz	L(set_null_term)
> +	vmovd	%xmm0, %ecx
> +	movw	%cx, (%rdi)
> +
> +	.p2align 4,, 2
> +L(set_null_term):
> +	movb	$0, (%END_REG)
> +	ret
> +
> +	.p2align 4,, 12
> +L(copy_4_7):
> +	movl	-3(%rsi, %rdx), %ecx
> +	vmovd	%xmm0, (%rdi)
> +	movl	%ecx, -3(%END_REG)
> +	ret
> +# endif
> +
> +	.p2align 4,, 10
> +L(copy_16_31):
> +	VMOVU	-(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
> +	VMOVU	%xmm0, (%rdi)
> +	VMOVU	%xmm1, -(16 - CHAR_SIZE)(%END_REG)
> +	ret
> +
> +	.p2align 4,, 10
> +L(copy_8_15):
> +# ifdef USE_AS_WCSCPY
> +	movl	-(8 - CHAR_SIZE)(%rsi, %rdx), %ecx
>  # else
> -	add	$(VEC_SIZE * 2), %rsi
> -	add	$(VEC_SIZE * 2), %rdi
> -	jmp	L(CopyVecSizeExit)
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> -#  ifndef USE_AS_STRCAT
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec6):
> -	vmovdqu %ymm6, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec5):
> -	vmovdqu %ymm5, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec4):
> -	vmovdqu %ymm4, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> -
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec3):
> -	vmovdqu %ymm3, (%rdi, %rcx)
> -	jmp	L(CopyVecSizeVecExit)
> -#  endif
> -
> -/* Case2 */
> -
> -	.p2align 4
> -L(CopyVecSizeCase2):
> -	add	$VEC_SIZE, %r8
> -	add	%rcx, %rdi
> -	add	%rcx, %rsi
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyTwoVecSizeCase2):
> -	add	%rcx, %rsi
> -	bsf	%edx, %edx
> -	add	$VEC_SIZE, %edx
> -	sub	%ecx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -L(CopyVecSizeTailCase2):
> -	add	%rcx, %rsi
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -L(CopyVecSizeTail1Case2):
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -	jmp	L(StrncpyExit)
> -
> -/* Case2 or Case3,  Case3 */
> -
> -	.p2align 4
> -L(CopyVecSizeCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyVecSizeCase2)
> -L(CopyVecSizeCase3):
> -	add	$VEC_SIZE, %r8
> -	add	%rcx, %rdi
> -	add	%rcx, %rsi
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyTwoVecSizeCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyTwoVecSizeCase2)
> -	add	%rcx, %rsi
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyVecSizeTailCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyVecSizeTailCase2)
> -	add	%rcx, %rsi
> -	jmp	L(StrncpyExit)
> -
> -	.p2align 4
> -L(CopyTwoVecSize1Case2OrCase3):
> -	add	$VEC_SIZE, %rdi
> -	add	$VEC_SIZE, %rsi
> -	sub	$VEC_SIZE, %r8
> -L(CopyVecSizeTail1Case2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(CopyVecSizeTail1Case2)
> -	jmp	L(StrncpyExit)
> -# endif
> -
> -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
> -
> -	.p2align 4
> -L(Exit1):
> -	movzwl	(%rsi), %edx
> -	mov	%dx, (%rdi)
> -# ifdef USE_AS_STPCPY
> -	lea	1(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$2, %r8
> -	lea	2(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> -# endif
> -	VZEROUPPER_RETURN
> -
> -	.p2align 4
> -L(Exit2):
> -	movzwl	(%rsi), %ecx
> -	mov	%cx, (%rdi)
> -	movb	$0, 2(%rdi)
> -# ifdef USE_AS_STPCPY
> -	lea	2(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$3, %r8
> -	lea	3(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> -# endif
> -	VZEROUPPER_RETURN
> -
> -	.p2align 4
> -L(Exit3):
> -	mov	(%rsi), %edx
> -	mov	%edx, (%rdi)
> +	movq	-(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
> +# endif
> +	vmovq	%xmm0, (%rdi)
> +	movq	%rcx, -(8 - CHAR_SIZE)(%END_REG)
> +	ret
> +
> +
> +	.p2align 4,, 8
> +L(more_1x_vec):
> +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> +	VMOVU	%VMM(0), (%rdi)
> +# endif
> +	subq	%rsi, %rdi
> +	orq	$(VEC_SIZE - 1), %rsi
> +	addq	%rsi, %rdi
> +	VMOVA	1(%rsi), %VMM(1)
> +
> +	/* Try and order stores after as many loads as is reasonable to
> +	   avoid potential false dependencies.  */
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +	VMOVU	%VMM(0), (%rax)
> +# endif
> +	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(ret_vec_x1)
> +
> +	VMOVA	(VEC_SIZE + 1)(%rsi), %VMM(2)
> +	VMOVU	%VMM(1), 1(%rdi)
> +
> +	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(ret_vec_x2)
> +
> +	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(3)
> +	VMOVU	%VMM(2), (VEC_SIZE + 1)(%rdi)
> +
> +	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(ret_vec_x3)
> +
> +	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(4)
> +	VMOVU	%VMM(3), (VEC_SIZE * 2 + 1)(%rdi)
> +	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %edx
> +	testl	%edx, %edx
> +	jnz	L(ret_vec_x4)
> +
> +	VMOVU	%VMM(4), (VEC_SIZE * 3 + 1)(%rdi)
> +
> +	/* Subtract rsi from rdi before aligning. Adding back rsi will
> +	   get proper rdi (dst) for new src.  */
> +	subq	%rsi, %rdi
> +	incq	%rsi
> +	orq	$(VEC_SIZE * 4 - 1), %rsi
> +
> +	/* Do first half of loop ahead of time so loop can just start by
> +	   storing.  */
> +	VMOVA	(VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPMIN	%VMM(4), %VMM(6), %VMM(6)
> +	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %edx
> +	addq	%rsi, %rdi
> +
> +	testl	%edx, %edx
> +	jnz	L(loop_4x_done)
> +
> +	.p2align 4,, 11
> +L(loop_4x_vec):
> +
> +	VMOVU	%VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
> +	subq	$(VEC_SIZE * -4), %rsi
> +	VMOVU	%VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3 + 1)(%rdi)
> +
> +
> +	VMOVA	(VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPMIN	%VMM(4), %VMM(6), %VMM(6)
> +	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
> +
> +	vpmovmskb %VMM(6), %edx
> +	subq	$(VEC_SIZE * -4), %rdi
> +	testl	%edx, %edx
> +	jz	L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(ret_vec_x1)
> +	VMOVU	%VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
> +
> +	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(ret_vec_x2)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
> +
> +	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(ret_vec_x3)
> +	VMOVU	%VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
> +L(ret_vec_x4):
> +	bsfl	%edx, %edx
> +	VMOVU	((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> +	VMOVU	%VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
>  # ifdef USE_AS_STPCPY
> -	lea	3(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	$4, %r8
> -	lea	4(%rdi), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	leaq	(VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax
>  # endif
> +L(return_end):
>  	VZEROUPPER_RETURN
>  
> -	.p2align 4
> -L(Exit4_7):
> -	mov	(%rsi), %ecx
> -	mov	%ecx, (%rdi)
> -	mov	-3(%rsi, %rdx), %ecx
> -	mov	%ecx, -3(%rdi, %rdx)
> +	.p2align 4,, 8
> +L(ret_vec_x1):
> +	bsfl	%ecx, %ecx
> +	VMOVU	(1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> +	VMOVU	%VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
>  # ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	%rdx, %r8
> -	sub	$1, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	leaq	1(%rcx, %rdi), %rax
>  # endif
> -	VZEROUPPER_RETURN
> -
> -	.p2align 4
> -L(Exit8_15):
> -	mov	(%rsi), %rcx
> -	mov	-7(%rsi, %rdx), %r9
> -	mov	%rcx, (%rdi)
> -	mov	%r9, -7(%rdi, %rdx)
> -# ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	%rdx, %r8
> -	sub	$1, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> -# endif
> -	VZEROUPPER_RETURN
> +L(return_vzeroupper):
> +	ZERO_UPPER_VEC_REGISTERS_RETURN
>  
> -	.p2align 4
> -L(Exit16_31):
> -	vmovdqu (%rsi), %xmm2
> -	vmovdqu -15(%rsi, %rdx), %xmm3
> -	vmovdqu %xmm2, (%rdi)
> -	vmovdqu %xmm3, -15(%rdi, %rdx)
> +	.p2align 4,, 8
> +L(ret_vec_x2):
> +	bsfl	%ecx, %ecx
> +	VMOVU	((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> +	VMOVU	%VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
>  # ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub %rdx, %r8
> -	sub $1, %r8
> -	lea 1(%rdi, %rdx), %rdi
> -	jnz L(StrncpyFillTailWithZero)
> +	leaq	(VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax
>  # endif
>  	VZEROUPPER_RETURN
>  
> -	.p2align 4
> -L(Exit32_63):
> -	vmovdqu (%rsi), %ymm2
> -	vmovdqu -31(%rsi, %rdx), %ymm3
> -	vmovdqu %ymm2, (%rdi)
> -	vmovdqu %ymm3, -31(%rdi, %rdx)
> +	.p2align 4,, 8
> +L(ret_vec_x3):
> +	bsfl	%ecx, %ecx
> +	VMOVU	((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> +	VMOVU	%VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
>  # ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -	sub	%rdx, %r8
> -	sub	$1, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -	jnz	L(StrncpyFillTailWithZero)
> +	leaq	(VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax
>  # endif
>  	VZEROUPPER_RETURN
>  
> -# ifdef USE_AS_STRNCPY
>  
> -	.p2align 4
> -L(StrncpyExit1):
> -	movzbl	(%rsi), %edx
> -	mov	%dl, (%rdi)
> +	.p2align 4,, 4
> +L(page_cross):
> +	movq	%rsi, %rcx
> +	andq	$(VEC_SIZE * -1), %rcx
> +
> +	VPCMPEQ	(%rcx), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	shrxl	%esi, %ecx, %ecx
> +# if USE_MOVSB_IN_PAGE_CROSS
> +	/* Optimizing more aggressively for space as this is very cold
> +	   code. This saves 2x cache lines.  */
> +
> +	/* This adds once to the later result which will get correct
> +	   copy bounds. NB: this can never zero-out a non-zero RCX as
> +	   to be in the page cross case rsi cannot be aligned and we
> +	   already right-shift rcx by the misalignment.  */
> +	shll	$CHAR_SIZE, %ecx
> +	jz	L(page_cross_continue)
> +	bsfl	%ecx, %ecx
> +#  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> +	movq	%rdi, %rax
> +#  endif
> +	rep	movsb
>  #  ifdef USE_AS_STPCPY
> -	lea	1(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, 1(%rdi)
> +	leaq	-CHAR_SIZE(%rdi), %rax
>  #  endif
> -	VZEROUPPER_RETURN
>  
> -	.p2align 4
> -L(StrncpyExit2):
> -	movzwl	(%rsi), %edx
> -	mov	%dx, (%rdi)
> -#  ifdef USE_AS_STPCPY
> -	lea	2(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, 2(%rdi)
> -#  endif
>  	VZEROUPPER_RETURN
>  
> -	.p2align 4
> -L(StrncpyExit3_4):
> -	movzwl	(%rsi), %ecx
> -	movzwl	-2(%rsi, %r8), %edx
> -	mov	%cx, (%rdi)
> -	mov	%dx, -2(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	VZEROUPPER_RETURN
> -
> -	.p2align 4
> -L(StrncpyExit5_8):
> -	mov	(%rsi), %ecx
> -	mov	-4(%rsi, %r8), %edx
> -	mov	%ecx, (%rdi)
> -	mov	%edx, -4(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	VZEROUPPER_RETURN
> -
> -	.p2align 4
> -L(StrncpyExit9_16):
> -	mov	(%rsi), %rcx
> -	mov	-8(%rsi, %r8), %rdx
> -	mov	%rcx, (%rdi)
> -	mov	%rdx, -8(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	VZEROUPPER_RETURN
> -
> -	.p2align 4
> -L(StrncpyExit17_32):
> -	vmovdqu (%rsi), %xmm2
> -	vmovdqu -16(%rsi, %r8), %xmm3
> -	vmovdqu %xmm2, (%rdi)
> -	vmovdqu %xmm3, -16(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	VZEROUPPER_RETURN
> -
> -	.p2align 4
> -L(StrncpyExit33_64):
> -	/*  0/32, 31/16 */
> -	vmovdqu (%rsi), %ymm2
> -	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
> -	vmovdqu %ymm2, (%rdi)
> -	vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
> -#  ifdef USE_AS_STPCPY
> -	lea	(%rdi, %r8), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi, %r8)
> -#  endif
> -	VZEROUPPER_RETURN
> -
> -	.p2align 4
> -L(StrncpyExit65):
> -	/* 0/32, 32/32, 64/1 */
> -	vmovdqu (%rsi), %ymm2
> -	vmovdqu 32(%rsi), %ymm3
> -	mov	64(%rsi), %cl
> -	vmovdqu %ymm2, (%rdi)
> -	vmovdqu %ymm3, 32(%rdi)
> -	mov	%cl, 64(%rdi)
> -#  ifdef USE_AS_STPCPY
> -	lea	65(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, 65(%rdi)
> -#  endif
> -	VZEROUPPER_RETURN
> +# else
> +	testl	%ecx, %ecx
> +	jz	L(page_cross_continue)
>  
> +	/* Traditional copy case, essentially same as used in non-page-
> +	   cross case but since we can't reuse VMM(0) we need twice as
> +	   many loads from rsi.  */
>  #  ifndef USE_AS_STRCAT
> -
> -	.p2align 4
> -L(Fill1):
> -	mov	%dl, (%rdi)
> -	VZEROUPPER_RETURN
> -
> -	.p2align 4
> -L(Fill2):
> -	mov	%dx, (%rdi)
> -	VZEROUPPER_RETURN
> -
> -	.p2align 4
> -L(Fill3_4):
> -	mov	%dx, (%rdi)
> -	mov     %dx, -2(%rdi, %r8)
> -	VZEROUPPER_RETURN
> -
> -	.p2align 4
> -L(Fill5_8):
> -	mov	%edx, (%rdi)
> -	mov     %edx, -4(%rdi, %r8)
> -	VZEROUPPER_RETURN
> -
> -	.p2align 4
> -L(Fill9_16):
> -	mov	%rdx, (%rdi)
> -	mov	%rdx, -8(%rdi, %r8)
> -	VZEROUPPER_RETURN
> -
> -	.p2align 4
> -L(Fill17_32):
> -	vmovdqu %xmmZ, (%rdi)
> -	vmovdqu %xmmZ, -16(%rdi, %r8)
> -	VZEROUPPER_RETURN
> -
> -	.p2align 4
> -L(CopyVecSizeUnalignedVec2):
> -	vmovdqu %ymm2, (%rdi, %rcx)
> -
> -	.p2align 4
> -L(CopyVecSizeVecExit):
> -	bsf	%edx, %edx
> -	add	$(VEC_SIZE - 1), %r8
> -	add	%rcx, %rdi
> -#   ifdef USE_AS_STPCPY
> -	lea	(%rdi, %rdx), %rax
> -#   endif
> -	sub	%rdx, %r8
> -	lea	1(%rdi, %rdx), %rdi
> -
> -	.p2align 4
> -L(StrncpyFillTailWithZero):
> -	xor	%edx, %edx
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(StrncpyFillExit)
> -
> -	vmovdqu %ymmZ, (%rdi)
> -	add	$VEC_SIZE, %rdi
> -
> -	mov	%rdi, %rsi
> -	and	$(VEC_SIZE - 1), %esi
> -	sub	%rsi, %rdi
> -	add	%rsi, %r8
> -	sub	$(VEC_SIZE * 4), %r8
> -	jb	L(StrncpyFillLessFourVecSize)
> -
> -L(StrncpyFillLoopVmovdqa):
> -	vmovdqa %ymmZ, (%rdi)
> -	vmovdqa %ymmZ, VEC_SIZE(%rdi)
> -	vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
> -	vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
> -	add	$(VEC_SIZE * 4), %rdi
> -	sub	$(VEC_SIZE * 4), %r8
> -	jae	L(StrncpyFillLoopVmovdqa)
> -
> -L(StrncpyFillLessFourVecSize):
> -	add	$(VEC_SIZE * 2), %r8
> -	jl	L(StrncpyFillLessTwoVecSize)
> -	vmovdqa %ymmZ, (%rdi)
> -	vmovdqa %ymmZ, VEC_SIZE(%rdi)
> -	add	$(VEC_SIZE * 2), %rdi
> -	sub	$VEC_SIZE, %r8
> -	jl	L(StrncpyFillExit)
> -	vmovdqa %ymmZ, (%rdi)
> -	add	$VEC_SIZE, %rdi
> -	jmp	L(Fill)
> -
> -	.p2align 4
> -L(StrncpyFillLessTwoVecSize):
> -	add	$VEC_SIZE, %r8
> -	jl	L(StrncpyFillExit)
> -	vmovdqa %ymmZ, (%rdi)
> -	add	$VEC_SIZE, %rdi
> -	jmp	L(Fill)
> -
> -	.p2align 4
> -L(StrncpyFillExit):
> -	add	$VEC_SIZE, %r8
> -L(Fill):
> -	cmp	$17, %r8d
> -	jae	L(Fill17_32)
> -	cmp	$9, %r8d
> -	jae	L(Fill9_16)
> -	cmp	$5, %r8d
> -	jae	L(Fill5_8)
> -	cmp	$3, %r8d
> -	jae	L(Fill3_4)
> -	cmp	$1, %r8d
> -	ja	L(Fill2)
> -	je	L(Fill1)
> -	VZEROUPPER_RETURN
> -
> -/* end of ifndef USE_AS_STRCAT */
> +	xorl	%edx, %edx
>  #  endif
> -
> -	.p2align 4
> -L(UnalignedLeaveCase2OrCase3):
> -	test	%rdx, %rdx
> -	jnz	L(UnalignedFourVecSizeLeaveCase2)
> -L(UnalignedFourVecSizeLeaveCase3):
> -	lea	(VEC_SIZE * 4)(%r8), %rcx
> -	and	$-VEC_SIZE, %rcx
> -	add	$(VEC_SIZE * 3), %r8
> -	jl	L(CopyVecSizeCase3)
> -	vmovdqu %ymm4, (%rdi)
> -	sub	$VEC_SIZE, %r8
> -	jb	L(CopyVecSizeCase3)
> -	vmovdqu %ymm5, VEC_SIZE(%rdi)
> -	sub	$VEC_SIZE, %r8
> -	jb	L(CopyVecSizeCase3)
> -	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> -	sub	$VEC_SIZE, %r8
> -	jb	L(CopyVecSizeCase3)
> -	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
> +	bsfl	%ecx, %edx
>  #  ifdef USE_AS_STPCPY
> -	lea	(VEC_SIZE * 4)(%rdi), %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (VEC_SIZE * 4)(%rdi)
> +	leaq	(%rdi, %rdx), %rax
> +#  elif !defined USE_AS_STRCAT
> +	movq	%rdi, %rax
>  #  endif
> -	VZEROUPPER_RETURN
>  
> -	.p2align 4
> -L(UnalignedFourVecSizeLeaveCase2):
> -	xor	%ecx, %ecx
> -	vpcmpeqb %ymm4, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %edx
> -	add	$(VEC_SIZE * 3), %r8
> -	jle	L(CopyVecSizeCase2OrCase3)
> -	test	%edx, %edx
> -#  ifndef USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec4)
> -#  else
> -	jnz	L(CopyVecSize)
> -#  endif
> -	vpcmpeqb %ymm5, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %edx
> -	vmovdqu %ymm4, (%rdi)
> -	add	$VEC_SIZE, %rcx
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -	test	%edx, %edx
> -#  ifndef USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec5)
> -#  else
> -	jnz	L(CopyVecSize)
> -#  endif
> +	/* vzeroupper early to avoid duplicating at each return.  */
> +	COND_VZEROUPPER
>  
> -	vpcmpeqb %ymm6, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %edx
> -	vmovdqu %ymm5, VEC_SIZE(%rdi)
> -	add	$VEC_SIZE, %rcx
> -	sub	$VEC_SIZE, %r8
> -	jbe	L(CopyVecSizeCase2OrCase3)
> -	test	%edx, %edx
> -#  ifndef USE_AS_STRCAT
> -	jnz	L(CopyVecSizeUnalignedVec6)
> -#  else
> -	jnz	L(CopyVecSize)
> -#  endif
> +	testw	%cx, %cx
> +	jz	L(page_cross_copy_16_31)
>  
> -	vpcmpeqb %ymm7, %ymmZ, %ymmM
> -	vpmovmskb %ymmM, %edx
> -	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> -	lea	VEC_SIZE(%rdi, %rcx), %rdi
> -	lea	VEC_SIZE(%rsi, %rcx), %rsi
> -	bsf	%edx, %edx
> -	cmp	%r8d, %edx
> -	jb	L(CopyVecSizeExit)
> -L(StrncpyExit):
> -	cmp	$65, %r8d
> -	je	L(StrncpyExit65)
> -	cmp	$33, %r8d
> -	jae	L(StrncpyExit33_64)
> -	cmp	$17, %r8d
> -	jae	L(StrncpyExit17_32)
> -	cmp	$9, %r8d
> -	jae	L(StrncpyExit9_16)
> -	cmp	$5, %r8d
> -	jae	L(StrncpyExit5_8)
> -	cmp	$3, %r8d
> -	jae	L(StrncpyExit3_4)
> -	cmp	$1, %r8d
> -	ja	L(StrncpyExit2)
> -	je	L(StrncpyExit1)
> -#  ifdef USE_AS_STPCPY
> -	mov	%rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRCAT
> -	movb	$0, (%rdi)
> -#  endif
> -	VZEROUPPER_RETURN
> -
> -	.p2align 4
> -L(ExitZero):
> -#  ifndef USE_AS_STRCAT
> -	mov	%rdi, %rax
> -#  endif
> -	VZEROUPPER_RETURN
> +	testb	%cl, %cl
> +	jz	L(page_cross_copy_8_15)
>  
> -# endif
> +	testl	$0x7, %cl
> +	jz	L(page_cross_copy_4_7)
>  
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# else
> -END (STRCAT)
> -# endif
> +	testl	%edx, %edx
> +	jz	L(page_cross_set_null_term)
> +	movzwl	(%rsi), %ecx
> +	movw	%cx, (%rdi)
> +L(page_cross_set_null_term):
> +	movb	$0, (%END_REG)
> +	ret
> +
> +	.p2align 4,, 4
> +L(page_cross_copy_4_7):
> +	movl	(%rsi), %ecx
> +	movl	-3(%rsi, %rdx), %esi
> +	movl	%ecx, (%rdi)
> +	movl	%esi, -3(%END_REG)
> +	ret
> +
> +	.p2align 4,, 4
> +L(page_cross_copy_8_15):
> +	movq	(%rsi), %rcx
> +	movq	-7(%rsi, %rdx), %rsi
> +	movq	%rcx, (%rdi)
> +	movq	%rsi, -7(%END_REG)
> +	ret
> +
> +
> +	.p2align 4,, 3
> +L(page_cross_copy_16_31):
> +	VMOVU	(%rsi), %xmm0
> +	VMOVU	-15(%rsi, %rdx), %xmm1
> +	VMOVU	%xmm0, (%rdi)
> +	VMOVU	%xmm1, -15(%END_REG)
> +	ret
> +# endif
> +
> +END(STRCPY)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> index 0dcea18dbb..7272deef2c 100644
> --- a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> @@ -1,3 +1,3 @@
> -#define USE_AS_STRNCAT
> -#define STRCAT __strncat_avx2_rtm
> -#include "strcat-avx2-rtm.S"
> +#define STRNCAT	__strncat_avx2_rtm
> +#include "x86-avx-rtm-vecs.h"
> +#include "strncat-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
> index 52ecbca943..ffa58bd0de 100644
> --- a/sysdeps/x86_64/multiarch/strncat-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
> @@ -1,7 +1,419 @@
> -#ifndef STRNCAT
> -# define STRNCAT	__strncat_avx2
> -#endif
> +/* strncat with AVX2
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (3)
> +
> +# include <sysdep.h>
> +
> +# ifndef VEC_SIZE
> +#  include "x86-avx-vecs.h"
> +# endif
> +
> +# ifndef STRNCAT
> +#  define STRNCAT	__strncat_avx2
> +# endif
> +
> +# ifdef USE_AS_WCSCPY
> +#  define MOVCHAR	movl
> +#  define VPCMPEQ	vpcmpeqd
> +#  define VPMIN	vpminud
> +#  define CHAR_SIZE	4
> +# else
> +#  define MOVCHAR	movb
> +#  define VPCMPEQ	vpcmpeqb
> +#  define VPMIN	vpminub
> +#  define CHAR_SIZE	1
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# define PAGE_SIZE	4096
> +
> +# define VZERO	VMM(7)
> +# define VZERO_128	VMM_128(7)
> +
> +	.section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCAT)
> +	/* Filter zero length strings and very long strings.  Zero
> +	   length strings just return, very long strings are handled by
> +	   using the non-length variant {wcs|str}cat.  */
> +	movq	%rdi, %rax
> +# ifdef USE_AS_WCSCPY
> +	leaq	-1(%rdx), %rcx
> +	shr	$56, %rcx
> +	jnz	L(zero_len)
> +	salq	$2, %rdx
> +# else
> +	test	%rdx, %rdx
> +	jl	L(zero_len)
> +# endif
> +	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
> +
> +# include "strcat-strlen-avx2.h.S"
> +
> +	movl	%esi, %ecx
> +	andl	$(PAGE_SIZE - 1), %ecx
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
> +	ja	L(page_cross)
> +L(page_cross_continue):
> +	VMOVU	(%rsi), %VMM(0)
> +	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +
> +	tzcnt	%ecx, %r8d
> +	cmpq	%r8, %rdx
> +	jbe	L(less_1x_vec)
> +
> +	testl	%ecx, %ecx
> +	jz	L(more_1x_vec)
> +
> +	/* Hoist this to save code size.  */
> +
> +	movl	%r8d, %edx
> +
> +L(less_1x_vec):
> +	COND_VZEROUPPER
> +
> +	cmpl	$16, %edx
> +	jae	L(copy_16_31)
> +	cmpl	$8, %edx
> +	jae	L(copy_8_15)
> +
> +
> +# ifdef USE_AS_WCSCPY
> +	vmovd	%VMM_128(0), (%rdi)
> +	MOVCHAR	$0, (%rdi, %rdx)
> +	ret
> +# else
> +	cmpl	$4, %edx
> +	jae	L(copy_4_7)
> +
> +	movzbl	(%rsi), %ecx
> +	cmpl	$1, %edx
> +	jbe	L(set_null_term)
> +
> +	/* NB: make this `vmovw` if support for AVX512-FP16 is added.
> +	 */
> +	movzwl	1(%rsi), %esi
> +	movw	%si, 1(%rdi)
> +
> +	.p2align 4,, 1
> +L(set_null_term):
> +	movb	%cl, (%rdi)
> +	MOVCHAR	$0, (%rdi, %rdx)
> +	ret
> +
> +	.p2align 4,, 11
> +L(copy_4_7):
> +	movl	-(4)(%rsi, %rdx), %ecx
> +	vmovd	%xmm0, (%rdi)
> +	movl	%ecx, -(4)(%rdi, %rdx)
> +	MOVCHAR	$0, (%rdi, %rdx)
> +	ret
> +# endif
> +
> +
> +	.p2align 4,, 10
> +L(copy_16_31):
> +	VMOVU	-(16)(%rsi, %rdx), %xmm1
> +	VMOVU	%xmm0, (%rdi)
> +	VMOVU	%xmm1, -(16)(%rdi, %rdx)
> +	MOVCHAR	$0, (%rdi, %rdx)
> +	ret
> +
> +	.p2align 4,, 10
> +L(copy_8_15):
> +	movq	-(8)(%rsi, %rdx), %rcx
> +	vmovq	%xmm0, (%rdi)
> +	movq	%rcx, -(8)(%rdi, %rdx)
> +	MOVCHAR	$0, (%rdi, %rdx)
> +	ret
> +
> +	.p2align 4,, 8
> +	.p2align 6,, 14
> +L(more_1x_vec):
> +	VMOVU	%VMM(0), (%rdi)
> +
> +	/* Align rsi (src) and just rdx/rdi (length/dst).  */
> +	addq	%rsi, %rdx
> +	subq	%rsi, %rdi
> +	orq	$(VEC_SIZE - 1), %rsi
> +	incq	%rsi
> +	addq	%rsi, %rdi
> +L(loop_last_4x_vec):
> +	subq	%rsi, %rdx
> +	VMOVA	0(%rsi), %VMM(1)
> +	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	cmpq	$(VEC_SIZE * 2), %rdx
> +	ja	L(more_2x_vec)
> +L(last_2x_vec):
> +	tzcnt	%ecx, %ecx
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x1_len)
> +
> +	cmpl	$VEC_SIZE, %ecx
> +	jnz	L(ret_vec_x1)
> +
> +	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(2)
> +	VMOVU	%VMM(1), (%rdi)
> +	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	addl	$-VEC_SIZE, %edx
> +	bzhil	%edx, %ecx, %r8d
> +	jz	L(ret_vec_x2_len)
> +L(ret_vec_x2):
> +	bsfl	%ecx, %edx
> +L(ret_vec_x2_len):
> +	VMOVU	(%rsi, %rdx), %VMM(0)
> +	MOVCHAR	$0, (VEC_SIZE)(%rdi, %rdx)
> +	VMOVU	%VMM(0), (%rdi, %rdx)
> +L(return_vzeroupper):
> +	ZERO_UPPER_VEC_REGISTERS_RETURN
> +
> +
> +	.p2align 4,, 12
> +L(ret_vec_x1_len):
> +	movl	%edx, %ecx
> +L(ret_vec_x1):
> +	VMOVU	-(VEC_SIZE)(%rsi, %rcx), %VMM(1)
> +	MOVCHAR	$0, (%rdi, %rcx)
> +	VMOVU	%VMM(1), -VEC_SIZE(%rdi, %rcx)
> +	VZEROUPPER_RETURN
> +
> +	.p2align 4,, 8
> +L(last_4x_vec):
> +	subq	$-(VEC_SIZE * 4), %rsi
> +	VMOVA	0(%rsi), %VMM(1)
> +	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	addl	$-(VEC_SIZE * 4), %edx
> +	cmpl	$(VEC_SIZE * 2), %edx
> +	jbe	L(last_2x_vec)
> +	.p2align 4,, 8
> +L(more_2x_vec):
> +	/* L(ret_vec_x1) expects ecx to have position of first match so
> +	   test with bsf.  */
> +	bsfl	%ecx, %ecx
> +	jnz	L(ret_vec_x1)
> +
> +	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(2)
> +	VMOVU	%VMM(1), (%rdi)
> +
> +	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(ret_vec_x2)
> +
>  
> -#define USE_AS_STRNCAT
> -#define STRCAT	STRNCAT
> -#include "strcat-avx2.S"
> +	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(3)
> +	VMOVU	%VMM(2), (VEC_SIZE * 1)(%rdi)
> +
> +	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +
> +	/* Check if length is greater than 4x VEC.  */
> +	cmpq	$(VEC_SIZE * 4), %rdx
> +	ja	L(more_4x_vec)
> +
> +	addl	$(VEC_SIZE * -2), %edx
> +
> +	tzcnt	%ecx, %ecx
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x3_len)
> +
> +	cmpl	$VEC_SIZE, %ecx
> +	jnz	L(ret_vec_x3)
> +
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
> +	VMOVU	%VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
> +	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	addl	$-VEC_SIZE, %edx
> +	bzhil	%edx, %ecx, %r8d
> +	jz	L(ret_vec_x4_len)
> +L(ret_vec_x4):
> +	bsfl	%ecx, %edx
> +L(ret_vec_x4_len):
> +	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %VMM(0)
> +	MOVCHAR	$0, (VEC_SIZE * 3)(%rdi, %rdx)
> +	VMOVU	%VMM(0), (VEC_SIZE * 2)(%rdi, %rdx)
> +	VZEROUPPER_RETURN
> +
> +	.p2align 4,, 4
> +L(ret_vec_x3_len):
> +	movl	%edx, %ecx
> +L(ret_vec_x3):
> +	VMOVU	(VEC_SIZE)(%rsi, %rcx), %VMM(0)
> +	MOVCHAR	$0, (VEC_SIZE * 2)(%rdi, %rcx)
> +	VMOVU	%VMM(0), (VEC_SIZE)(%rdi, %rcx)
> +	VZEROUPPER_RETURN
> +
> +
> +	.p2align 4,, 8
> +L(more_4x_vec):
> +	bsfl	%ecx, %ecx
> +	jnz	L(ret_vec_x3)
> +
> +	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(4)
> +	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
> +	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(ret_vec_x4)
> +
> +	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi)
> +
> +
> +	/* Recheck length before aligning.  */
> +	cmpq	$(VEC_SIZE * 8), %rdx
> +	jbe	L(last_4x_vec)
> +
> +	/* Align rsi (src) and just rdx/rdi (length/dst).  */
> +	addq	%rsi, %rdx
> +	subq	%rsi, %rdi
> +	subq	$-(VEC_SIZE * 4), %rsi
> +	andq	$(VEC_SIZE * -4), %rsi
> +
> +	/* Do first half of loop ahead of time so loop can just start by
> +	   storing.  */
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPMIN	%VMM(4), %VMM(6), %VMM(6)
> +	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %r8d
> +	addq	%rsi, %rdi
> +	testl	%r8d, %r8d
> +	jnz	L(loop_4x_done)
> +
> +	/* Use r9 for end of region before handling last 4x VEC
> +	   specially.  */
> +	leaq	-(VEC_SIZE * 4)(%rdx), %r9
> +
> +	.p2align 4,, 11
> +L(loop_4x_vec):
> +
> +	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +	subq	$(VEC_SIZE * -4), %rsi
> +	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> +
> +	subq	$(VEC_SIZE * -4), %rdi
> +	cmpq	%rsi, %r9
> +	jbe	L(loop_last_4x_vec)
> +
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPMIN	%VMM(4), %VMM(6), %VMM(6)
> +	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
> +
> +	vpmovmskb %VMM(6), %r8d
> +
> +	testl	%r8d, %r8d
> +	jz	L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	/* L(ret_vec_x1) expects ecx to have position of first match so
> +	   test with bsf.  */
> +	bsfl	%ecx, %ecx
> +	jnz	L(ret_vec_x1)
> +	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +
> +	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +
> +	testl	%ecx, %ecx
> +	jnz	L(ret_vec_x2)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +
> +	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	bsfl	%ecx, %ecx
> +	jnz	L(ret_vec_x3)
> +
> +	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +	bsfl	%r8d, %r8d
> +	VMOVU	(VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1)
> +	VMOVU	%VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8)
> +	VZEROUPPER_RETURN
> +
> +
> +
> +	.p2align 4,, 4
> +L(page_cross):
> +	movq	%rsi, %r8
> +	andq	$(VEC_SIZE * -1), %r8
> +
> +	VPCMPEQ	(%r8), %VZERO, %VMM(6)
> +
> +	vpmovmskb %VMM(6), %ecx
> +	shrxl	%esi, %ecx, %ecx
> +
> +	subl	%esi, %r8d
> +	andl	$(VEC_SIZE - 1), %r8d
> +	cmpq	%r8, %rdx
> +	jb	L(page_cross_small)
> +
> +	/* Optimizing more aggressively for space as this is very cold
> +	   code. This saves 2x cache lines.  */
> +
> +	/* This adds once to the later result which will get correct
> +	   copy bounds. NB: this can never zero-out a non-zero RCX as
> +	   to be in the page cross case rsi cannot be aligned and we
> +	   already right-shift rcx by the misalignment.  */
> +	shll	$CHAR_SIZE, %ecx
> +	jz	L(page_cross_continue)
> +	bsfl	%ecx, %ecx
> +	rep	movsb
> +	VZEROUPPER_RETURN
> +
> +L(page_cross_small):
> +	tzcntl	%ecx, %ecx
> +	jz	L(page_cross_setz)
> +	cmpl	%edx, %ecx
> +	cmova	%edx, %ecx
> +	rep	movsb
> +L(page_cross_setz):
> +	MOVCHAR	$0, (%rdi)
> +	VZEROUPPER_RETURN
> +L(zero_len):
> +# ifdef USE_AS_WCSCPY
> +	test	%rdx, %rdx
> +# endif
> +	jnz	OVERFLOW_STRCAT
> +	ret
> +
> +
> +END(STRNCAT)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> index 79e7083299..d42ad88b3d 100644
> --- a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> @@ -1,3 +1,3 @@
> -#define USE_AS_STRNCPY
> -#define STRCPY __strncpy_avx2_rtm
> -#include "strcpy-avx2-rtm.S"
> +#define STRNCPY	__strncpy_avx2_rtm
> +#include "x86-avx-rtm-vecs.h"
> +#include "strncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
> index ce634e94fa..e9afd8fbed 100644
> --- a/sysdeps/x86_64/multiarch/strncpy-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
> @@ -1,7 +1,735 @@
> -#ifndef STRNCPY
> -# define STRNCPY	__strncpy_avx2
> -#endif
> +/* strncpy with AVX2
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (3)
> +
> +# include <sysdep.h>
> +
> +
> +# ifndef VEC_SIZE
> +#  include "x86-avx-vecs.h"
> +# endif
> +
> +# ifndef STRNCPY
> +#  define STRNCPY	__strncpy_avx2
> +# endif
> +
> +
> +# ifdef USE_AS_WCSCPY
> +#  define VPCMPEQ	vpcmpeqd
> +#  define VPMIN	vpminud
> +#  define CHAR_SIZE	4
> +# else
> +#  define VPCMPEQ	vpcmpeqb
> +#  define VPMIN	vpminub
> +#  define CHAR_SIZE	1
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# define PAGE_SIZE	4096
> +
> +# define VZERO	VMM(7)
> +# define VZERO_128	VMM_128(7)
> +
> +
> +	.section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCPY)
> +	/* Filter zero length strings and very long strings.  Zero
> +	   length strings just return, very long strings are handled by
> +	   just running rep stos{b|l} to zero set (which will almost
> +	   certainly segfault), if that succeeds then just calling
> +	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
> +# ifdef USE_AS_WCSCPY
> +	decq	%rdx
> +	movq	%rdx, %rax
> +	/* 56 is end of max supported address space.  */
> +	shr	$56, %rax
> +	jnz	L(zero_len)
> +	salq	$2, %rdx
> +# else
> +	decq	%rdx
> +	/* `dec` can macrofuse with `jl`. If the flag needs to become
> +	   `jb` replace `dec` with `sub`.  */
> +	jl	L(zero_len)
> +# endif
> +
> +	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
> +	movl	%esi, %eax
> +	andl	$(PAGE_SIZE - 1), %eax
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
> +	ja	L(page_cross)
> +
> +L(page_cross_continue):
> +	VMOVU	(%rsi), %VMM(0)
> +	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +
> +	/* If no STPCPY just save end ahead of time.  */
> +# ifndef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +# elif defined USE_AS_WCSCPY
> +	/* Clear dependency as nearly all return code for wcpncpy uses
> +	   `setc %al`.  */
> +	xorl	%eax, %eax
> +# endif
> +
> +	cmpq	$(VEC_SIZE - CHAR_SIZE), %rdx
> +	/* `jb` because length rdx is now length - CHAR_SIZE.  */
> +	jbe	L(less_1x_vec)
> +
> +	/* This may overset but thats fine because we still need to zero
> +	   fill.  */
> +	VMOVU	%VMM(0), (%rdi)
> +
> +	testl	%ecx, %ecx
> +	jnz	L(zfill)
> +
> +	/* Align.  */
> +	addq	%rsi, %rdx
> +	subq	%rsi, %rdi
> +	orq	$(VEC_SIZE - 1), %rsi
> +	incq	%rsi
> +L(last_4x_vec):
> +	addq	%rsi, %rdi
> +L(loop_last_4x_vec):
> +	subq	%rsi, %rdx
> +
> +
> +	VMOVA	0(%rsi), %VMM(1)
> +	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +
> +	cmpq	$(VEC_SIZE * 2), %rdx
> +	jae	L(more_2x_vec)
> +
> +	cmpl	$(VEC_SIZE), %edx
> +	jb	L(ret_vec_x1_len)
> +
> +	testl	%ecx, %ecx
> +	jnz	L(ret_vec_x1)
> +
> +	VPCMPEQ	VEC_SIZE(%rsi), %VZERO, %VMM(6)
> +	VMOVU	%VMM(1), (%rdi)
> +	vpmovmskb %VMM(6), %ecx
> +	shlq	$VEC_SIZE, %rcx
> +L(ret_vec_x1_len):
> +	tzcntq	%rcx, %rcx
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x1_len_no_zfill)
> +	/* Fall through (expectation) is copy len < buffer len.  */
> +	VMOVU	%VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +L(ret_vec_x1_len_no_zfill_mov):
> +	movl	%ecx, %edx
> +# ifdef USE_AS_STPCPY
> +	/* clear flags.  */
> +	xorl	%ecx, %ecx
> +# endif
> +L(ret_vec_x1_len_no_zfill):
> +	VMOVU	((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> +	VMOVU	%VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	setc	%al
> +	addq	%rdx, %rdi
> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# endif
> +L(return_vzeroupper):
> +	ZERO_UPPER_VEC_REGISTERS_RETURN
> +
> +	.p2align 4,, 6
> +L(ret_vec_x1):
> +	bsfl	%ecx, %ecx
> +	VMOVU	%VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +	subl	%ecx, %edx
> +	/* Check if we need to reload/store.  */
> +	cmpl	$VEC_SIZE, %edx
> +	jb	L(ret_vec_x1_len_no_zfill_mov)
> +	/* Otherwise safe to just store directly.  */
> +	VMOVU	%VMM(1), (%rdi)
> +	VMOVU	%VZERO, (%rdi, %rcx)
> +# ifdef USE_AS_STPCPY
> +	leaq	(%rdi, %rcx), %rax
> +# endif
> +	VZEROUPPER_RETURN
> +
> +	.p2align 4,, 12
> +L(more_2x_vec):
> +	VMOVU	%VMM(1), (%rdi)
> +	testl	%ecx, %ecx
> +	/* Must fill at least 2x VEC.  */
> +	jnz	L(zfill_vec1)
> +
> +	VMOVA	VEC_SIZE(%rsi), %VMM(2)
> +	VMOVU	%VMM(2), VEC_SIZE(%rdi)
> +	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	testl	%ecx, %ecx
> +	/* Must fill at least 1x VEC.  */
> +	jnz	L(zfill_vec2)
> +
> +	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(3)
> +	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +
> +	/* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
> +	   CHAR_SIZE.  */
> +	cmpq	$(VEC_SIZE * 4 - CHAR_SIZE), %rdx
> +	ja	L(more_4x_vec)
> +
> +	subl	$(VEC_SIZE * 3), %edx
> +	jb	L(ret_vec_x3_len)
> +
> +	testl	%ecx, %ecx
> +	jnz	L(ret_vec_x3)
> +
> +	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
> +	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
> +	vpmovmskb %VMM(6), %ecx
> +	tzcntl	%ecx, %ecx
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x4_len_no_zfill)
> +	/* Fall through (expectation) is copy len < buffer len.  */
> +	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +	movl	%ecx, %edx
> +L(ret_vec_x4_len_no_zfill):
> +	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> +	VMOVU	%VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	setc	%al
> +	addq	%rdx, %rdi
> +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +	leal	(VEC_SIZE * 3 + 0)(%edx), %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# endif
> +	VZEROUPPER_RETURN
> +
> +
> +L(ret_vec_x3_len):
> +	addl	$(VEC_SIZE * 1), %edx
> +	tzcntl	%ecx, %ecx
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_vec_x3_len_no_zfill)
> +	/* Fall through (expectation) is copy len < buffer len.  */
> +	VMOVU	%VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +L(ret_vec_x3_len_no_zfill_mov):
> +	movl	%ecx, %edx
> +# ifdef USE_AS_STPCPY
> +	/* clear flags.  */
> +	xorl	%ecx, %ecx
> +# endif
> +	.p2align 4,, 4
> +L(ret_vec_x3_len_no_zfill):
> +	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> +	VMOVU	%VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	setc	%al
> +	addq	%rdx, %rdi
> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +	leal	(VEC_SIZE * 2 + 0)(%rdx), %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# endif
> +	VZEROUPPER_RETURN
> +
> +
> +	.p2align 4,, 8
> +L(ret_vec_x3):
> +	bsfl	%ecx, %ecx
> +	VMOVU	%VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
> +	subl	%ecx, %edx
> +	jl	L(ret_vec_x3_len_no_zfill_mov)
> +	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
> +	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
> +# ifdef USE_AS_STPCPY
> +	leaq	(VEC_SIZE * 2)(%rdi, %rcx), %rax
> +# endif
> +	VZEROUPPER_RETURN
> +
> +	.p2align 4,, 8
> +L(more_4x_vec):
> +
> +	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
> +	testl	%ecx, %ecx
> +	jnz	L(zfill_vec3)
> +
> +	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(4)
> +	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi)
> +	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(zfill_vec4)
> +
> +	movq	%rdx, %rcx
> +	addq	%rsi, %rdx
> +	subq	%rsi, %rdi
> +	subq	$-(VEC_SIZE * 4), %rsi
> +	/* Recheck length before aligning.  */
> +	cmpq	$(VEC_SIZE * 8 - CHAR_SIZE), %rcx
> +	jbe	L(last_4x_vec)
> +
> +	andq	$(VEC_SIZE * -4), %rsi
> +
> +	/* Do first half of loop ahead of time so loop can just start by
> +	   storing.  */
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPMIN	%VMM(4), %VMM(6), %VMM(6)
> +	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %r8d
> +	addq	%rsi, %rdi
> +	testl	%r8d, %r8d
> +	jnz	L(loop_4x_done)
> +
> +	/* Use r9 as end register.  */
> +	leaq	-(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
>  
> -#define USE_AS_STRNCPY
> -#define STRCPY	STRNCPY
> -#include "strcpy-avx2.S"
> +	.p2align 4,, 11
> +L(loop_4x_vec):
> +
> +	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +	subq	$(VEC_SIZE * -4), %rsi
> +	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> +
> +	subq	$(VEC_SIZE * -4), %rdi
> +	cmpq	%rsi, %r9
> +	jbe	L(loop_last_4x_vec)
> +
> +	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> +	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> +	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> +	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> +	VPMIN	%VMM(0), %VMM(1), %VMM(4)
> +	VPMIN	%VMM(2), %VMM(3), %VMM(6)
> +	VPMIN	%VMM(4), %VMM(6), %VMM(6)
> +	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
> +
> +	vpmovmskb %VMM(6), %r8d
> +
> +	testl	%r8d, %r8d
> +	jz	L(loop_4x_vec)
> +
> +L(loop_4x_done):
> +	subq	%rsi, %rdx
> +	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(zfill_vec1)
> +
> +	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(zfill_vec2)
> +
> +	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> +	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
> +	vpmovmskb %VMM(6), %ecx
> +	testl	%ecx, %ecx
> +	jnz	L(zfill_vec3)
> +
> +	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> +	movl	%r8d, %ecx
> +
> +	// Zfill more....
> +
> +	.p2align 4,, 4
> +L(zfill_vec4):
> +	addq	$(VEC_SIZE * 2), %rdi
> +	subq	$(VEC_SIZE * 2), %rdx
> +L(zfill_vec2):
> +	shlq	$VEC_SIZE, %rcx
> +L(zfill):
> +	bsfq	%rcx, %rcx
> +	subq	%rcx, %rdx
> +	addq	%rcx, %rdi
> +# ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +# endif
> +L(zfill_from_page_cross):
> +	cmpq	$VEC_SIZE, %rdx
> +	jb	L(zfill_less_vec_vzeroupper)
> +
> +L(zfill_more_1x_vec):
> +	VMOVU	%VZERO, CHAR_SIZE(%rdi)
> +	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
> +	cmpq	$(VEC_SIZE * 2), %rdx
> +	jae	L(zfill_more_2x_vec)
> +L(zfill_done0):
> +	VZEROUPPER_RETURN
> +
> +	.p2align 4,, 8
> +L(zfill_vec3):
> +	addq	$(VEC_SIZE * 2), %rdi
> +	subq	$(VEC_SIZE * 2), %rdx
> +	.p2align 4,, 2
> +L(zfill_vec1):
> +	bsfl	%ecx, %ecx
> +	addq	%rcx, %rdi
> +	subq	%rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +# endif
> +	/* zfill from vec1/vec3 must have to set at least 2x VECS.  */
> +
> +	VMOVU	%VZERO, CHAR_SIZE(%rdi)
> +	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
> +	cmpq	$(VEC_SIZE * 2), %rdx
> +	jb	L(zfill_done0)
> +L(zfill_more_2x_vec):
> +	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
> +	VMOVU	%VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
> +	subq	$(VEC_SIZE * 4 - CHAR_SIZE), %rdx
> +	jbe	L(zfill_done)
> +
> +	addq	%rdi, %rdx
> +	VMOVU	%VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
> +	VMOVU	%VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
> +
> +
> +	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
> +	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
> +
> +	subq	$-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
> +	cmpq	%rdi, %rdx
> +	jbe	L(zfill_done)
> +
> +	andq	$-(VEC_SIZE), %rdi
> +	.p2align 4,, 12
> +L(zfill_loop_4x_vec):
> +	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
> +	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
> +	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
> +	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	cmpq	%rdi, %rdx
> +	ja	L(zfill_loop_4x_vec)
> +L(zfill_done):
> +	VZEROUPPER_RETURN
> +
> +
> +	.p2align 4,, 8
> +L(copy_1x):
> +	VMOVU	%VMM(0), (%rdi)
> +	testl	%ecx, %ecx
> +	jz	L(ret_32_32)
> +L(zfill_less_vec):
> +	bsfl	%ecx, %ecx
> +L(zfill_less_vec_no_bsf):
> +	subq	%rcx, %rdx
> +	addq	%rcx, %rdi
> +# ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +# endif
> +L(zfill_less_vec_vzeroupper):
> +	COND_VZEROUPPER
> +	/* We are taking advantage of the fact that to be here we must
> +	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
> +	   way for overwriting.  */
> +	cmpl	$16, %edx
> +	jb	L(zfill_less_16)
> +	VMOVU	%VZERO_128, (%rdi)
> +	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
> +	ret
> +# ifdef USE_AS_STPCPY
> +L(ret_32_32):
> +	leaq	CHAR_SIZE(%rdi, %rdx), %rax
> +	VZEROUPPER_RETURN
> +# endif
> +
> +	.p2align 4,, 4
> +L(copy_16_31):
> +	/* Overfill to avoid branches.  */
> +	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
> +	vmovdqu	%xmm0, (%rdi)
> +	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
> +	cmpl	%ecx, %edx
> +	ja	L(zfill_less_vec_no_bsf)
> +# ifndef USE_AS_STPCPY
> +L(ret_32_32):
> +# else
> +#  ifdef USE_AS_WCSCPY
> +	setc	%al
> +	addq	%rdx, %rdi
> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# endif
> +	VZEROUPPER_RETURN
> +
> +	.p2align 4,, 4
> +L(copy_8_15):
> +	/* Overfill to avoid branches.  */
> +	movq	-(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
> +	vmovq	%xmm0, (%rdi)
> +	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_8_15)
> +	subq	%rcx, %rdx
> +	addq	%rcx, %rdi
> +# ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +# endif
> +	.p2align 4,, 8
> +L(zfill_less_16):
> +	xorl	%ecx, %ecx
> +	cmpl	$8, %edx
> +	jb	L(zfill_less_8)
> +	movq	%rcx, (%rdi)
> +	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
> +# ifndef USE_AS_STPCPY
> +L(ret_8_15):
> +# endif
> +	ret
> +
> +
> +	.p2align 4,, 8
> +L(less_1x_vec):
> +	/* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
> +	   buffer sizes are aligned conventially.  */
> +	je	L(copy_1x)
> +
> +	tzcntl	%ecx, %ecx
> +	cmpl	$16, %edx
> +	jae	L(copy_16_31)
> +
> +	COND_VZEROUPPER
> +	cmpl	$8, %edx
> +	jae	L(copy_8_15)
> +# ifdef USE_AS_WCSCPY
> +	testl	%ecx, %ecx
> +	jz	L(zfill_less_8_set_ret)
> +
> +	movl	(%rsi, %rdx), %esi
> +	vmovd	%xmm0, (%rdi)
> +	movl	%esi, (%rdi, %rdx)
> +
> +#  ifdef USE_AS_STPCPY
> +	cmpl	%ecx, %edx
> +L(ret_8_15):
> +	setc	%al
> +	addq	%rdx, %rdi
> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
> +#  endif
> +	ret
> +L(zfill_less_8_set_ret):
> +	xorl	%ecx, %ecx
> +#  ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#  endif
> +L(zfill_less_8):
> +	movl	%ecx, (%rdi)
> +	movl	%ecx, (%rdi, %rdx)
> +	ret
> +
> +# else
> +	cmpl	$3, %edx
> +	jb	L(copy_0_3)
> +	/* Overfill to avoid branches.  */
> +	movl	-3(%rsi, %rdx), %esi
> +	vmovd	%xmm0, (%rdi)
> +	movl	%esi, -3(%rdi, %rdx)
> +	cmpl	%ecx, %edx
> +	jbe	L(ret_4_7)
> +	subq	%rcx, %rdx
> +	addq	%rcx, %rdi
> +#  ifdef USE_AS_STPCPY
> +	movq	%rdi, %rax
> +#  endif
> +	xorl	%ecx, %ecx
> +	.p2align 4,, 8
> +L(zfill_less_8):
> +	cmpl	$3, %edx
> +	jb	L(zfill_less_3)
> +	movl	%ecx, (%rdi)
> +	movl	%ecx, -3(%rdi, %rdx)
> +#  ifdef USE_AS_STPCPY
> +	ret
> +#  endif
> +
> +L(ret_4_7):
> +#  ifdef USE_AS_STPCPY
> +L(ret_8_15):
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#  endif
> +	ret
> +
> +	.p2align 4,, 4
> +L(zfill_less_3):
> +	testl	%edx, %edx
> +	jz	L(zfill_1)
> +	movw	%cx, (%rdi)
> +L(zfill_1):
> +	movb	%cl, (%rdi, %rdx)
> +	ret
> +
> +	.p2align 4,, 8
> +L(copy_0_3):
> +	vmovd	%xmm0, %r8d
> +	testl	%edx, %edx
> +	jz	L(copy_1)
> +	movw	%r8w, (%rdi)
> +	cmpl	%ecx, %edx
> +	ja	L(zfill_from_1)
> +	movzbl	(%rsi, %rdx), %r8d
> +#  ifdef USE_AS_STPCPY
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +	movb	%r8b, (%rdi, %rdx)
> +	ret
> +#  endif
> +
> +L(copy_1):
> +#  ifdef USE_AS_STPCPY
> +	movl	%edx, %eax
> +	cmpl	%ecx, %edx
> +	adcq	%rdi, %rax
> +#  endif
> +#  ifdef USE_AS_WCSCPY
> +	vmovd	%xmm0, (%rdi)
> +#  else
> +	movb	%r8b, (%rdi, %rdx)
> +#  endif
> +	ret
> +# endif
> +
> +	.p2align 4,, 2
> +L(zero_len):
> +	movq	%rdi, %rax
> +	ret
> +# ifndef USE_AS_WCSCPY
> +	.p2align 4,, 8
> +L(zfill_from_1):
> +#  ifdef USE_AS_STPCPY
> +	leaq	(%rdi, %rcx), %rax
> +#  endif
> +	movw	$0, -1(%rdi, %rdx)
> +	ret
> +# endif
> +
> +	.p2align 4,, 4
> +	.p2align 6,, 8
> +L(page_cross):
> +	movq	%rsi, %rax
> +	andq	$(VEC_SIZE * -1), %rax
> +
> +	VPCMPEQ	(%rax), %VZERO, %VMM(6)
> +
> +	vpmovmskb %VMM(6), %ecx
> +	shrxl	%esi, %ecx, %ecx
> +
> +	subl	%esi, %eax
> +	andl	$(VEC_SIZE - 1), %eax
> +	cmpq	%rax, %rdx
> +	jb	L(page_cross_small)
> +	/* Optimizing more aggressively for space as this is very cold
> +	   code. This saves 2x cache lines.  */
> +
> +	/* If rcx is non-zero then continue.  */
> +	shl	$CHAR_SIZE, %ecx
> +	jz	L(page_cross_continue)
> +	bsf	%ecx, %ecx
> +
> +	subq	%rcx, %rdx
> +# ifdef USE_AS_STPCPY
> +	leaq	-CHAR_SIZE(%rdi, %rcx), %rax
> +# else
> +	movq	%rdi, %rax
> +# endif
> +
> +	rep	movsb
> +# ifdef USE_AS_WCSCPY
> +	movl	$0, (%rdi)
> +# else
> +	movb	$0, (%rdi)
> +# endif
> +	jmp	L(zfill_from_page_cross)
> +
> +L(page_cross_small):
> +	tzcntl	%ecx, %ecx
> +	xorl	%eax, %eax
> +	cmpl	%ecx, %edx
> +	jbe	L(page_cross_copy_only)
> +
> +	/* Do a zfill of the tail before copying.  */
> +	movq	%rdi, %r9
> +	movl	%ecx, %r8d
> +
> +	subl	%ecx, %edx
> +	leaq	CHAR_SIZE(%rdi, %rcx), %rdi
> +	movl	%edx, %ecx
> +	rep	stosb
> +	movq	%r9, %rdi
> +	movl	%r8d, %edx
> +L(page_cross_copy_only):
> +	leal	CHAR_SIZE(%rdx), %ecx
> +# ifdef USE_AS_STPCPY
> +#  ifdef USE_AS_WCSCPY
> +	setc	%al
> +	addq	%rdi, %rdx
> +	leaq	(%rdx, %rax, CHAR_SIZE), %rax
> +#  else
> +	movl	%edx, %eax
> +	adcq	%rdi, %rax
> +#  endif
> +# else
> +	movq	%rdi, %rax
> +# endif
> +	rep	movsb
> +	ret
> +
> +
> +L(best_effort_strncpy):
> +	movq	%rdx, %rcx
> +	xorl	%eax, %eax
> +	movq	%rdi, %r8
> +	/* The length is >= 2^63. We very much so expect to segfault at
> +	   rep stos. If that doesn't happen then just strcpy to finish.
> +	 */
> +# ifdef USE_AS_WCSCPY
> +	rep	stosl
> +# else
> +	rep	stosb
> +# endif
> +	movq	%r8, %rdi
> +	jmp	OVERFLOW_STRCPY
> +END(STRNCPY)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> index dca1089060..275af7560a 100644
> --- a/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> +++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> @@ -27,7 +27,8 @@
>  #define VEC_SIZE			32
>  #include "x86-vec-macros.h"
>  
> -#define USE_WITH_AVX		1
> +#define USE_WITH_AVX2		1
> +
>  #define SECTION(p)			p##.avx
>  
>  /* 4-byte mov instructions with AVX2.  */
> -- 
> 2.34.1
> 

LGTM.

Thanks.

H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v5 3/4] x86: Add evex optimized functions for the wchar_t strcpy family
  2022-11-09  1:38     ` [PATCH v5 3/4] x86: Add evex optimized functions for the wchar_t strcpy family Noah Goldstein
@ 2022-11-09  3:01       ` H.J. Lu
  0 siblings, 0 replies; 42+ messages in thread
From: H.J. Lu @ 2022-11-09  3:01 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Tue, Nov 08, 2022 at 05:38:40PM -0800, Noah Goldstein wrote:
> Implemented:
>     wcscat-evex  (+ 905 bytes)
>     wcscpy-evex  (+ 674 bytes)
>     wcpcpy-evex  (+ 709 bytes)
>     wcsncpy-evex (+1358 bytes)
>     wcpncpy-evex (+1467 bytes)
>     wcsncat-evex (+1213 bytes)
> 
> Performance Changes:
>     Times are from N = 10 runs of the benchmark suite and are reported
>     as geometric mean of all ratios of New Implementation / Best Old
>     Implementation. Best Old Implementation was determined with the
>     highest ISA implementation.
> 
>     wcscat-evex     -> 0.991
>     wcscpy-evex     -> 0.587
>     wcpcpy-evex     -> 0.695
>     wcsncpy-evex    -> 0.719
>     wcpncpy-evex    -> 0.694
>     wcsncat-evex    -> 0.979
> 
> Code Size Changes:
>     This change  increase the size of libc.so by ~6.3kb bytes. For
>     reference the patch optimizing the normal strcpy family functions
>     decreases libc.so by ~5.7kb.
> 
> Full check passes on x86-64 and build succeeds for all ISA levels w/
> and w/o multiarch.
> ---
>  sysdeps/x86_64/Makefile                    |  5 ++
>  sysdeps/x86_64/multiarch/Makefile          | 14 ++++-
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 63 ++++++++++++++++++++--
>  sysdeps/x86_64/multiarch/ifunc-wcs.h       | 48 +++++++++++++++++
>  sysdeps/x86_64/multiarch/wcpcpy-evex.S     |  8 +++
>  sysdeps/x86_64/multiarch/wcpcpy-generic.c  | 27 ++++++++++
>  sysdeps/x86_64/multiarch/wcpcpy.c          | 37 +++++++++++++
>  sysdeps/x86_64/multiarch/wcpncpy-evex.S    |  8 +++
>  sysdeps/x86_64/multiarch/wcpncpy-generic.c | 27 ++++++++++
>  sysdeps/x86_64/multiarch/wcpncpy.c         | 37 +++++++++++++
>  sysdeps/x86_64/multiarch/wcscat-evex.S     |  9 ++++
>  sysdeps/x86_64/multiarch/wcscat-generic.c  | 27 ++++++++++
>  sysdeps/x86_64/multiarch/wcscat.c          | 37 +++++++++++++
>  sysdeps/x86_64/multiarch/wcscpy-evex.S     |  7 +++
>  sysdeps/x86_64/multiarch/wcscpy-generic.c  |  3 +-
>  sysdeps/x86_64/multiarch/wcscpy.c          | 11 ++++
>  sysdeps/x86_64/multiarch/wcsncat-evex.S    |  9 ++++
>  sysdeps/x86_64/multiarch/wcsncat-generic.c | 27 ++++++++++
>  sysdeps/x86_64/multiarch/wcsncat.c         | 34 ++++++++++++
>  sysdeps/x86_64/multiarch/wcsncpy-evex.S    |  7 +++
>  sysdeps/x86_64/multiarch/wcsncpy-generic.c | 27 ++++++++++
>  sysdeps/x86_64/multiarch/wcsncpy.c         | 37 +++++++++++++
>  sysdeps/x86_64/wcpcpy-generic.c            | 31 +++++++++++
>  sysdeps/x86_64/wcpcpy.S                    | 40 ++++++++++++++
>  sysdeps/x86_64/wcpncpy-generic.c           | 31 +++++++++++
>  sysdeps/x86_64/wcpncpy.S                   | 40 ++++++++++++++
>  sysdeps/x86_64/wcscat-generic.c            | 31 +++++++++++
>  sysdeps/x86_64/wcscat.S                    | 40 ++++++++++++++
>  sysdeps/x86_64/wcscpy.S                    |  3 +-
>  sysdeps/x86_64/wcsncat-generic.c           | 31 +++++++++++
>  sysdeps/x86_64/wcsncat.S                   | 38 +++++++++++++
>  sysdeps/x86_64/wcsncpy-generic.c           | 31 +++++++++++
>  sysdeps/x86_64/wcsncpy.S                   | 40 ++++++++++++++
>  33 files changed, 858 insertions(+), 7 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcs.h
>  create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcpcpy.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcpncpy.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcscat-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcscat-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcscat.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcscpy-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncat-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncat-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncat.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncpy.c
>  create mode 100644 sysdeps/x86_64/wcpcpy-generic.c
>  create mode 100644 sysdeps/x86_64/wcpcpy.S
>  create mode 100644 sysdeps/x86_64/wcpncpy-generic.c
>  create mode 100644 sysdeps/x86_64/wcpncpy.S
>  create mode 100644 sysdeps/x86_64/wcscat-generic.c
>  create mode 100644 sysdeps/x86_64/wcscat.S
>  create mode 100644 sysdeps/x86_64/wcsncat-generic.c
>  create mode 100644 sysdeps/x86_64/wcsncat.S
>  create mode 100644 sysdeps/x86_64/wcsncpy-generic.c
>  create mode 100644 sysdeps/x86_64/wcsncpy.S
> 
> diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
> index 3627c5659f..688eb2d7c4 100644
> --- a/sysdeps/x86_64/Makefile
> +++ b/sysdeps/x86_64/Makefile
> @@ -188,8 +188,13 @@ endif
>  ifeq ($(subdir),wcsmbs)
>  
>  sysdep_routines += \
> +  wcpcpy-generic \
> +  wcpncpy-generic \
> +  wcscat-generic \
>    wcscpy-generic \
> +  wcsncat-generic \
>    wcsncmp-generic \
> +  wcsncpy-generic \
>    wcsnlen-generic \
>  # sysdep_routines
>  
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 066bfa48d9..d6e01940c3 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -131,6 +131,12 @@ endif
>  
>  ifeq ($(subdir),wcsmbs)
>  sysdep_routines += \
> +  wcpcpy-evex \
> +  wcpcpy-generic \
> +  wcpncpy-evex \
> +  wcpncpy-generic \
> +  wcscat-evex \
> +  wcscat-generic \
>    wcschr-avx2 \
>    wcschr-avx2-rtm \
>    wcschr-evex \
> @@ -140,6 +146,8 @@ sysdep_routines += \
>    wcscmp-avx2-rtm \
>    wcscmp-evex \
>    wcscmp-sse2 \
> +  wcscpy-evex \
> +  wcscpy-generic \
>    wcscpy-ssse3 \
>    wcslen-avx2 \
>    wcslen-avx2-rtm \
> @@ -147,9 +155,13 @@ sysdep_routines += \
>    wcslen-evex512 \
>    wcslen-sse2 \
>    wcslen-sse4_1 \
> +  wcsncat-evex \
> +  wcsncat-generic \
>    wcsncmp-avx2 \
>    wcsncmp-avx2-rtm \
>    wcsncmp-evex \
> +  wcsncpy-evex \
> +  wcsncpy-generic \
>    wcsnlen-avx2 \
>    wcsnlen-avx2-rtm \
>    wcsnlen-evex \
> @@ -163,8 +175,8 @@ sysdep_routines += \
>    wmemchr-avx2 \
>    wmemchr-avx2-rtm \
>    wmemchr-evex \
> -  wmemchr-evex512 \
>    wmemchr-evex-rtm \
> +  wmemchr-evex512 \
>    wmemchr-sse2 \
>    wmemcmp-avx2-movbe \
>    wmemcmp-avx2-movbe-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 7cebee7ec7..c908d6c158 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -901,16 +901,73 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  
>    /* Support sysdeps/x86_64/multiarch/wcscpy.c.  */
>    IFUNC_IMPL (i, name, wcscpy,
> -	      /* ISA V4 wrapper for SSSE3 implementation because
> -	         the SSSE3 implementation is also used at ISA
> -	         level 3/4.  */
>  	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcscpy,
> +				     (CPU_FEATURE_USABLE (AVX512VL)
> +				      && CPU_FEATURE_USABLE (AVX512BW)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcscpy_evex)
> +	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
>  				     CPU_FEATURE_USABLE (SSSE3),
>  				     __wcscpy_ssse3)
>  	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
>  				     1,
>  				     __wcscpy_generic))
>  
> +  /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
> +  IFUNC_IMPL (i, name, wcsncpy,
> +	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
> +				     (CPU_FEATURE_USABLE (AVX512VL)
> +				      && CPU_FEATURE_USABLE (AVX512BW)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcsncpy_evex)
> +	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
> +				     1,
> +				     __wcsncpy_generic))
> +
> +  /* Support sysdeps/x86_64/multiarch/wcpcpy.c.  */
> +  IFUNC_IMPL (i, name, wcpcpy,
> +	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcpcpy,
> +				     (CPU_FEATURE_USABLE (AVX512VL)
> +				      && CPU_FEATURE_USABLE (AVX512BW)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcpcpy_evex)
> +	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
> +				     1,
> +				     __wcpcpy_generic))
> +
> +  /* Support sysdeps/x86_64/multiarch/wcpncpy.c.  */
> +  IFUNC_IMPL (i, name, wcpncpy,
> +	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcpncpy,
> +				     (CPU_FEATURE_USABLE (AVX512VL)
> +				      && CPU_FEATURE_USABLE (AVX512BW)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcpncpy_evex)
> +	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
> +				     1,
> +				     __wcpncpy_generic))
> +
> +  /* Support sysdeps/x86_64/multiarch/wcscat.c.  */
> +  IFUNC_IMPL (i, name, wcscat,
> +	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcscat,
> +				     (CPU_FEATURE_USABLE (AVX512VL)
> +				      && CPU_FEATURE_USABLE (AVX512BW)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcscat_evex)
> +	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
> +				     1,
> +				     __wcscat_generic))
> +
> +  /* Support sysdeps/x86_64/multiarch/wcsncat.c.  */
> +  IFUNC_IMPL (i, name, wcsncat,
> +	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncat,
> +				     (CPU_FEATURE_USABLE (AVX512VL)
> +				      && CPU_FEATURE_USABLE (AVX512BW)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcsncat_evex)
> +	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
> +				     1,
> +				     __wcsncat_generic))
> +
>    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
>    IFUNC_IMPL (i, name, wcslen,
>  	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcslen,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-wcs.h b/sysdeps/x86_64/multiarch/ifunc-wcs.h
> new file mode 100644
> index 0000000000..1d2a63458b
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/ifunc-wcs.h
> @@ -0,0 +1,48 @@
> +/* Common definition for ifunc selections optimized wide-character
> +   string copy functions.
> +
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <init-arch.h>
> +
> +#ifndef GENERIC
> +# define GENERIC generic
> +#endif
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features *cpu_features = __get_cpu_features ();
> +
> +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> +      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> +      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> +				      AVX_Fast_Unaligned_Load, ))
> +    {
> +      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> +	return OPTIMIZE (evex);
> +    }
> +
> +  return OPTIMIZE (GENERIC);
> +}
> diff --git a/sysdeps/x86_64/multiarch/wcpcpy-evex.S b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
> new file mode 100644
> index 0000000000..ac6429cc07
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpcpy-evex.S
> @@ -0,0 +1,8 @@
> +#ifndef WCPCPY
> +# define WCPCPY	__wcpcpy_evex
> +#endif
> +
> +#define USE_AS_STPCPY
> +#define USE_AS_WCSCPY
> +#define STRCPY	WCPCPY
> +#include "strcpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/wcpcpy-generic.c b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
> new file mode 100644
> index 0000000000..6039196a3e
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
> @@ -0,0 +1,27 @@
> +/* wcpcpy.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* We always need to build this implementation as strspn-sse4 needs to
> +   be able to fallback to it.  */
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (3)
> +
> +# define WCPCPY __wcpcpy_generic
> +# include <wcsmbs/wcpcpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcpcpy.c b/sysdeps/x86_64/multiarch/wcpcpy.c
> new file mode 100644
> index 0000000000..8f96ddbc99
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpcpy.c
> @@ -0,0 +1,37 @@
> +/* Multiple versions of wcpcpy.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __wcpcpy __redirect_wcpcpy
> +# include <wchar.h>
> +# undef __wcpcpy
> +
> +# define SYMBOL_NAME wcpcpy
> +# include <init-arch.h>
> +
> +# include "ifunc-wcs.h"
> +
> +libc_ifunc_redirected (__redirect_wcpcpy, __wcpcpy, IFUNC_SELECTOR ());
> +weak_alias (__wcpcpy, wcpcpy)
> +# ifdef SHARED
> +__hidden_ver1 (__wcpcpy, __GI___wcpcpy, __redirect_wcpcpy)
> +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpcpy);
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcpncpy-evex.S b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
> new file mode 100644
> index 0000000000..62ddb694fe
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpncpy-evex.S
> @@ -0,0 +1,8 @@
> +#ifndef WCPNCPY
> +# define WCPNCPY	__wcpncpy_evex
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define USE_AS_STPCPY
> +#define STRNCPY	WCPNCPY
> +#include "strncpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/wcpncpy-generic.c b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
> new file mode 100644
> index 0000000000..de8d34320e
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
> @@ -0,0 +1,27 @@
> +/* wcpncpy.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* We always need to build this implementation as strspn-sse4 needs to
> +   be able to fallback to it.  */
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (3)
> +
> +# define WCPNCPY __wcpncpy_generic
> +# include <wcsmbs/wcpncpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcpncpy.c b/sysdeps/x86_64/multiarch/wcpncpy.c
> new file mode 100644
> index 0000000000..ed8f307e07
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpncpy.c
> @@ -0,0 +1,37 @@
> +/* Multiple versions of wcpncpy.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __wcpncpy __redirect_wcpncpy
> +# include <wchar.h>
> +# undef __wcpncpy
> +
> +# define SYMBOL_NAME wcpncpy
> +# include <init-arch.h>
> +
> +# include "ifunc-wcs.h"
> +
> +libc_ifunc_redirected (__redirect_wcpncpy, __wcpncpy, IFUNC_SELECTOR ());
> +weak_alias (__wcpncpy, wcpncpy)
> +# ifdef SHARED
> +__hidden_ver1 (__wcpncpy, __GI___wcpncpy, __redirect_wcpncpy)
> +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcpncpy);
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcscat-evex.S b/sysdeps/x86_64/multiarch/wcscat-evex.S
> new file mode 100644
> index 0000000000..1d017e4899
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscat-evex.S
> @@ -0,0 +1,9 @@
> +#ifndef WCSCAT
> +# define WCSCAT	__wcscat_evex
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define USE_AS_STRCAT
> +
> +#define STRCPY	WCSCAT
> +#include "strcpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/wcscat-generic.c b/sysdeps/x86_64/multiarch/wcscat-generic.c
> new file mode 100644
> index 0000000000..d86b4d5c00
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscat-generic.c
> @@ -0,0 +1,27 @@
> +/* wcscat.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* We always need to build this implementation as strspn-sse4 needs to
> +   be able to fallback to it.  */
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (3)
> +
> +# define WCSCAT __wcscat_generic
> +# include <wcsmbs/wcscat.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcscat.c b/sysdeps/x86_64/multiarch/wcscat.c
> new file mode 100644
> index 0000000000..3277c44561
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscat.c
> @@ -0,0 +1,37 @@
> +/* Multiple versions of wcscat.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __wcscat __redirect_wcscat
> +# include <wchar.h>
> +# undef __wcscat
> +
> +# define SYMBOL_NAME wcscat
> +# include <init-arch.h>
> +
> +# include "ifunc-wcs.h"
> +
> +libc_ifunc_redirected (__redirect_wcscat, __wcscat, IFUNC_SELECTOR ());
> +weak_alias (__wcscat, wcscat)
> +# ifdef SHARED
> +__hidden_ver1 (__wcscat, __GI___wcscat, __redirect_wcscat)
> +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcscat);
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcscpy-evex.S b/sysdeps/x86_64/multiarch/wcscpy-evex.S
> new file mode 100644
> index 0000000000..1069a8e224
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscpy-evex.S
> @@ -0,0 +1,7 @@
> +#ifndef WCSCPY
> +# define WCSCPY	__wcscpy_evex
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define STRCPY	WCSCPY
> +#include "strcpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/wcscpy-generic.c b/sysdeps/x86_64/multiarch/wcscpy-generic.c
> index 93d314aaad..4a1fffae4b 100644
> --- a/sysdeps/x86_64/multiarch/wcscpy-generic.c
> +++ b/sysdeps/x86_64/multiarch/wcscpy-generic.c
> @@ -18,8 +18,7 @@
>  
>  
>  #include <isa-level.h>
> -
> -#if ISA_SHOULD_BUILD (1)
> +#if ISA_SHOULD_BUILD (3)
>  
>  # define WCSCPY  __wcscpy_generic
>  # include <wcsmbs/wcscpy.c>
> diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
> index 92c917b6b4..9ad77da8ac 100644
> --- a/sysdeps/x86_64/multiarch/wcscpy.c
> +++ b/sysdeps/x86_64/multiarch/wcscpy.c
> @@ -26,6 +26,8 @@
>  # define SYMBOL_NAME wcscpy
>  # include <init-arch.h>
>  
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> +
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> @@ -35,6 +37,15 @@ IFUNC_SELECTOR (void)
>  {
>    const struct cpu_features* cpu_features = __get_cpu_features ();
>  
> +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> +      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> +      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load, ))
> +    {
> +      if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> +	return OPTIMIZE (evex);
> +    }
> +
>    if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
>      return OPTIMIZE (ssse3);
>  
> diff --git a/sysdeps/x86_64/multiarch/wcsncat-evex.S b/sysdeps/x86_64/multiarch/wcsncat-evex.S
> new file mode 100644
> index 0000000000..392215950a
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncat-evex.S
> @@ -0,0 +1,9 @@
> +#ifndef WCSCAT
> +# define WCSCAT	__wcsncat_evex
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define USE_AS_STRCAT
> +
> +#define STRNCAT	WCSCAT
> +#include "strncat-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsncat-generic.c b/sysdeps/x86_64/multiarch/wcsncat-generic.c
> new file mode 100644
> index 0000000000..4b55cb40bc
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncat-generic.c
> @@ -0,0 +1,27 @@
> +/* wcsncat.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* We always need to build this implementation as strspn-sse4 needs to
> +   be able to fallback to it.  */
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (3)
> +
> +# define WCSNCAT __wcsncat_generic
> +# include <wcsmbs/wcsncat.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcsncat.c b/sysdeps/x86_64/multiarch/wcsncat.c
> new file mode 100644
> index 0000000000..49c46aef08
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncat.c
> @@ -0,0 +1,34 @@
> +/* Multiple versions of wcsncat.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define wcsncat __redirect_wcsncat
> +# include <wchar.h>
> +# undef wcsncat
> +
> +# define SYMBOL_NAME wcsncat
> +# include "ifunc-wcs.h"
> +
> +libc_ifunc_redirected (__redirect_wcsncat, wcsncat, IFUNC_SELECTOR ());
> +# ifdef SHARED
> +__hidden_ver1 (wcsncat, __GI_wcsncat, __redirect_wcsncat)
> +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncat);
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcsncpy-evex.S b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
> new file mode 100644
> index 0000000000..2debb8fd6b
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncpy-evex.S
> @@ -0,0 +1,7 @@
> +#ifndef WCSNCPY
> +# define WCSNCPY	__wcsncpy_evex
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define STRNCPY	WCSNCPY
> +#include "strncpy-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsncpy-generic.c b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
> new file mode 100644
> index 0000000000..d0e8a86605
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
> @@ -0,0 +1,27 @@
> +/* wcsncpy.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* We always need to build this implementation as strspn-sse4 needs to
> +   be able to fallback to it.  */
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (3)
> +
> +# define WCSNCPY __wcsncpy_generic
> +# include <wcsmbs/wcsncpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcsncpy.c b/sysdeps/x86_64/multiarch/wcsncpy.c
> new file mode 100644
> index 0000000000..5b89dd4d27
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncpy.c
> @@ -0,0 +1,37 @@
> +/* Multiple versions of wcsncpy.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __wcsncpy __redirect_wcsncpy
> +# include <wchar.h>
> +# undef __wcsncpy
> +
> +# define SYMBOL_NAME wcsncpy
> +# include <init-arch.h>
> +
> +# include "ifunc-wcs.h"
> +
> +libc_ifunc_redirected (__redirect_wcsncpy, __wcsncpy, IFUNC_SELECTOR ());
> +weak_alias (__wcsncpy, wcsncpy)
> +# ifdef SHARED
> +__hidden_ver1 (__wcsncpy, __GI___wcsncpy, __redirect_wcsncpy)
> +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcsncpy);
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/wcpcpy-generic.c b/sysdeps/x86_64/wcpcpy-generic.c
> new file mode 100644
> index 0000000000..3ddc98872f
> --- /dev/null
> +++ b/sysdeps/x86_64/wcpcpy-generic.c
> @@ -0,0 +1,31 @@
> +/* ISA level static dispatch for wcpcpy .c files.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcpcpy non-multiarch build is split into two files,
> +   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL <= 3
> +
> +# include <wcsmbs/wcpcpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/wcpcpy.S b/sysdeps/x86_64/wcpcpy.S
> new file mode 100644
> index 0000000000..4e4fca71eb
> --- /dev/null
> +++ b/sysdeps/x86_64/wcpcpy.S
> @@ -0,0 +1,40 @@
> +/* ISA level static dispatch for wcpcpy .S files.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcpcpy non-multiarch build is split into two files,
> +   wcpcpy-generic.c and wcpcpy.S. The wcpcpy-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcpcpy-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL >= 4
> +
> +# define WCPCPY	__wcpcpy
> +
> +# define DEFAULT_IMPL_V4	"multiarch/wcpcpy-evex.S"
> +/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> +   should never be used from here.  */
> +# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
> +
> +# include "isa-default-impl.h"
> +
> +weak_alias (__wcpcpy, wcpcpy)
> +libc_hidden_def (__wcpcpy)
> +#endif
> diff --git a/sysdeps/x86_64/wcpncpy-generic.c b/sysdeps/x86_64/wcpncpy-generic.c
> new file mode 100644
> index 0000000000..0c76e5614c
> --- /dev/null
> +++ b/sysdeps/x86_64/wcpncpy-generic.c
> @@ -0,0 +1,31 @@
> +/* ISA level static dispatch for wcpncpy .c files.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcpncpy non-multiarch build is split into two files,
> +   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL <= 3
> +
> +# include <wcsmbs/wcpncpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/wcpncpy.S b/sysdeps/x86_64/wcpncpy.S
> new file mode 100644
> index 0000000000..b4e531473e
> --- /dev/null
> +++ b/sysdeps/x86_64/wcpncpy.S
> @@ -0,0 +1,40 @@
> +/* ISA level static dispatch for wcpcpy .S files.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcpncpy non-multiarch build is split into two files,
> +   wcpncpy-generic.c and wcpncpy.S. The wcpncpy-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcpncpy-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL >= 4
> +
> +# define WCPNCPY	__wcpncpy
> +
> +# define DEFAULT_IMPL_V4	"multiarch/wcpncpy-evex.S"
> +/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> +   should never be used from here.  */
> +# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
> +
> +# include "isa-default-impl.h"
> +
> +weak_alias (__wcpncpy, wcpncpy)
> +libc_hidden_def (__wcpncpy)
> +#endif
> diff --git a/sysdeps/x86_64/wcscat-generic.c b/sysdeps/x86_64/wcscat-generic.c
> new file mode 100644
> index 0000000000..512d0e4d43
> --- /dev/null
> +++ b/sysdeps/x86_64/wcscat-generic.c
> @@ -0,0 +1,31 @@
> +/* ISA level static dispatch for wcscat .c files.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcscat non-multiarch build is split into two files,
> +   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL <= 3
> +
> +# include <wcsmbs/wcscat.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/wcscat.S b/sysdeps/x86_64/wcscat.S
> new file mode 100644
> index 0000000000..ee8360b6e8
> --- /dev/null
> +++ b/sysdeps/x86_64/wcscat.S
> @@ -0,0 +1,40 @@
> +/* ISA level static dispatch for wcscat .S files.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcscat non-multiarch build is split into two files,
> +   wcscat-generic.c and wcscat.S. The wcscat-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcscat-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL >= 4
> +
> +# define WCSCAT	__wcscat
> +
> +# define DEFAULT_IMPL_V4	"multiarch/wcscat-evex.S"
> +/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> +   should never be used from here.  */
> +# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
> +
> +# include "isa-default-impl.h"
> +
> +weak_alias (__wcscat, wcscat)
> +libc_hidden_def (__wcscat)
> +#endif
> diff --git a/sysdeps/x86_64/wcscpy.S b/sysdeps/x86_64/wcscpy.S
> index 11d0bb4bab..e403579961 100644
> --- a/sysdeps/x86_64/wcscpy.S
> +++ b/sysdeps/x86_64/wcscpy.S
> @@ -1,4 +1,4 @@
> -/* wcscpy dispatch for RTLD and non-multiarch .c files
> +/* ISA level static dispatch for wcscpy .S files.
>     Copyright (C) 2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>  
> @@ -28,6 +28,7 @@
>  
>  # define WCSCPY	__wcscpy
>  
> +# define DEFAULT_IMPL_V4	"multiarch/wcscpy-evex.S"
>  # define DEFAULT_IMPL_V2	"multiarch/wcscpy-ssse3.S"
>  /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
>     should never be used from here.  */
> diff --git a/sysdeps/x86_64/wcsncat-generic.c b/sysdeps/x86_64/wcsncat-generic.c
> new file mode 100644
> index 0000000000..86e20d9028
> --- /dev/null
> +++ b/sysdeps/x86_64/wcsncat-generic.c
> @@ -0,0 +1,31 @@
> +/* ISA level static dispatch for wcsncat .c files.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcsncat non-multiarch build is split into two files,
> +   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL <= 3
> +
> +# include <wcsmbs/wcsncat.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/wcsncat.S b/sysdeps/x86_64/wcsncat.S
> new file mode 100644
> index 0000000000..090055a1b8
> --- /dev/null
> +++ b/sysdeps/x86_64/wcsncat.S
> @@ -0,0 +1,38 @@
> +/* ISA level static dispatch for wcsncat .S files.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcsncat non-multiarch build is split into two files,
> +   wcsncat-generic.c and wcsncat.S. The wcsncat-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcsncat-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL >= 4
> +
> +# define WCSNCAT	wcsncat
> +
> +# define DEFAULT_IMPL_V4	"multiarch/wcsncat-evex.S"
> +/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> +   should never be used from here.  */
> +# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
> +
> +# include "isa-default-impl.h"
> +
> +#endif
> diff --git a/sysdeps/x86_64/wcsncpy-generic.c b/sysdeps/x86_64/wcsncpy-generic.c
> new file mode 100644
> index 0000000000..0f0ee65b65
> --- /dev/null
> +++ b/sysdeps/x86_64/wcsncpy-generic.c
> @@ -0,0 +1,31 @@
> +/* ISA level static dispatch for wcsncpy .c files.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcsncpy non-multiarch build is split into two files,
> +   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL <= 3
> +
> +# include <wcsmbs/wcsncpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/wcsncpy.S b/sysdeps/x86_64/wcsncpy.S
> new file mode 100644
> index 0000000000..32eaf1163b
> --- /dev/null
> +++ b/sysdeps/x86_64/wcsncpy.S
> @@ -0,0 +1,40 @@
> +/* ISA level static dispatch for wcsncpy .S files.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* wcsncpy non-multiarch build is split into two files,
> +   wcsncpy-generic.c and wcsncpy.S. The wcsncpy-generic.c build is
> +   for ISA level <= 1 and just uses multiarch/wcsncpy-generic.c.
> +   This must be split into two files because we cannot include C
> +   code from assembly or vice versa.  */
> +
> +#include <isa-level.h>
> +
> +#if MINIMUM_X86_ISA_LEVEL >= 4
> +
> +# define WCSNCPY	__wcsncpy
> +
> +# define DEFAULT_IMPL_V4	"multiarch/wcsncpy-evex.S"
> +/* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
> +   should never be used from here.  */
> +# define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
> +
> +# include "isa-default-impl.h"
> +
> +weak_alias (__wcsncpy, wcsncpy)
> +libc_hidden_def (__wcsncpy)
> +#endif
> -- 
> 2.34.1
> 

LGTM.

Thanks.

H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v5 4/4] x86: Add avx2 optimized functions for the wchar_t strcpy family
  2022-11-09  1:38     ` [PATCH v5 4/4] x86: Add avx2 " Noah Goldstein
@ 2022-11-09  3:01       ` H.J. Lu
  0 siblings, 0 replies; 42+ messages in thread
From: H.J. Lu @ 2022-11-09  3:01 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Tue, Nov 08, 2022 at 05:38:41PM -0800, Noah Goldstein wrote:
> Implemented:
>     wcscat-avx2  (+ 744 bytes
>     wcscpy-avx2  (+ 539 bytes)
>     wcpcpy-avx2  (+ 577 bytes)
>     wcsncpy-avx2 (+1108 bytes)
>     wcpncpy-avx2 (+1214 bytes)
>     wcsncat-avx2 (+1085 bytes)
> 
> Performance Changes:
>     Times are from N = 10 runs of the benchmark suite and are reported
>     as geometric mean of all ratios of New Implementation / Best Old
>     Implementation. Best Old Implementation was determined with the
>     highest ISA implementation.
> 
>     wcscat-avx2     -> 0.975
>     wcscpy-avx2     -> 0.591
>     wcpcpy-avx2     -> 0.698
>     wcsncpy-avx2    -> 0.730
>     wcpncpy-avx2    -> 0.711
>     wcsncat-avx2    -> 0.954
> 
> Code Size Changes:
>     This change  increase the size of libc.so by ~5.5kb bytes. For
>     reference the patch optimizing the normal strcpy family functions
>     decreases libc.so by ~5.2kb.
> 
> Full check passes on x86-64 and build succeeds for all ISA levels w/
> and w/o multiarch.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |  6 +++++
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 ++++++++++++++++++++--
>  sysdeps/x86_64/multiarch/ifunc-wcs.h       |  7 ++++++
>  sysdeps/x86_64/multiarch/wcpcpy-avx2.S     |  8 +++++++
>  sysdeps/x86_64/multiarch/wcpcpy-generic.c  |  2 +-
>  sysdeps/x86_64/multiarch/wcpncpy-avx2.S    |  8 +++++++
>  sysdeps/x86_64/multiarch/wcpncpy-generic.c |  2 +-
>  sysdeps/x86_64/multiarch/wcscat-avx2.S     | 10 ++++++++
>  sysdeps/x86_64/multiarch/wcscat-generic.c  |  2 +-
>  sysdeps/x86_64/multiarch/wcscpy-avx2.S     |  7 ++++++
>  sysdeps/x86_64/multiarch/wcscpy-generic.c  |  2 +-
>  sysdeps/x86_64/multiarch/wcscpy.c          |  5 ++++
>  sysdeps/x86_64/multiarch/wcsncat-avx2.S    |  9 +++++++
>  sysdeps/x86_64/multiarch/wcsncat-generic.c |  2 +-
>  sysdeps/x86_64/multiarch/wcsncpy-avx2.S    |  7 ++++++
>  sysdeps/x86_64/multiarch/wcsncpy-generic.c |  2 +-
>  sysdeps/x86_64/wcpcpy-generic.c            |  2 +-
>  sysdeps/x86_64/wcpcpy.S                    |  3 ++-
>  sysdeps/x86_64/wcpncpy-generic.c           |  2 +-
>  sysdeps/x86_64/wcpncpy.S                   |  3 ++-
>  sysdeps/x86_64/wcscat-generic.c            |  2 +-
>  sysdeps/x86_64/wcscat.S                    |  3 ++-
>  sysdeps/x86_64/wcscpy.S                    |  1 +
>  sysdeps/x86_64/wcsncat-generic.c           |  2 +-
>  sysdeps/x86_64/wcsncat.S                   |  3 ++-
>  sysdeps/x86_64/wcsncpy-generic.c           |  2 +-
>  sysdeps/x86_64/wcsncpy.S                   |  3 ++-
>  27 files changed, 115 insertions(+), 18 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2.S
> 
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index d6e01940c3..e1e894c963 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -131,10 +131,13 @@ endif
>  
>  ifeq ($(subdir),wcsmbs)
>  sysdep_routines += \
> +  wcpcpy-avx2 \
>    wcpcpy-evex \
>    wcpcpy-generic \
> +  wcpncpy-avx2 \
>    wcpncpy-evex \
>    wcpncpy-generic \
> +  wcscat-avx2 \
>    wcscat-evex \
>    wcscat-generic \
>    wcschr-avx2 \
> @@ -146,6 +149,7 @@ sysdep_routines += \
>    wcscmp-avx2-rtm \
>    wcscmp-evex \
>    wcscmp-sse2 \
> +  wcscpy-avx2 \
>    wcscpy-evex \
>    wcscpy-generic \
>    wcscpy-ssse3 \
> @@ -155,11 +159,13 @@ sysdep_routines += \
>    wcslen-evex512 \
>    wcslen-sse2 \
>    wcslen-sse4_1 \
> +  wcsncat-avx2 \
>    wcsncat-evex \
>    wcsncat-generic \
>    wcsncmp-avx2 \
>    wcsncmp-avx2-rtm \
>    wcsncmp-evex \
> +  wcsncpy-avx2 \
>    wcsncpy-evex \
>    wcsncpy-generic \
>    wcsnlen-avx2 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index c908d6c158..0c15dfebfd 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -907,6 +907,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  				      && CPU_FEATURE_USABLE (BMI2)),
>  				     __wcscpy_evex)
>  	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
> +				     (CPU_FEATURE_USABLE (AVX2)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcscpy_avx2)
> +	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcscpy,
>  				     CPU_FEATURE_USABLE (SSSE3),
>  				     __wcscpy_ssse3)
>  	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
> @@ -920,7 +924,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  				      && CPU_FEATURE_USABLE (AVX512BW)
>  				      && CPU_FEATURE_USABLE (BMI2)),
>  				     __wcsncpy_evex)
> -	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
> +	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
> +				     (CPU_FEATURE_USABLE (AVX2)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcsncpy_avx2)
> +	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
>  				     1,
>  				     __wcsncpy_generic))
>  
> @@ -932,6 +940,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  				      && CPU_FEATURE_USABLE (BMI2)),
>  				     __wcpcpy_evex)
>  	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
> +				     (CPU_FEATURE_USABLE (AVX2)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcpcpy_avx2)
> +	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcpcpy,
>  				     1,
>  				     __wcpcpy_generic))
>  
> @@ -942,7 +954,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  				      && CPU_FEATURE_USABLE (AVX512BW)
>  				      && CPU_FEATURE_USABLE (BMI2)),
>  				     __wcpncpy_evex)
> -	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
> +	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
> +				     (CPU_FEATURE_USABLE (AVX2)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcpncpy_avx2)
> +	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
>  				     1,
>  				     __wcpncpy_generic))
>  
> @@ -954,6 +970,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  				      && CPU_FEATURE_USABLE (BMI2)),
>  				     __wcscat_evex)
>  	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
> +				     (CPU_FEATURE_USABLE (AVX2)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcscat_avx2)
> +	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcscat,
>  				     1,
>  				     __wcscat_generic))
>  
> @@ -965,6 +985,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  				      && CPU_FEATURE_USABLE (BMI2)),
>  				     __wcsncat_evex)
>  	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
> +				     (CPU_FEATURE_USABLE (AVX2)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcsncat_avx2)
> +	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncat,
>  				     1,
>  				     __wcsncat_generic))
>  
> diff --git a/sysdeps/x86_64/multiarch/ifunc-wcs.h b/sysdeps/x86_64/multiarch/ifunc-wcs.h
> index 1d2a63458b..51194e620e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-wcs.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-wcs.h
> @@ -27,6 +27,8 @@
>  
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
>  
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
>  
>  static inline void *
> @@ -42,6 +44,11 @@ IFUNC_SELECTOR (void)
>        if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
>  	return OPTIMIZE (evex);
> +
> +      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> +				       Prefer_No_VZEROUPPER, !))
> +	return OPTIMIZE (avx2);
> +
>      }
>  
>    return OPTIMIZE (GENERIC);
> diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
> new file mode 100644
> index 0000000000..0fffd912d3
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
> @@ -0,0 +1,8 @@
> +#ifndef WCPCPY
> +# define WCPCPY	__wcpcpy_avx2
> +#endif
> +
> +#define USE_AS_STPCPY
> +#define USE_AS_WCSCPY
> +#define STRCPY	WCPCPY
> +#include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcpcpy-generic.c b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
> index 6039196a3e..0ba29b081f 100644
> --- a/sysdeps/x86_64/multiarch/wcpcpy-generic.c
> +++ b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
> @@ -19,7 +19,7 @@
>  /* We always need to build this implementation as strspn-sse4 needs to
>     be able to fallback to it.  */
>  #include <isa-level.h>
> -#if ISA_SHOULD_BUILD (3)
> +#if ISA_SHOULD_BUILD (2)
>  
>  # define WCPCPY __wcpcpy_generic
>  # include <wcsmbs/wcpcpy.c>
> diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
> new file mode 100644
> index 0000000000..b7e594f7b7
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
> @@ -0,0 +1,8 @@
> +#ifndef WCPNCPY
> +# define WCPNCPY	__wcpncpy_avx2
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define USE_AS_STPCPY
> +#define STRNCPY	WCPNCPY
> +#include "strncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcpncpy-generic.c b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
> index de8d34320e..4aab4ecdd2 100644
> --- a/sysdeps/x86_64/multiarch/wcpncpy-generic.c
> +++ b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
> @@ -19,7 +19,7 @@
>  /* We always need to build this implementation as strspn-sse4 needs to
>     be able to fallback to it.  */
>  #include <isa-level.h>
> -#if ISA_SHOULD_BUILD (3)
> +#if ISA_SHOULD_BUILD (2)
>  
>  # define WCPNCPY __wcpncpy_generic
>  # include <wcsmbs/wcpncpy.c>
> diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2.S b/sysdeps/x86_64/multiarch/wcscat-avx2.S
> new file mode 100644
> index 0000000000..a20f23c09d
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscat-avx2.S
> @@ -0,0 +1,10 @@
> +#ifndef WCSCAT
> +# define WCSCAT	__wcscat_avx2
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define USE_AS_STRCAT
> +
> +#define STRCPY	WCSCAT
> +
> +#include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcscat-generic.c b/sysdeps/x86_64/multiarch/wcscat-generic.c
> index d86b4d5c00..6476f85bbb 100644
> --- a/sysdeps/x86_64/multiarch/wcscat-generic.c
> +++ b/sysdeps/x86_64/multiarch/wcscat-generic.c
> @@ -19,7 +19,7 @@
>  /* We always need to build this implementation as strspn-sse4 needs to
>     be able to fallback to it.  */
>  #include <isa-level.h>
> -#if ISA_SHOULD_BUILD (3)
> +#if ISA_SHOULD_BUILD (2)
>  
>  # define WCSCAT __wcscat_generic
>  # include <wcsmbs/wcscat.c>
> diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2.S b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
> new file mode 100644
> index 0000000000..6bc509da07
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
> @@ -0,0 +1,7 @@
> +#ifndef WCSCPY
> +# define WCSCPY	__wcscpy_avx2
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define STRCPY	WCSCPY
> +#include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcscpy-generic.c b/sysdeps/x86_64/multiarch/wcscpy-generic.c
> index 4a1fffae4b..600d606c45 100644
> --- a/sysdeps/x86_64/multiarch/wcscpy-generic.c
> +++ b/sysdeps/x86_64/multiarch/wcscpy-generic.c
> @@ -18,7 +18,7 @@
>  
>  
>  #include <isa-level.h>
> -#if ISA_SHOULD_BUILD (3)
> +#if ISA_SHOULD_BUILD (2)
>  
>  # define WCSCPY  __wcscpy_generic
>  # include <wcsmbs/wcscpy.c>
> diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
> index 9ad77da8ac..e204059873 100644
> --- a/sysdeps/x86_64/multiarch/wcscpy.c
> +++ b/sysdeps/x86_64/multiarch/wcscpy.c
> @@ -28,6 +28,8 @@
>  
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
>  
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> @@ -44,6 +46,9 @@ IFUNC_SELECTOR (void)
>        if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
>  	return OPTIMIZE (evex);
> +
> +      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER, !))
> +	return OPTIMIZE (avx2);
>      }
>  
>    if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2.S b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
> new file mode 100644
> index 0000000000..a72105b7e9
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
> @@ -0,0 +1,9 @@
> +#ifndef WCSNCAT
> +# define WCSNCAT	__wcsncat_avx2
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define USE_AS_STRCAT
> +
> +#define STRNCAT	WCSNCAT
> +#include "strncat-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsncat-generic.c b/sysdeps/x86_64/multiarch/wcsncat-generic.c
> index 4b55cb40bc..9ced02b35e 100644
> --- a/sysdeps/x86_64/multiarch/wcsncat-generic.c
> +++ b/sysdeps/x86_64/multiarch/wcsncat-generic.c
> @@ -19,7 +19,7 @@
>  /* We always need to build this implementation as strspn-sse4 needs to
>     be able to fallback to it.  */
>  #include <isa-level.h>
> -#if ISA_SHOULD_BUILD (3)
> +#if ISA_SHOULD_BUILD (2)
>  
>  # define WCSNCAT __wcsncat_generic
>  # include <wcsmbs/wcsncat.c>
> diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
> new file mode 100644
> index 0000000000..3a1a8a372c
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
> @@ -0,0 +1,7 @@
> +#ifndef WCSNCPY
> +# define WCSNCPY	__wcsncpy_avx2
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define STRNCPY	WCSNCPY
> +#include "strncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsncpy-generic.c b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
> index d0e8a86605..693521713b 100644
> --- a/sysdeps/x86_64/multiarch/wcsncpy-generic.c
> +++ b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
> @@ -19,7 +19,7 @@
>  /* We always need to build this implementation as strspn-sse4 needs to
>     be able to fallback to it.  */
>  #include <isa-level.h>
> -#if ISA_SHOULD_BUILD (3)
> +#if ISA_SHOULD_BUILD (2)
>  
>  # define WCSNCPY __wcsncpy_generic
>  # include <wcsmbs/wcsncpy.c>
> diff --git a/sysdeps/x86_64/wcpcpy-generic.c b/sysdeps/x86_64/wcpcpy-generic.c
> index 3ddc98872f..4ab6182cd9 100644
> --- a/sysdeps/x86_64/wcpcpy-generic.c
> +++ b/sysdeps/x86_64/wcpcpy-generic.c
> @@ -24,7 +24,7 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL <= 3
> +#if MINIMUM_X86_ISA_LEVEL <= 2
>  
>  # include <wcsmbs/wcpcpy.c>
>  
> diff --git a/sysdeps/x86_64/wcpcpy.S b/sysdeps/x86_64/wcpcpy.S
> index 4e4fca71eb..e64af6977f 100644
> --- a/sysdeps/x86_64/wcpcpy.S
> +++ b/sysdeps/x86_64/wcpcpy.S
> @@ -24,11 +24,12 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL >= 4
> +#if MINIMUM_X86_ISA_LEVEL >= 3
>  
>  # define WCPCPY	__wcpcpy
>  
>  # define DEFAULT_IMPL_V4	"multiarch/wcpcpy-evex.S"
> +# define DEFAULT_IMPL_V3	"multiarch/wcpcpy-avx2.S"
>  /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
>     should never be used from here.  */
>  # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
> diff --git a/sysdeps/x86_64/wcpncpy-generic.c b/sysdeps/x86_64/wcpncpy-generic.c
> index 0c76e5614c..18c0377d35 100644
> --- a/sysdeps/x86_64/wcpncpy-generic.c
> +++ b/sysdeps/x86_64/wcpncpy-generic.c
> @@ -24,7 +24,7 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL <= 3
> +#if MINIMUM_X86_ISA_LEVEL <= 2
>  
>  # include <wcsmbs/wcpncpy.c>
>  
> diff --git a/sysdeps/x86_64/wcpncpy.S b/sysdeps/x86_64/wcpncpy.S
> index b4e531473e..0e0f432fbb 100644
> --- a/sysdeps/x86_64/wcpncpy.S
> +++ b/sysdeps/x86_64/wcpncpy.S
> @@ -24,11 +24,12 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL >= 4
> +#if MINIMUM_X86_ISA_LEVEL >= 3
>  
>  # define WCPNCPY	__wcpncpy
>  
>  # define DEFAULT_IMPL_V4	"multiarch/wcpncpy-evex.S"
> +# define DEFAULT_IMPL_V3	"multiarch/wcpncpy-avx2.S"
>  /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
>     should never be used from here.  */
>  # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
> diff --git a/sysdeps/x86_64/wcscat-generic.c b/sysdeps/x86_64/wcscat-generic.c
> index 512d0e4d43..639ceac523 100644
> --- a/sysdeps/x86_64/wcscat-generic.c
> +++ b/sysdeps/x86_64/wcscat-generic.c
> @@ -24,7 +24,7 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL <= 3
> +#if MINIMUM_X86_ISA_LEVEL <= 2
>  
>  # include <wcsmbs/wcscat.c>
>  
> diff --git a/sysdeps/x86_64/wcscat.S b/sysdeps/x86_64/wcscat.S
> index ee8360b6e8..06130f58f9 100644
> --- a/sysdeps/x86_64/wcscat.S
> +++ b/sysdeps/x86_64/wcscat.S
> @@ -24,11 +24,12 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL >= 4
> +#if MINIMUM_X86_ISA_LEVEL >= 3
>  
>  # define WCSCAT	__wcscat
>  
>  # define DEFAULT_IMPL_V4	"multiarch/wcscat-evex.S"
> +# define DEFAULT_IMPL_V3	"multiarch/wcscat-avx2.S"
>  /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
>     should never be used from here.  */
>  # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
> diff --git a/sysdeps/x86_64/wcscpy.S b/sysdeps/x86_64/wcscpy.S
> index e403579961..4a859585a6 100644
> --- a/sysdeps/x86_64/wcscpy.S
> +++ b/sysdeps/x86_64/wcscpy.S
> @@ -29,6 +29,7 @@
>  # define WCSCPY	__wcscpy
>  
>  # define DEFAULT_IMPL_V4	"multiarch/wcscpy-evex.S"
> +# define DEFAULT_IMPL_V3	"multiarch/wcscpy-avx2.S"
>  # define DEFAULT_IMPL_V2	"multiarch/wcscpy-ssse3.S"
>  /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
>     should never be used from here.  */
> diff --git a/sysdeps/x86_64/wcsncat-generic.c b/sysdeps/x86_64/wcsncat-generic.c
> index 86e20d9028..57bdd9b7cf 100644
> --- a/sysdeps/x86_64/wcsncat-generic.c
> +++ b/sysdeps/x86_64/wcsncat-generic.c
> @@ -24,7 +24,7 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL <= 3
> +#if MINIMUM_X86_ISA_LEVEL <= 2
>  
>  # include <wcsmbs/wcsncat.c>
>  
> diff --git a/sysdeps/x86_64/wcsncat.S b/sysdeps/x86_64/wcsncat.S
> index 090055a1b8..e1d8609651 100644
> --- a/sysdeps/x86_64/wcsncat.S
> +++ b/sysdeps/x86_64/wcsncat.S
> @@ -24,11 +24,12 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL >= 4
> +#if MINIMUM_X86_ISA_LEVEL >= 3
>  
>  # define WCSNCAT	wcsncat
>  
>  # define DEFAULT_IMPL_V4	"multiarch/wcsncat-evex.S"
> +# define DEFAULT_IMPL_V3	"multiarch/wcsncat-avx2.S"
>  /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
>     should never be used from here.  */
>  # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
> diff --git a/sysdeps/x86_64/wcsncpy-generic.c b/sysdeps/x86_64/wcsncpy-generic.c
> index 0f0ee65b65..4dcbd8ac7f 100644
> --- a/sysdeps/x86_64/wcsncpy-generic.c
> +++ b/sysdeps/x86_64/wcsncpy-generic.c
> @@ -24,7 +24,7 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL <= 3
> +#if MINIMUM_X86_ISA_LEVEL <= 2
>  
>  # include <wcsmbs/wcsncpy.c>
>  
> diff --git a/sysdeps/x86_64/wcsncpy.S b/sysdeps/x86_64/wcsncpy.S
> index 32eaf1163b..f305b5eb9b 100644
> --- a/sysdeps/x86_64/wcsncpy.S
> +++ b/sysdeps/x86_64/wcsncpy.S
> @@ -24,11 +24,12 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL >= 4
> +#if MINIMUM_X86_ISA_LEVEL >= 3
>  
>  # define WCSNCPY	__wcsncpy
>  
>  # define DEFAULT_IMPL_V4	"multiarch/wcsncpy-evex.S"
> +# define DEFAULT_IMPL_V3	"multiarch/wcsncpy-avx2.S"
>  /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
>     should never be used from here.  */
>  # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
> -- 
> 2.34.1
> 

LGTM.

Thanks.

H.J.

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2022-11-09  3:01 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-03  8:53 [PATCH v1 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json Noah Goldstein
2022-11-03  8:53 ` [PATCH v1 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
2022-11-03  8:55   ` Noah Goldstein
2022-11-04 23:04   ` [PATCH v4 1/4] " Noah Goldstein
2022-11-04 23:04     ` [PATCH v4 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
2022-11-04 23:04     ` [PATCH v4 3/4] x86: Add evex optimized functions for the wchar_t strcpy family Noah Goldstein
2022-11-04 23:04     ` [PATCH v4 4/4] x86: Add avx2 " Noah Goldstein
2022-11-04 23:34     ` [PATCH v4 1/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions H.J. Lu
2022-11-09  1:38   ` [PATCH v5 " Noah Goldstein
2022-11-09  1:38     ` [PATCH v5 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
2022-11-09  3:00       ` H.J. Lu
2022-11-09  1:38     ` [PATCH v5 3/4] x86: Add evex optimized functions for the wchar_t strcpy family Noah Goldstein
2022-11-09  3:01       ` H.J. Lu
2022-11-09  1:38     ` [PATCH v5 4/4] x86: Add avx2 " Noah Goldstein
2022-11-09  3:01       ` H.J. Lu
2022-11-09  3:00     ` [PATCH v5 1/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions H.J. Lu
2022-11-03  8:53 ` [PATCH v1 3/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
2022-11-03  8:55   ` Noah Goldstein
2022-11-03  8:53 ` [PATCH v1 4/4] x86: Add optimized functions for the wide-character strcpy family Noah Goldstein
2022-11-03  9:06 ` [PATCH v1 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json Noah Goldstein
2022-11-04  8:20 ` [PATCH v2 " Noah Goldstein
2022-11-04  8:20   ` [PATCH v2 2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
2022-11-04 16:33     ` H.J. Lu
2022-11-04 20:20       ` Noah Goldstein
2022-11-04  8:20   ` [PATCH v2 3/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
2022-11-04 16:45     ` H.J. Lu
2022-11-04 20:21       ` Noah Goldstein
2022-11-04  8:20   ` [PATCH v2 4/4] x86: Add optimized functions for the wide-character strcpy family Noah Goldstein
2022-11-04 16:47     ` H.J. Lu
2022-11-04 20:22       ` Noah Goldstein
2022-11-04 16:26   ` [PATCH v2 1/4] benchtests: Make str{n}{cat|cpy} benchmarks output json H.J. Lu
2022-11-04 20:13 ` [PATCH v3 1/5] " Noah Goldstein
2022-11-04 20:13   ` [PATCH v3 2/5] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions Noah Goldstein
2022-11-04 21:46     ` H.J. Lu
2022-11-04 22:27       ` Noah Goldstein
2022-11-04 22:47         ` H.J. Lu
2022-11-04 23:06           ` Noah Goldstein
2022-11-04 20:13   ` [PATCH v3 3/5] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions Noah Goldstein
2022-11-04 20:13   ` [PATCH v3 4/5] x86: Add evex optimized functions for the wchar_t strcpy family Noah Goldstein
2022-11-04 20:13   ` [PATCH v3 5/5] x86: Add avx2 " Noah Goldstein
2022-11-04 21:01   ` [PATCH v3 1/5] benchtests: Make str{n}{cat|cpy} benchmarks output json H.J. Lu
2022-11-04 21:24     ` Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).