public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat
@ 2021-06-09 20:52 Noah Goldstein
  2021-06-09 20:52 ` [PATCH v1 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 Noah Goldstein
                   ` (8 more replies)
  0 siblings, 9 replies; 27+ messages in thread
From: Noah Goldstein @ 2021-06-09 20:52 UTC (permalink / raw)
  To: libc-alpha

This commit adds tests for a bug in the wide char variant of the
functions where the implementation may assume that maxlen for wcsnlen
or n for wmemchr/strncat will not overflow when multiplied by
sizeof(wchar_t).

These tests show the following implementations failing on x86_64:

wcsnlen-sse4_1
wcsnlen-avx2

wmemchr-sse2
wmemchr-avx2

strncat would fail as well if it where on a system that prefered
either of the wcsnlen implementations that failed as it relies on
wcsnlen.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 string/test-memchr.c  | 39 ++++++++++++++++++++++++---
 string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
 string/test-strnlen.c | 33 +++++++++++++++++++++++
 3 files changed, 130 insertions(+), 3 deletions(-)

diff --git a/string/test-memchr.c b/string/test-memchr.c
index 665edc32af..ce964284aa 100644
--- a/string/test-memchr.c
+++ b/string/test-memchr.c
@@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
   CHAR *res = CALL (impl, s, c, n);
   if (res != exp_res)
     {
-      error (0, 0, "Wrong result in function %s %p %p", impl->name,
-	     res, exp_res);
+      error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
+             impl->name, s, c, n, res, exp_res);
       ret = 1;
       return;
     }
@@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
     }
   buf[align + len] = 0;
 
-  if (pos < len)
+  if (pos < MIN(n, len))
     {
       buf[align + pos] = seek_char;
       buf[align + len] = -seek_char;
@@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
     do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
 }
 
+static void
+do_overflow_tests (void)
+{
+  size_t i, j, len;
+  const size_t one = 1;
+  uintptr_t buf_addr = (uintptr_t) buf1;
+
+  for (i = 0; i < 750; ++i)
+    {
+        do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
+        do_test (0, i, 751, i - buf_addr, BIG_CHAR);
+        do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
+        do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
+        do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+      len = 0;
+      for (j = 8 * sizeof(size_t) - 1; j ; --j)
+        {
+          len |= one << j;
+          do_test (0, i, 751, len - i, BIG_CHAR);
+          do_test (0, i, 751, len + i, BIG_CHAR);
+          do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
+
+          do_test (0, i, 751, ~len - i, BIG_CHAR);
+          do_test (0, i, 751, ~len + i, BIG_CHAR);
+          do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
+        }
+    }
+}
+
 static void
 do_random_tests (void)
 {
@@ -221,6 +253,7 @@ test_main (void)
     do_test (page_size / 2 - i, i, i, 1, 0x9B);
 
   do_random_tests ();
+  do_overflow_tests ();
   return ret;
 }
 
diff --git a/string/test-strncat.c b/string/test-strncat.c
index 2ef917b820..0ab7541d4e 100644
--- a/string/test-strncat.c
+++ b/string/test-strncat.c
@@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
     }
 }
 
+static void
+do_overflow_tests (void)
+{
+  size_t i, j, len;
+  const size_t one = 1;
+  CHAR *s1, *s2;
+  uintptr_t s1_addr;
+  s1 = (CHAR *) buf1;
+  s2 = (CHAR *) buf2;
+  s1_addr = (uintptr_t)s1;
+ for (j = 0; j < 200; ++j)
+      s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
+ s2[200] = 0;
+  for (i = 0; i < 750; ++i) {
+    for (j = 0; j < i; ++j)
+      s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
+    s1[i] = '\0';
+
+       FOR_EACH_IMPL (impl, 0)
+    {
+      s2[0] = '\0';
+      do_one_test (impl, s2, s1, SIZE_MAX - i);
+      s2[0] = '\0';
+      do_one_test (impl, s2, s1, i - s1_addr);
+      s2[0] = '\0';
+      do_one_test (impl, s2, s1, -s1_addr - i);
+      s2[0] = '\0';
+      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
+      s2[0] = '\0';
+      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
+    }
+
+    len = 0;
+    for (j = 8 * sizeof(size_t) - 1; j ; --j)
+      {
+        len |= one << j;
+        FOR_EACH_IMPL (impl, 0)
+          {
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, len - i);
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, len + i);
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, len - s1_addr - i);
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, len - s1_addr + i);
+
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, ~len - i);
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, ~len + i);
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, ~len - s1_addr - i);
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, ~len - s1_addr + i);
+          }
+      }
+  }
+}
+
 static void
 do_random_tests (void)
 {
@@ -316,6 +376,7 @@ test_main (void)
     }
 
   do_random_tests ();
+  do_overflow_tests ();
   return ret;
 }
 
diff --git a/string/test-strnlen.c b/string/test-strnlen.c
index 920f58e97b..f53e09263f 100644
--- a/string/test-strnlen.c
+++ b/string/test-strnlen.c
@@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
     do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
 }
 
+static void
+do_overflow_tests (void)
+{
+  size_t i, j, len;
+  const size_t one = 1;
+  uintptr_t buf_addr = (uintptr_t) buf1;
+
+  for (i = 0; i < 750; ++i)
+    {
+      do_test (0, i, SIZE_MAX - i, BIG_CHAR);
+      do_test (0, i, i - buf_addr, BIG_CHAR);
+      do_test (0, i, -buf_addr - i, BIG_CHAR);
+      do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
+      do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+      len = 0;
+      for (j = 8 * sizeof(size_t) - 1; j ; --j)
+        {
+          len |= one << j;
+          do_test (0, i, len - i, BIG_CHAR);
+          do_test (0, i, len + i, BIG_CHAR);
+          do_test (0, i, len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, len - buf_addr + i, BIG_CHAR);
+
+          do_test (0, i, ~len - i, BIG_CHAR);
+          do_test (0, i, ~len + i, BIG_CHAR);
+          do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
+        }
+    }
+}
+
 static void
 do_random_tests (void)
 {
@@ -283,6 +315,7 @@ test_main (void)
   do_random_tests ();
   do_page_tests ();
   do_page_2_tests ();
+  do_overflow_tests ();
   return ret;
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v1 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2
  2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
@ 2021-06-09 20:52 ` Noah Goldstein
  2021-06-09 20:52 ` [PATCH v1 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 Noah Goldstein
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 27+ messages in thread
From: Noah Goldstein @ 2021-06-09 20:52 UTC (permalink / raw)
  To: libc-alpha

This commit fixes the bug mentioned in the previous commit.

The previous implementations of wmemchr in these files relied
on n * sizeof(wchar_t) which was not guranteed by the standard.

The new overflow tests added in the previous commit now
pass (As well as all the other tests).

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/memchr.S                | 77 +++++++++++++++++++-------
 sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
 2 files changed, 98 insertions(+), 37 deletions(-)

diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index beff2708de..3ddc4655cf 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -21,9 +21,11 @@
 #ifdef USE_AS_WMEMCHR
 # define MEMCHR		wmemchr
 # define PCMPEQ		pcmpeqd
+# define CHAR_PER_VEC	4
 #else
 # define MEMCHR		memchr
 # define PCMPEQ		pcmpeqb
+# define CHAR_PER_VEC	16
 #endif
 
 /* fast SSE2 version with using pmaxub and 64 byte loop */
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
 	movd	%esi, %xmm1
 	mov	%edi, %ecx
 
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#endif
 #ifdef USE_AS_WMEMCHR
 	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
-	shl	$2, %RDX_LP
 #else
-# ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
-# endif
 	punpcklbw %xmm1, %xmm1
 	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
 	test	%eax, %eax
 
 	jnz	L(matches_1)
-	sub	$16, %rdx
+	sub	$CHAR_PER_VEC, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
 	and	$15, %ecx
 	and	$-16, %rdi
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	add	%rcx, %rdx
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	jmp	L(loop_prolog)
 
@@ -77,16 +81,21 @@ L(crosscache):
 	movdqa	(%rdi), %xmm0
 
 	PCMPEQ	%xmm1, %xmm0
-/* Check if there is a match.  */
+	/* Check if there is a match.  */
 	pmovmskb %xmm0, %eax
-/* Remove the leading bytes.  */
+	/* Remove the leading bytes.  */
 	sar	%cl, %eax
 	test	%eax, %eax
 	je	L(unaligned_no_match)
-/* Check which byte is a match.  */
+	/* Check which byte is a match.  */
 	bsf	%eax, %eax
-
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	add	%rdi, %rax
 	add	%rcx, %rax
@@ -94,15 +103,18 @@ L(crosscache):
 
 	.p2align 4
 L(unaligned_no_match):
-        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+	/* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
 	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
 	   possible addition overflow.  */
 	neg	%rcx
 	add	$16, %rcx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	sub	%rcx, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 
 	.p2align 4
@@ -135,7 +147,7 @@ L(loop_prolog):
 	test	$0x3f, %rdi
 	jz	L(align64_loop)
 
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 
 	movdqa	(%rdi), %xmm0
@@ -167,11 +179,14 @@ L(loop_prolog):
 	mov	%rdi, %rcx
 	and	$-64, %rdi
 	and	$63, %ecx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	add	%rcx, %rdx
 
 	.p2align 4
 L(align64_loop):
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	movdqa	(%rdi), %xmm0
 	movdqa	16(%rdi), %xmm2
@@ -218,7 +233,7 @@ L(align64_loop):
 
 	.p2align 4
 L(exit_loop):
-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
 	jle	L(exit_loop_32)
 
 	movdqa	(%rdi), %xmm0
@@ -238,7 +253,7 @@ L(exit_loop):
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32_1)
-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
 	jle	L(return_null)
 
 	PCMPEQ	48(%rdi), %xmm1
@@ -250,13 +265,13 @@ L(exit_loop):
 
 	.p2align 4
 L(exit_loop_32):
-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
 	movdqa	(%rdi), %xmm0
 	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches_1)
-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
 	jbe	L(return_null)
 
 	PCMPEQ	16(%rdi), %xmm1
@@ -293,7 +308,13 @@ L(matches32):
 	.p2align 4
 L(matches_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	add	%rdi, %rax
 	ret
@@ -301,7 +322,13 @@ L(matches_1):
 	.p2align 4
 L(matches16_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	16(%rdi, %rax), %rax
 	ret
@@ -309,7 +336,13 @@ L(matches16_1):
 	.p2align 4
 L(matches32_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	32(%rdi, %rax), %rax
 	ret
@@ -317,7 +350,13 @@ L(matches32_1):
 	.p2align 4
 L(matches48_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	48(%rdi, %rax), %rax
 	ret
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 0d8758e3e7..afdb956502 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -54,21 +54,19 @@
 
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
-	test	%RDX_LP, %RDX_LP
-	jz	L(null)
-# endif
-# ifdef USE_AS_WMEMCHR
-	shl	$2, %RDX_LP
-# else
 #  ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+#  else
+	test	%RDX_LP, %RDX_LP
 #  endif
+	jz	L(null)
 # endif
 	/* Broadcast CHAR to YMMMATCH.  */
 	vmovd	%esi, %xmm0
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
 	vpmovmskb %ymm1, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* If length < CHAR_PER_VEC handle special.  */
-	cmpq	$VEC_SIZE, %rdx
+	cmpq	$CHAR_PER_VEC, %rdx
 	jbe	L(first_vec_x0)
 # endif
 	testl	%eax, %eax
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
 L(first_vec_x0):
 	/* Check if first match was before length.  */
 	tzcntl	%eax, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
 	xorl	%ecx, %ecx
 	cmpl	%eax, %edx
 	leaq	(%rdi, %rax), %rax
@@ -110,12 +112,12 @@ L(null):
 # endif
 	.p2align 4
 L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is necessary
-	   for computer return address if byte is found or adjusting length
-	   if it is not and this is memchr.  */
+	/* Save pointer before aligning as its original value is
+	   necessary for computer return address if byte is found or
+	   adjusting length if it is not and this is memchr.  */
 	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
-	   rdi for rawmemchr.  */
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+	   and rdi for rawmemchr.  */
 	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
 	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
@@ -124,6 +126,10 @@ L(cross_page_boundary):
 	   match).  */
 	leaq	1(%ALGN_PTR_REG), %rsi
 	subq	%RRAW_PTR_REG, %rsi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+#  endif
 # endif
 	/* Remove the leading bytes.  */
 	sarxl	%ERAW_PTR_REG, %eax, %eax
@@ -181,6 +187,10 @@ L(cross_page_continue):
 	orq	$(VEC_SIZE - 1), %rdi
 	/* esi is for adjusting length to see if near the end.  */
 	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %esi
+#  endif
 # else
 	orq	$(VEC_SIZE - 1), %rdi
 L(cross_page_continue):
@@ -213,7 +223,7 @@ L(cross_page_continue):
 
 # ifndef USE_AS_RAWMEMCHR
 	/* Check if at last VEC_SIZE * 4 length.  */
-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(last_4x_vec_or_less_cmpeq)
 	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
 	   length.  */
@@ -221,6 +231,10 @@ L(cross_page_continue):
 	movl	%edi, %ecx
 	orq	$(VEC_SIZE * 4 - 1), %rdi
 	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 	addq	%rcx, %rdx
 # else
 	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
@@ -250,15 +264,19 @@ L(loop_4x_vec):
 
 	subq	$-(VEC_SIZE * 4), %rdi
 
-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$(CHAR_PER_VEC * 4), %rdx
 	ja	L(loop_4x_vec)
 
-	/* Fall through into less than 4 remaining vectors of length case.
-	 */
+	/* Fall through into less than 4 remaining vectors of length
+	   case.  */
 	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	.p2align 4
 L(last_4x_vec_or_less):
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
 	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
 	jnz	L(first_vec_x1_check)
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
 L(last_4x_vec_or_less_cmpeq):
 	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
 	subq	$-(VEC_SIZE * 4), %rdi
 	/* Check first VEC regardless.  */
 	testl	%eax, %eax
-- 
2.25.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v1 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2
  2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
  2021-06-09 20:52 ` [PATCH v1 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 Noah Goldstein
@ 2021-06-09 20:52 ` Noah Goldstein
  2021-06-09 21:53 ` [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat H.J. Lu
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 27+ messages in thread
From: Noah Goldstein @ 2021-06-09 20:52 UTC (permalink / raw)
  To: libc-alpha

This commit fixes the bug mentioned in the previous commit.

The previous implementations of wmemchr in these files relied
on maxlen * sizeof(wchar_t) which was not guranteed by the standard.

The new overflow tests added in the previous commit now
pass (As well as all the other tests).

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
Its possible there is a room for a speedup in strnlen-avx2
and strnlen-evex if we check for overflow first and jump to
strlen. This allows for end pointers to be used as opposed
to tracking length which will save some ALU / code size.
 sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
 sysdeps/x86_64/strlen.S                |  14 ++-
 2 files changed, 106 insertions(+), 38 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index bd2e6ee44a..b282a75613 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -44,21 +44,21 @@
 
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
 	/* Check zero length.  */
+#  ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RSI_LP, %RSI_LP
+#  else
 	test	%RSI_LP, %RSI_LP
+#  endif
 	jz	L(zero)
 	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
 	mov	%RSI_LP, %R8_LP
-#  ifdef USE_AS_WCSLEN
-	shl	$2, %RSI_LP
-#  elif defined __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-#  endif
 # endif
 	movl	%edi, %eax
 	movq	%rdi, %rdx
@@ -72,10 +72,10 @@ ENTRY (STRLEN)
 
 	/* Check the first VEC_SIZE bytes.  */
 	VPCMPEQ	(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 # ifdef USE_AS_STRNLEN
 	/* If length < VEC_SIZE handle special.  */
-	cmpq	$VEC_SIZE, %rsi
+	cmpq	$CHAR_PER_VEC, %rsi
 	jbe	L(first_vec_x0)
 # endif
 	/* If empty continue to aligned_more. Otherwise return bit
@@ -84,6 +84,7 @@ ENTRY (STRLEN)
 	jz	L(aligned_more)
 	tzcntl	%eax, %eax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -97,9 +98,14 @@ L(zero):
 L(first_vec_x0):
 	/* Set bit for max len so that tzcnt will return min of max len
 	   and position of first match.  */
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
 	btsq	%rsi, %rax
 	tzcntl	%eax, %eax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 #  endif
 	VZEROUPPER_RETURN
@@ -113,14 +119,19 @@ L(first_vec_x1):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE * 4 + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	incl	%edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -133,14 +144,19 @@ L(first_vec_x2):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE * 3 + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -153,14 +169,19 @@ L(first_vec_x3):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE * 2 + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE * 2 + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -173,14 +194,19 @@ L(first_vec_x4):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE * 3 + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -195,10 +221,14 @@ L(cross_page_continue):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
 # ifdef USE_AS_STRNLEN
-	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
-	   it simplies the logic in last_4x_vec_or_less.  */
+	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
+	   because it simplies the logic in last_4x_vec_or_less.  */
 	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
 	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 # endif
 	/* Load first VEC regardless.  */
 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
@@ -207,34 +237,38 @@ L(cross_page_continue):
 	subq	%rcx, %rsi
 	jb	L(last_4x_vec_or_less)
 # endif
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x4)
 
 	/* Align data to VEC_SIZE * 4 - 1.  */
 # ifdef USE_AS_STRNLEN
 	/* Before adjusting length check if at last VEC_SIZE * 4.  */
-	cmpq	$(VEC_SIZE * 4 - 1), %rsi
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
 	jbe	L(last_4x_vec_or_less_load)
 	incq	%rdi
 	movl	%edi, %ecx
 	orq	$(VEC_SIZE * 4 - 1), %rdi
 	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 	/* Readjust length.  */
 	addq	%rcx, %rsi
 # else
@@ -246,13 +280,13 @@ L(cross_page_continue):
 L(loop_4x_vec):
 # ifdef USE_AS_STRNLEN
 	/* Break if at end of length.  */
-	subq	$(VEC_SIZE * 4), %rsi
+	subq	$(CHAR_PER_VEC * 4), %rsi
 	jb	L(last_4x_vec_or_less_cmpeq)
 # endif
-	/* Save some code size by microfusing VPMINU with the load. Since
-	   the matches in ymm2/ymm4 can only be returned if there where no
-	   matches in ymm1/ymm3 respectively there is no issue with overlap.
-	 */
+	/* Save some code size by microfusing VPMINU with the load.
+	   Since the matches in ymm2/ymm4 can only be returned if there
+	   where no matches in ymm1/ymm3 respectively there is no issue
+	   with overlap.  */
 	vmovdqa	1(%rdi), %ymm1
 	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
 	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
@@ -260,7 +294,7 @@ L(loop_4x_vec):
 
 	VPMINU	%ymm2, %ymm4, %ymm5
 	VPCMPEQ	%ymm5, %ymm0, %ymm5
-	vpmovmskb	%ymm5, %ecx
+	vpmovmskb %ymm5, %ecx
 
 	subq	$-(VEC_SIZE * 4), %rdi
 	testl	%ecx, %ecx
@@ -268,27 +302,28 @@ L(loop_4x_vec):
 
 
 	VPCMPEQ	%ymm1, %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	subq	%rdx, %rdi
 	testl	%eax, %eax
 	jnz	L(last_vec_return_x0)
 
 	VPCMPEQ	%ymm2, %ymm0, %ymm2
-	vpmovmskb	%ymm2, %eax
+	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_return_x1)
 
 	/* Combine last 2 VEC.  */
 	VPCMPEQ	%ymm3, %ymm0, %ymm3
-	vpmovmskb	%ymm3, %eax
-	/* rcx has combined result from all 4 VEC. It will only be used if
-	   the first 3 other VEC all did not contain a match.  */
+	vpmovmskb %ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used
+	   if the first 3 other VEC all did not contain a match.  */
 	salq	$32, %rcx
 	orq	%rcx, %rax
 	tzcntq	%rax, %rax
 	subq	$(VEC_SIZE * 2 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -297,15 +332,19 @@ L(loop_4x_vec):
 # ifdef USE_AS_STRNLEN
 	.p2align 4
 L(last_4x_vec_or_less_load):
-	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
+	/* Depending on entry adjust rdi / prepare first VEC in ymm1.
+	 */
 	subq	$-(VEC_SIZE * 4), %rdi
 L(last_4x_vec_or_less_cmpeq):
 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 L(last_4x_vec_or_less):
-
-	vpmovmskb	%ymm1, %eax
-	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
-	   VEC_SIZE * 4.  */
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
+	vpmovmskb %ymm1, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off
+	   by VEC_SIZE * 4.  */
 	testl	$(VEC_SIZE * 2), %esi
 	jnz	L(last_4x_vec)
 
@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
 	jb	L(max)
 
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
 	cmpl	%eax, %esi
@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
 	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -340,6 +380,7 @@ L(last_vec_return_x0):
 	subq	$(VEC_SIZE * 4 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -350,6 +391,7 @@ L(last_vec_return_x1):
 	subq	$(VEC_SIZE * 3 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -366,6 +408,7 @@ L(last_vec_x1_check):
 	incl	%eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -381,14 +424,14 @@ L(last_4x_vec):
 	jnz	L(last_vec_x1)
 
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x2)
 
 	/* Normalize length.  */
 	andl	$(VEC_SIZE * 4 - 1), %esi
 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x3)
 
@@ -396,7 +439,7 @@ L(last_4x_vec):
 	jb	L(max)
 
 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
 	cmpl	%eax, %esi
@@ -405,6 +448,7 @@ L(last_4x_vec):
 	addl	$(VEC_SIZE * 3 + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -419,6 +463,7 @@ L(last_vec_x1):
 	incl	%eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -432,6 +477,7 @@ L(last_vec_x2):
 	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -447,6 +493,7 @@ L(last_vec_x3):
 	addl	$(VEC_SIZE * 2 + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -455,13 +502,13 @@ L(max_end):
 	VZEROUPPER_RETURN
 # endif
 
-	/* Cold case for crossing page with first load.	 */
+	/* Cold case for crossing page with first load.  */
 	.p2align 4
 L(cross_page_boundary):
 	/* Align data to VEC_SIZE - 1.  */
 	orq	$(VEC_SIZE - 1), %rdi
 	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
 	   so no need to manually mod rdx.  */
 	sarxl	%edx, %eax, %eax
@@ -470,6 +517,10 @@ L(cross_page_boundary):
 	jnz	L(cross_page_less_vec)
 	leaq	1(%rdi), %rcx
 	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %ecx
+#  endif
 	/* Check length.  */
 	cmpq	%rsi, %rcx
 	jb	L(cross_page_continue)
@@ -479,6 +530,7 @@ L(cross_page_boundary):
 	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide length by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 #  endif
 # endif
@@ -489,6 +541,10 @@ L(return_vzeroupper):
 	.p2align 4
 L(cross_page_less_vec):
 	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
 	cmpq	%rax, %rsi
 	cmovb	%esi, %eax
 #  ifdef USE_AS_WCSLEN
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index d223ea1700..3fc6734910 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -65,12 +65,24 @@ ENTRY(strlen)
 	ret
 L(n_nonzero):
 # ifdef AS_WCSLEN
-	shl	$2, %RSI_LP
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
+   overflow the only way this program doesn't have undefined behavior 
+   is if there is a null terminator in valid memory so strlen will 
+   suffice.  */
+	mov	%RSI_LP, %R10_LP
+	sar	$62, %R10_LP
+	test	%R10_LP, %R10_LP
+	jnz	__wcslen_sse2
+	sal	$2, %RSI_LP
 # endif
 
 /* Initialize long lived registers.  */
 
 	add	%RDI_LP, %RSI_LP
+# ifdef AS_WCSLEN
+/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
+	jbe	__wcslen_sse2
+# endif
 	mov	%RSI_LP, %R10_LP
 	and	$-64, %R10_LP
 	mov	%RSI_LP, %R11_LP
-- 
2.25.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v1 1/3] String: Add additional overflow tests for strnlen,  memchr, and strncat
  2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
  2021-06-09 20:52 ` [PATCH v1 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 Noah Goldstein
  2021-06-09 20:52 ` [PATCH v1 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 Noah Goldstein
@ 2021-06-09 21:53 ` H.J. Lu
  2021-06-09 22:26   ` Noah Goldstein
  2021-06-22 18:11 ` [PATCH v2 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974] Noah Goldstein
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2021-06-09 21:53 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Wed, Jun 9, 2021 at 1:53 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit adds tests for a bug in the wide char variant of the
> functions where the implementation may assume that maxlen for wcsnlen
> or n for wmemchr/strncat will not overflow when multiplied by
> sizeof(wchar_t).
>
> These tests show the following implementations failing on x86_64:
>
> wcsnlen-sse4_1
> wcsnlen-avx2
>
> wmemchr-sse2
> wmemchr-avx2
>
> strncat would fail as well if it where on a system that prefered
> either of the wcsnlen implementations that failed as it relies on
> wcsnlen.

Please open a bug report for each standard C function.   We need to
track them for backporting to release branches.

Thanks.

> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  string/test-memchr.c  | 39 ++++++++++++++++++++++++---
>  string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
>  string/test-strnlen.c | 33 +++++++++++++++++++++++
>  3 files changed, 130 insertions(+), 3 deletions(-)
>
> diff --git a/string/test-memchr.c b/string/test-memchr.c
> index 665edc32af..ce964284aa 100644
> --- a/string/test-memchr.c
> +++ b/string/test-memchr.c
> @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
>    CHAR *res = CALL (impl, s, c, n);
>    if (res != exp_res)
>      {
> -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
> -            res, exp_res);
> +      error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
> +             impl->name, s, c, n, res, exp_res);
>        ret = 1;
>        return;
>      }
> @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
>      }
>    buf[align + len] = 0;
>
> -  if (pos < len)
> +  if (pos < MIN(n, len))
>      {
>        buf[align + pos] = seek_char;
>        buf[align + len] = -seek_char;
> @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
>      do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
>  }
>
> +static void
> +do_overflow_tests (void)
> +{
> +  size_t i, j, len;
> +  const size_t one = 1;
> +  uintptr_t buf_addr = (uintptr_t) buf1;
> +
> +  for (i = 0; i < 750; ++i)
> +    {
> +        do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
> +        do_test (0, i, 751, i - buf_addr, BIG_CHAR);
> +        do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
> +        do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
> +        do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
> +
> +      len = 0;
> +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
> +        {
> +          len |= one << j;
> +          do_test (0, i, 751, len - i, BIG_CHAR);
> +          do_test (0, i, 751, len + i, BIG_CHAR);
> +          do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
> +          do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
> +
> +          do_test (0, i, 751, ~len - i, BIG_CHAR);
> +          do_test (0, i, 751, ~len + i, BIG_CHAR);
> +          do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
> +          do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
> +        }
> +    }
> +}
> +
>  static void
>  do_random_tests (void)
>  {
> @@ -221,6 +253,7 @@ test_main (void)
>      do_test (page_size / 2 - i, i, i, 1, 0x9B);
>
>    do_random_tests ();
> +  do_overflow_tests ();
>    return ret;
>  }
>
> diff --git a/string/test-strncat.c b/string/test-strncat.c
> index 2ef917b820..0ab7541d4e 100644
> --- a/string/test-strncat.c
> +++ b/string/test-strncat.c
> @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
>      }
>  }
>
> +static void
> +do_overflow_tests (void)
> +{
> +  size_t i, j, len;
> +  const size_t one = 1;
> +  CHAR *s1, *s2;
> +  uintptr_t s1_addr;
> +  s1 = (CHAR *) buf1;
> +  s2 = (CHAR *) buf2;
> +  s1_addr = (uintptr_t)s1;
> + for (j = 0; j < 200; ++j)
> +      s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
> + s2[200] = 0;
> +  for (i = 0; i < 750; ++i) {
> +    for (j = 0; j < i; ++j)
> +      s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
> +    s1[i] = '\0';
> +
> +       FOR_EACH_IMPL (impl, 0)
> +    {
> +      s2[0] = '\0';
> +      do_one_test (impl, s2, s1, SIZE_MAX - i);
> +      s2[0] = '\0';
> +      do_one_test (impl, s2, s1, i - s1_addr);
> +      s2[0] = '\0';
> +      do_one_test (impl, s2, s1, -s1_addr - i);
> +      s2[0] = '\0';
> +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
> +      s2[0] = '\0';
> +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
> +    }
> +
> +    len = 0;
> +    for (j = 8 * sizeof(size_t) - 1; j ; --j)
> +      {
> +        len |= one << j;
> +        FOR_EACH_IMPL (impl, 0)
> +          {
> +            s2[0] = '\0';
> +            do_one_test (impl, s2, s1, len - i);
> +            s2[0] = '\0';
> +            do_one_test (impl, s2, s1, len + i);
> +            s2[0] = '\0';
> +            do_one_test (impl, s2, s1, len - s1_addr - i);
> +            s2[0] = '\0';
> +            do_one_test (impl, s2, s1, len - s1_addr + i);
> +
> +            s2[0] = '\0';
> +            do_one_test (impl, s2, s1, ~len - i);
> +            s2[0] = '\0';
> +            do_one_test (impl, s2, s1, ~len + i);
> +            s2[0] = '\0';
> +            do_one_test (impl, s2, s1, ~len - s1_addr - i);
> +            s2[0] = '\0';
> +            do_one_test (impl, s2, s1, ~len - s1_addr + i);
> +          }
> +      }
> +  }
> +}
> +
>  static void
>  do_random_tests (void)
>  {
> @@ -316,6 +376,7 @@ test_main (void)
>      }
>
>    do_random_tests ();
> +  do_overflow_tests ();
>    return ret;
>  }
>
> diff --git a/string/test-strnlen.c b/string/test-strnlen.c
> index 920f58e97b..f53e09263f 100644
> --- a/string/test-strnlen.c
> +++ b/string/test-strnlen.c
> @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
>      do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
>  }
>
> +static void
> +do_overflow_tests (void)
> +{
> +  size_t i, j, len;
> +  const size_t one = 1;
> +  uintptr_t buf_addr = (uintptr_t) buf1;
> +
> +  for (i = 0; i < 750; ++i)
> +    {
> +      do_test (0, i, SIZE_MAX - i, BIG_CHAR);
> +      do_test (0, i, i - buf_addr, BIG_CHAR);
> +      do_test (0, i, -buf_addr - i, BIG_CHAR);
> +      do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
> +      do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
> +
> +      len = 0;
> +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
> +        {
> +          len |= one << j;
> +          do_test (0, i, len - i, BIG_CHAR);
> +          do_test (0, i, len + i, BIG_CHAR);
> +          do_test (0, i, len - buf_addr - i, BIG_CHAR);
> +          do_test (0, i, len - buf_addr + i, BIG_CHAR);
> +
> +          do_test (0, i, ~len - i, BIG_CHAR);
> +          do_test (0, i, ~len + i, BIG_CHAR);
> +          do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
> +          do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
> +        }
> +    }
> +}
> +
>  static void
>  do_random_tests (void)
>  {
> @@ -283,6 +315,7 @@ test_main (void)
>    do_random_tests ();
>    do_page_tests ();
>    do_page_2_tests ();
> +  do_overflow_tests ();
>    return ret;
>  }
>
> --
> 2.25.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v1 1/3] String: Add additional overflow tests for strnlen,  memchr, and strncat
  2021-06-09 21:53 ` [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat H.J. Lu
@ 2021-06-09 22:26   ` Noah Goldstein
  2021-06-22 15:43     ` Noah Goldstein
  0 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-09 22:26 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Wed, Jun 9, 2021 at 5:54 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> On Wed, Jun 9, 2021 at 1:53 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> > This commit adds tests for a bug in the wide char variant of the
> > functions where the implementation may assume that maxlen for wcsnlen
> > or n for wmemchr/strncat will not overflow when multiplied by
> > sizeof(wchar_t).
> >
> > These tests show the following implementations failing on x86_64:
> >
> > wcsnlen-sse4_1
> > wcsnlen-avx2
> >
> > wmemchr-sse2
> > wmemchr-avx2
> >
> > strncat would fail as well if it where on a system that prefered
> > either of the wcsnlen implementations that failed as it relies on
> > wcsnlen.
>
> Please open a bug report for each standard C function.   We need to
> track them for backporting to release branches.
>

Done: https://sourceware.org/bugzilla/show_bug.cgi?id=27974


>
> Thanks.
>
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> >  string/test-memchr.c  | 39 ++++++++++++++++++++++++---
> >  string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
> >  string/test-strnlen.c | 33 +++++++++++++++++++++++
> >  3 files changed, 130 insertions(+), 3 deletions(-)
> >
> > diff --git a/string/test-memchr.c b/string/test-memchr.c
> > index 665edc32af..ce964284aa 100644
> > --- a/string/test-memchr.c
> > +++ b/string/test-memchr.c
> > @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c,
> size_t n, CHAR *exp_res)
> >    CHAR *res = CALL (impl, s, c, n);
> >    if (res != exp_res)
> >      {
> > -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
> > -            res, exp_res);
> > +      error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p !=
> %p",
> > +             impl->name, s, c, n, res, exp_res);
> >        ret = 1;
> >        return;
> >      }
> > @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t
> n, int seek_char)
> >      }
> >    buf[align + len] = 0;
> >
> > -  if (pos < len)
> > +  if (pos < MIN(n, len))
> >      {
> >        buf[align + pos] = seek_char;
> >        buf[align + len] = -seek_char;
> > @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len,
> size_t n, int seek_char)
> >      do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
> >  }
> >
> > +static void
> > +do_overflow_tests (void)
> > +{
> > +  size_t i, j, len;
> > +  const size_t one = 1;
> > +  uintptr_t buf_addr = (uintptr_t) buf1;
> > +
> > +  for (i = 0; i < 750; ++i)
> > +    {
> > +        do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
> > +        do_test (0, i, 751, i - buf_addr, BIG_CHAR);
> > +        do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
> > +        do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
> > +        do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
> > +
> > +      len = 0;
> > +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
> > +        {
> > +          len |= one << j;
> > +          do_test (0, i, 751, len - i, BIG_CHAR);
> > +          do_test (0, i, 751, len + i, BIG_CHAR);
> > +          do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
> > +          do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
> > +
> > +          do_test (0, i, 751, ~len - i, BIG_CHAR);
> > +          do_test (0, i, 751, ~len + i, BIG_CHAR);
> > +          do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
> > +          do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
> > +        }
> > +    }
> > +}
> > +
> >  static void
> >  do_random_tests (void)
> >  {
> > @@ -221,6 +253,7 @@ test_main (void)
> >      do_test (page_size / 2 - i, i, i, 1, 0x9B);
> >
> >    do_random_tests ();
> > +  do_overflow_tests ();
> >    return ret;
> >  }
> >
> > diff --git a/string/test-strncat.c b/string/test-strncat.c
> > index 2ef917b820..0ab7541d4e 100644
> > --- a/string/test-strncat.c
> > +++ b/string/test-strncat.c
> > @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1,
> size_t len2,
> >      }
> >  }
> >
> > +static void
> > +do_overflow_tests (void)
> > +{
> > +  size_t i, j, len;
> > +  const size_t one = 1;
> > +  CHAR *s1, *s2;
> > +  uintptr_t s1_addr;
> > +  s1 = (CHAR *) buf1;
> > +  s2 = (CHAR *) buf2;
> > +  s1_addr = (uintptr_t)s1;
> > + for (j = 0; j < 200; ++j)
> > +      s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
> > + s2[200] = 0;
> > +  for (i = 0; i < 750; ++i) {
> > +    for (j = 0; j < i; ++j)
> > +      s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
> > +    s1[i] = '\0';
> > +
> > +       FOR_EACH_IMPL (impl, 0)
> > +    {
> > +      s2[0] = '\0';
> > +      do_one_test (impl, s2, s1, SIZE_MAX - i);
> > +      s2[0] = '\0';
> > +      do_one_test (impl, s2, s1, i - s1_addr);
> > +      s2[0] = '\0';
> > +      do_one_test (impl, s2, s1, -s1_addr - i);
> > +      s2[0] = '\0';
> > +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
> > +      s2[0] = '\0';
> > +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
> > +    }
> > +
> > +    len = 0;
> > +    for (j = 8 * sizeof(size_t) - 1; j ; --j)
> > +      {
> > +        len |= one << j;
> > +        FOR_EACH_IMPL (impl, 0)
> > +          {
> > +            s2[0] = '\0';
> > +            do_one_test (impl, s2, s1, len - i);
> > +            s2[0] = '\0';
> > +            do_one_test (impl, s2, s1, len + i);
> > +            s2[0] = '\0';
> > +            do_one_test (impl, s2, s1, len - s1_addr - i);
> > +            s2[0] = '\0';
> > +            do_one_test (impl, s2, s1, len - s1_addr + i);
> > +
> > +            s2[0] = '\0';
> > +            do_one_test (impl, s2, s1, ~len - i);
> > +            s2[0] = '\0';
> > +            do_one_test (impl, s2, s1, ~len + i);
> > +            s2[0] = '\0';
> > +            do_one_test (impl, s2, s1, ~len - s1_addr - i);
> > +            s2[0] = '\0';
> > +            do_one_test (impl, s2, s1, ~len - s1_addr + i);
> > +          }
> > +      }
> > +  }
> > +}
> > +
> >  static void
> >  do_random_tests (void)
> >  {
> > @@ -316,6 +376,7 @@ test_main (void)
> >      }
> >
> >    do_random_tests ();
> > +  do_overflow_tests ();
> >    return ret;
> >  }
> >
> > diff --git a/string/test-strnlen.c b/string/test-strnlen.c
> > index 920f58e97b..f53e09263f 100644
> > --- a/string/test-strnlen.c
> > +++ b/string/test-strnlen.c
> > @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int
> max_char)
> >      do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len,
> maxlen));
> >  }
> >
> > +static void
> > +do_overflow_tests (void)
> > +{
> > +  size_t i, j, len;
> > +  const size_t one = 1;
> > +  uintptr_t buf_addr = (uintptr_t) buf1;
> > +
> > +  for (i = 0; i < 750; ++i)
> > +    {
> > +      do_test (0, i, SIZE_MAX - i, BIG_CHAR);
> > +      do_test (0, i, i - buf_addr, BIG_CHAR);
> > +      do_test (0, i, -buf_addr - i, BIG_CHAR);
> > +      do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
> > +      do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
> > +
> > +      len = 0;
> > +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
> > +        {
> > +          len |= one << j;
> > +          do_test (0, i, len - i, BIG_CHAR);
> > +          do_test (0, i, len + i, BIG_CHAR);
> > +          do_test (0, i, len - buf_addr - i, BIG_CHAR);
> > +          do_test (0, i, len - buf_addr + i, BIG_CHAR);
> > +
> > +          do_test (0, i, ~len - i, BIG_CHAR);
> > +          do_test (0, i, ~len + i, BIG_CHAR);
> > +          do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
> > +          do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
> > +        }
> > +    }
> > +}
> > +
> >  static void
> >  do_random_tests (void)
> >  {
> > @@ -283,6 +315,7 @@ test_main (void)
> >    do_random_tests ();
> >    do_page_tests ();
> >    do_page_2_tests ();
> > +  do_overflow_tests ();
> >    return ret;
> >  }
> >
> > --
> > 2.25.1
> >
>
>
> --
> H.J.
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v1 1/3] String: Add additional overflow tests for strnlen,  memchr, and strncat
  2021-06-09 22:26   ` Noah Goldstein
@ 2021-06-22 15:43     ` Noah Goldstein
  2021-06-22 16:18       ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-22 15:43 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Wed, Jun 9, 2021 at 6:26 PM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

>
>
> On Wed, Jun 9, 2021 at 5:54 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
>> On Wed, Jun 9, 2021 at 1:53 PM Noah Goldstein <goldstein.w.n@gmail.com>
>> wrote:
>> >
>> > This commit adds tests for a bug in the wide char variant of the
>> > functions where the implementation may assume that maxlen for wcsnlen
>> > or n for wmemchr/strncat will not overflow when multiplied by
>> > sizeof(wchar_t).
>> >
>> > These tests show the following implementations failing on x86_64:
>> >
>> > wcsnlen-sse4_1
>> > wcsnlen-avx2
>> >
>> > wmemchr-sse2
>> > wmemchr-avx2
>> >
>> > strncat would fail as well if it where on a system that prefered
>> > either of the wcsnlen implementations that failed as it relies on
>> > wcsnlen.
>>
>> Please open a bug report for each standard C function.   We need to
>> track them for backporting to release branches.
>>
>
> Done: https://sourceware.org/bugzilla/show_bug.cgi?id=27974
>
>
>>
>> Thanks.
>>
>> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
>> > ---
>> >  string/test-memchr.c  | 39 ++++++++++++++++++++++++---
>> >  string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
>> >  string/test-strnlen.c | 33 +++++++++++++++++++++++
>> >  3 files changed, 130 insertions(+), 3 deletions(-)
>> >
>> > diff --git a/string/test-memchr.c b/string/test-memchr.c
>> > index 665edc32af..ce964284aa 100644
>> > --- a/string/test-memchr.c
>> > +++ b/string/test-memchr.c
>> > @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c,
>> size_t n, CHAR *exp_res)
>> >    CHAR *res = CALL (impl, s, c, n);
>> >    if (res != exp_res)
>> >      {
>> > -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
>> > -            res, exp_res);
>> > +      error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p !=
>> %p",
>> > +             impl->name, s, c, n, res, exp_res);
>> >        ret = 1;
>> >        return;
>> >      }
>> > @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t
>> n, int seek_char)
>> >      }
>> >    buf[align + len] = 0;
>> >
>> > -  if (pos < len)
>> > +  if (pos < MIN(n, len))
>> >      {
>> >        buf[align + pos] = seek_char;
>> >        buf[align + len] = -seek_char;
>> > @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len,
>> size_t n, int seek_char)
>> >      do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
>> >  }
>> >
>> > +static void
>> > +do_overflow_tests (void)
>> > +{
>> > +  size_t i, j, len;
>> > +  const size_t one = 1;
>> > +  uintptr_t buf_addr = (uintptr_t) buf1;
>> > +
>> > +  for (i = 0; i < 750; ++i)
>> > +    {
>> > +        do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
>> > +        do_test (0, i, 751, i - buf_addr, BIG_CHAR);
>> > +        do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
>> > +        do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
>> > +        do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
>> > +
>> > +      len = 0;
>> > +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
>> > +        {
>> > +          len |= one << j;
>> > +          do_test (0, i, 751, len - i, BIG_CHAR);
>> > +          do_test (0, i, 751, len + i, BIG_CHAR);
>> > +          do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
>> > +          do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
>> > +
>> > +          do_test (0, i, 751, ~len - i, BIG_CHAR);
>> > +          do_test (0, i, 751, ~len + i, BIG_CHAR);
>> > +          do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
>> > +          do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
>> > +        }
>> > +    }
>> > +}
>> > +
>> >  static void
>> >  do_random_tests (void)
>> >  {
>> > @@ -221,6 +253,7 @@ test_main (void)
>> >      do_test (page_size / 2 - i, i, i, 1, 0x9B);
>> >
>> >    do_random_tests ();
>> > +  do_overflow_tests ();
>> >    return ret;
>> >  }
>> >
>> > diff --git a/string/test-strncat.c b/string/test-strncat.c
>> > index 2ef917b820..0ab7541d4e 100644
>> > --- a/string/test-strncat.c
>> > +++ b/string/test-strncat.c
>> > @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t
>> len1, size_t len2,
>> >      }
>> >  }
>> >
>> > +static void
>> > +do_overflow_tests (void)
>> > +{
>> > +  size_t i, j, len;
>> > +  const size_t one = 1;
>> > +  CHAR *s1, *s2;
>> > +  uintptr_t s1_addr;
>> > +  s1 = (CHAR *) buf1;
>> > +  s2 = (CHAR *) buf2;
>> > +  s1_addr = (uintptr_t)s1;
>> > + for (j = 0; j < 200; ++j)
>> > +      s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
>> > + s2[200] = 0;
>> > +  for (i = 0; i < 750; ++i) {
>> > +    for (j = 0; j < i; ++j)
>> > +      s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
>> > +    s1[i] = '\0';
>> > +
>> > +       FOR_EACH_IMPL (impl, 0)
>> > +    {
>> > +      s2[0] = '\0';
>> > +      do_one_test (impl, s2, s1, SIZE_MAX - i);
>> > +      s2[0] = '\0';
>> > +      do_one_test (impl, s2, s1, i - s1_addr);
>> > +      s2[0] = '\0';
>> > +      do_one_test (impl, s2, s1, -s1_addr - i);
>> > +      s2[0] = '\0';
>> > +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
>> > +      s2[0] = '\0';
>> > +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
>> > +    }
>> > +
>> > +    len = 0;
>> > +    for (j = 8 * sizeof(size_t) - 1; j ; --j)
>> > +      {
>> > +        len |= one << j;
>> > +        FOR_EACH_IMPL (impl, 0)
>> > +          {
>> > +            s2[0] = '\0';
>> > +            do_one_test (impl, s2, s1, len - i);
>> > +            s2[0] = '\0';
>> > +            do_one_test (impl, s2, s1, len + i);
>> > +            s2[0] = '\0';
>> > +            do_one_test (impl, s2, s1, len - s1_addr - i);
>> > +            s2[0] = '\0';
>> > +            do_one_test (impl, s2, s1, len - s1_addr + i);
>> > +
>> > +            s2[0] = '\0';
>> > +            do_one_test (impl, s2, s1, ~len - i);
>> > +            s2[0] = '\0';
>> > +            do_one_test (impl, s2, s1, ~len + i);
>> > +            s2[0] = '\0';
>> > +            do_one_test (impl, s2, s1, ~len - s1_addr - i);
>> > +            s2[0] = '\0';
>> > +            do_one_test (impl, s2, s1, ~len - s1_addr + i);
>> > +          }
>> > +      }
>> > +  }
>> > +}
>> > +
>> >  static void
>> >  do_random_tests (void)
>> >  {
>> > @@ -316,6 +376,7 @@ test_main (void)
>> >      }
>> >
>> >    do_random_tests ();
>> > +  do_overflow_tests ();
>> >    return ret;
>> >  }
>> >
>> > diff --git a/string/test-strnlen.c b/string/test-strnlen.c
>> > index 920f58e97b..f53e09263f 100644
>> > --- a/string/test-strnlen.c
>> > +++ b/string/test-strnlen.c
>> > @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen,
>> int max_char)
>> >      do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len,
>> maxlen));
>> >  }
>> >
>> > +static void
>> > +do_overflow_tests (void)
>> > +{
>> > +  size_t i, j, len;
>> > +  const size_t one = 1;
>> > +  uintptr_t buf_addr = (uintptr_t) buf1;
>> > +
>> > +  for (i = 0; i < 750; ++i)
>> > +    {
>> > +      do_test (0, i, SIZE_MAX - i, BIG_CHAR);
>> > +      do_test (0, i, i - buf_addr, BIG_CHAR);
>> > +      do_test (0, i, -buf_addr - i, BIG_CHAR);
>> > +      do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
>> > +      do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
>> > +
>> > +      len = 0;
>> > +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
>> > +        {
>> > +          len |= one << j;
>> > +          do_test (0, i, len - i, BIG_CHAR);
>> > +          do_test (0, i, len + i, BIG_CHAR);
>> > +          do_test (0, i, len - buf_addr - i, BIG_CHAR);
>> > +          do_test (0, i, len - buf_addr + i, BIG_CHAR);
>> > +
>> > +          do_test (0, i, ~len - i, BIG_CHAR);
>> > +          do_test (0, i, ~len + i, BIG_CHAR);
>> > +          do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
>> > +          do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
>> > +        }
>> > +    }
>> > +}
>> > +
>> >  static void
>> >  do_random_tests (void)
>> >  {
>> > @@ -283,6 +315,7 @@ test_main (void)
>> >    do_random_tests ();
>> >    do_page_tests ();
>> >    do_page_2_tests ();
>> > +  do_overflow_tests ();
>> >    return ret;
>> >  }
>> >
>> > --
>> > 2.25.1
>> >
>>
>>
>> --
>> H.J.
>>
>
Ping if we want this in 2.34

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v1 1/3] String: Add additional overflow tests for strnlen,  memchr, and strncat
  2021-06-22 15:43     ` Noah Goldstein
@ 2021-06-22 16:18       ` H.J. Lu
  2021-06-22 18:23         ` Noah Goldstein
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2021-06-22 16:18 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jun 22, 2021 at 8:43 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
> On Wed, Jun 9, 2021 at 6:26 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>>
>>
>> On Wed, Jun 9, 2021 at 5:54 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>>
>>> On Wed, Jun 9, 2021 at 1:53 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>> >
>>> > This commit adds tests for a bug in the wide char variant of the
>>> > functions where the implementation may assume that maxlen for wcsnlen
>>> > or n for wmemchr/strncat will not overflow when multiplied by
>>> > sizeof(wchar_t).
>>> >
>>> > These tests show the following implementations failing on x86_64:
>>> >
>>> > wcsnlen-sse4_1
>>> > wcsnlen-avx2
>>> >
>>> > wmemchr-sse2
>>> > wmemchr-avx2
>>> >
>>> > strncat would fail as well if it where on a system that prefered
>>> > either of the wcsnlen implementations that failed as it relies on
>>> > wcsnlen.
>>>
>>> Please open a bug report for each standard C function.   We need to
>>> track them for backporting to release branches.
>>
>>
>> Done: https://sourceware.org/bugzilla/show_bug.cgi?id=27974
>>
>>>
>>>
>>> Thanks.
>>>
>>> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
>>> > ---
>>> >  string/test-memchr.c  | 39 ++++++++++++++++++++++++---
>>> >  string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
>>> >  string/test-strnlen.c | 33 +++++++++++++++++++++++
>>> >  3 files changed, 130 insertions(+), 3 deletions(-)
>>> >
>>> > diff --git a/string/test-memchr.c b/string/test-memchr.c
>>> > index 665edc32af..ce964284aa 100644
>>> > --- a/string/test-memchr.c
>>> > +++ b/string/test-memchr.c
>>> > @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
>>> >    CHAR *res = CALL (impl, s, c, n);
>>> >    if (res != exp_res)
>>> >      {
>>> > -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
>>> > -            res, exp_res);
>>> > +      error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
>>> > +             impl->name, s, c, n, res, exp_res);
>>> >        ret = 1;
>>> >        return;
>>> >      }
>>> > @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
>>> >      }
>>> >    buf[align + len] = 0;
>>> >
>>> > -  if (pos < len)
>>> > +  if (pos < MIN(n, len))
>>> >      {
>>> >        buf[align + pos] = seek_char;
>>> >        buf[align + len] = -seek_char;
>>> > @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
>>> >      do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
>>> >  }
>>> >
>>> > +static void
>>> > +do_overflow_tests (void)
>>> > +{
>>> > +  size_t i, j, len;
>>> > +  const size_t one = 1;
>>> > +  uintptr_t buf_addr = (uintptr_t) buf1;
>>> > +
>>> > +  for (i = 0; i < 750; ++i)
>>> > +    {
>>> > +        do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
>>> > +        do_test (0, i, 751, i - buf_addr, BIG_CHAR);
>>> > +        do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
>>> > +        do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
>>> > +        do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
>>> > +
>>> > +      len = 0;
>>> > +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
>>> > +        {
>>> > +          len |= one << j;
>>> > +          do_test (0, i, 751, len - i, BIG_CHAR);
>>> > +          do_test (0, i, 751, len + i, BIG_CHAR);
>>> > +          do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
>>> > +          do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
>>> > +
>>> > +          do_test (0, i, 751, ~len - i, BIG_CHAR);
>>> > +          do_test (0, i, 751, ~len + i, BIG_CHAR);
>>> > +          do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
>>> > +          do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
>>> > +        }
>>> > +    }
>>> > +}
>>> > +
>>> >  static void
>>> >  do_random_tests (void)
>>> >  {
>>> > @@ -221,6 +253,7 @@ test_main (void)
>>> >      do_test (page_size / 2 - i, i, i, 1, 0x9B);
>>> >
>>> >    do_random_tests ();
>>> > +  do_overflow_tests ();
>>> >    return ret;
>>> >  }
>>> >
>>> > diff --git a/string/test-strncat.c b/string/test-strncat.c
>>> > index 2ef917b820..0ab7541d4e 100644
>>> > --- a/string/test-strncat.c
>>> > +++ b/string/test-strncat.c
>>> > @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
>>> >      }
>>> >  }
>>> >
>>> > +static void
>>> > +do_overflow_tests (void)
>>> > +{
>>> > +  size_t i, j, len;
>>> > +  const size_t one = 1;
>>> > +  CHAR *s1, *s2;
>>> > +  uintptr_t s1_addr;
>>> > +  s1 = (CHAR *) buf1;
>>> > +  s2 = (CHAR *) buf2;
>>> > +  s1_addr = (uintptr_t)s1;
>>> > + for (j = 0; j < 200; ++j)
>>> > +      s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
>>> > + s2[200] = 0;
>>> > +  for (i = 0; i < 750; ++i) {
>>> > +    for (j = 0; j < i; ++j)
>>> > +      s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
>>> > +    s1[i] = '\0';
>>> > +
>>> > +       FOR_EACH_IMPL (impl, 0)
>>> > +    {
>>> > +      s2[0] = '\0';
>>> > +      do_one_test (impl, s2, s1, SIZE_MAX - i);
>>> > +      s2[0] = '\0';
>>> > +      do_one_test (impl, s2, s1, i - s1_addr);
>>> > +      s2[0] = '\0';
>>> > +      do_one_test (impl, s2, s1, -s1_addr - i);
>>> > +      s2[0] = '\0';
>>> > +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
>>> > +      s2[0] = '\0';
>>> > +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
>>> > +    }
>>> > +
>>> > +    len = 0;
>>> > +    for (j = 8 * sizeof(size_t) - 1; j ; --j)
>>> > +      {
>>> > +        len |= one << j;
>>> > +        FOR_EACH_IMPL (impl, 0)
>>> > +          {
>>> > +            s2[0] = '\0';
>>> > +            do_one_test (impl, s2, s1, len - i);
>>> > +            s2[0] = '\0';
>>> > +            do_one_test (impl, s2, s1, len + i);
>>> > +            s2[0] = '\0';
>>> > +            do_one_test (impl, s2, s1, len - s1_addr - i);
>>> > +            s2[0] = '\0';
>>> > +            do_one_test (impl, s2, s1, len - s1_addr + i);
>>> > +
>>> > +            s2[0] = '\0';
>>> > +            do_one_test (impl, s2, s1, ~len - i);
>>> > +            s2[0] = '\0';
>>> > +            do_one_test (impl, s2, s1, ~len + i);
>>> > +            s2[0] = '\0';
>>> > +            do_one_test (impl, s2, s1, ~len - s1_addr - i);
>>> > +            s2[0] = '\0';
>>> > +            do_one_test (impl, s2, s1, ~len - s1_addr + i);
>>> > +          }
>>> > +      }
>>> > +  }
>>> > +}
>>> > +
>>> >  static void
>>> >  do_random_tests (void)
>>> >  {
>>> > @@ -316,6 +376,7 @@ test_main (void)
>>> >      }
>>> >
>>> >    do_random_tests ();
>>> > +  do_overflow_tests ();
>>> >    return ret;
>>> >  }
>>> >
>>> > diff --git a/string/test-strnlen.c b/string/test-strnlen.c
>>> > index 920f58e97b..f53e09263f 100644
>>> > --- a/string/test-strnlen.c
>>> > +++ b/string/test-strnlen.c
>>> > @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
>>> >      do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
>>> >  }
>>> >
>>> > +static void
>>> > +do_overflow_tests (void)
>>> > +{
>>> > +  size_t i, j, len;
>>> > +  const size_t one = 1;
>>> > +  uintptr_t buf_addr = (uintptr_t) buf1;
>>> > +
>>> > +  for (i = 0; i < 750; ++i)
>>> > +    {
>>> > +      do_test (0, i, SIZE_MAX - i, BIG_CHAR);
>>> > +      do_test (0, i, i - buf_addr, BIG_CHAR);
>>> > +      do_test (0, i, -buf_addr - i, BIG_CHAR);
>>> > +      do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
>>> > +      do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
>>> > +
>>> > +      len = 0;
>>> > +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
>>> > +        {
>>> > +          len |= one << j;
>>> > +          do_test (0, i, len - i, BIG_CHAR);
>>> > +          do_test (0, i, len + i, BIG_CHAR);
>>> > +          do_test (0, i, len - buf_addr - i, BIG_CHAR);
>>> > +          do_test (0, i, len - buf_addr + i, BIG_CHAR);
>>> > +
>>> > +          do_test (0, i, ~len - i, BIG_CHAR);
>>> > +          do_test (0, i, ~len + i, BIG_CHAR);
>>> > +          do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
>>> > +          do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
>>> > +        }
>>> > +    }
>>> > +}
>>> > +
>>> >  static void
>>> >  do_random_tests (void)
>>> >  {
>>> > @@ -283,6 +315,7 @@ test_main (void)
>>> >    do_random_tests ();
>>> >    do_page_tests ();
>>> >    do_page_2_tests ();
>>> > +  do_overflow_tests ();
>>> >    return ret;
>>> >  }
>>> >
>>> > --
>>> > 2.25.1
>>> >
>>>
>>>
>>> --
>>> H.J.
>
>
> Ping if we want this in 2.34

Can you repost the patches with BZ# in the commit log?

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974]
  2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
                   ` (2 preceding siblings ...)
  2021-06-09 21:53 ` [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat H.J. Lu
@ 2021-06-22 18:11 ` Noah Goldstein
  2021-06-22 21:24   ` H.J. Lu
  2021-06-22 18:11 ` [PATCH v2 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-22 18:11 UTC (permalink / raw)
  To: libc-alpha

This commit adds tests for a bug in the wide char variant of the
functions where the implementation may assume that maxlen for wcsnlen
or n for wmemchr/strncat will not overflow when multiplied by
sizeof(wchar_t).

These tests show the following implementations failing on x86_64:

wcsnlen-sse4_1
wcsnlen-avx2

wmemchr-sse2
wmemchr-avx2

strncat would fail as well if it where on a system that prefered
either of the wcsnlen implementations that failed as it relies on
wcsnlen.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
Some notes:

I only tested this patch (and the subsequent fixes) on a machine that
prefers EVEX.

The fix for wcsnlen-sse2 is possibly invalid. What it does is checks
if the computation is maxlen * sizeof(wchar_t) + s overflows, and if
so just calls wcslen. The rational is that either the end of the
string will be found in readable memory or the user invoked UB by
calling wcsnlen on a string that is not contained in valid memory
and without a maxlen to that will bound it in valid memory.

 string/test-memchr.c  | 39 ++++++++++++++++++++++++---
 string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
 string/test-strnlen.c | 33 +++++++++++++++++++++++
 3 files changed, 130 insertions(+), 3 deletions(-)

diff --git a/string/test-memchr.c b/string/test-memchr.c
index 665edc32af..ce964284aa 100644
--- a/string/test-memchr.c
+++ b/string/test-memchr.c
@@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
   CHAR *res = CALL (impl, s, c, n);
   if (res != exp_res)
     {
-      error (0, 0, "Wrong result in function %s %p %p", impl->name,
-	     res, exp_res);
+      error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
+             impl->name, s, c, n, res, exp_res);
       ret = 1;
       return;
     }
@@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
     }
   buf[align + len] = 0;
 
-  if (pos < len)
+  if (pos < MIN(n, len))
     {
       buf[align + pos] = seek_char;
       buf[align + len] = -seek_char;
@@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
     do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
 }
 
+static void
+do_overflow_tests (void)
+{
+  size_t i, j, len;
+  const size_t one = 1;
+  uintptr_t buf_addr = (uintptr_t) buf1;
+
+  for (i = 0; i < 750; ++i)
+    {
+        do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
+        do_test (0, i, 751, i - buf_addr, BIG_CHAR);
+        do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
+        do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
+        do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+      len = 0;
+      for (j = 8 * sizeof(size_t) - 1; j ; --j)
+        {
+          len |= one << j;
+          do_test (0, i, 751, len - i, BIG_CHAR);
+          do_test (0, i, 751, len + i, BIG_CHAR);
+          do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
+
+          do_test (0, i, 751, ~len - i, BIG_CHAR);
+          do_test (0, i, 751, ~len + i, BIG_CHAR);
+          do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
+        }
+    }
+}
+
 static void
 do_random_tests (void)
 {
@@ -221,6 +253,7 @@ test_main (void)
     do_test (page_size / 2 - i, i, i, 1, 0x9B);
 
   do_random_tests ();
+  do_overflow_tests ();
   return ret;
 }
 
diff --git a/string/test-strncat.c b/string/test-strncat.c
index 2ef917b820..37ea26ea05 100644
--- a/string/test-strncat.c
+++ b/string/test-strncat.c
@@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
     }
 }
 
+static void
+do_overflow_tests (void)
+{
+  size_t i, j, len;
+  const size_t one = 1;
+  CHAR *s1, *s2;
+  uintptr_t s1_addr;
+  s1 = (CHAR *) buf1;
+  s2 = (CHAR *) buf2;
+  s1_addr = (uintptr_t)s1;
+ for (j = 0; j < 200; ++j)
+      s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
+ s2[200] = 0;
+  for (i = 0; i < 750; ++i) {
+    for (j = 0; j < i; ++j)
+      s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
+    s1[i] = '\0';
+
+       FOR_EACH_IMPL (impl, 0)
+    {
+      s2[200] = '\0';
+      do_one_test (impl, s2, s1, SIZE_MAX - i);
+      s2[200] = '\0';
+      do_one_test (impl, s2, s1, i - s1_addr);
+      s2[200] = '\0';
+      do_one_test (impl, s2, s1, -s1_addr - i);
+      s2[200] = '\0';
+      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
+      s2[200] = '\0';
+      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
+    }
+
+    len = 0;
+    for (j = 8 * sizeof(size_t) - 1; j ; --j)
+      {
+        len |= one << j;
+        FOR_EACH_IMPL (impl, 0)
+          {
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, len - i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, len + i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, len - s1_addr - i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, len - s1_addr + i);
+
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, ~len - i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, ~len + i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, ~len - s1_addr - i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, ~len - s1_addr + i);
+          }
+      }
+  }
+}
+
 static void
 do_random_tests (void)
 {
@@ -316,6 +376,7 @@ test_main (void)
     }
 
   do_random_tests ();
+  do_overflow_tests ();
   return ret;
 }
 
diff --git a/string/test-strnlen.c b/string/test-strnlen.c
index 920f58e97b..f53e09263f 100644
--- a/string/test-strnlen.c
+++ b/string/test-strnlen.c
@@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
     do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
 }
 
+static void
+do_overflow_tests (void)
+{
+  size_t i, j, len;
+  const size_t one = 1;
+  uintptr_t buf_addr = (uintptr_t) buf1;
+
+  for (i = 0; i < 750; ++i)
+    {
+      do_test (0, i, SIZE_MAX - i, BIG_CHAR);
+      do_test (0, i, i - buf_addr, BIG_CHAR);
+      do_test (0, i, -buf_addr - i, BIG_CHAR);
+      do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
+      do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+      len = 0;
+      for (j = 8 * sizeof(size_t) - 1; j ; --j)
+        {
+          len |= one << j;
+          do_test (0, i, len - i, BIG_CHAR);
+          do_test (0, i, len + i, BIG_CHAR);
+          do_test (0, i, len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, len - buf_addr + i, BIG_CHAR);
+
+          do_test (0, i, ~len - i, BIG_CHAR);
+          do_test (0, i, ~len + i, BIG_CHAR);
+          do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
+        }
+    }
+}
+
 static void
 do_random_tests (void)
 {
@@ -283,6 +315,7 @@ test_main (void)
   do_random_tests ();
   do_page_tests ();
   do_page_2_tests ();
+  do_overflow_tests ();
   return ret;
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ #27974]
  2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
                   ` (3 preceding siblings ...)
  2021-06-22 18:11 ` [PATCH v2 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974] Noah Goldstein
@ 2021-06-22 18:11 ` Noah Goldstein
  2021-06-22 21:24   ` H.J. Lu
  2021-06-22 18:11 ` [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-22 18:11 UTC (permalink / raw)
  To: libc-alpha

This commit fixes the bug mentioned in the previous commit.

The previous implementations of wmemchr in these files relied
on n * sizeof(wchar_t) which was not guranteed by the standard.

The new overflow tests added in the previous commit now
pass (As well as all the other tests).

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/memchr.S                | 77 +++++++++++++++++++-------
 sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
 2 files changed, 98 insertions(+), 37 deletions(-)

diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index beff2708de..3ddc4655cf 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -21,9 +21,11 @@
 #ifdef USE_AS_WMEMCHR
 # define MEMCHR		wmemchr
 # define PCMPEQ		pcmpeqd
+# define CHAR_PER_VEC	4
 #else
 # define MEMCHR		memchr
 # define PCMPEQ		pcmpeqb
+# define CHAR_PER_VEC	16
 #endif
 
 /* fast SSE2 version with using pmaxub and 64 byte loop */
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
 	movd	%esi, %xmm1
 	mov	%edi, %ecx
 
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#endif
 #ifdef USE_AS_WMEMCHR
 	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
-	shl	$2, %RDX_LP
 #else
-# ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
-# endif
 	punpcklbw %xmm1, %xmm1
 	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
 	test	%eax, %eax
 
 	jnz	L(matches_1)
-	sub	$16, %rdx
+	sub	$CHAR_PER_VEC, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
 	and	$15, %ecx
 	and	$-16, %rdi
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	add	%rcx, %rdx
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	jmp	L(loop_prolog)
 
@@ -77,16 +81,21 @@ L(crosscache):
 	movdqa	(%rdi), %xmm0
 
 	PCMPEQ	%xmm1, %xmm0
-/* Check if there is a match.  */
+	/* Check if there is a match.  */
 	pmovmskb %xmm0, %eax
-/* Remove the leading bytes.  */
+	/* Remove the leading bytes.  */
 	sar	%cl, %eax
 	test	%eax, %eax
 	je	L(unaligned_no_match)
-/* Check which byte is a match.  */
+	/* Check which byte is a match.  */
 	bsf	%eax, %eax
-
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	add	%rdi, %rax
 	add	%rcx, %rax
@@ -94,15 +103,18 @@ L(crosscache):
 
 	.p2align 4
 L(unaligned_no_match):
-        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+	/* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
 	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
 	   possible addition overflow.  */
 	neg	%rcx
 	add	$16, %rcx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	sub	%rcx, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 
 	.p2align 4
@@ -135,7 +147,7 @@ L(loop_prolog):
 	test	$0x3f, %rdi
 	jz	L(align64_loop)
 
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 
 	movdqa	(%rdi), %xmm0
@@ -167,11 +179,14 @@ L(loop_prolog):
 	mov	%rdi, %rcx
 	and	$-64, %rdi
 	and	$63, %ecx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	add	%rcx, %rdx
 
 	.p2align 4
 L(align64_loop):
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	movdqa	(%rdi), %xmm0
 	movdqa	16(%rdi), %xmm2
@@ -218,7 +233,7 @@ L(align64_loop):
 
 	.p2align 4
 L(exit_loop):
-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
 	jle	L(exit_loop_32)
 
 	movdqa	(%rdi), %xmm0
@@ -238,7 +253,7 @@ L(exit_loop):
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32_1)
-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
 	jle	L(return_null)
 
 	PCMPEQ	48(%rdi), %xmm1
@@ -250,13 +265,13 @@ L(exit_loop):
 
 	.p2align 4
 L(exit_loop_32):
-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
 	movdqa	(%rdi), %xmm0
 	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches_1)
-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
 	jbe	L(return_null)
 
 	PCMPEQ	16(%rdi), %xmm1
@@ -293,7 +308,13 @@ L(matches32):
 	.p2align 4
 L(matches_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	add	%rdi, %rax
 	ret
@@ -301,7 +322,13 @@ L(matches_1):
 	.p2align 4
 L(matches16_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	16(%rdi, %rax), %rax
 	ret
@@ -309,7 +336,13 @@ L(matches16_1):
 	.p2align 4
 L(matches32_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	32(%rdi, %rax), %rax
 	ret
@@ -317,7 +350,13 @@ L(matches32_1):
 	.p2align 4
 L(matches48_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	48(%rdi, %rax), %rax
 	ret
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 0d8758e3e7..afdb956502 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -54,21 +54,19 @@
 
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
-	test	%RDX_LP, %RDX_LP
-	jz	L(null)
-# endif
-# ifdef USE_AS_WMEMCHR
-	shl	$2, %RDX_LP
-# else
 #  ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+#  else
+	test	%RDX_LP, %RDX_LP
 #  endif
+	jz	L(null)
 # endif
 	/* Broadcast CHAR to YMMMATCH.  */
 	vmovd	%esi, %xmm0
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
 	vpmovmskb %ymm1, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* If length < CHAR_PER_VEC handle special.  */
-	cmpq	$VEC_SIZE, %rdx
+	cmpq	$CHAR_PER_VEC, %rdx
 	jbe	L(first_vec_x0)
 # endif
 	testl	%eax, %eax
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
 L(first_vec_x0):
 	/* Check if first match was before length.  */
 	tzcntl	%eax, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
 	xorl	%ecx, %ecx
 	cmpl	%eax, %edx
 	leaq	(%rdi, %rax), %rax
@@ -110,12 +112,12 @@ L(null):
 # endif
 	.p2align 4
 L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is necessary
-	   for computer return address if byte is found or adjusting length
-	   if it is not and this is memchr.  */
+	/* Save pointer before aligning as its original value is
+	   necessary for computer return address if byte is found or
+	   adjusting length if it is not and this is memchr.  */
 	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
-	   rdi for rawmemchr.  */
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+	   and rdi for rawmemchr.  */
 	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
 	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
@@ -124,6 +126,10 @@ L(cross_page_boundary):
 	   match).  */
 	leaq	1(%ALGN_PTR_REG), %rsi
 	subq	%RRAW_PTR_REG, %rsi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+#  endif
 # endif
 	/* Remove the leading bytes.  */
 	sarxl	%ERAW_PTR_REG, %eax, %eax
@@ -181,6 +187,10 @@ L(cross_page_continue):
 	orq	$(VEC_SIZE - 1), %rdi
 	/* esi is for adjusting length to see if near the end.  */
 	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %esi
+#  endif
 # else
 	orq	$(VEC_SIZE - 1), %rdi
 L(cross_page_continue):
@@ -213,7 +223,7 @@ L(cross_page_continue):
 
 # ifndef USE_AS_RAWMEMCHR
 	/* Check if at last VEC_SIZE * 4 length.  */
-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(last_4x_vec_or_less_cmpeq)
 	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
 	   length.  */
@@ -221,6 +231,10 @@ L(cross_page_continue):
 	movl	%edi, %ecx
 	orq	$(VEC_SIZE * 4 - 1), %rdi
 	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 	addq	%rcx, %rdx
 # else
 	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
@@ -250,15 +264,19 @@ L(loop_4x_vec):
 
 	subq	$-(VEC_SIZE * 4), %rdi
 
-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$(CHAR_PER_VEC * 4), %rdx
 	ja	L(loop_4x_vec)
 
-	/* Fall through into less than 4 remaining vectors of length case.
-	 */
+	/* Fall through into less than 4 remaining vectors of length
+	   case.  */
 	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	.p2align 4
 L(last_4x_vec_or_less):
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
 	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
 	jnz	L(first_vec_x1_check)
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
 L(last_4x_vec_or_less_cmpeq):
 	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
 	subq	$-(VEC_SIZE * 4), %rdi
 	/* Check first VEC regardless.  */
 	testl	%eax, %eax
-- 
2.25.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
  2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
                   ` (4 preceding siblings ...)
  2021-06-22 18:11 ` [PATCH v2 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
@ 2021-06-22 18:11 ` Noah Goldstein
  2021-06-22 21:33   ` H.J. Lu
  2021-06-23  6:31 ` [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat " Noah Goldstein
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-22 18:11 UTC (permalink / raw)
  To: libc-alpha

This commit fixes the bug mentioned in the previous commit.

The previous implementations of wmemchr in these files relied
on maxlen * sizeof(wchar_t) which was not guranteed by the standard.

The new overflow tests added in the previous commit now
pass (As well as all the other tests).

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
 sysdeps/x86_64/strlen.S                |  14 ++-
 2 files changed, 106 insertions(+), 38 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index bd2e6ee44a..b282a75613 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -44,21 +44,21 @@
 
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
 	/* Check zero length.  */
+#  ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RSI_LP, %RSI_LP
+#  else
 	test	%RSI_LP, %RSI_LP
+#  endif
 	jz	L(zero)
 	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
 	mov	%RSI_LP, %R8_LP
-#  ifdef USE_AS_WCSLEN
-	shl	$2, %RSI_LP
-#  elif defined __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-#  endif
 # endif
 	movl	%edi, %eax
 	movq	%rdi, %rdx
@@ -72,10 +72,10 @@ ENTRY (STRLEN)
 
 	/* Check the first VEC_SIZE bytes.  */
 	VPCMPEQ	(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 # ifdef USE_AS_STRNLEN
 	/* If length < VEC_SIZE handle special.  */
-	cmpq	$VEC_SIZE, %rsi
+	cmpq	$CHAR_PER_VEC, %rsi
 	jbe	L(first_vec_x0)
 # endif
 	/* If empty continue to aligned_more. Otherwise return bit
@@ -84,6 +84,7 @@ ENTRY (STRLEN)
 	jz	L(aligned_more)
 	tzcntl	%eax, %eax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -97,9 +98,14 @@ L(zero):
 L(first_vec_x0):
 	/* Set bit for max len so that tzcnt will return min of max len
 	   and position of first match.  */
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
 	btsq	%rsi, %rax
 	tzcntl	%eax, %eax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 #  endif
 	VZEROUPPER_RETURN
@@ -113,14 +119,19 @@ L(first_vec_x1):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE * 4 + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	incl	%edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -133,14 +144,19 @@ L(first_vec_x2):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE * 3 + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -153,14 +169,19 @@ L(first_vec_x3):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE * 2 + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE * 2 + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -173,14 +194,19 @@ L(first_vec_x4):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE * 3 + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -195,10 +221,14 @@ L(cross_page_continue):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
 # ifdef USE_AS_STRNLEN
-	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
-	   it simplies the logic in last_4x_vec_or_less.  */
+	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
+	   because it simplies the logic in last_4x_vec_or_less.  */
 	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
 	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 # endif
 	/* Load first VEC regardless.  */
 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
@@ -207,34 +237,38 @@ L(cross_page_continue):
 	subq	%rcx, %rsi
 	jb	L(last_4x_vec_or_less)
 # endif
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x4)
 
 	/* Align data to VEC_SIZE * 4 - 1.  */
 # ifdef USE_AS_STRNLEN
 	/* Before adjusting length check if at last VEC_SIZE * 4.  */
-	cmpq	$(VEC_SIZE * 4 - 1), %rsi
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
 	jbe	L(last_4x_vec_or_less_load)
 	incq	%rdi
 	movl	%edi, %ecx
 	orq	$(VEC_SIZE * 4 - 1), %rdi
 	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 	/* Readjust length.  */
 	addq	%rcx, %rsi
 # else
@@ -246,13 +280,13 @@ L(cross_page_continue):
 L(loop_4x_vec):
 # ifdef USE_AS_STRNLEN
 	/* Break if at end of length.  */
-	subq	$(VEC_SIZE * 4), %rsi
+	subq	$(CHAR_PER_VEC * 4), %rsi
 	jb	L(last_4x_vec_or_less_cmpeq)
 # endif
-	/* Save some code size by microfusing VPMINU with the load. Since
-	   the matches in ymm2/ymm4 can only be returned if there where no
-	   matches in ymm1/ymm3 respectively there is no issue with overlap.
-	 */
+	/* Save some code size by microfusing VPMINU with the load.
+	   Since the matches in ymm2/ymm4 can only be returned if there
+	   where no matches in ymm1/ymm3 respectively there is no issue
+	   with overlap.  */
 	vmovdqa	1(%rdi), %ymm1
 	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
 	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
@@ -260,7 +294,7 @@ L(loop_4x_vec):
 
 	VPMINU	%ymm2, %ymm4, %ymm5
 	VPCMPEQ	%ymm5, %ymm0, %ymm5
-	vpmovmskb	%ymm5, %ecx
+	vpmovmskb %ymm5, %ecx
 
 	subq	$-(VEC_SIZE * 4), %rdi
 	testl	%ecx, %ecx
@@ -268,27 +302,28 @@ L(loop_4x_vec):
 
 
 	VPCMPEQ	%ymm1, %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	subq	%rdx, %rdi
 	testl	%eax, %eax
 	jnz	L(last_vec_return_x0)
 
 	VPCMPEQ	%ymm2, %ymm0, %ymm2
-	vpmovmskb	%ymm2, %eax
+	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_return_x1)
 
 	/* Combine last 2 VEC.  */
 	VPCMPEQ	%ymm3, %ymm0, %ymm3
-	vpmovmskb	%ymm3, %eax
-	/* rcx has combined result from all 4 VEC. It will only be used if
-	   the first 3 other VEC all did not contain a match.  */
+	vpmovmskb %ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used
+	   if the first 3 other VEC all did not contain a match.  */
 	salq	$32, %rcx
 	orq	%rcx, %rax
 	tzcntq	%rax, %rax
 	subq	$(VEC_SIZE * 2 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -297,15 +332,19 @@ L(loop_4x_vec):
 # ifdef USE_AS_STRNLEN
 	.p2align 4
 L(last_4x_vec_or_less_load):
-	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
+	/* Depending on entry adjust rdi / prepare first VEC in ymm1.
+	 */
 	subq	$-(VEC_SIZE * 4), %rdi
 L(last_4x_vec_or_less_cmpeq):
 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 L(last_4x_vec_or_less):
-
-	vpmovmskb	%ymm1, %eax
-	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
-	   VEC_SIZE * 4.  */
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
+	vpmovmskb %ymm1, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off
+	   by VEC_SIZE * 4.  */
 	testl	$(VEC_SIZE * 2), %esi
 	jnz	L(last_4x_vec)
 
@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
 	jb	L(max)
 
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
 	cmpl	%eax, %esi
@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
 	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -340,6 +380,7 @@ L(last_vec_return_x0):
 	subq	$(VEC_SIZE * 4 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -350,6 +391,7 @@ L(last_vec_return_x1):
 	subq	$(VEC_SIZE * 3 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -366,6 +408,7 @@ L(last_vec_x1_check):
 	incl	%eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -381,14 +424,14 @@ L(last_4x_vec):
 	jnz	L(last_vec_x1)
 
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x2)
 
 	/* Normalize length.  */
 	andl	$(VEC_SIZE * 4 - 1), %esi
 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x3)
 
@@ -396,7 +439,7 @@ L(last_4x_vec):
 	jb	L(max)
 
 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
 	cmpl	%eax, %esi
@@ -405,6 +448,7 @@ L(last_4x_vec):
 	addl	$(VEC_SIZE * 3 + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -419,6 +463,7 @@ L(last_vec_x1):
 	incl	%eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -432,6 +477,7 @@ L(last_vec_x2):
 	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -447,6 +493,7 @@ L(last_vec_x3):
 	addl	$(VEC_SIZE * 2 + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -455,13 +502,13 @@ L(max_end):
 	VZEROUPPER_RETURN
 # endif
 
-	/* Cold case for crossing page with first load.	 */
+	/* Cold case for crossing page with first load.  */
 	.p2align 4
 L(cross_page_boundary):
 	/* Align data to VEC_SIZE - 1.  */
 	orq	$(VEC_SIZE - 1), %rdi
 	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
 	   so no need to manually mod rdx.  */
 	sarxl	%edx, %eax, %eax
@@ -470,6 +517,10 @@ L(cross_page_boundary):
 	jnz	L(cross_page_less_vec)
 	leaq	1(%rdi), %rcx
 	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %ecx
+#  endif
 	/* Check length.  */
 	cmpq	%rsi, %rcx
 	jb	L(cross_page_continue)
@@ -479,6 +530,7 @@ L(cross_page_boundary):
 	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide length by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 #  endif
 # endif
@@ -489,6 +541,10 @@ L(return_vzeroupper):
 	.p2align 4
 L(cross_page_less_vec):
 	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
 	cmpq	%rax, %rsi
 	cmovb	%esi, %eax
 #  ifdef USE_AS_WCSLEN
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index d223ea1700..3fc6734910 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -65,12 +65,24 @@ ENTRY(strlen)
 	ret
 L(n_nonzero):
 # ifdef AS_WCSLEN
-	shl	$2, %RSI_LP
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
+   overflow the only way this program doesn't have undefined behavior 
+   is if there is a null terminator in valid memory so strlen will 
+   suffice.  */
+	mov	%RSI_LP, %R10_LP
+	sar	$62, %R10_LP
+	test	%R10_LP, %R10_LP
+	jnz	__wcslen_sse2
+	sal	$2, %RSI_LP
 # endif
 
 /* Initialize long lived registers.  */
 
 	add	%RDI_LP, %RSI_LP
+# ifdef AS_WCSLEN
+/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
+	jbe	__wcslen_sse2
+# endif
 	mov	%RSI_LP, %R10_LP
 	and	$-64, %R10_LP
 	mov	%RSI_LP, %R11_LP
-- 
2.25.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v1 1/3] String: Add additional overflow tests for strnlen,  memchr, and strncat
  2021-06-22 16:18       ` H.J. Lu
@ 2021-06-22 18:23         ` Noah Goldstein
  0 siblings, 0 replies; 27+ messages in thread
From: Noah Goldstein @ 2021-06-22 18:23 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jun 22, 2021 at 12:19 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> On Tue, Jun 22, 2021 at 8:43 AM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> >
> > On Wed, Jun 9, 2021 at 6:26 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >>
> >>
> >>
> >> On Wed, Jun 9, 2021 at 5:54 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>>
> >>> On Wed, Jun 9, 2021 at 1:53 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >>> >
> >>> > This commit adds tests for a bug in the wide char variant of the
> >>> > functions where the implementation may assume that maxlen for wcsnlen
> >>> > or n for wmemchr/strncat will not overflow when multiplied by
> >>> > sizeof(wchar_t).
> >>> >
> >>> > These tests show the following implementations failing on x86_64:
> >>> >
> >>> > wcsnlen-sse4_1
> >>> > wcsnlen-avx2
> >>> >
> >>> > wmemchr-sse2
> >>> > wmemchr-avx2
> >>> >
> >>> > strncat would fail as well if it where on a system that prefered
> >>> > either of the wcsnlen implementations that failed as it relies on
> >>> > wcsnlen.
> >>>
> >>> Please open a bug report for each standard C function.   We need to
> >>> track them for backporting to release branches.
> >>
> >>
> >> Done: https://sourceware.org/bugzilla/show_bug.cgi?id=27974
> >>
> >>>
> >>>
> >>> Thanks.
> >>>
> >>> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> >>> > ---
> >>> >  string/test-memchr.c  | 39 ++++++++++++++++++++++++---
> >>> >  string/test-strncat.c | 61
> +++++++++++++++++++++++++++++++++++++++++++
> >>> >  string/test-strnlen.c | 33 +++++++++++++++++++++++
> >>> >  3 files changed, 130 insertions(+), 3 deletions(-)
> >>> >
> >>> > diff --git a/string/test-memchr.c b/string/test-memchr.c
> >>> > index 665edc32af..ce964284aa 100644
> >>> > --- a/string/test-memchr.c
> >>> > +++ b/string/test-memchr.c
> >>> > @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c,
> size_t n, CHAR *exp_res)
> >>> >    CHAR *res = CALL (impl, s, c, n);
> >>> >    if (res != exp_res)
> >>> >      {
> >>> > -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
> >>> > -            res, exp_res);
> >>> > +      error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p
> != %p",
> >>> > +             impl->name, s, c, n, res, exp_res);
> >>> >        ret = 1;
> >>> >        return;
> >>> >      }
> >>> > @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len,
> size_t n, int seek_char)
> >>> >      }
> >>> >    buf[align + len] = 0;
> >>> >
> >>> > -  if (pos < len)
> >>> > +  if (pos < MIN(n, len))
> >>> >      {
> >>> >        buf[align + pos] = seek_char;
> >>> >        buf[align + len] = -seek_char;
> >>> > @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len,
> size_t n, int seek_char)
> >>> >      do_one_test (impl, (CHAR *) (buf + align), seek_char, n,
> result);
> >>> >  }
> >>> >
> >>> > +static void
> >>> > +do_overflow_tests (void)
> >>> > +{
> >>> > +  size_t i, j, len;
> >>> > +  const size_t one = 1;
> >>> > +  uintptr_t buf_addr = (uintptr_t) buf1;
> >>> > +
> >>> > +  for (i = 0; i < 750; ++i)
> >>> > +    {
> >>> > +        do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
> >>> > +        do_test (0, i, 751, i - buf_addr, BIG_CHAR);
> >>> > +        do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
> >>> > +        do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
> >>> > +        do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
> >>> > +
> >>> > +      len = 0;
> >>> > +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
> >>> > +        {
> >>> > +          len |= one << j;
> >>> > +          do_test (0, i, 751, len - i, BIG_CHAR);
> >>> > +          do_test (0, i, 751, len + i, BIG_CHAR);
> >>> > +          do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
> >>> > +          do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
> >>> > +
> >>> > +          do_test (0, i, 751, ~len - i, BIG_CHAR);
> >>> > +          do_test (0, i, 751, ~len + i, BIG_CHAR);
> >>> > +          do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
> >>> > +          do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
> >>> > +        }
> >>> > +    }
> >>> > +}
> >>> > +
> >>> >  static void
> >>> >  do_random_tests (void)
> >>> >  {
> >>> > @@ -221,6 +253,7 @@ test_main (void)
> >>> >      do_test (page_size / 2 - i, i, i, 1, 0x9B);
> >>> >
> >>> >    do_random_tests ();
> >>> > +  do_overflow_tests ();
> >>> >    return ret;
> >>> >  }
> >>> >
> >>> > diff --git a/string/test-strncat.c b/string/test-strncat.c
> >>> > index 2ef917b820..0ab7541d4e 100644
> >>> > --- a/string/test-strncat.c
> >>> > +++ b/string/test-strncat.c
> >>> > @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t
> len1, size_t len2,
> >>> >      }
> >>> >  }
> >>> >
> >>> > +static void
> >>> > +do_overflow_tests (void)
> >>> > +{
> >>> > +  size_t i, j, len;
> >>> > +  const size_t one = 1;
> >>> > +  CHAR *s1, *s2;
> >>> > +  uintptr_t s1_addr;
> >>> > +  s1 = (CHAR *) buf1;
> >>> > +  s2 = (CHAR *) buf2;
> >>> > +  s1_addr = (uintptr_t)s1;
> >>> > + for (j = 0; j < 200; ++j)
> >>> > +      s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
> >>> > + s2[200] = 0;
> >>> > +  for (i = 0; i < 750; ++i) {
> >>> > +    for (j = 0; j < i; ++j)
> >>> > +      s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
> >>> > +    s1[i] = '\0';
> >>> > +
> >>> > +       FOR_EACH_IMPL (impl, 0)
> >>> > +    {
> >>> > +      s2[0] = '\0';
> >>> > +      do_one_test (impl, s2, s1, SIZE_MAX - i);
> >>> > +      s2[0] = '\0';
> >>> > +      do_one_test (impl, s2, s1, i - s1_addr);
> >>> > +      s2[0] = '\0';
> >>> > +      do_one_test (impl, s2, s1, -s1_addr - i);
> >>> > +      s2[0] = '\0';
> >>> > +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
> >>> > +      s2[0] = '\0';
> >>> > +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
> >>> > +    }
> >>> > +
> >>> > +    len = 0;
> >>> > +    for (j = 8 * sizeof(size_t) - 1; j ; --j)
> >>> > +      {
> >>> > +        len |= one << j;
> >>> > +        FOR_EACH_IMPL (impl, 0)
> >>> > +          {
> >>> > +            s2[0] = '\0';
> >>> > +            do_one_test (impl, s2, s1, len - i);
> >>> > +            s2[0] = '\0';
> >>> > +            do_one_test (impl, s2, s1, len + i);
> >>> > +            s2[0] = '\0';
> >>> > +            do_one_test (impl, s2, s1, len - s1_addr - i);
> >>> > +            s2[0] = '\0';
> >>> > +            do_one_test (impl, s2, s1, len - s1_addr + i);
> >>> > +
> >>> > +            s2[0] = '\0';
> >>> > +            do_one_test (impl, s2, s1, ~len - i);
> >>> > +            s2[0] = '\0';
> >>> > +            do_one_test (impl, s2, s1, ~len + i);
> >>> > +            s2[0] = '\0';
> >>> > +            do_one_test (impl, s2, s1, ~len - s1_addr - i);
> >>> > +            s2[0] = '\0';
> >>> > +            do_one_test (impl, s2, s1, ~len - s1_addr + i);
> >>> > +          }
> >>> > +      }
> >>> > +  }
> >>> > +}
> >>> > +
> >>> >  static void
> >>> >  do_random_tests (void)
> >>> >  {
> >>> > @@ -316,6 +376,7 @@ test_main (void)
> >>> >      }
> >>> >
> >>> >    do_random_tests ();
> >>> > +  do_overflow_tests ();
> >>> >    return ret;
> >>> >  }
> >>> >
> >>> > diff --git a/string/test-strnlen.c b/string/test-strnlen.c
> >>> > index 920f58e97b..f53e09263f 100644
> >>> > --- a/string/test-strnlen.c
> >>> > +++ b/string/test-strnlen.c
> >>> > @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen,
> int max_char)
> >>> >      do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len,
> maxlen));
> >>> >  }
> >>> >
> >>> > +static void
> >>> > +do_overflow_tests (void)
> >>> > +{
> >>> > +  size_t i, j, len;
> >>> > +  const size_t one = 1;
> >>> > +  uintptr_t buf_addr = (uintptr_t) buf1;
> >>> > +
> >>> > +  for (i = 0; i < 750; ++i)
> >>> > +    {
> >>> > +      do_test (0, i, SIZE_MAX - i, BIG_CHAR);
> >>> > +      do_test (0, i, i - buf_addr, BIG_CHAR);
> >>> > +      do_test (0, i, -buf_addr - i, BIG_CHAR);
> >>> > +      do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
> >>> > +      do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
> >>> > +
> >>> > +      len = 0;
> >>> > +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
> >>> > +        {
> >>> > +          len |= one << j;
> >>> > +          do_test (0, i, len - i, BIG_CHAR);
> >>> > +          do_test (0, i, len + i, BIG_CHAR);
> >>> > +          do_test (0, i, len - buf_addr - i, BIG_CHAR);
> >>> > +          do_test (0, i, len - buf_addr + i, BIG_CHAR);
> >>> > +
> >>> > +          do_test (0, i, ~len - i, BIG_CHAR);
> >>> > +          do_test (0, i, ~len + i, BIG_CHAR);
> >>> > +          do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
> >>> > +          do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
> >>> > +        }
> >>> > +    }
> >>> > +}
> >>> > +
> >>> >  static void
> >>> >  do_random_tests (void)
> >>> >  {
> >>> > @@ -283,6 +315,7 @@ test_main (void)
> >>> >    do_random_tests ();
> >>> >    do_page_tests ();
> >>> >    do_page_2_tests ();
> >>> > +  do_overflow_tests ();
> >>> >    return ret;
> >>> >  }
> >>> >
> >>> > --
> >>> > 2.25.1
> >>> >
> >>>
> >>>
> >>> --
> >>> H.J.
> >
> >
> > Ping if we want this in 2.34
>
> Can you repost the patches with BZ# in the commit log?
>

Done. (Not sure why the patch didn't come in as reply to this one but just
posted).

>
> Thanks.
>
> --
> H.J.
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974]
  2021-06-22 18:11 ` [PATCH v2 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974] Noah Goldstein
@ 2021-06-22 21:24   ` H.J. Lu
  0 siblings, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2021-06-22 21:24 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit adds tests for a bug in the wide char variant of the
> functions where the implementation may assume that maxlen for wcsnlen
> or n for wmemchr/strncat will not overflow when multiplied by
> sizeof(wchar_t).
>
> These tests show the following implementations failing on x86_64:
>
> wcsnlen-sse4_1
> wcsnlen-avx2
>
> wmemchr-sse2
> wmemchr-avx2
>
> strncat would fail as well if it where on a system that prefered
> either of the wcsnlen implementations that failed as it relies on
> wcsnlen.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> Some notes:
>
> I only tested this patch (and the subsequent fixes) on a machine that
> prefers EVEX.
>
> The fix for wcsnlen-sse2 is possibly invalid. What it does is checks
> if the computation is maxlen * sizeof(wchar_t) + s overflows, and if
> so just calls wcslen. The rational is that either the end of the
> string will be found in readable memory or the user invoked UB by
> calling wcsnlen on a string that is not contained in valid memory
> and without a maxlen to that will bound it in valid memory.
>
>  string/test-memchr.c  | 39 ++++++++++++++++++++++++---
>  string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
>  string/test-strnlen.c | 33 +++++++++++++++++++++++
>  3 files changed, 130 insertions(+), 3 deletions(-)
>
> diff --git a/string/test-memchr.c b/string/test-memchr.c
> index 665edc32af..ce964284aa 100644
> --- a/string/test-memchr.c
> +++ b/string/test-memchr.c
> @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
>    CHAR *res = CALL (impl, s, c, n);
>    if (res != exp_res)
>      {
> -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
> -            res, exp_res);
> +      error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
> +             impl->name, s, c, n, res, exp_res);
>        ret = 1;
>        return;
>      }
> @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
>      }
>    buf[align + len] = 0;
>
> -  if (pos < len)
> +  if (pos < MIN(n, len))
>      {
>        buf[align + pos] = seek_char;
>        buf[align + len] = -seek_char;
> @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
>      do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
>  }
>
> +static void
> +do_overflow_tests (void)
> +{
> +  size_t i, j, len;
> +  const size_t one = 1;
> +  uintptr_t buf_addr = (uintptr_t) buf1;
> +
> +  for (i = 0; i < 750; ++i)
> +    {
> +        do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
> +        do_test (0, i, 751, i - buf_addr, BIG_CHAR);
> +        do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
> +        do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
> +        do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
> +
> +      len = 0;
> +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
> +        {
> +          len |= one << j;
> +          do_test (0, i, 751, len - i, BIG_CHAR);
> +          do_test (0, i, 751, len + i, BIG_CHAR);
> +          do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
> +          do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
> +
> +          do_test (0, i, 751, ~len - i, BIG_CHAR);
> +          do_test (0, i, 751, ~len + i, BIG_CHAR);
> +          do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
> +          do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
> +        }
> +    }
> +}
> +
>  static void
>  do_random_tests (void)
>  {
> @@ -221,6 +253,7 @@ test_main (void)
>      do_test (page_size / 2 - i, i, i, 1, 0x9B);
>
>    do_random_tests ();
> +  do_overflow_tests ();
>    return ret;
>  }
>
> diff --git a/string/test-strncat.c b/string/test-strncat.c
> index 2ef917b820..37ea26ea05 100644
> --- a/string/test-strncat.c
> +++ b/string/test-strncat.c
> @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
>      }
>  }
>
> +static void
> +do_overflow_tests (void)
> +{
> +  size_t i, j, len;
> +  const size_t one = 1;
> +  CHAR *s1, *s2;
> +  uintptr_t s1_addr;
> +  s1 = (CHAR *) buf1;
> +  s2 = (CHAR *) buf2;
> +  s1_addr = (uintptr_t)s1;
> + for (j = 0; j < 200; ++j)
> +      s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
> + s2[200] = 0;
> +  for (i = 0; i < 750; ++i) {
> +    for (j = 0; j < i; ++j)
> +      s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
> +    s1[i] = '\0';
> +
> +       FOR_EACH_IMPL (impl, 0)
> +    {
> +      s2[200] = '\0';
> +      do_one_test (impl, s2, s1, SIZE_MAX - i);
> +      s2[200] = '\0';
> +      do_one_test (impl, s2, s1, i - s1_addr);
> +      s2[200] = '\0';
> +      do_one_test (impl, s2, s1, -s1_addr - i);
> +      s2[200] = '\0';
> +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
> +      s2[200] = '\0';
> +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
> +    }
> +
> +    len = 0;
> +    for (j = 8 * sizeof(size_t) - 1; j ; --j)
> +      {
> +        len |= one << j;
> +        FOR_EACH_IMPL (impl, 0)
> +          {
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, len - i);
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, len + i);
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, len - s1_addr - i);
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, len - s1_addr + i);
> +
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, ~len - i);
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, ~len + i);
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, ~len - s1_addr - i);
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, ~len - s1_addr + i);
> +          }
> +      }
> +  }
> +}
> +
>  static void
>  do_random_tests (void)
>  {
> @@ -316,6 +376,7 @@ test_main (void)
>      }
>
>    do_random_tests ();
> +  do_overflow_tests ();
>    return ret;
>  }
>
> diff --git a/string/test-strnlen.c b/string/test-strnlen.c
> index 920f58e97b..f53e09263f 100644
> --- a/string/test-strnlen.c
> +++ b/string/test-strnlen.c
> @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
>      do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
>  }
>
> +static void
> +do_overflow_tests (void)
> +{
> +  size_t i, j, len;
> +  const size_t one = 1;
> +  uintptr_t buf_addr = (uintptr_t) buf1;
> +
> +  for (i = 0; i < 750; ++i)
> +    {
> +      do_test (0, i, SIZE_MAX - i, BIG_CHAR);
> +      do_test (0, i, i - buf_addr, BIG_CHAR);
> +      do_test (0, i, -buf_addr - i, BIG_CHAR);
> +      do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
> +      do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
> +
> +      len = 0;
> +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
> +        {
> +          len |= one << j;
> +          do_test (0, i, len - i, BIG_CHAR);
> +          do_test (0, i, len + i, BIG_CHAR);
> +          do_test (0, i, len - buf_addr - i, BIG_CHAR);
> +          do_test (0, i, len - buf_addr + i, BIG_CHAR);
> +
> +          do_test (0, i, ~len - i, BIG_CHAR);
> +          do_test (0, i, ~len + i, BIG_CHAR);
> +          do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
> +          do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
> +        }
> +    }
> +}
> +
>  static void
>  do_random_tests (void)
>  {
> @@ -283,6 +315,7 @@ test_main (void)
>    do_random_tests ();
>    do_page_tests ();
>    do_page_2_tests ();
> +  do_overflow_tests ();
>    return ret;
>  }
>
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ #27974]
  2021-06-22 18:11 ` [PATCH v2 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
@ 2021-06-22 21:24   ` H.J. Lu
  0 siblings, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2021-06-22 21:24 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit fixes the bug mentioned in the previous commit.
>
> The previous implementations of wmemchr in these files relied
> on n * sizeof(wchar_t) which was not guranteed by the standard.
>
> The new overflow tests added in the previous commit now
> pass (As well as all the other tests).
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/memchr.S                | 77 +++++++++++++++++++-------
>  sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
>  2 files changed, 98 insertions(+), 37 deletions(-)
>
> diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
> index beff2708de..3ddc4655cf 100644
> --- a/sysdeps/x86_64/memchr.S
> +++ b/sysdeps/x86_64/memchr.S
> @@ -21,9 +21,11 @@
>  #ifdef USE_AS_WMEMCHR
>  # define MEMCHR                wmemchr
>  # define PCMPEQ                pcmpeqd
> +# define CHAR_PER_VEC  4
>  #else
>  # define MEMCHR                memchr
>  # define PCMPEQ                pcmpeqb
> +# define CHAR_PER_VEC  16
>  #endif
>
>  /* fast SSE2 version with using pmaxub and 64 byte loop */
> @@ -33,15 +35,14 @@ ENTRY(MEMCHR)
>         movd    %esi, %xmm1
>         mov     %edi, %ecx
>
> +#ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %edx, %edx
> +#endif
>  #ifdef USE_AS_WMEMCHR
>         test    %RDX_LP, %RDX_LP
>         jz      L(return_null)
> -       shl     $2, %RDX_LP
>  #else
> -# ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %edx, %edx
> -# endif
>         punpcklbw %xmm1, %xmm1
>         test    %RDX_LP, %RDX_LP
>         jz      L(return_null)
> @@ -60,13 +61,16 @@ ENTRY(MEMCHR)
>         test    %eax, %eax
>
>         jnz     L(matches_1)
> -       sub     $16, %rdx
> +       sub     $CHAR_PER_VEC, %rdx
>         jbe     L(return_null)
>         add     $16, %rdi
>         and     $15, %ecx
>         and     $-16, %rdi
> +#ifdef USE_AS_WMEMCHR
> +       shr     $2, %ecx
> +#endif
>         add     %rcx, %rdx
> -       sub     $64, %rdx
> +       sub     $(CHAR_PER_VEC * 4), %rdx
>         jbe     L(exit_loop)
>         jmp     L(loop_prolog)
>
> @@ -77,16 +81,21 @@ L(crosscache):
>         movdqa  (%rdi), %xmm0
>
>         PCMPEQ  %xmm1, %xmm0
> -/* Check if there is a match.  */
> +       /* Check if there is a match.  */
>         pmovmskb %xmm0, %eax
> -/* Remove the leading bytes.  */
> +       /* Remove the leading bytes.  */
>         sar     %cl, %eax
>         test    %eax, %eax
>         je      L(unaligned_no_match)
> -/* Check which byte is a match.  */
> +       /* Check which byte is a match.  */
>         bsf     %eax, %eax
> -
> +#ifdef USE_AS_WMEMCHR
> +       mov     %eax, %esi
> +       shr     $2, %esi
> +       sub     %rsi, %rdx
> +#else
>         sub     %rax, %rdx
> +#endif
>         jbe     L(return_null)
>         add     %rdi, %rax
>         add     %rcx, %rax
> @@ -94,15 +103,18 @@ L(crosscache):
>
>         .p2align 4
>  L(unaligned_no_match):
> -        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
> +       /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
>            "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
>            possible addition overflow.  */
>         neg     %rcx
>         add     $16, %rcx
> +#ifdef USE_AS_WMEMCHR
> +       shr     $2, %ecx
> +#endif
>         sub     %rcx, %rdx
>         jbe     L(return_null)
>         add     $16, %rdi
> -       sub     $64, %rdx
> +       sub     $(CHAR_PER_VEC * 4), %rdx
>         jbe     L(exit_loop)
>
>         .p2align 4
> @@ -135,7 +147,7 @@ L(loop_prolog):
>         test    $0x3f, %rdi
>         jz      L(align64_loop)
>
> -       sub     $64, %rdx
> +       sub     $(CHAR_PER_VEC * 4), %rdx
>         jbe     L(exit_loop)
>
>         movdqa  (%rdi), %xmm0
> @@ -167,11 +179,14 @@ L(loop_prolog):
>         mov     %rdi, %rcx
>         and     $-64, %rdi
>         and     $63, %ecx
> +#ifdef USE_AS_WMEMCHR
> +       shr     $2, %ecx
> +#endif
>         add     %rcx, %rdx
>
>         .p2align 4
>  L(align64_loop):
> -       sub     $64, %rdx
> +       sub     $(CHAR_PER_VEC * 4), %rdx
>         jbe     L(exit_loop)
>         movdqa  (%rdi), %xmm0
>         movdqa  16(%rdi), %xmm2
> @@ -218,7 +233,7 @@ L(align64_loop):
>
>         .p2align 4
>  L(exit_loop):
> -       add     $32, %edx
> +       add     $(CHAR_PER_VEC * 2), %edx
>         jle     L(exit_loop_32)
>
>         movdqa  (%rdi), %xmm0
> @@ -238,7 +253,7 @@ L(exit_loop):
>         pmovmskb %xmm3, %eax
>         test    %eax, %eax
>         jnz     L(matches32_1)
> -       sub     $16, %edx
> +       sub     $CHAR_PER_VEC, %edx
>         jle     L(return_null)
>
>         PCMPEQ  48(%rdi), %xmm1
> @@ -250,13 +265,13 @@ L(exit_loop):
>
>         .p2align 4
>  L(exit_loop_32):
> -       add     $32, %edx
> +       add     $(CHAR_PER_VEC * 2), %edx
>         movdqa  (%rdi), %xmm0
>         PCMPEQ  %xmm1, %xmm0
>         pmovmskb %xmm0, %eax
>         test    %eax, %eax
>         jnz     L(matches_1)
> -       sub     $16, %edx
> +       sub     $CHAR_PER_VEC, %edx
>         jbe     L(return_null)
>
>         PCMPEQ  16(%rdi), %xmm1
> @@ -293,7 +308,13 @@ L(matches32):
>         .p2align 4
>  L(matches_1):
>         bsf     %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> +       mov     %eax, %esi
> +       shr     $2, %esi
> +       sub     %rsi, %rdx
> +#else
>         sub     %rax, %rdx
> +#endif
>         jbe     L(return_null)
>         add     %rdi, %rax
>         ret
> @@ -301,7 +322,13 @@ L(matches_1):
>         .p2align 4
>  L(matches16_1):
>         bsf     %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> +       mov     %eax, %esi
> +       shr     $2, %esi
> +       sub     %rsi, %rdx
> +#else
>         sub     %rax, %rdx
> +#endif
>         jbe     L(return_null)
>         lea     16(%rdi, %rax), %rax
>         ret
> @@ -309,7 +336,13 @@ L(matches16_1):
>         .p2align 4
>  L(matches32_1):
>         bsf     %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> +       mov     %eax, %esi
> +       shr     $2, %esi
> +       sub     %rsi, %rdx
> +#else
>         sub     %rax, %rdx
> +#endif
>         jbe     L(return_null)
>         lea     32(%rdi, %rax), %rax
>         ret
> @@ -317,7 +350,13 @@ L(matches32_1):
>         .p2align 4
>  L(matches48_1):
>         bsf     %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> +       mov     %eax, %esi
> +       shr     $2, %esi
> +       sub     %rsi, %rdx
> +#else
>         sub     %rax, %rdx
> +#endif
>         jbe     L(return_null)
>         lea     48(%rdi, %rax), %rax
>         ret
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 0d8758e3e7..afdb956502 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -54,21 +54,19 @@
>
>  # define VEC_SIZE 32
>  # define PAGE_SIZE 4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
>         .section SECTION(.text),"ax",@progbits
>  ENTRY (MEMCHR)
>  # ifndef USE_AS_RAWMEMCHR
>         /* Check for zero length.  */
> -       test    %RDX_LP, %RDX_LP
> -       jz      L(null)
> -# endif
> -# ifdef USE_AS_WMEMCHR
> -       shl     $2, %RDX_LP
> -# else
>  #  ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %edx, %edx
> +       /* Clear upper bits.  */
> +       and     %RDX_LP, %RDX_LP
> +#  else
> +       test    %RDX_LP, %RDX_LP
>  #  endif
> +       jz      L(null)
>  # endif
>         /* Broadcast CHAR to YMMMATCH.  */
>         vmovd   %esi, %xmm0
> @@ -84,7 +82,7 @@ ENTRY (MEMCHR)
>         vpmovmskb %ymm1, %eax
>  # ifndef USE_AS_RAWMEMCHR
>         /* If length < CHAR_PER_VEC handle special.  */
> -       cmpq    $VEC_SIZE, %rdx
> +       cmpq    $CHAR_PER_VEC, %rdx
>         jbe     L(first_vec_x0)
>  # endif
>         testl   %eax, %eax
> @@ -98,6 +96,10 @@ ENTRY (MEMCHR)
>  L(first_vec_x0):
>         /* Check if first match was before length.  */
>         tzcntl  %eax, %eax
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %edx
> +#  endif
>         xorl    %ecx, %ecx
>         cmpl    %eax, %edx
>         leaq    (%rdi, %rax), %rax
> @@ -110,12 +112,12 @@ L(null):
>  # endif
>         .p2align 4
>  L(cross_page_boundary):
> -       /* Save pointer before aligning as its original value is necessary
> -          for computer return address if byte is found or adjusting length
> -          if it is not and this is memchr.  */
> +       /* Save pointer before aligning as its original value is
> +          necessary for computer return address if byte is found or
> +          adjusting length if it is not and this is memchr.  */
>         movq    %rdi, %rcx
> -       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
> -          rdi for rawmemchr.  */
> +       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
> +          and rdi for rawmemchr.  */
>         orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
>         VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
> @@ -124,6 +126,10 @@ L(cross_page_boundary):
>            match).  */
>         leaq    1(%ALGN_PTR_REG), %rsi
>         subq    %RRAW_PTR_REG, %rsi
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> +       shrl    $2, %esi
> +#  endif
>  # endif
>         /* Remove the leading bytes.  */
>         sarxl   %ERAW_PTR_REG, %eax, %eax
> @@ -181,6 +187,10 @@ L(cross_page_continue):
>         orq     $(VEC_SIZE - 1), %rdi
>         /* esi is for adjusting length to see if near the end.  */
>         leal    (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %esi
> +#  endif
>  # else
>         orq     $(VEC_SIZE - 1), %rdi
>  L(cross_page_continue):
> @@ -213,7 +223,7 @@ L(cross_page_continue):
>
>  # ifndef USE_AS_RAWMEMCHR
>         /* Check if at last VEC_SIZE * 4 length.  */
> -       subq    $(VEC_SIZE * 4), %rdx
> +       subq    $(CHAR_PER_VEC * 4), %rdx
>         jbe     L(last_4x_vec_or_less_cmpeq)
>         /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
>            length.  */
> @@ -221,6 +231,10 @@ L(cross_page_continue):
>         movl    %edi, %ecx
>         orq     $(VEC_SIZE * 4 - 1), %rdi
>         andl    $(VEC_SIZE * 4 - 1), %ecx
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %ecx
> +#  endif
>         addq    %rcx, %rdx
>  # else
>         /* Align data to VEC_SIZE * 4 - 1 for loop.  */
> @@ -250,15 +264,19 @@ L(loop_4x_vec):
>
>         subq    $-(VEC_SIZE * 4), %rdi
>
> -       subq    $(VEC_SIZE * 4), %rdx
> +       subq    $(CHAR_PER_VEC * 4), %rdx
>         ja      L(loop_4x_vec)
>
> -       /* Fall through into less than 4 remaining vectors of length case.
> -        */
> +       /* Fall through into less than 4 remaining vectors of length
> +          case.  */
>         VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
>         .p2align 4
>  L(last_4x_vec_or_less):
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %edx
> +#  endif
>         /* Check if first VEC contained match.  */
>         testl   %eax, %eax
>         jnz     L(first_vec_x1_check)
> @@ -355,6 +373,10 @@ L(last_vec_x2_return):
>  L(last_4x_vec_or_less_cmpeq):
>         VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %edx
> +#  endif
>         subq    $-(VEC_SIZE * 4), %rdi
>         /* Check first VEC regardless.  */
>         testl   %eax, %eax
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
  2021-06-22 18:11 ` [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
@ 2021-06-22 21:33   ` H.J. Lu
  2021-06-22 23:16     ` Noah Goldstein
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2021-06-22 21:33 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit fixes the bug mentioned in the previous commit.
>
> The previous implementations of wmemchr in these files relied
> on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
>
> The new overflow tests added in the previous commit now
> pass (As well as all the other tests).
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
>  sysdeps/x86_64/strlen.S                |  14 ++-
>  2 files changed, 106 insertions(+), 38 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> index bd2e6ee44a..b282a75613 100644
> --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> @@ -44,21 +44,21 @@
>
>  # define VEC_SIZE 32
>  # define PAGE_SIZE 4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
>         .section SECTION(.text),"ax",@progbits
>  ENTRY (STRLEN)
>  # ifdef USE_AS_STRNLEN
>         /* Check zero length.  */
> +#  ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       and     %RSI_LP, %RSI_LP
> +#  else
>         test    %RSI_LP, %RSI_LP
> +#  endif
>         jz      L(zero)
>         /* Store max len in R8_LP before adjusting if using WCSLEN.  */
>         mov     %RSI_LP, %R8_LP
> -#  ifdef USE_AS_WCSLEN
> -       shl     $2, %RSI_LP
> -#  elif defined __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %esi, %esi
> -#  endif
>  # endif
>         movl    %edi, %eax
>         movq    %rdi, %rdx
> @@ -72,10 +72,10 @@ ENTRY (STRLEN)
>
>         /* Check the first VEC_SIZE bytes.  */
>         VPCMPEQ (%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>  # ifdef USE_AS_STRNLEN
>         /* If length < VEC_SIZE handle special.  */
> -       cmpq    $VEC_SIZE, %rsi
> +       cmpq    $CHAR_PER_VEC, %rsi
>         jbe     L(first_vec_x0)
>  # endif
>         /* If empty continue to aligned_more. Otherwise return bit
> @@ -84,6 +84,7 @@ ENTRY (STRLEN)
>         jz      L(aligned_more)
>         tzcntl  %eax, %eax
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  # endif
>         VZEROUPPER_RETURN
> @@ -97,9 +98,14 @@ L(zero):
>  L(first_vec_x0):
>         /* Set bit for max len so that tzcnt will return min of max len
>            and position of first match.  */
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %esi
> +#  endif
>         btsq    %rsi, %rax
>         tzcntl  %eax, %eax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -113,14 +119,19 @@ L(first_vec_x1):
>  # ifdef USE_AS_STRNLEN
>         /* Use ecx which was computed earlier to compute correct value.
>          */
> +#  ifdef USE_AS_WCSLEN
> +       leal    -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
> +#  else
>         subl    $(VEC_SIZE * 4 + 1), %ecx
>         addl    %ecx, %eax
> +#  endif
>  # else
>         subl    %edx, %edi
>         incl    %edi
>         addl    %edi, %eax
>  # endif
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  # endif
>         VZEROUPPER_RETURN
> @@ -133,14 +144,19 @@ L(first_vec_x2):
>  # ifdef USE_AS_STRNLEN
>         /* Use ecx which was computed earlier to compute correct value.
>          */
> +#  ifdef USE_AS_WCSLEN
> +       leal    -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
> +#  else
>         subl    $(VEC_SIZE * 3 + 1), %ecx
>         addl    %ecx, %eax
> +#  endif
>  # else
>         subl    %edx, %edi
>         addl    $(VEC_SIZE + 1), %edi
>         addl    %edi, %eax
>  # endif
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  # endif
>         VZEROUPPER_RETURN
> @@ -153,14 +169,19 @@ L(first_vec_x3):
>  # ifdef USE_AS_STRNLEN
>         /* Use ecx which was computed earlier to compute correct value.
>          */
> +#  ifdef USE_AS_WCSLEN
> +       leal    -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
> +#  else
>         subl    $(VEC_SIZE * 2 + 1), %ecx
>         addl    %ecx, %eax
> +#  endif
>  # else
>         subl    %edx, %edi
>         addl    $(VEC_SIZE * 2 + 1), %edi
>         addl    %edi, %eax
>  # endif
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  # endif
>         VZEROUPPER_RETURN
> @@ -173,14 +194,19 @@ L(first_vec_x4):
>  # ifdef USE_AS_STRNLEN
>         /* Use ecx which was computed earlier to compute correct value.
>          */
> +#  ifdef USE_AS_WCSLEN
> +       leal    -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
> +#  else
>         subl    $(VEC_SIZE + 1), %ecx
>         addl    %ecx, %eax
> +#  endif
>  # else
>         subl    %edx, %edi
>         addl    $(VEC_SIZE * 3 + 1), %edi
>         addl    %edi, %eax
>  # endif
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  # endif
>         VZEROUPPER_RETURN
> @@ -195,10 +221,14 @@ L(cross_page_continue):
>         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>            since data is only aligned to VEC_SIZE.  */
>  # ifdef USE_AS_STRNLEN
> -       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> -          it simplies the logic in last_4x_vec_or_less.  */
> +       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> +          because it simplies the logic in last_4x_vec_or_less.  */
>         leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
>         subq    %rdx, %rcx
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %ecx
> +#  endif
>  # endif
>         /* Load first VEC regardless.  */
>         VPCMPEQ 1(%rdi), %ymm0, %ymm1
> @@ -207,34 +237,38 @@ L(cross_page_continue):
>         subq    %rcx, %rsi
>         jb      L(last_4x_vec_or_less)
>  # endif
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x1)
>
>         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x2)
>
>         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x3)
>
>         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x4)
>
>         /* Align data to VEC_SIZE * 4 - 1.  */
>  # ifdef USE_AS_STRNLEN
>         /* Before adjusting length check if at last VEC_SIZE * 4.  */
> -       cmpq    $(VEC_SIZE * 4 - 1), %rsi
> +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
>         jbe     L(last_4x_vec_or_less_load)
>         incq    %rdi
>         movl    %edi, %ecx
>         orq     $(VEC_SIZE * 4 - 1), %rdi
>         andl    $(VEC_SIZE * 4 - 1), %ecx
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %ecx
> +#  endif
>         /* Readjust length.  */
>         addq    %rcx, %rsi
>  # else
> @@ -246,13 +280,13 @@ L(cross_page_continue):
>  L(loop_4x_vec):
>  # ifdef USE_AS_STRNLEN
>         /* Break if at end of length.  */
> -       subq    $(VEC_SIZE * 4), %rsi
> +       subq    $(CHAR_PER_VEC * 4), %rsi
>         jb      L(last_4x_vec_or_less_cmpeq)
>  # endif
> -       /* Save some code size by microfusing VPMINU with the load. Since
> -          the matches in ymm2/ymm4 can only be returned if there where no
> -          matches in ymm1/ymm3 respectively there is no issue with overlap.
> -        */
> +       /* Save some code size by microfusing VPMINU with the load.
> +          Since the matches in ymm2/ymm4 can only be returned if there
> +          where no matches in ymm1/ymm3 respectively there is no issue
> +          with overlap.  */
>         vmovdqa 1(%rdi), %ymm1
>         VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
>         vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> @@ -260,7 +294,7 @@ L(loop_4x_vec):
>
>         VPMINU  %ymm2, %ymm4, %ymm5
>         VPCMPEQ %ymm5, %ymm0, %ymm5
> -       vpmovmskb       %ymm5, %ecx
> +       vpmovmskb %ymm5, %ecx
>
>         subq    $-(VEC_SIZE * 4), %rdi
>         testl   %ecx, %ecx
> @@ -268,27 +302,28 @@ L(loop_4x_vec):
>
>
>         VPCMPEQ %ymm1, %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         subq    %rdx, %rdi
>         testl   %eax, %eax
>         jnz     L(last_vec_return_x0)
>
>         VPCMPEQ %ymm2, %ymm0, %ymm2
> -       vpmovmskb       %ymm2, %eax
> +       vpmovmskb %ymm2, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_return_x1)
>
>         /* Combine last 2 VEC.  */
>         VPCMPEQ %ymm3, %ymm0, %ymm3
> -       vpmovmskb       %ymm3, %eax
> -       /* rcx has combined result from all 4 VEC. It will only be used if
> -          the first 3 other VEC all did not contain a match.  */
> +       vpmovmskb %ymm3, %eax
> +       /* rcx has combined result from all 4 VEC. It will only be used
> +          if the first 3 other VEC all did not contain a match.  */
>         salq    $32, %rcx
>         orq     %rcx, %rax
>         tzcntq  %rax, %rax
>         subq    $(VEC_SIZE * 2 - 1), %rdi
>         addq    %rdi, %rax
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  # endif
>         VZEROUPPER_RETURN
> @@ -297,15 +332,19 @@ L(loop_4x_vec):
>  # ifdef USE_AS_STRNLEN
>         .p2align 4
>  L(last_4x_vec_or_less_load):
> -       /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> +       /* Depending on entry adjust rdi / prepare first VEC in ymm1.
> +        */
>         subq    $-(VEC_SIZE * 4), %rdi
>  L(last_4x_vec_or_less_cmpeq):
>         VPCMPEQ 1(%rdi), %ymm0, %ymm1
>  L(last_4x_vec_or_less):
> -
> -       vpmovmskb       %ymm1, %eax
> -       /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> -          VEC_SIZE * 4.  */
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %esi
> +#  endif
> +       vpmovmskb %ymm1, %eax
> +       /* If remaining length > VEC_SIZE * 2. This works if esi is off
> +          by VEC_SIZE * 4.  */
>         testl   $(VEC_SIZE * 2), %esi
>         jnz     L(last_4x_vec)
>
> @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
>         jb      L(max)
>
>         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         tzcntl  %eax, %eax
>         /* Check the end of data.  */
>         cmpl    %eax, %esi
> @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
>         addl    $(VEC_SIZE + 1), %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -340,6 +380,7 @@ L(last_vec_return_x0):
>         subq    $(VEC_SIZE * 4 - 1), %rdi
>         addq    %rdi, %rax
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  # endif
>         VZEROUPPER_RETURN
> @@ -350,6 +391,7 @@ L(last_vec_return_x1):
>         subq    $(VEC_SIZE * 3 - 1), %rdi
>         addq    %rdi, %rax
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  # endif
>         VZEROUPPER_RETURN
> @@ -366,6 +408,7 @@ L(last_vec_x1_check):
>         incl    %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -381,14 +424,14 @@ L(last_4x_vec):
>         jnz     L(last_vec_x1)
>
>         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_x2)
>
>         /* Normalize length.  */
>         andl    $(VEC_SIZE * 4 - 1), %esi
>         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_x3)
>
> @@ -396,7 +439,7 @@ L(last_4x_vec):
>         jb      L(max)
>
>         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         tzcntl  %eax, %eax
>         /* Check the end of data.  */
>         cmpl    %eax, %esi
> @@ -405,6 +448,7 @@ L(last_4x_vec):
>         addl    $(VEC_SIZE * 3 + 1), %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -419,6 +463,7 @@ L(last_vec_x1):
>         incl    %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -432,6 +477,7 @@ L(last_vec_x2):
>         addl    $(VEC_SIZE + 1), %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -447,6 +493,7 @@ L(last_vec_x3):
>         addl    $(VEC_SIZE * 2 + 1), %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -455,13 +502,13 @@ L(max_end):
>         VZEROUPPER_RETURN
>  # endif
>
> -       /* Cold case for crossing page with first load.  */
> +       /* Cold case for crossing page with first load.  */
>         .p2align 4
>  L(cross_page_boundary):
>         /* Align data to VEC_SIZE - 1.  */
>         orq     $(VEC_SIZE - 1), %rdi
>         VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
>            so no need to manually mod rdx.  */
>         sarxl   %edx, %eax, %eax
> @@ -470,6 +517,10 @@ L(cross_page_boundary):
>         jnz     L(cross_page_less_vec)
>         leaq    1(%rdi), %rcx
>         subq    %rdx, %rcx
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> +       shrl    $2, %ecx
> +#  endif
>         /* Check length.  */
>         cmpq    %rsi, %rcx
>         jb      L(cross_page_continue)
> @@ -479,6 +530,7 @@ L(cross_page_boundary):
>         jz      L(cross_page_continue)
>         tzcntl  %eax, %eax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide length by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  #  endif
>  # endif
> @@ -489,6 +541,10 @@ L(return_vzeroupper):
>         .p2align 4
>  L(cross_page_less_vec):
>         tzcntl  %eax, %eax
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %esi
> +#  endif
>         cmpq    %rax, %rsi
>         cmovb   %esi, %eax
>  #  ifdef USE_AS_WCSLEN
> diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> index d223ea1700..3fc6734910 100644
> --- a/sysdeps/x86_64/strlen.S
> +++ b/sysdeps/x86_64/strlen.S
> @@ -65,12 +65,24 @@ ENTRY(strlen)
>         ret
>  L(n_nonzero):
>  # ifdef AS_WCSLEN
> -       shl     $2, %RSI_LP
> +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> +   overflow the only way this program doesn't have undefined behavior
> +   is if there is a null terminator in valid memory so strlen will
> +   suffice.  */
> +       mov     %RSI_LP, %R10_LP
> +       sar     $62, %R10_LP
> +       test    %R10_LP, %R10_LP
> +       jnz     __wcslen_sse2

Branch to  __wcslen_sse2 is wrong for 2 reasons:

1.  __wcslen_sse2 is undefined with --disable-multi-arch.
2. You should skip ENDBR64 at function entry.

Please create a new label and branch to it.

> +       sal     $2, %RSI_LP
>  # endif
>
>  /* Initialize long lived registers.  */
>
>         add     %RDI_LP, %RSI_LP
> +# ifdef AS_WCSLEN
> +/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
> +       jbe     __wcslen_sse2
> +# endif
>         mov     %RSI_LP, %R10_LP
>         and     $-64, %R10_LP
>         mov     %RSI_LP, %R11_LP
> --
> 2.25.1
>

Thanks.


--
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
  2021-06-22 21:33   ` H.J. Lu
@ 2021-06-22 23:16     ` Noah Goldstein
  2021-06-22 23:28       ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-22 23:16 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jun 22, 2021 at 5:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> > This commit fixes the bug mentioned in the previous commit.
> >
> > The previous implementations of wmemchr in these files relied
> > on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
> >
> > The new overflow tests added in the previous commit now
> > pass (As well as all the other tests).
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> >  sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
> >  sysdeps/x86_64/strlen.S                |  14 ++-
> >  2 files changed, 106 insertions(+), 38 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S
> b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > index bd2e6ee44a..b282a75613 100644
> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > @@ -44,21 +44,21 @@
> >
> >  # define VEC_SIZE 32
> >  # define PAGE_SIZE 4096
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> >
> >         .section SECTION(.text),"ax",@progbits
> >  ENTRY (STRLEN)
> >  # ifdef USE_AS_STRNLEN
> >         /* Check zero length.  */
> > +#  ifdef __ILP32__
> > +       /* Clear upper bits.  */
> > +       and     %RSI_LP, %RSI_LP
> > +#  else
> >         test    %RSI_LP, %RSI_LP
> > +#  endif
> >         jz      L(zero)
> >         /* Store max len in R8_LP before adjusting if using WCSLEN.  */
> >         mov     %RSI_LP, %R8_LP
> > -#  ifdef USE_AS_WCSLEN
> > -       shl     $2, %RSI_LP
> > -#  elif defined __ILP32__
> > -       /* Clear the upper 32 bits.  */
> > -       movl    %esi, %esi
> > -#  endif
> >  # endif
> >         movl    %edi, %eax
> >         movq    %rdi, %rdx
> > @@ -72,10 +72,10 @@ ENTRY (STRLEN)
> >
> >         /* Check the first VEC_SIZE bytes.  */
> >         VPCMPEQ (%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >  # ifdef USE_AS_STRNLEN
> >         /* If length < VEC_SIZE handle special.  */
> > -       cmpq    $VEC_SIZE, %rsi
> > +       cmpq    $CHAR_PER_VEC, %rsi
> >         jbe     L(first_vec_x0)
> >  # endif
> >         /* If empty continue to aligned_more. Otherwise return bit
> > @@ -84,6 +84,7 @@ ENTRY (STRLEN)
> >         jz      L(aligned_more)
> >         tzcntl  %eax, %eax
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrl    $2, %eax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -97,9 +98,14 @@ L(zero):
> >  L(first_vec_x0):
> >         /* Set bit for max len so that tzcnt will return min of max len
> >            and position of first match.  */
> > +#  ifdef USE_AS_WCSLEN
> > +       /* NB: Multiply length by 4 to get byte count.  */
> > +       sall    $2, %esi
> > +#  endif
> >         btsq    %rsi, %rax
> >         tzcntl  %eax, %eax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrl    $2, %eax
> >  #  endif
> >         VZEROUPPER_RETURN
> > @@ -113,14 +119,19 @@ L(first_vec_x1):
> >  # ifdef USE_AS_STRNLEN
> >         /* Use ecx which was computed earlier to compute correct value.
> >          */
> > +#  ifdef USE_AS_WCSLEN
> > +       leal    -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
> > +#  else
> >         subl    $(VEC_SIZE * 4 + 1), %ecx
> >         addl    %ecx, %eax
> > +#  endif
> >  # else
> >         subl    %edx, %edi
> >         incl    %edi
> >         addl    %edi, %eax
> >  # endif
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrl    $2, %eax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -133,14 +144,19 @@ L(first_vec_x2):
> >  # ifdef USE_AS_STRNLEN
> >         /* Use ecx which was computed earlier to compute correct value.
> >          */
> > +#  ifdef USE_AS_WCSLEN
> > +       leal    -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
> > +#  else
> >         subl    $(VEC_SIZE * 3 + 1), %ecx
> >         addl    %ecx, %eax
> > +#  endif
> >  # else
> >         subl    %edx, %edi
> >         addl    $(VEC_SIZE + 1), %edi
> >         addl    %edi, %eax
> >  # endif
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrl    $2, %eax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -153,14 +169,19 @@ L(first_vec_x3):
> >  # ifdef USE_AS_STRNLEN
> >         /* Use ecx which was computed earlier to compute correct value.
> >          */
> > +#  ifdef USE_AS_WCSLEN
> > +       leal    -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
> > +#  else
> >         subl    $(VEC_SIZE * 2 + 1), %ecx
> >         addl    %ecx, %eax
> > +#  endif
> >  # else
> >         subl    %edx, %edi
> >         addl    $(VEC_SIZE * 2 + 1), %edi
> >         addl    %edi, %eax
> >  # endif
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrl    $2, %eax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -173,14 +194,19 @@ L(first_vec_x4):
> >  # ifdef USE_AS_STRNLEN
> >         /* Use ecx which was computed earlier to compute correct value.
> >          */
> > +#  ifdef USE_AS_WCSLEN
> > +       leal    -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
> > +#  else
> >         subl    $(VEC_SIZE + 1), %ecx
> >         addl    %ecx, %eax
> > +#  endif
> >  # else
> >         subl    %edx, %edi
> >         addl    $(VEC_SIZE * 3 + 1), %edi
> >         addl    %edi, %eax
> >  # endif
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrl    $2, %eax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -195,10 +221,14 @@ L(cross_page_continue):
> >         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> >            since data is only aligned to VEC_SIZE.  */
> >  # ifdef USE_AS_STRNLEN
> > -       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> because
> > -          it simplies the logic in last_4x_vec_or_less.  */
> > +       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> > +          because it simplies the logic in last_4x_vec_or_less.  */
> >         leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> >         subq    %rdx, %rcx
> > +#  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> > +       sarl    $2, %ecx
> > +#  endif
> >  # endif
> >         /* Load first VEC regardless.  */
> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > @@ -207,34 +237,38 @@ L(cross_page_continue):
> >         subq    %rcx, %rsi
> >         jb      L(last_4x_vec_or_less)
> >  # endif
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x1)
> >
> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x2)
> >
> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x3)
> >
> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x4)
> >
> >         /* Align data to VEC_SIZE * 4 - 1.  */
> >  # ifdef USE_AS_STRNLEN
> >         /* Before adjusting length check if at last VEC_SIZE * 4.  */
> > -       cmpq    $(VEC_SIZE * 4 - 1), %rsi
> > +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
> >         jbe     L(last_4x_vec_or_less_load)
> >         incq    %rdi
> >         movl    %edi, %ecx
> >         orq     $(VEC_SIZE * 4 - 1), %rdi
> >         andl    $(VEC_SIZE * 4 - 1), %ecx
> > +#  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> > +       sarl    $2, %ecx
> > +#  endif
> >         /* Readjust length.  */
> >         addq    %rcx, %rsi
> >  # else
> > @@ -246,13 +280,13 @@ L(cross_page_continue):
> >  L(loop_4x_vec):
> >  # ifdef USE_AS_STRNLEN
> >         /* Break if at end of length.  */
> > -       subq    $(VEC_SIZE * 4), %rsi
> > +       subq    $(CHAR_PER_VEC * 4), %rsi
> >         jb      L(last_4x_vec_or_less_cmpeq)
> >  # endif
> > -       /* Save some code size by microfusing VPMINU with the load. Since
> > -          the matches in ymm2/ymm4 can only be returned if there where
> no
> > -          matches in ymm1/ymm3 respectively there is no issue with
> overlap.
> > -        */
> > +       /* Save some code size by microfusing VPMINU with the load.
> > +          Since the matches in ymm2/ymm4 can only be returned if there
> > +          where no matches in ymm1/ymm3 respectively there is no issue
> > +          with overlap.  */
> >         vmovdqa 1(%rdi), %ymm1
> >         VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> >         vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> > @@ -260,7 +294,7 @@ L(loop_4x_vec):
> >
> >         VPMINU  %ymm2, %ymm4, %ymm5
> >         VPCMPEQ %ymm5, %ymm0, %ymm5
> > -       vpmovmskb       %ymm5, %ecx
> > +       vpmovmskb %ymm5, %ecx
> >
> >         subq    $-(VEC_SIZE * 4), %rdi
> >         testl   %ecx, %ecx
> > @@ -268,27 +302,28 @@ L(loop_4x_vec):
> >
> >
> >         VPCMPEQ %ymm1, %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         subq    %rdx, %rdi
> >         testl   %eax, %eax
> >         jnz     L(last_vec_return_x0)
> >
> >         VPCMPEQ %ymm2, %ymm0, %ymm2
> > -       vpmovmskb       %ymm2, %eax
> > +       vpmovmskb %ymm2, %eax
> >         testl   %eax, %eax
> >         jnz     L(last_vec_return_x1)
> >
> >         /* Combine last 2 VEC.  */
> >         VPCMPEQ %ymm3, %ymm0, %ymm3
> > -       vpmovmskb       %ymm3, %eax
> > -       /* rcx has combined result from all 4 VEC. It will only be used
> if
> > -          the first 3 other VEC all did not contain a match.  */
> > +       vpmovmskb %ymm3, %eax
> > +       /* rcx has combined result from all 4 VEC. It will only be used
> > +          if the first 3 other VEC all did not contain a match.  */
> >         salq    $32, %rcx
> >         orq     %rcx, %rax
> >         tzcntq  %rax, %rax
> >         subq    $(VEC_SIZE * 2 - 1), %rdi
> >         addq    %rdi, %rax
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -297,15 +332,19 @@ L(loop_4x_vec):
> >  # ifdef USE_AS_STRNLEN
> >         .p2align 4
> >  L(last_4x_vec_or_less_load):
> > -       /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> > +       /* Depending on entry adjust rdi / prepare first VEC in ymm1.
> > +        */
> >         subq    $-(VEC_SIZE * 4), %rdi
> >  L(last_4x_vec_or_less_cmpeq):
> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >  L(last_4x_vec_or_less):
> > -
> > -       vpmovmskb       %ymm1, %eax
> > -       /* If remaining length > VEC_SIZE * 2. This works if esi is off
> by
> > -          VEC_SIZE * 4.  */
> > +#  ifdef USE_AS_WCSLEN
> > +       /* NB: Multiply length by 4 to get byte count.  */
> > +       sall    $2, %esi
> > +#  endif
> > +       vpmovmskb %ymm1, %eax
> > +       /* If remaining length > VEC_SIZE * 2. This works if esi is off
> > +          by VEC_SIZE * 4.  */
> >         testl   $(VEC_SIZE * 2), %esi
> >         jnz     L(last_4x_vec)
> >
> > @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
> >         jb      L(max)
> >
> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         tzcntl  %eax, %eax
> >         /* Check the end of data.  */
> >         cmpl    %eax, %esi
> > @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
> >         addl    $(VEC_SIZE + 1), %eax
> >         addq    %rdi, %rax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  #  endif
> >         VZEROUPPER_RETURN
> > @@ -340,6 +380,7 @@ L(last_vec_return_x0):
> >         subq    $(VEC_SIZE * 4 - 1), %rdi
> >         addq    %rdi, %rax
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -350,6 +391,7 @@ L(last_vec_return_x1):
> >         subq    $(VEC_SIZE * 3 - 1), %rdi
> >         addq    %rdi, %rax
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -366,6 +408,7 @@ L(last_vec_x1_check):
> >         incl    %eax
> >         addq    %rdi, %rax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  #  endif
> >         VZEROUPPER_RETURN
> > @@ -381,14 +424,14 @@ L(last_4x_vec):
> >         jnz     L(last_vec_x1)
> >
> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> >         jnz     L(last_vec_x2)
> >
> >         /* Normalize length.  */
> >         andl    $(VEC_SIZE * 4 - 1), %esi
> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> >         jnz     L(last_vec_x3)
> >
> > @@ -396,7 +439,7 @@ L(last_4x_vec):
> >         jb      L(max)
> >
> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         tzcntl  %eax, %eax
> >         /* Check the end of data.  */
> >         cmpl    %eax, %esi
> > @@ -405,6 +448,7 @@ L(last_4x_vec):
> >         addl    $(VEC_SIZE * 3 + 1), %eax
> >         addq    %rdi, %rax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  #  endif
> >         VZEROUPPER_RETURN
> > @@ -419,6 +463,7 @@ L(last_vec_x1):
> >         incl    %eax
> >         addq    %rdi, %rax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  #  endif
> >         VZEROUPPER_RETURN
> > @@ -432,6 +477,7 @@ L(last_vec_x2):
> >         addl    $(VEC_SIZE + 1), %eax
> >         addq    %rdi, %rax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  #  endif
> >         VZEROUPPER_RETURN
> > @@ -447,6 +493,7 @@ L(last_vec_x3):
> >         addl    $(VEC_SIZE * 2 + 1), %eax
> >         addq    %rdi, %rax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  #  endif
> >         VZEROUPPER_RETURN
> > @@ -455,13 +502,13 @@ L(max_end):
> >         VZEROUPPER_RETURN
> >  # endif
> >
> > -       /* Cold case for crossing page with first load.  */
> > +       /* Cold case for crossing page with first load.  */
> >         .p2align 4
> >  L(cross_page_boundary):
> >         /* Align data to VEC_SIZE - 1.  */
> >         orq     $(VEC_SIZE - 1), %rdi
> >         VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> >            so no need to manually mod rdx.  */
> >         sarxl   %edx, %eax, %eax
> > @@ -470,6 +517,10 @@ L(cross_page_boundary):
> >         jnz     L(cross_page_less_vec)
> >         leaq    1(%rdi), %rcx
> >         subq    %rdx, %rcx
> > +#  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> > +       shrl    $2, %ecx
> > +#  endif
> >         /* Check length.  */
> >         cmpq    %rsi, %rcx
> >         jb      L(cross_page_continue)
> > @@ -479,6 +530,7 @@ L(cross_page_boundary):
> >         jz      L(cross_page_continue)
> >         tzcntl  %eax, %eax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide length by 4 to get wchar_t count.  */
> >         shrl    $2, %eax
> >  #  endif
> >  # endif
> > @@ -489,6 +541,10 @@ L(return_vzeroupper):
> >         .p2align 4
> >  L(cross_page_less_vec):
> >         tzcntl  %eax, %eax
> > +#  ifdef USE_AS_WCSLEN
> > +       /* NB: Multiply length by 4 to get byte count.  */
> > +       sall    $2, %esi
> > +#  endif
> >         cmpq    %rax, %rsi
> >         cmovb   %esi, %eax
> >  #  ifdef USE_AS_WCSLEN
> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> > index d223ea1700..3fc6734910 100644
> > --- a/sysdeps/x86_64/strlen.S
> > +++ b/sysdeps/x86_64/strlen.S
> > @@ -65,12 +65,24 @@ ENTRY(strlen)
> >         ret
> >  L(n_nonzero):
> >  # ifdef AS_WCSLEN
> > -       shl     $2, %RSI_LP
> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> > +   overflow the only way this program doesn't have undefined behavior
> > +   is if there is a null terminator in valid memory so strlen will
> > +   suffice.  */
> > +       mov     %RSI_LP, %R10_LP
> > +       sar     $62, %R10_LP
> > +       test    %R10_LP, %R10_LP
> > +       jnz     __wcslen_sse2
>
> Branch to  __wcslen_sse2 is wrong for 2 reasons:
>
> 1.  __wcslen_sse2 is undefined with --disable-multi-arch.
>
Won't __wcsnlen_sse2 be undefined with --disable-multi-arch as well?


> 2. You should skip ENDBR64 at function entry.
>
> Please create a new label and branch to it.
>
> I am not quite sure how to do this. I am trying to use
strstr-sse2-unaligned.S as a template:
https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S;h=21e1a5f7cfde8ec07fcc4fc80d26984a58d651d7;hb=HEAD#l78
which appears to make a direct call to the global label of __strchr_sse2
without anything special in strchr-sse2.S or strstr-sse2-unaligned.S.

Is there an example in the code you know of I can follow?


> > +       sal     $2, %RSI_LP
> >  # endif
> >
> >  /* Initialize long lived registers.  */
> >
> >         add     %RDI_LP, %RSI_LP
> > +# ifdef AS_WCSLEN
> > +/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
> > +       jbe     __wcslen_sse2
> > +# endif
> >         mov     %RSI_LP, %R10_LP
> >         and     $-64, %R10_LP
> >         mov     %RSI_LP, %R11_LP
> > --
> > 2.25.1
> >
>
> Thanks.
>
>
> --
> H.J.
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
  2021-06-22 23:16     ` Noah Goldstein
@ 2021-06-22 23:28       ` H.J. Lu
  2021-06-23  3:11         ` Noah Goldstein
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2021-06-22 23:28 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jun 22, 2021 at 4:16 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
> On Tue, Jun 22, 2021 at 5:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >
>> > This commit fixes the bug mentioned in the previous commit.
>> >
>> > The previous implementations of wmemchr in these files relied
>> > on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
>> >
>> > The new overflow tests added in the previous commit now
>> > pass (As well as all the other tests).
>> >
>> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
>> > ---
>> >  sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
>> >  sysdeps/x86_64/strlen.S                |  14 ++-
>> >  2 files changed, 106 insertions(+), 38 deletions(-)
>> >
>> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
>> > index bd2e6ee44a..b282a75613 100644
>> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
>> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
>> > @@ -44,21 +44,21 @@
>> >
>> >  # define VEC_SIZE 32
>> >  # define PAGE_SIZE 4096
>> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>> >
>> >         .section SECTION(.text),"ax",@progbits
>> >  ENTRY (STRLEN)
>> >  # ifdef USE_AS_STRNLEN
>> >         /* Check zero length.  */
>> > +#  ifdef __ILP32__
>> > +       /* Clear upper bits.  */
>> > +       and     %RSI_LP, %RSI_LP
>> > +#  else
>> >         test    %RSI_LP, %RSI_LP
>> > +#  endif
>> >         jz      L(zero)
>> >         /* Store max len in R8_LP before adjusting if using WCSLEN.  */
>> >         mov     %RSI_LP, %R8_LP
>> > -#  ifdef USE_AS_WCSLEN
>> > -       shl     $2, %RSI_LP
>> > -#  elif defined __ILP32__
>> > -       /* Clear the upper 32 bits.  */
>> > -       movl    %esi, %esi
>> > -#  endif
>> >  # endif
>> >         movl    %edi, %eax
>> >         movq    %rdi, %rdx
>> > @@ -72,10 +72,10 @@ ENTRY (STRLEN)
>> >
>> >         /* Check the first VEC_SIZE bytes.  */
>> >         VPCMPEQ (%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >  # ifdef USE_AS_STRNLEN
>> >         /* If length < VEC_SIZE handle special.  */
>> > -       cmpq    $VEC_SIZE, %rsi
>> > +       cmpq    $CHAR_PER_VEC, %rsi
>> >         jbe     L(first_vec_x0)
>> >  # endif
>> >         /* If empty continue to aligned_more. Otherwise return bit
>> > @@ -84,6 +84,7 @@ ENTRY (STRLEN)
>> >         jz      L(aligned_more)
>> >         tzcntl  %eax, %eax
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrl    $2, %eax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -97,9 +98,14 @@ L(zero):
>> >  L(first_vec_x0):
>> >         /* Set bit for max len so that tzcnt will return min of max len
>> >            and position of first match.  */
>> > +#  ifdef USE_AS_WCSLEN
>> > +       /* NB: Multiply length by 4 to get byte count.  */
>> > +       sall    $2, %esi
>> > +#  endif
>> >         btsq    %rsi, %rax
>> >         tzcntl  %eax, %eax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrl    $2, %eax
>> >  #  endif
>> >         VZEROUPPER_RETURN
>> > @@ -113,14 +119,19 @@ L(first_vec_x1):
>> >  # ifdef USE_AS_STRNLEN
>> >         /* Use ecx which was computed earlier to compute correct value.
>> >          */
>> > +#  ifdef USE_AS_WCSLEN
>> > +       leal    -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
>> > +#  else
>> >         subl    $(VEC_SIZE * 4 + 1), %ecx
>> >         addl    %ecx, %eax
>> > +#  endif
>> >  # else
>> >         subl    %edx, %edi
>> >         incl    %edi
>> >         addl    %edi, %eax
>> >  # endif
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrl    $2, %eax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -133,14 +144,19 @@ L(first_vec_x2):
>> >  # ifdef USE_AS_STRNLEN
>> >         /* Use ecx which was computed earlier to compute correct value.
>> >          */
>> > +#  ifdef USE_AS_WCSLEN
>> > +       leal    -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
>> > +#  else
>> >         subl    $(VEC_SIZE * 3 + 1), %ecx
>> >         addl    %ecx, %eax
>> > +#  endif
>> >  # else
>> >         subl    %edx, %edi
>> >         addl    $(VEC_SIZE + 1), %edi
>> >         addl    %edi, %eax
>> >  # endif
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrl    $2, %eax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -153,14 +169,19 @@ L(first_vec_x3):
>> >  # ifdef USE_AS_STRNLEN
>> >         /* Use ecx which was computed earlier to compute correct value.
>> >          */
>> > +#  ifdef USE_AS_WCSLEN
>> > +       leal    -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
>> > +#  else
>> >         subl    $(VEC_SIZE * 2 + 1), %ecx
>> >         addl    %ecx, %eax
>> > +#  endif
>> >  # else
>> >         subl    %edx, %edi
>> >         addl    $(VEC_SIZE * 2 + 1), %edi
>> >         addl    %edi, %eax
>> >  # endif
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrl    $2, %eax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -173,14 +194,19 @@ L(first_vec_x4):
>> >  # ifdef USE_AS_STRNLEN
>> >         /* Use ecx which was computed earlier to compute correct value.
>> >          */
>> > +#  ifdef USE_AS_WCSLEN
>> > +       leal    -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
>> > +#  else
>> >         subl    $(VEC_SIZE + 1), %ecx
>> >         addl    %ecx, %eax
>> > +#  endif
>> >  # else
>> >         subl    %edx, %edi
>> >         addl    $(VEC_SIZE * 3 + 1), %edi
>> >         addl    %edi, %eax
>> >  # endif
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrl    $2, %eax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -195,10 +221,14 @@ L(cross_page_continue):
>> >         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>> >            since data is only aligned to VEC_SIZE.  */
>> >  # ifdef USE_AS_STRNLEN
>> > -       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
>> > -          it simplies the logic in last_4x_vec_or_less.  */
>> > +       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
>> > +          because it simplies the logic in last_4x_vec_or_less.  */
>> >         leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
>> >         subq    %rdx, %rcx
>> > +#  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
>> > +       sarl    $2, %ecx
>> > +#  endif
>> >  # endif
>> >         /* Load first VEC regardless.  */
>> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
>> > @@ -207,34 +237,38 @@ L(cross_page_continue):
>> >         subq    %rcx, %rsi
>> >         jb      L(last_4x_vec_or_less)
>> >  # endif
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         testl   %eax, %eax
>> >         jnz     L(first_vec_x1)
>> >
>> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         testl   %eax, %eax
>> >         jnz     L(first_vec_x2)
>> >
>> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         testl   %eax, %eax
>> >         jnz     L(first_vec_x3)
>> >
>> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         testl   %eax, %eax
>> >         jnz     L(first_vec_x4)
>> >
>> >         /* Align data to VEC_SIZE * 4 - 1.  */
>> >  # ifdef USE_AS_STRNLEN
>> >         /* Before adjusting length check if at last VEC_SIZE * 4.  */
>> > -       cmpq    $(VEC_SIZE * 4 - 1), %rsi
>> > +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
>> >         jbe     L(last_4x_vec_or_less_load)
>> >         incq    %rdi
>> >         movl    %edi, %ecx
>> >         orq     $(VEC_SIZE * 4 - 1), %rdi
>> >         andl    $(VEC_SIZE * 4 - 1), %ecx
>> > +#  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
>> > +       sarl    $2, %ecx
>> > +#  endif
>> >         /* Readjust length.  */
>> >         addq    %rcx, %rsi
>> >  # else
>> > @@ -246,13 +280,13 @@ L(cross_page_continue):
>> >  L(loop_4x_vec):
>> >  # ifdef USE_AS_STRNLEN
>> >         /* Break if at end of length.  */
>> > -       subq    $(VEC_SIZE * 4), %rsi
>> > +       subq    $(CHAR_PER_VEC * 4), %rsi
>> >         jb      L(last_4x_vec_or_less_cmpeq)
>> >  # endif
>> > -       /* Save some code size by microfusing VPMINU with the load. Since
>> > -          the matches in ymm2/ymm4 can only be returned if there where no
>> > -          matches in ymm1/ymm3 respectively there is no issue with overlap.
>> > -        */
>> > +       /* Save some code size by microfusing VPMINU with the load.
>> > +          Since the matches in ymm2/ymm4 can only be returned if there
>> > +          where no matches in ymm1/ymm3 respectively there is no issue
>> > +          with overlap.  */
>> >         vmovdqa 1(%rdi), %ymm1
>> >         VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
>> >         vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
>> > @@ -260,7 +294,7 @@ L(loop_4x_vec):
>> >
>> >         VPMINU  %ymm2, %ymm4, %ymm5
>> >         VPCMPEQ %ymm5, %ymm0, %ymm5
>> > -       vpmovmskb       %ymm5, %ecx
>> > +       vpmovmskb %ymm5, %ecx
>> >
>> >         subq    $-(VEC_SIZE * 4), %rdi
>> >         testl   %ecx, %ecx
>> > @@ -268,27 +302,28 @@ L(loop_4x_vec):
>> >
>> >
>> >         VPCMPEQ %ymm1, %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         subq    %rdx, %rdi
>> >         testl   %eax, %eax
>> >         jnz     L(last_vec_return_x0)
>> >
>> >         VPCMPEQ %ymm2, %ymm0, %ymm2
>> > -       vpmovmskb       %ymm2, %eax
>> > +       vpmovmskb %ymm2, %eax
>> >         testl   %eax, %eax
>> >         jnz     L(last_vec_return_x1)
>> >
>> >         /* Combine last 2 VEC.  */
>> >         VPCMPEQ %ymm3, %ymm0, %ymm3
>> > -       vpmovmskb       %ymm3, %eax
>> > -       /* rcx has combined result from all 4 VEC. It will only be used if
>> > -          the first 3 other VEC all did not contain a match.  */
>> > +       vpmovmskb %ymm3, %eax
>> > +       /* rcx has combined result from all 4 VEC. It will only be used
>> > +          if the first 3 other VEC all did not contain a match.  */
>> >         salq    $32, %rcx
>> >         orq     %rcx, %rax
>> >         tzcntq  %rax, %rax
>> >         subq    $(VEC_SIZE * 2 - 1), %rdi
>> >         addq    %rdi, %rax
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -297,15 +332,19 @@ L(loop_4x_vec):
>> >  # ifdef USE_AS_STRNLEN
>> >         .p2align 4
>> >  L(last_4x_vec_or_less_load):
>> > -       /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
>> > +       /* Depending on entry adjust rdi / prepare first VEC in ymm1.
>> > +        */
>> >         subq    $-(VEC_SIZE * 4), %rdi
>> >  L(last_4x_vec_or_less_cmpeq):
>> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
>> >  L(last_4x_vec_or_less):
>> > -
>> > -       vpmovmskb       %ymm1, %eax
>> > -       /* If remaining length > VEC_SIZE * 2. This works if esi is off by
>> > -          VEC_SIZE * 4.  */
>> > +#  ifdef USE_AS_WCSLEN
>> > +       /* NB: Multiply length by 4 to get byte count.  */
>> > +       sall    $2, %esi
>> > +#  endif
>> > +       vpmovmskb %ymm1, %eax
>> > +       /* If remaining length > VEC_SIZE * 2. This works if esi is off
>> > +          by VEC_SIZE * 4.  */
>> >         testl   $(VEC_SIZE * 2), %esi
>> >         jnz     L(last_4x_vec)
>> >
>> > @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
>> >         jb      L(max)
>> >
>> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         tzcntl  %eax, %eax
>> >         /* Check the end of data.  */
>> >         cmpl    %eax, %esi
>> > @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
>> >         addl    $(VEC_SIZE + 1), %eax
>> >         addq    %rdi, %rax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  #  endif
>> >         VZEROUPPER_RETURN
>> > @@ -340,6 +380,7 @@ L(last_vec_return_x0):
>> >         subq    $(VEC_SIZE * 4 - 1), %rdi
>> >         addq    %rdi, %rax
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -350,6 +391,7 @@ L(last_vec_return_x1):
>> >         subq    $(VEC_SIZE * 3 - 1), %rdi
>> >         addq    %rdi, %rax
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -366,6 +408,7 @@ L(last_vec_x1_check):
>> >         incl    %eax
>> >         addq    %rdi, %rax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  #  endif
>> >         VZEROUPPER_RETURN
>> > @@ -381,14 +424,14 @@ L(last_4x_vec):
>> >         jnz     L(last_vec_x1)
>> >
>> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         testl   %eax, %eax
>> >         jnz     L(last_vec_x2)
>> >
>> >         /* Normalize length.  */
>> >         andl    $(VEC_SIZE * 4 - 1), %esi
>> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         testl   %eax, %eax
>> >         jnz     L(last_vec_x3)
>> >
>> > @@ -396,7 +439,7 @@ L(last_4x_vec):
>> >         jb      L(max)
>> >
>> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         tzcntl  %eax, %eax
>> >         /* Check the end of data.  */
>> >         cmpl    %eax, %esi
>> > @@ -405,6 +448,7 @@ L(last_4x_vec):
>> >         addl    $(VEC_SIZE * 3 + 1), %eax
>> >         addq    %rdi, %rax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  #  endif
>> >         VZEROUPPER_RETURN
>> > @@ -419,6 +463,7 @@ L(last_vec_x1):
>> >         incl    %eax
>> >         addq    %rdi, %rax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  #  endif
>> >         VZEROUPPER_RETURN
>> > @@ -432,6 +477,7 @@ L(last_vec_x2):
>> >         addl    $(VEC_SIZE + 1), %eax
>> >         addq    %rdi, %rax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  #  endif
>> >         VZEROUPPER_RETURN
>> > @@ -447,6 +493,7 @@ L(last_vec_x3):
>> >         addl    $(VEC_SIZE * 2 + 1), %eax
>> >         addq    %rdi, %rax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  #  endif
>> >         VZEROUPPER_RETURN
>> > @@ -455,13 +502,13 @@ L(max_end):
>> >         VZEROUPPER_RETURN
>> >  # endif
>> >
>> > -       /* Cold case for crossing page with first load.  */
>> > +       /* Cold case for crossing page with first load.  */
>> >         .p2align 4
>> >  L(cross_page_boundary):
>> >         /* Align data to VEC_SIZE - 1.  */
>> >         orq     $(VEC_SIZE - 1), %rdi
>> >         VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
>> >            so no need to manually mod rdx.  */
>> >         sarxl   %edx, %eax, %eax
>> > @@ -470,6 +517,10 @@ L(cross_page_boundary):
>> >         jnz     L(cross_page_less_vec)
>> >         leaq    1(%rdi), %rcx
>> >         subq    %rdx, %rcx
>> > +#  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> > +       shrl    $2, %ecx
>> > +#  endif
>> >         /* Check length.  */
>> >         cmpq    %rsi, %rcx
>> >         jb      L(cross_page_continue)
>> > @@ -479,6 +530,7 @@ L(cross_page_boundary):
>> >         jz      L(cross_page_continue)
>> >         tzcntl  %eax, %eax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide length by 4 to get wchar_t count.  */
>> >         shrl    $2, %eax
>> >  #  endif
>> >  # endif
>> > @@ -489,6 +541,10 @@ L(return_vzeroupper):
>> >         .p2align 4
>> >  L(cross_page_less_vec):
>> >         tzcntl  %eax, %eax
>> > +#  ifdef USE_AS_WCSLEN
>> > +       /* NB: Multiply length by 4 to get byte count.  */
>> > +       sall    $2, %esi
>> > +#  endif
>> >         cmpq    %rax, %rsi
>> >         cmovb   %esi, %eax
>> >  #  ifdef USE_AS_WCSLEN
>> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
>> > index d223ea1700..3fc6734910 100644
>> > --- a/sysdeps/x86_64/strlen.S
>> > +++ b/sysdeps/x86_64/strlen.S
>> > @@ -65,12 +65,24 @@ ENTRY(strlen)
>> >         ret
>> >  L(n_nonzero):
>> >  # ifdef AS_WCSLEN
>> > -       shl     $2, %RSI_LP
>> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
>> > +   overflow the only way this program doesn't have undefined behavior
>> > +   is if there is a null terminator in valid memory so strlen will
>> > +   suffice.  */
>> > +       mov     %RSI_LP, %R10_LP
>> > +       sar     $62, %R10_LP
>> > +       test    %R10_LP, %R10_LP
>> > +       jnz     __wcslen_sse2
>>
>> Branch to  __wcslen_sse2 is wrong for 2 reasons:
>>
>> 1.  __wcslen_sse2 is undefined with --disable-multi-arch.
>
> Won't __wcsnlen_sse2 be undefined with --disable-multi-arch as well?
>
>>
>> 2. You should skip ENDBR64 at function entry.
>>
>> Please create a new label and branch to it.
>>
> I am not quite sure how to do this. I am trying to use
> strstr-sse2-unaligned.S as a template:
> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S;h=21e1a5f7cfde8ec07fcc4fc80d26984a58d651d7;hb=HEAD#l78
> which appears to make a direct call to the global label of __strchr_sse2
> without anything special in strchr-sse2.S or strstr-sse2-unaligned.S.

This is different since all files are in sysdeps/x86_64/multiarch.

> Is there an example in the code you know of I can follow?

There are no exact same codes.

memmove-vec-unaligned-erms.S has

ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
        movq    %rdi, %rax
L(start):   <<<<<<<<<<<<<<  This is equivalent to __wcslen_sse2.
# ifdef __ILP32__
        /* Clear the upper 32 bits.  */
        movl    %edx, %edx
# endif

>>
>> > +       sal     $2, %RSI_LP
>> >  # endif
>> >
>> >  /* Initialize long lived registers.  */
>> >
>> >         add     %RDI_LP, %RSI_LP
>> > +# ifdef AS_WCSLEN
>> > +/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
>> > +       jbe     __wcslen_sse2
>> > +# endif
>> >         mov     %RSI_LP, %R10_LP
>> >         and     $-64, %R10_LP
>> >         mov     %RSI_LP, %R11_LP
>> > --
>> > 2.25.1
>> >
>>
>> Thanks.
>>
>>
>> --
>> H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
  2021-06-22 23:28       ` H.J. Lu
@ 2021-06-23  3:11         ` Noah Goldstein
  2021-06-23  3:58           ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-23  3:11 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jun 22, 2021 at 7:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> On Tue, Jun 22, 2021 at 4:16 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> >
> >
> > On Tue, Jun 22, 2021 at 5:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <
> goldstein.w.n@gmail.com> wrote:
> >> >
> >> > This commit fixes the bug mentioned in the previous commit.
> >> >
> >> > The previous implementations of wmemchr in these files relied
> >> > on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
> >> >
> >> > The new overflow tests added in the previous commit now
> >> > pass (As well as all the other tests).
> >> >
> >> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> >> > ---
> >> >  sysdeps/x86_64/multiarch/strlen-avx2.S | 130
> ++++++++++++++++++-------
> >> >  sysdeps/x86_64/strlen.S                |  14 ++-
> >> >  2 files changed, 106 insertions(+), 38 deletions(-)
> >> >
> >> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S
> b/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> > index bd2e6ee44a..b282a75613 100644
> >> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> > @@ -44,21 +44,21 @@
> >> >
> >> >  # define VEC_SIZE 32
> >> >  # define PAGE_SIZE 4096
> >> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> >> >
> >> >         .section SECTION(.text),"ax",@progbits
> >> >  ENTRY (STRLEN)
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* Check zero length.  */
> >> > +#  ifdef __ILP32__
> >> > +       /* Clear upper bits.  */
> >> > +       and     %RSI_LP, %RSI_LP
> >> > +#  else
> >> >         test    %RSI_LP, %RSI_LP
> >> > +#  endif
> >> >         jz      L(zero)
> >> >         /* Store max len in R8_LP before adjusting if using WCSLEN.
> */
> >> >         mov     %RSI_LP, %R8_LP
> >> > -#  ifdef USE_AS_WCSLEN
> >> > -       shl     $2, %RSI_LP
> >> > -#  elif defined __ILP32__
> >> > -       /* Clear the upper 32 bits.  */
> >> > -       movl    %esi, %esi
> >> > -#  endif
> >> >  # endif
> >> >         movl    %edi, %eax
> >> >         movq    %rdi, %rdx
> >> > @@ -72,10 +72,10 @@ ENTRY (STRLEN)
> >> >
> >> >         /* Check the first VEC_SIZE bytes.  */
> >> >         VPCMPEQ (%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* If length < VEC_SIZE handle special.  */
> >> > -       cmpq    $VEC_SIZE, %rsi
> >> > +       cmpq    $CHAR_PER_VEC, %rsi
> >> >         jbe     L(first_vec_x0)
> >> >  # endif
> >> >         /* If empty continue to aligned_more. Otherwise return bit
> >> > @@ -84,6 +84,7 @@ ENTRY (STRLEN)
> >> >         jz      L(aligned_more)
> >> >         tzcntl  %eax, %eax
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrl    $2, %eax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -97,9 +98,14 @@ L(zero):
> >> >  L(first_vec_x0):
> >> >         /* Set bit for max len so that tzcnt will return min of max
> len
> >> >            and position of first match.  */
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Multiply length by 4 to get byte count.  */
> >> > +       sall    $2, %esi
> >> > +#  endif
> >> >         btsq    %rsi, %rax
> >> >         tzcntl  %eax, %eax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrl    $2, %eax
> >> >  #  endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -113,14 +119,19 @@ L(first_vec_x1):
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >          */
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       leal    -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
> >> > +#  else
> >> >         subl    $(VEC_SIZE * 4 + 1), %ecx
> >> >         addl    %ecx, %eax
> >> > +#  endif
> >> >  # else
> >> >         subl    %edx, %edi
> >> >         incl    %edi
> >> >         addl    %edi, %eax
> >> >  # endif
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrl    $2, %eax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -133,14 +144,19 @@ L(first_vec_x2):
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >          */
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       leal    -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
> >> > +#  else
> >> >         subl    $(VEC_SIZE * 3 + 1), %ecx
> >> >         addl    %ecx, %eax
> >> > +#  endif
> >> >  # else
> >> >         subl    %edx, %edi
> >> >         addl    $(VEC_SIZE + 1), %edi
> >> >         addl    %edi, %eax
> >> >  # endif
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrl    $2, %eax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -153,14 +169,19 @@ L(first_vec_x3):
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >          */
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       leal    -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
> >> > +#  else
> >> >         subl    $(VEC_SIZE * 2 + 1), %ecx
> >> >         addl    %ecx, %eax
> >> > +#  endif
> >> >  # else
> >> >         subl    %edx, %edi
> >> >         addl    $(VEC_SIZE * 2 + 1), %edi
> >> >         addl    %edi, %eax
> >> >  # endif
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrl    $2, %eax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -173,14 +194,19 @@ L(first_vec_x4):
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >          */
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       leal    -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
> >> > +#  else
> >> >         subl    $(VEC_SIZE + 1), %ecx
> >> >         addl    %ecx, %eax
> >> > +#  endif
> >> >  # else
> >> >         subl    %edx, %edi
> >> >         addl    $(VEC_SIZE * 3 + 1), %edi
> >> >         addl    %edi, %eax
> >> >  # endif
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrl    $2, %eax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -195,10 +221,14 @@ L(cross_page_continue):
> >> >         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> >> >            since data is only aligned to VEC_SIZE.  */
> >> >  # ifdef USE_AS_STRNLEN
> >> > -       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> because
> >> > -          it simplies the logic in last_4x_vec_or_less.  */
> >> > +       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> >> > +          because it simplies the logic in last_4x_vec_or_less.  */
> >> >         leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> >> >         subq    %rdx, %rcx
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> >> > +       sarl    $2, %ecx
> >> > +#  endif
> >> >  # endif
> >> >         /* Load first VEC regardless.  */
> >> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >> > @@ -207,34 +237,38 @@ L(cross_page_continue):
> >> >         subq    %rcx, %rsi
> >> >         jb      L(last_4x_vec_or_less)
> >> >  # endif
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         testl   %eax, %eax
> >> >         jnz     L(first_vec_x1)
> >> >
> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         testl   %eax, %eax
> >> >         jnz     L(first_vec_x2)
> >> >
> >> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         testl   %eax, %eax
> >> >         jnz     L(first_vec_x3)
> >> >
> >> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         testl   %eax, %eax
> >> >         jnz     L(first_vec_x4)
> >> >
> >> >         /* Align data to VEC_SIZE * 4 - 1.  */
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* Before adjusting length check if at last VEC_SIZE * 4.  */
> >> > -       cmpq    $(VEC_SIZE * 4 - 1), %rsi
> >> > +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
> >> >         jbe     L(last_4x_vec_or_less_load)
> >> >         incq    %rdi
> >> >         movl    %edi, %ecx
> >> >         orq     $(VEC_SIZE * 4 - 1), %rdi
> >> >         andl    $(VEC_SIZE * 4 - 1), %ecx
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> >> > +       sarl    $2, %ecx
> >> > +#  endif
> >> >         /* Readjust length.  */
> >> >         addq    %rcx, %rsi
> >> >  # else
> >> > @@ -246,13 +280,13 @@ L(cross_page_continue):
> >> >  L(loop_4x_vec):
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* Break if at end of length.  */
> >> > -       subq    $(VEC_SIZE * 4), %rsi
> >> > +       subq    $(CHAR_PER_VEC * 4), %rsi
> >> >         jb      L(last_4x_vec_or_less_cmpeq)
> >> >  # endif
> >> > -       /* Save some code size by microfusing VPMINU with the load.
> Since
> >> > -          the matches in ymm2/ymm4 can only be returned if there
> where no
> >> > -          matches in ymm1/ymm3 respectively there is no issue with
> overlap.
> >> > -        */
> >> > +       /* Save some code size by microfusing VPMINU with the load.
> >> > +          Since the matches in ymm2/ymm4 can only be returned if
> there
> >> > +          where no matches in ymm1/ymm3 respectively there is no
> issue
> >> > +          with overlap.  */
> >> >         vmovdqa 1(%rdi), %ymm1
> >> >         VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> >> >         vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> >> > @@ -260,7 +294,7 @@ L(loop_4x_vec):
> >> >
> >> >         VPMINU  %ymm2, %ymm4, %ymm5
> >> >         VPCMPEQ %ymm5, %ymm0, %ymm5
> >> > -       vpmovmskb       %ymm5, %ecx
> >> > +       vpmovmskb %ymm5, %ecx
> >> >
> >> >         subq    $-(VEC_SIZE * 4), %rdi
> >> >         testl   %ecx, %ecx
> >> > @@ -268,27 +302,28 @@ L(loop_4x_vec):
> >> >
> >> >
> >> >         VPCMPEQ %ymm1, %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         subq    %rdx, %rdi
> >> >         testl   %eax, %eax
> >> >         jnz     L(last_vec_return_x0)
> >> >
> >> >         VPCMPEQ %ymm2, %ymm0, %ymm2
> >> > -       vpmovmskb       %ymm2, %eax
> >> > +       vpmovmskb %ymm2, %eax
> >> >         testl   %eax, %eax
> >> >         jnz     L(last_vec_return_x1)
> >> >
> >> >         /* Combine last 2 VEC.  */
> >> >         VPCMPEQ %ymm3, %ymm0, %ymm3
> >> > -       vpmovmskb       %ymm3, %eax
> >> > -       /* rcx has combined result from all 4 VEC. It will only be
> used if
> >> > -          the first 3 other VEC all did not contain a match.  */
> >> > +       vpmovmskb %ymm3, %eax
> >> > +       /* rcx has combined result from all 4 VEC. It will only be
> used
> >> > +          if the first 3 other VEC all did not contain a match.  */
> >> >         salq    $32, %rcx
> >> >         orq     %rcx, %rax
> >> >         tzcntq  %rax, %rax
> >> >         subq    $(VEC_SIZE * 2 - 1), %rdi
> >> >         addq    %rdi, %rax
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -297,15 +332,19 @@ L(loop_4x_vec):
> >> >  # ifdef USE_AS_STRNLEN
> >> >         .p2align 4
> >> >  L(last_4x_vec_or_less_load):
> >> > -       /* Depending on entry adjust rdi / prepare first VEC in
> ymm1.  */
> >> > +       /* Depending on entry adjust rdi / prepare first VEC in ymm1.
> >> > +        */
> >> >         subq    $-(VEC_SIZE * 4), %rdi
> >> >  L(last_4x_vec_or_less_cmpeq):
> >> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >> >  L(last_4x_vec_or_less):
> >> > -
> >> > -       vpmovmskb       %ymm1, %eax
> >> > -       /* If remaining length > VEC_SIZE * 2. This works if esi is
> off by
> >> > -          VEC_SIZE * 4.  */
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Multiply length by 4 to get byte count.  */
> >> > +       sall    $2, %esi
> >> > +#  endif
> >> > +       vpmovmskb %ymm1, %eax
> >> > +       /* If remaining length > VEC_SIZE * 2. This works if esi is
> off
> >> > +          by VEC_SIZE * 4.  */
> >> >         testl   $(VEC_SIZE * 2), %esi
> >> >         jnz     L(last_4x_vec)
> >> >
> >> > @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
> >> >         jb      L(max)
> >> >
> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         tzcntl  %eax, %eax
> >> >         /* Check the end of data.  */
> >> >         cmpl    %eax, %esi
> >> > @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
> >> >         addl    $(VEC_SIZE + 1), %eax
> >> >         addq    %rdi, %rax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  #  endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -340,6 +380,7 @@ L(last_vec_return_x0):
> >> >         subq    $(VEC_SIZE * 4 - 1), %rdi
> >> >         addq    %rdi, %rax
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -350,6 +391,7 @@ L(last_vec_return_x1):
> >> >         subq    $(VEC_SIZE * 3 - 1), %rdi
> >> >         addq    %rdi, %rax
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -366,6 +408,7 @@ L(last_vec_x1_check):
> >> >         incl    %eax
> >> >         addq    %rdi, %rax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  #  endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -381,14 +424,14 @@ L(last_4x_vec):
> >> >         jnz     L(last_vec_x1)
> >> >
> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         testl   %eax, %eax
> >> >         jnz     L(last_vec_x2)
> >> >
> >> >         /* Normalize length.  */
> >> >         andl    $(VEC_SIZE * 4 - 1), %esi
> >> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         testl   %eax, %eax
> >> >         jnz     L(last_vec_x3)
> >> >
> >> > @@ -396,7 +439,7 @@ L(last_4x_vec):
> >> >         jb      L(max)
> >> >
> >> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         tzcntl  %eax, %eax
> >> >         /* Check the end of data.  */
> >> >         cmpl    %eax, %esi
> >> > @@ -405,6 +448,7 @@ L(last_4x_vec):
> >> >         addl    $(VEC_SIZE * 3 + 1), %eax
> >> >         addq    %rdi, %rax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  #  endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -419,6 +463,7 @@ L(last_vec_x1):
> >> >         incl    %eax
> >> >         addq    %rdi, %rax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  #  endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -432,6 +477,7 @@ L(last_vec_x2):
> >> >         addl    $(VEC_SIZE + 1), %eax
> >> >         addq    %rdi, %rax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  #  endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -447,6 +493,7 @@ L(last_vec_x3):
> >> >         addl    $(VEC_SIZE * 2 + 1), %eax
> >> >         addq    %rdi, %rax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  #  endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -455,13 +502,13 @@ L(max_end):
> >> >         VZEROUPPER_RETURN
> >> >  # endif
> >> >
> >> > -       /* Cold case for crossing page with first load.  */
> >> > +       /* Cold case for crossing page with first load.  */
> >> >         .p2align 4
> >> >  L(cross_page_boundary):
> >> >         /* Align data to VEC_SIZE - 1.  */
> >> >         orq     $(VEC_SIZE - 1), %rdi
> >> >         VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         /* Remove the leading bytes. sarxl only uses bits [5:0] of
> COUNT
> >> >            so no need to manually mod rdx.  */
> >> >         sarxl   %edx, %eax, %eax
> >> > @@ -470,6 +517,10 @@ L(cross_page_boundary):
> >> >         jnz     L(cross_page_less_vec)
> >> >         leaq    1(%rdi), %rcx
> >> >         subq    %rdx, %rcx
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> > +       shrl    $2, %ecx
> >> > +#  endif
> >> >         /* Check length.  */
> >> >         cmpq    %rsi, %rcx
> >> >         jb      L(cross_page_continue)
> >> > @@ -479,6 +530,7 @@ L(cross_page_boundary):
> >> >         jz      L(cross_page_continue)
> >> >         tzcntl  %eax, %eax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide length by 4 to get wchar_t count.  */
> >> >         shrl    $2, %eax
> >> >  #  endif
> >> >  # endif
> >> > @@ -489,6 +541,10 @@ L(return_vzeroupper):
> >> >         .p2align 4
> >> >  L(cross_page_less_vec):
> >> >         tzcntl  %eax, %eax
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Multiply length by 4 to get byte count.  */
> >> > +       sall    $2, %esi
> >> > +#  endif
> >> >         cmpq    %rax, %rsi
> >> >         cmovb   %esi, %eax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> >> > index d223ea1700..3fc6734910 100644
> >> > --- a/sysdeps/x86_64/strlen.S
> >> > +++ b/sysdeps/x86_64/strlen.S
> >> > @@ -65,12 +65,24 @@ ENTRY(strlen)
> >> >         ret
> >> >  L(n_nonzero):
> >> >  # ifdef AS_WCSLEN
> >> > -       shl     $2, %RSI_LP
> >> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> >> > +   overflow the only way this program doesn't have undefined behavior
> >> > +   is if there is a null terminator in valid memory so strlen will
> >> > +   suffice.  */
> >> > +       mov     %RSI_LP, %R10_LP
> >> > +       sar     $62, %R10_LP
> >> > +       test    %R10_LP, %R10_LP
> >> > +       jnz     __wcslen_sse2
> >>
> >> Branch to  __wcslen_sse2 is wrong for 2 reasons:
> >>
> >> 1.  __wcslen_sse2 is undefined with --disable-multi-arch.
> >
> > Won't __wcsnlen_sse2 be undefined with --disable-multi-arch as well?
> >
> >>
> >> 2. You should skip ENDBR64 at function entry.
> >>
> >> Please create a new label and branch to it.
> >>
> > I am not quite sure how to do this. I am trying to use
> > strstr-sse2-unaligned.S as a template:
> >
> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S;h=21e1a5f7cfde8ec07fcc4fc80d26984a58d651d7;hb=HEAD#l78
> > which appears to make a direct call to the global label of __strchr_sse2
> > without anything special in strchr-sse2.S or strstr-sse2-unaligned.S.


> This is different since all files are in sysdeps/x86_64/multiarch.
>

I see. So it turns out we are missing wcslen_sse4_1 which strlen.S
can also implement (it passes all tests). Would jumping to that be
valid?

Otherwise I think the best bet is to add a target  for wcslen_sse4_1
and define it and wcsnlen_sse4_1 in the same file so the label is visible.
The only issue is the #defines in strlen.S need to all be protected which
is a bit messy. If we don't want to define wcslen_sse4_1 for whatever
reason, I already have this approach working with defining
wcsnlen_sse4_1 in the same file as wcslen-sse2.S and entering from
a local label. But looking at the code it seems the strlen.S file is a bit
better optimized. Thoughts?


> > Is there an example in the code you know of I can follow?
>
> There are no exact same codes.
>
> memmove-vec-unaligned-erms.S has
>
> ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
>         movq    %rdi, %rax
> L(start):   <<<<<<<<<<<<<<  This is equivalent to __wcslen_sse2.
> # ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         movl    %edx, %edx
> # endif


> >>
> >> > +       sal     $2, %RSI_LP
> >> >  # endif
> >> >
> >> >  /* Initialize long lived registers.  */
> >> >
> >> >         add     %RDI_LP, %RSI_LP
> >> > +# ifdef AS_WCSLEN
> >> > +/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
> >> > +       jbe     __wcslen_sse2
> >> > +# endif
> >> >         mov     %RSI_LP, %R10_LP
> >> >         and     $-64, %R10_LP
> >> >         mov     %RSI_LP, %R11_LP
> >> > --
> >> > 2.25.1
> >> >
> >>
> >> Thanks.
> >>
> >>
> >> --
> >> H.J.
>
>
>
> --
> H.J.
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
  2021-06-23  3:11         ` Noah Goldstein
@ 2021-06-23  3:58           ` H.J. Lu
  2021-06-23  4:55             ` Noah Goldstein
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2021-06-23  3:58 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

 On Tue, Jun 22, 2021 at 8:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
>
> On Tue, Jun 22, 2021 at 7:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Tue, Jun 22, 2021 at 4:16 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >
>> >
>> >
>> > On Tue, Jun 22, 2021 at 5:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>> >>
>> >> On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >> >
>> >> > This commit fixes the bug mentioned in the previous commit.
>> >> >
>> >> > The previous implementations of wmemchr in these files relied
>> >> > on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
>> >> >
>> >> > The new overflow tests added in the previous commit now
>> >> > pass (As well as all the other tests).
>> >> >
>> >> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
>> >> > ---
>> >> >  sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
>> >> >  sysdeps/x86_64/strlen.S                |  14 ++-
>> >> >  2 files changed, 106 insertions(+), 38 deletions(-)
>> >> >
>> >> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
>> >> > index bd2e6ee44a..b282a75613 100644
>> >> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
>> >> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
>> >> > @@ -44,21 +44,21 @@
>> >> >
>> >> >  # define VEC_SIZE 32
>> >> >  # define PAGE_SIZE 4096
>> >> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>> >> >
>> >> >         .section SECTION(.text),"ax",@progbits
>> >> >  ENTRY (STRLEN)
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* Check zero length.  */
>> >> > +#  ifdef __ILP32__
>> >> > +       /* Clear upper bits.  */
>> >> > +       and     %RSI_LP, %RSI_LP
>> >> > +#  else
>> >> >         test    %RSI_LP, %RSI_LP
>> >> > +#  endif
>> >> >         jz      L(zero)
>> >> >         /* Store max len in R8_LP before adjusting if using WCSLEN.  */
>> >> >         mov     %RSI_LP, %R8_LP
>> >> > -#  ifdef USE_AS_WCSLEN
>> >> > -       shl     $2, %RSI_LP
>> >> > -#  elif defined __ILP32__
>> >> > -       /* Clear the upper 32 bits.  */
>> >> > -       movl    %esi, %esi
>> >> > -#  endif
>> >> >  # endif
>> >> >         movl    %edi, %eax
>> >> >         movq    %rdi, %rdx
>> >> > @@ -72,10 +72,10 @@ ENTRY (STRLEN)
>> >> >
>> >> >         /* Check the first VEC_SIZE bytes.  */
>> >> >         VPCMPEQ (%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* If length < VEC_SIZE handle special.  */
>> >> > -       cmpq    $VEC_SIZE, %rsi
>> >> > +       cmpq    $CHAR_PER_VEC, %rsi
>> >> >         jbe     L(first_vec_x0)
>> >> >  # endif
>> >> >         /* If empty continue to aligned_more. Otherwise return bit
>> >> > @@ -84,6 +84,7 @@ ENTRY (STRLEN)
>> >> >         jz      L(aligned_more)
>> >> >         tzcntl  %eax, %eax
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrl    $2, %eax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -97,9 +98,14 @@ L(zero):
>> >> >  L(first_vec_x0):
>> >> >         /* Set bit for max len so that tzcnt will return min of max len
>> >> >            and position of first match.  */
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Multiply length by 4 to get byte count.  */
>> >> > +       sall    $2, %esi
>> >> > +#  endif
>> >> >         btsq    %rsi, %rax
>> >> >         tzcntl  %eax, %eax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrl    $2, %eax
>> >> >  #  endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -113,14 +119,19 @@ L(first_vec_x1):
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* Use ecx which was computed earlier to compute correct value.
>> >> >          */
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       leal    -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
>> >> > +#  else
>> >> >         subl    $(VEC_SIZE * 4 + 1), %ecx
>> >> >         addl    %ecx, %eax
>> >> > +#  endif
>> >> >  # else
>> >> >         subl    %edx, %edi
>> >> >         incl    %edi
>> >> >         addl    %edi, %eax
>> >> >  # endif
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrl    $2, %eax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -133,14 +144,19 @@ L(first_vec_x2):
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* Use ecx which was computed earlier to compute correct value.
>> >> >          */
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       leal    -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
>> >> > +#  else
>> >> >         subl    $(VEC_SIZE * 3 + 1), %ecx
>> >> >         addl    %ecx, %eax
>> >> > +#  endif
>> >> >  # else
>> >> >         subl    %edx, %edi
>> >> >         addl    $(VEC_SIZE + 1), %edi
>> >> >         addl    %edi, %eax
>> >> >  # endif
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrl    $2, %eax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -153,14 +169,19 @@ L(first_vec_x3):
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* Use ecx which was computed earlier to compute correct value.
>> >> >          */
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       leal    -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
>> >> > +#  else
>> >> >         subl    $(VEC_SIZE * 2 + 1), %ecx
>> >> >         addl    %ecx, %eax
>> >> > +#  endif
>> >> >  # else
>> >> >         subl    %edx, %edi
>> >> >         addl    $(VEC_SIZE * 2 + 1), %edi
>> >> >         addl    %edi, %eax
>> >> >  # endif
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrl    $2, %eax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -173,14 +194,19 @@ L(first_vec_x4):
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* Use ecx which was computed earlier to compute correct value.
>> >> >          */
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       leal    -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
>> >> > +#  else
>> >> >         subl    $(VEC_SIZE + 1), %ecx
>> >> >         addl    %ecx, %eax
>> >> > +#  endif
>> >> >  # else
>> >> >         subl    %edx, %edi
>> >> >         addl    $(VEC_SIZE * 3 + 1), %edi
>> >> >         addl    %edi, %eax
>> >> >  # endif
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrl    $2, %eax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -195,10 +221,14 @@ L(cross_page_continue):
>> >> >         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>> >> >            since data is only aligned to VEC_SIZE.  */
>> >> >  # ifdef USE_AS_STRNLEN
>> >> > -       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
>> >> > -          it simplies the logic in last_4x_vec_or_less.  */
>> >> > +       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
>> >> > +          because it simplies the logic in last_4x_vec_or_less.  */
>> >> >         leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
>> >> >         subq    %rdx, %rcx
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
>> >> > +       sarl    $2, %ecx
>> >> > +#  endif
>> >> >  # endif
>> >> >         /* Load first VEC regardless.  */
>> >> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
>> >> > @@ -207,34 +237,38 @@ L(cross_page_continue):
>> >> >         subq    %rcx, %rsi
>> >> >         jb      L(last_4x_vec_or_less)
>> >> >  # endif
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(first_vec_x1)
>> >> >
>> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(first_vec_x2)
>> >> >
>> >> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(first_vec_x3)
>> >> >
>> >> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(first_vec_x4)
>> >> >
>> >> >         /* Align data to VEC_SIZE * 4 - 1.  */
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* Before adjusting length check if at last VEC_SIZE * 4.  */
>> >> > -       cmpq    $(VEC_SIZE * 4 - 1), %rsi
>> >> > +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
>> >> >         jbe     L(last_4x_vec_or_less_load)
>> >> >         incq    %rdi
>> >> >         movl    %edi, %ecx
>> >> >         orq     $(VEC_SIZE * 4 - 1), %rdi
>> >> >         andl    $(VEC_SIZE * 4 - 1), %ecx
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
>> >> > +       sarl    $2, %ecx
>> >> > +#  endif
>> >> >         /* Readjust length.  */
>> >> >         addq    %rcx, %rsi
>> >> >  # else
>> >> > @@ -246,13 +280,13 @@ L(cross_page_continue):
>> >> >  L(loop_4x_vec):
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* Break if at end of length.  */
>> >> > -       subq    $(VEC_SIZE * 4), %rsi
>> >> > +       subq    $(CHAR_PER_VEC * 4), %rsi
>> >> >         jb      L(last_4x_vec_or_less_cmpeq)
>> >> >  # endif
>> >> > -       /* Save some code size by microfusing VPMINU with the load. Since
>> >> > -          the matches in ymm2/ymm4 can only be returned if there where no
>> >> > -          matches in ymm1/ymm3 respectively there is no issue with overlap.
>> >> > -        */
>> >> > +       /* Save some code size by microfusing VPMINU with the load.
>> >> > +          Since the matches in ymm2/ymm4 can only be returned if there
>> >> > +          where no matches in ymm1/ymm3 respectively there is no issue
>> >> > +          with overlap.  */
>> >> >         vmovdqa 1(%rdi), %ymm1
>> >> >         VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
>> >> >         vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
>> >> > @@ -260,7 +294,7 @@ L(loop_4x_vec):
>> >> >
>> >> >         VPMINU  %ymm2, %ymm4, %ymm5
>> >> >         VPCMPEQ %ymm5, %ymm0, %ymm5
>> >> > -       vpmovmskb       %ymm5, %ecx
>> >> > +       vpmovmskb %ymm5, %ecx
>> >> >
>> >> >         subq    $-(VEC_SIZE * 4), %rdi
>> >> >         testl   %ecx, %ecx
>> >> > @@ -268,27 +302,28 @@ L(loop_4x_vec):
>> >> >
>> >> >
>> >> >         VPCMPEQ %ymm1, %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         subq    %rdx, %rdi
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(last_vec_return_x0)
>> >> >
>> >> >         VPCMPEQ %ymm2, %ymm0, %ymm2
>> >> > -       vpmovmskb       %ymm2, %eax
>> >> > +       vpmovmskb %ymm2, %eax
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(last_vec_return_x1)
>> >> >
>> >> >         /* Combine last 2 VEC.  */
>> >> >         VPCMPEQ %ymm3, %ymm0, %ymm3
>> >> > -       vpmovmskb       %ymm3, %eax
>> >> > -       /* rcx has combined result from all 4 VEC. It will only be used if
>> >> > -          the first 3 other VEC all did not contain a match.  */
>> >> > +       vpmovmskb %ymm3, %eax
>> >> > +       /* rcx has combined result from all 4 VEC. It will only be used
>> >> > +          if the first 3 other VEC all did not contain a match.  */
>> >> >         salq    $32, %rcx
>> >> >         orq     %rcx, %rax
>> >> >         tzcntq  %rax, %rax
>> >> >         subq    $(VEC_SIZE * 2 - 1), %rdi
>> >> >         addq    %rdi, %rax
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -297,15 +332,19 @@ L(loop_4x_vec):
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         .p2align 4
>> >> >  L(last_4x_vec_or_less_load):
>> >> > -       /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
>> >> > +       /* Depending on entry adjust rdi / prepare first VEC in ymm1.
>> >> > +        */
>> >> >         subq    $-(VEC_SIZE * 4), %rdi
>> >> >  L(last_4x_vec_or_less_cmpeq):
>> >> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
>> >> >  L(last_4x_vec_or_less):
>> >> > -
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > -       /* If remaining length > VEC_SIZE * 2. This works if esi is off by
>> >> > -          VEC_SIZE * 4.  */
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Multiply length by 4 to get byte count.  */
>> >> > +       sall    $2, %esi
>> >> > +#  endif
>> >> > +       vpmovmskb %ymm1, %eax
>> >> > +       /* If remaining length > VEC_SIZE * 2. This works if esi is off
>> >> > +          by VEC_SIZE * 4.  */
>> >> >         testl   $(VEC_SIZE * 2), %esi
>> >> >         jnz     L(last_4x_vec)
>> >> >
>> >> > @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
>> >> >         jb      L(max)
>> >> >
>> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         tzcntl  %eax, %eax
>> >> >         /* Check the end of data.  */
>> >> >         cmpl    %eax, %esi
>> >> > @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
>> >> >         addl    $(VEC_SIZE + 1), %eax
>> >> >         addq    %rdi, %rax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  #  endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -340,6 +380,7 @@ L(last_vec_return_x0):
>> >> >         subq    $(VEC_SIZE * 4 - 1), %rdi
>> >> >         addq    %rdi, %rax
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -350,6 +391,7 @@ L(last_vec_return_x1):
>> >> >         subq    $(VEC_SIZE * 3 - 1), %rdi
>> >> >         addq    %rdi, %rax
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -366,6 +408,7 @@ L(last_vec_x1_check):
>> >> >         incl    %eax
>> >> >         addq    %rdi, %rax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  #  endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -381,14 +424,14 @@ L(last_4x_vec):
>> >> >         jnz     L(last_vec_x1)
>> >> >
>> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(last_vec_x2)
>> >> >
>> >> >         /* Normalize length.  */
>> >> >         andl    $(VEC_SIZE * 4 - 1), %esi
>> >> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(last_vec_x3)
>> >> >
>> >> > @@ -396,7 +439,7 @@ L(last_4x_vec):
>> >> >         jb      L(max)
>> >> >
>> >> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         tzcntl  %eax, %eax
>> >> >         /* Check the end of data.  */
>> >> >         cmpl    %eax, %esi
>> >> > @@ -405,6 +448,7 @@ L(last_4x_vec):
>> >> >         addl    $(VEC_SIZE * 3 + 1), %eax
>> >> >         addq    %rdi, %rax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  #  endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -419,6 +463,7 @@ L(last_vec_x1):
>> >> >         incl    %eax
>> >> >         addq    %rdi, %rax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  #  endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -432,6 +477,7 @@ L(last_vec_x2):
>> >> >         addl    $(VEC_SIZE + 1), %eax
>> >> >         addq    %rdi, %rax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  #  endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -447,6 +493,7 @@ L(last_vec_x3):
>> >> >         addl    $(VEC_SIZE * 2 + 1), %eax
>> >> >         addq    %rdi, %rax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  #  endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -455,13 +502,13 @@ L(max_end):
>> >> >         VZEROUPPER_RETURN
>> >> >  # endif
>> >> >
>> >> > -       /* Cold case for crossing page with first load.  */
>> >> > +       /* Cold case for crossing page with first load.  */
>> >> >         .p2align 4
>> >> >  L(cross_page_boundary):
>> >> >         /* Align data to VEC_SIZE - 1.  */
>> >> >         orq     $(VEC_SIZE - 1), %rdi
>> >> >         VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
>> >> >            so no need to manually mod rdx.  */
>> >> >         sarxl   %edx, %eax, %eax
>> >> > @@ -470,6 +517,10 @@ L(cross_page_boundary):
>> >> >         jnz     L(cross_page_less_vec)
>> >> >         leaq    1(%rdi), %rcx
>> >> >         subq    %rdx, %rcx
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> > +       shrl    $2, %ecx
>> >> > +#  endif
>> >> >         /* Check length.  */
>> >> >         cmpq    %rsi, %rcx
>> >> >         jb      L(cross_page_continue)
>> >> > @@ -479,6 +530,7 @@ L(cross_page_boundary):
>> >> >         jz      L(cross_page_continue)
>> >> >         tzcntl  %eax, %eax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide length by 4 to get wchar_t count.  */
>> >> >         shrl    $2, %eax
>> >> >  #  endif
>> >> >  # endif
>> >> > @@ -489,6 +541,10 @@ L(return_vzeroupper):
>> >> >         .p2align 4
>> >> >  L(cross_page_less_vec):
>> >> >         tzcntl  %eax, %eax
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Multiply length by 4 to get byte count.  */
>> >> > +       sall    $2, %esi
>> >> > +#  endif
>> >> >         cmpq    %rax, %rsi
>> >> >         cmovb   %esi, %eax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
>> >> > index d223ea1700..3fc6734910 100644
>> >> > --- a/sysdeps/x86_64/strlen.S
>> >> > +++ b/sysdeps/x86_64/strlen.S
>> >> > @@ -65,12 +65,24 @@ ENTRY(strlen)
>> >> >         ret
>> >> >  L(n_nonzero):
>> >> >  # ifdef AS_WCSLEN
>> >> > -       shl     $2, %RSI_LP
>> >> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
>> >> > +   overflow the only way this program doesn't have undefined behavior
>> >> > +   is if there is a null terminator in valid memory so strlen will
>> >> > +   suffice.  */
>> >> > +       mov     %RSI_LP, %R10_LP
>> >> > +       sar     $62, %R10_LP
>> >> > +       test    %R10_LP, %R10_LP
>> >> > +       jnz     __wcslen_sse2
>> >>
>> >> Branch to  __wcslen_sse2 is wrong for 2 reasons:
>> >>
>> >> 1.  __wcslen_sse2 is undefined with --disable-multi-arch.
>> >
>> > Won't __wcsnlen_sse2 be undefined with --disable-multi-arch as well?
>> >
>> >>
>> >> 2. You should skip ENDBR64 at function entry.
>> >>
>> >> Please create a new label and branch to it.
>> >>
>> > I am not quite sure how to do this. I am trying to use
>> > strstr-sse2-unaligned.S as a template:
>> > https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S;h=21e1a5f7cfde8ec07fcc4fc80d26984a58d651d7;hb=HEAD#l78
>> > which appears to make a direct call to the global label of __strchr_sse2
>> > without anything special in strchr-sse2.S or strstr-sse2-unaligned.S.
>>
>>
>> This is different since all files are in sysdeps/x86_64/multiarch.
>
>
> I see. So it turns out we are missing wcslen_sse4_1 which strlen.S
> can also implement (it passes all tests). Would jumping to that be
> valid?
>
> Otherwise I think the best bet is to add a target  for wcslen_sse4_1
> and define it and wcsnlen_sse4_1 in the same file so the label is visible.
> The only issue is the #defines in strlen.S need to all be protected which
> is a bit messy. If we don't want to define wcslen_sse4_1 for whatever
> reason, I already have this approach working with defining
> wcsnlen_sse4_1 in the same file as wcslen-sse2.S and entering from
> a local label. But looking at the code it seems the strlen.S file is a bit
> better optimized. Thoughts?
>

I see what is going on.  I was confused by SSE4 codes in strlen.S.
I submitted a patch to move it to multiarch/strlen-vec.S.

Yes, we should add wcslen_sse4_1.   My question is why we need
to branch from __wcsnlen_sse4_1 to __strlen_sse2 with overflow?
Can you make __wcsnlen_sse4_1 to properly handle it directly?

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
  2021-06-23  3:58           ` H.J. Lu
@ 2021-06-23  4:55             ` Noah Goldstein
  0 siblings, 0 replies; 27+ messages in thread
From: Noah Goldstein @ 2021-06-23  4:55 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jun 22, 2021 at 11:59 PM H.J. Lu <hjl.tools@gmail.com> wrote:

>  On Tue, Jun 22, 2021 at 8:11 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> >
> >
> >
> > On Tue, Jun 22, 2021 at 7:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> On Tue, Jun 22, 2021 at 4:16 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >> >
> >> >
> >> >
> >> > On Tue, Jun 22, 2021 at 5:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >> >>
> >> >> On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <
> goldstein.w.n@gmail.com> wrote:
> >> >> >
> >> >> > This commit fixes the bug mentioned in the previous commit.
> >> >> >
> >> >> > The previous implementations of wmemchr in these files relied
> >> >> > on maxlen * sizeof(wchar_t) which was not guranteed by the
> standard.
> >> >> >
> >> >> > The new overflow tests added in the previous commit now
> >> >> > pass (As well as all the other tests).
> >> >> >
> >> >> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> >> >> > ---
> >> >> >  sysdeps/x86_64/multiarch/strlen-avx2.S | 130
> ++++++++++++++++++-------
> >> >> >  sysdeps/x86_64/strlen.S                |  14 ++-
> >> >> >  2 files changed, 106 insertions(+), 38 deletions(-)
> >> >> >
> >> >> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S
> b/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> >> > index bd2e6ee44a..b282a75613 100644
> >> >> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> >> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> >> > @@ -44,21 +44,21 @@
> >> >> >
> >> >> >  # define VEC_SIZE 32
> >> >> >  # define PAGE_SIZE 4096
> >> >> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> >> >> >
> >> >> >         .section SECTION(.text),"ax",@progbits
> >> >> >  ENTRY (STRLEN)
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* Check zero length.  */
> >> >> > +#  ifdef __ILP32__
> >> >> > +       /* Clear upper bits.  */
> >> >> > +       and     %RSI_LP, %RSI_LP
> >> >> > +#  else
> >> >> >         test    %RSI_LP, %RSI_LP
> >> >> > +#  endif
> >> >> >         jz      L(zero)
> >> >> >         /* Store max len in R8_LP before adjusting if using
> WCSLEN.  */
> >> >> >         mov     %RSI_LP, %R8_LP
> >> >> > -#  ifdef USE_AS_WCSLEN
> >> >> > -       shl     $2, %RSI_LP
> >> >> > -#  elif defined __ILP32__
> >> >> > -       /* Clear the upper 32 bits.  */
> >> >> > -       movl    %esi, %esi
> >> >> > -#  endif
> >> >> >  # endif
> >> >> >         movl    %edi, %eax
> >> >> >         movq    %rdi, %rdx
> >> >> > @@ -72,10 +72,10 @@ ENTRY (STRLEN)
> >> >> >
> >> >> >         /* Check the first VEC_SIZE bytes.  */
> >> >> >         VPCMPEQ (%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* If length < VEC_SIZE handle special.  */
> >> >> > -       cmpq    $VEC_SIZE, %rsi
> >> >> > +       cmpq    $CHAR_PER_VEC, %rsi
> >> >> >         jbe     L(first_vec_x0)
> >> >> >  # endif
> >> >> >         /* If empty continue to aligned_more. Otherwise return bit
> >> >> > @@ -84,6 +84,7 @@ ENTRY (STRLEN)
> >> >> >         jz      L(aligned_more)
> >> >> >         tzcntl  %eax, %eax
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrl    $2, %eax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -97,9 +98,14 @@ L(zero):
> >> >> >  L(first_vec_x0):
> >> >> >         /* Set bit for max len so that tzcnt will return min of
> max len
> >> >> >            and position of first match.  */
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Multiply length by 4 to get byte count.  */
> >> >> > +       sall    $2, %esi
> >> >> > +#  endif
> >> >> >         btsq    %rsi, %rax
> >> >> >         tzcntl  %eax, %eax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrl    $2, %eax
> >> >> >  #  endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -113,14 +119,19 @@ L(first_vec_x1):
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >> >          */
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       leal    -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
> >> >> > +#  else
> >> >> >         subl    $(VEC_SIZE * 4 + 1), %ecx
> >> >> >         addl    %ecx, %eax
> >> >> > +#  endif
> >> >> >  # else
> >> >> >         subl    %edx, %edi
> >> >> >         incl    %edi
> >> >> >         addl    %edi, %eax
> >> >> >  # endif
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrl    $2, %eax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -133,14 +144,19 @@ L(first_vec_x2):
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >> >          */
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       leal    -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
> >> >> > +#  else
> >> >> >         subl    $(VEC_SIZE * 3 + 1), %ecx
> >> >> >         addl    %ecx, %eax
> >> >> > +#  endif
> >> >> >  # else
> >> >> >         subl    %edx, %edi
> >> >> >         addl    $(VEC_SIZE + 1), %edi
> >> >> >         addl    %edi, %eax
> >> >> >  # endif
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrl    $2, %eax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -153,14 +169,19 @@ L(first_vec_x3):
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >> >          */
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       leal    -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
> >> >> > +#  else
> >> >> >         subl    $(VEC_SIZE * 2 + 1), %ecx
> >> >> >         addl    %ecx, %eax
> >> >> > +#  endif
> >> >> >  # else
> >> >> >         subl    %edx, %edi
> >> >> >         addl    $(VEC_SIZE * 2 + 1), %edi
> >> >> >         addl    %edi, %eax
> >> >> >  # endif
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrl    $2, %eax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -173,14 +194,19 @@ L(first_vec_x4):
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >> >          */
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       leal    -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
> >> >> > +#  else
> >> >> >         subl    $(VEC_SIZE + 1), %ecx
> >> >> >         addl    %ecx, %eax
> >> >> > +#  endif
> >> >> >  # else
> >> >> >         subl    %edx, %edi
> >> >> >         addl    $(VEC_SIZE * 3 + 1), %edi
> >> >> >         addl    %edi, %eax
> >> >> >  # endif
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrl    $2, %eax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -195,10 +221,14 @@ L(cross_page_continue):
> >> >> >         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a
> time
> >> >> >            since data is only aligned to VEC_SIZE.  */
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> > -       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> because
> >> >> > -          it simplies the logic in last_4x_vec_or_less.  */
> >> >> > +       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> >> >> > +          because it simplies the logic in last_4x_vec_or_less.
> */
> >> >> >         leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> >> >> >         subq    %rdx, %rcx
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> >> >> > +       sarl    $2, %ecx
> >> >> > +#  endif
> >> >> >  # endif
> >> >> >         /* Load first VEC regardless.  */
> >> >> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >> >> > @@ -207,34 +237,38 @@ L(cross_page_continue):
> >> >> >         subq    %rcx, %rsi
> >> >> >         jb      L(last_4x_vec_or_less)
> >> >> >  # endif
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(first_vec_x1)
> >> >> >
> >> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(first_vec_x2)
> >> >> >
> >> >> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(first_vec_x3)
> >> >> >
> >> >> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(first_vec_x4)
> >> >> >
> >> >> >         /* Align data to VEC_SIZE * 4 - 1.  */
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* Before adjusting length check if at last VEC_SIZE * 4.
> */
> >> >> > -       cmpq    $(VEC_SIZE * 4 - 1), %rsi
> >> >> > +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
> >> >> >         jbe     L(last_4x_vec_or_less_load)
> >> >> >         incq    %rdi
> >> >> >         movl    %edi, %ecx
> >> >> >         orq     $(VEC_SIZE * 4 - 1), %rdi
> >> >> >         andl    $(VEC_SIZE * 4 - 1), %ecx
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> >> >> > +       sarl    $2, %ecx
> >> >> > +#  endif
> >> >> >         /* Readjust length.  */
> >> >> >         addq    %rcx, %rsi
> >> >> >  # else
> >> >> > @@ -246,13 +280,13 @@ L(cross_page_continue):
> >> >> >  L(loop_4x_vec):
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* Break if at end of length.  */
> >> >> > -       subq    $(VEC_SIZE * 4), %rsi
> >> >> > +       subq    $(CHAR_PER_VEC * 4), %rsi
> >> >> >         jb      L(last_4x_vec_or_less_cmpeq)
> >> >> >  # endif
> >> >> > -       /* Save some code size by microfusing VPMINU with the
> load. Since
> >> >> > -          the matches in ymm2/ymm4 can only be returned if there
> where no
> >> >> > -          matches in ymm1/ymm3 respectively there is no issue
> with overlap.
> >> >> > -        */
> >> >> > +       /* Save some code size by microfusing VPMINU with the load.
> >> >> > +          Since the matches in ymm2/ymm4 can only be returned if
> there
> >> >> > +          where no matches in ymm1/ymm3 respectively there is no
> issue
> >> >> > +          with overlap.  */
> >> >> >         vmovdqa 1(%rdi), %ymm1
> >> >> >         VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> >> >> >         vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> >> >> > @@ -260,7 +294,7 @@ L(loop_4x_vec):
> >> >> >
> >> >> >         VPMINU  %ymm2, %ymm4, %ymm5
> >> >> >         VPCMPEQ %ymm5, %ymm0, %ymm5
> >> >> > -       vpmovmskb       %ymm5, %ecx
> >> >> > +       vpmovmskb %ymm5, %ecx
> >> >> >
> >> >> >         subq    $-(VEC_SIZE * 4), %rdi
> >> >> >         testl   %ecx, %ecx
> >> >> > @@ -268,27 +302,28 @@ L(loop_4x_vec):
> >> >> >
> >> >> >
> >> >> >         VPCMPEQ %ymm1, %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         subq    %rdx, %rdi
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(last_vec_return_x0)
> >> >> >
> >> >> >         VPCMPEQ %ymm2, %ymm0, %ymm2
> >> >> > -       vpmovmskb       %ymm2, %eax
> >> >> > +       vpmovmskb %ymm2, %eax
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(last_vec_return_x1)
> >> >> >
> >> >> >         /* Combine last 2 VEC.  */
> >> >> >         VPCMPEQ %ymm3, %ymm0, %ymm3
> >> >> > -       vpmovmskb       %ymm3, %eax
> >> >> > -       /* rcx has combined result from all 4 VEC. It will only be
> used if
> >> >> > -          the first 3 other VEC all did not contain a match.  */
> >> >> > +       vpmovmskb %ymm3, %eax
> >> >> > +       /* rcx has combined result from all 4 VEC. It will only be
> used
> >> >> > +          if the first 3 other VEC all did not contain a match.
> */
> >> >> >         salq    $32, %rcx
> >> >> >         orq     %rcx, %rax
> >> >> >         tzcntq  %rax, %rax
> >> >> >         subq    $(VEC_SIZE * 2 - 1), %rdi
> >> >> >         addq    %rdi, %rax
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -297,15 +332,19 @@ L(loop_4x_vec):
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         .p2align 4
> >> >> >  L(last_4x_vec_or_less_load):
> >> >> > -       /* Depending on entry adjust rdi / prepare first VEC in
> ymm1.  */
> >> >> > +       /* Depending on entry adjust rdi / prepare first VEC in
> ymm1.
> >> >> > +        */
> >> >> >         subq    $-(VEC_SIZE * 4), %rdi
> >> >> >  L(last_4x_vec_or_less_cmpeq):
> >> >> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >> >> >  L(last_4x_vec_or_less):
> >> >> > -
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > -       /* If remaining length > VEC_SIZE * 2. This works if esi
> is off by
> >> >> > -          VEC_SIZE * 4.  */
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Multiply length by 4 to get byte count.  */
> >> >> > +       sall    $2, %esi
> >> >> > +#  endif
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> > +       /* If remaining length > VEC_SIZE * 2. This works if esi
> is off
> >> >> > +          by VEC_SIZE * 4.  */
> >> >> >         testl   $(VEC_SIZE * 2), %esi
> >> >> >         jnz     L(last_4x_vec)
> >> >> >
> >> >> > @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
> >> >> >         jb      L(max)
> >> >> >
> >> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         tzcntl  %eax, %eax
> >> >> >         /* Check the end of data.  */
> >> >> >         cmpl    %eax, %esi
> >> >> > @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
> >> >> >         addl    $(VEC_SIZE + 1), %eax
> >> >> >         addq    %rdi, %rax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  #  endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -340,6 +380,7 @@ L(last_vec_return_x0):
> >> >> >         subq    $(VEC_SIZE * 4 - 1), %rdi
> >> >> >         addq    %rdi, %rax
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -350,6 +391,7 @@ L(last_vec_return_x1):
> >> >> >         subq    $(VEC_SIZE * 3 - 1), %rdi
> >> >> >         addq    %rdi, %rax
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -366,6 +408,7 @@ L(last_vec_x1_check):
> >> >> >         incl    %eax
> >> >> >         addq    %rdi, %rax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  #  endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -381,14 +424,14 @@ L(last_4x_vec):
> >> >> >         jnz     L(last_vec_x1)
> >> >> >
> >> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(last_vec_x2)
> >> >> >
> >> >> >         /* Normalize length.  */
> >> >> >         andl    $(VEC_SIZE * 4 - 1), %esi
> >> >> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(last_vec_x3)
> >> >> >
> >> >> > @@ -396,7 +439,7 @@ L(last_4x_vec):
> >> >> >         jb      L(max)
> >> >> >
> >> >> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         tzcntl  %eax, %eax
> >> >> >         /* Check the end of data.  */
> >> >> >         cmpl    %eax, %esi
> >> >> > @@ -405,6 +448,7 @@ L(last_4x_vec):
> >> >> >         addl    $(VEC_SIZE * 3 + 1), %eax
> >> >> >         addq    %rdi, %rax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  #  endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -419,6 +463,7 @@ L(last_vec_x1):
> >> >> >         incl    %eax
> >> >> >         addq    %rdi, %rax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  #  endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -432,6 +477,7 @@ L(last_vec_x2):
> >> >> >         addl    $(VEC_SIZE + 1), %eax
> >> >> >         addq    %rdi, %rax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  #  endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -447,6 +493,7 @@ L(last_vec_x3):
> >> >> >         addl    $(VEC_SIZE * 2 + 1), %eax
> >> >> >         addq    %rdi, %rax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  #  endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -455,13 +502,13 @@ L(max_end):
> >> >> >         VZEROUPPER_RETURN
> >> >> >  # endif
> >> >> >
> >> >> > -       /* Cold case for crossing page with first load.  */
> >> >> > +       /* Cold case for crossing page with first load.  */
> >> >> >         .p2align 4
> >> >> >  L(cross_page_boundary):
> >> >> >         /* Align data to VEC_SIZE - 1.  */
> >> >> >         orq     $(VEC_SIZE - 1), %rdi
> >> >> >         VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         /* Remove the leading bytes. sarxl only uses bits [5:0] of
> COUNT
> >> >> >            so no need to manually mod rdx.  */
> >> >> >         sarxl   %edx, %eax, %eax
> >> >> > @@ -470,6 +517,10 @@ L(cross_page_boundary):
> >> >> >         jnz     L(cross_page_less_vec)
> >> >> >         leaq    1(%rdi), %rcx
> >> >> >         subq    %rdx, %rcx
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> > +       shrl    $2, %ecx
> >> >> > +#  endif
> >> >> >         /* Check length.  */
> >> >> >         cmpq    %rsi, %rcx
> >> >> >         jb      L(cross_page_continue)
> >> >> > @@ -479,6 +530,7 @@ L(cross_page_boundary):
> >> >> >         jz      L(cross_page_continue)
> >> >> >         tzcntl  %eax, %eax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide length by 4 to get wchar_t count.  */
> >> >> >         shrl    $2, %eax
> >> >> >  #  endif
> >> >> >  # endif
> >> >> > @@ -489,6 +541,10 @@ L(return_vzeroupper):
> >> >> >         .p2align 4
> >> >> >  L(cross_page_less_vec):
> >> >> >         tzcntl  %eax, %eax
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Multiply length by 4 to get byte count.  */
> >> >> > +       sall    $2, %esi
> >> >> > +#  endif
> >> >> >         cmpq    %rax, %rsi
> >> >> >         cmovb   %esi, %eax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> >> >> > index d223ea1700..3fc6734910 100644
> >> >> > --- a/sysdeps/x86_64/strlen.S
> >> >> > +++ b/sysdeps/x86_64/strlen.S
> >> >> > @@ -65,12 +65,24 @@ ENTRY(strlen)
> >> >> >         ret
> >> >> >  L(n_nonzero):
> >> >> >  # ifdef AS_WCSLEN
> >> >> > -       shl     $2, %RSI_LP
> >> >> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> >> >> > +   overflow the only way this program doesn't have undefined
> behavior
> >> >> > +   is if there is a null terminator in valid memory so strlen will
> >> >> > +   suffice.  */
> >> >> > +       mov     %RSI_LP, %R10_LP
> >> >> > +       sar     $62, %R10_LP
> >> >> > +       test    %R10_LP, %R10_LP
> >> >> > +       jnz     __wcslen_sse2
> >> >>
> >> >> Branch to  __wcslen_sse2 is wrong for 2 reasons:
> >> >>
> >> >> 1.  __wcslen_sse2 is undefined with --disable-multi-arch.
> >> >
> >> > Won't __wcsnlen_sse2 be undefined with --disable-multi-arch as well?
> >> >
> >> >>
> >> >> 2. You should skip ENDBR64 at function entry.
> >> >>
> >> >> Please create a new label and branch to it.
> >> >>
> >> > I am not quite sure how to do this. I am trying to use
> >> > strstr-sse2-unaligned.S as a template:
> >> >
> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S;h=21e1a5f7cfde8ec07fcc4fc80d26984a58d651d7;hb=HEAD#l78
> >> > which appears to make a direct call to the global label of
> __strchr_sse2
> >> > without anything special in strchr-sse2.S or strstr-sse2-unaligned.S.
> >>
> >>
> >> This is different since all files are in sysdeps/x86_64/multiarch.
> >
> >
> > I see. So it turns out we are missing wcslen_sse4_1 which strlen.S
> > can also implement (it passes all tests). Would jumping to that be
> > valid?
> >
> > Otherwise I think the best bet is to add a target  for wcslen_sse4_1
> > and define it and wcsnlen_sse4_1 in the same file so the label is
> visible.
> > The only issue is the #defines in strlen.S need to all be protected which
> > is a bit messy. If we don't want to define wcslen_sse4_1 for whatever
> > reason, I already have this approach working with defining
> > wcsnlen_sse4_1 in the same file as wcslen-sse2.S and entering from
> > a local label. But looking at the code it seems the strlen.S file is a
> bit
> > better optimized. Thoughts?
> >
>
> I see what is going on.  I was confused by SSE4 codes in strlen.S.
> I submitted a patch to move it to multiarch/strlen-vec.S.


> Yes, we should add wcslen_sse4_1.   My question is why we need
> to branch from __wcsnlen_sse4_1 to __strlen_sse2 with overflow?
> Can you make __wcsnlen_sse4_1 to properly handle it directly?
>
> The current approach makes it non-trivial:

# define STRNLEN_PROLOG \
mov %r11, %rsi; \
subq %rax, %rsi; \
andq $-64, %rax; \
testq $-64, %rsi; \
je L(strnlen_ret)

AFAICT forces the length to be in bytes and rewriting that
affects the entire file's logic.

I considered porting the avx2 solution but I don't think it really fits
since the results from 4x vec all fit in a 64 bit register and the vast
improvement of the branch predictor on machines that use avx2.

I also think in the overflow case it is likely faster going through
wcslen as given that all the length bookkeeping / branches
can be dropped although it definitely does pessimize the common
no-overflow case.


> --
> H.J.
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974]
  2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
                   ` (5 preceding siblings ...)
  2021-06-22 18:11 ` [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
@ 2021-06-23  6:31 ` Noah Goldstein
  2021-06-23 17:30   ` H.J. Lu
  2021-06-23  6:31 ` [PATCH v3 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
  2021-06-23  6:31 ` [PATCH v3 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
  8 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-23  6:31 UTC (permalink / raw)
  To: libc-alpha

This commit adds tests for a bug in the wide char variant of the
functions where the implementation may assume that maxlen for wcsnlen
or n for wmemchr/strncat will not overflow when multiplied by
sizeof(wchar_t).

These tests show the following implementations failing on x86_64:

wcsnlen-sse4_1
wcsnlen-avx2

wmemchr-sse2
wmemchr-avx2

strncat would fail as well if it where on a system that prefered
either of the wcsnlen implementations that failed as it relies on
wcsnlen.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
Rebased on: [PATCH v1 1/4] x86-64: Add wcslen optimize for sse4.1
 string/test-memchr.c  | 39 ++++++++++++++++++++++++---
 string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
 string/test-strnlen.c | 33 +++++++++++++++++++++++
 3 files changed, 130 insertions(+), 3 deletions(-)

diff --git a/string/test-memchr.c b/string/test-memchr.c
index 665edc32af..ce964284aa 100644
--- a/string/test-memchr.c
+++ b/string/test-memchr.c
@@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
   CHAR *res = CALL (impl, s, c, n);
   if (res != exp_res)
     {
-      error (0, 0, "Wrong result in function %s %p %p", impl->name,
-	     res, exp_res);
+      error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
+             impl->name, s, c, n, res, exp_res);
       ret = 1;
       return;
     }
@@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
     }
   buf[align + len] = 0;
 
-  if (pos < len)
+  if (pos < MIN(n, len))
     {
       buf[align + pos] = seek_char;
       buf[align + len] = -seek_char;
@@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
     do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
 }
 
+static void
+do_overflow_tests (void)
+{
+  size_t i, j, len;
+  const size_t one = 1;
+  uintptr_t buf_addr = (uintptr_t) buf1;
+
+  for (i = 0; i < 750; ++i)
+    {
+        do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
+        do_test (0, i, 751, i - buf_addr, BIG_CHAR);
+        do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
+        do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
+        do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+      len = 0;
+      for (j = 8 * sizeof(size_t) - 1; j ; --j)
+        {
+          len |= one << j;
+          do_test (0, i, 751, len - i, BIG_CHAR);
+          do_test (0, i, 751, len + i, BIG_CHAR);
+          do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
+
+          do_test (0, i, 751, ~len - i, BIG_CHAR);
+          do_test (0, i, 751, ~len + i, BIG_CHAR);
+          do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
+        }
+    }
+}
+
 static void
 do_random_tests (void)
 {
@@ -221,6 +253,7 @@ test_main (void)
     do_test (page_size / 2 - i, i, i, 1, 0x9B);
 
   do_random_tests ();
+  do_overflow_tests ();
   return ret;
 }
 
diff --git a/string/test-strncat.c b/string/test-strncat.c
index 2ef917b820..37ea26ea05 100644
--- a/string/test-strncat.c
+++ b/string/test-strncat.c
@@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
     }
 }
 
+static void
+do_overflow_tests (void)
+{
+  size_t i, j, len;
+  const size_t one = 1;
+  CHAR *s1, *s2;
+  uintptr_t s1_addr;
+  s1 = (CHAR *) buf1;
+  s2 = (CHAR *) buf2;
+  s1_addr = (uintptr_t)s1;
+ for (j = 0; j < 200; ++j)
+      s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
+ s2[200] = 0;
+  for (i = 0; i < 750; ++i) {
+    for (j = 0; j < i; ++j)
+      s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
+    s1[i] = '\0';
+
+       FOR_EACH_IMPL (impl, 0)
+    {
+      s2[200] = '\0';
+      do_one_test (impl, s2, s1, SIZE_MAX - i);
+      s2[200] = '\0';
+      do_one_test (impl, s2, s1, i - s1_addr);
+      s2[200] = '\0';
+      do_one_test (impl, s2, s1, -s1_addr - i);
+      s2[200] = '\0';
+      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
+      s2[200] = '\0';
+      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
+    }
+
+    len = 0;
+    for (j = 8 * sizeof(size_t) - 1; j ; --j)
+      {
+        len |= one << j;
+        FOR_EACH_IMPL (impl, 0)
+          {
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, len - i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, len + i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, len - s1_addr - i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, len - s1_addr + i);
+
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, ~len - i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, ~len + i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, ~len - s1_addr - i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, ~len - s1_addr + i);
+          }
+      }
+  }
+}
+
 static void
 do_random_tests (void)
 {
@@ -316,6 +376,7 @@ test_main (void)
     }
 
   do_random_tests ();
+  do_overflow_tests ();
   return ret;
 }
 
diff --git a/string/test-strnlen.c b/string/test-strnlen.c
index 920f58e97b..f53e09263f 100644
--- a/string/test-strnlen.c
+++ b/string/test-strnlen.c
@@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
     do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
 }
 
+static void
+do_overflow_tests (void)
+{
+  size_t i, j, len;
+  const size_t one = 1;
+  uintptr_t buf_addr = (uintptr_t) buf1;
+
+  for (i = 0; i < 750; ++i)
+    {
+      do_test (0, i, SIZE_MAX - i, BIG_CHAR);
+      do_test (0, i, i - buf_addr, BIG_CHAR);
+      do_test (0, i, -buf_addr - i, BIG_CHAR);
+      do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
+      do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+      len = 0;
+      for (j = 8 * sizeof(size_t) - 1; j ; --j)
+        {
+          len |= one << j;
+          do_test (0, i, len - i, BIG_CHAR);
+          do_test (0, i, len + i, BIG_CHAR);
+          do_test (0, i, len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, len - buf_addr + i, BIG_CHAR);
+
+          do_test (0, i, ~len - i, BIG_CHAR);
+          do_test (0, i, ~len + i, BIG_CHAR);
+          do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
+        }
+    }
+}
+
 static void
 do_random_tests (void)
 {
@@ -283,6 +315,7 @@ test_main (void)
   do_random_tests ();
   do_page_tests ();
   do_page_2_tests ();
+  do_overflow_tests ();
   return ret;
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ #27974]
  2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
                   ` (6 preceding siblings ...)
  2021-06-23  6:31 ` [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat " Noah Goldstein
@ 2021-06-23  6:31 ` Noah Goldstein
  2021-06-23 17:30   ` H.J. Lu
  2021-06-23  6:31 ` [PATCH v3 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
  8 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-23  6:31 UTC (permalink / raw)
  To: libc-alpha

This commit fixes the bug mentioned in the previous commit.

The previous implementations of wmemchr in these files relied
on n * sizeof(wchar_t) which was not guranteed by the standard.

The new overflow tests added in the previous commit now
pass (As well as all the other tests).

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/memchr.S                | 77 +++++++++++++++++++-------
 sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
 2 files changed, 98 insertions(+), 37 deletions(-)

diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index beff2708de..3ddc4655cf 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -21,9 +21,11 @@
 #ifdef USE_AS_WMEMCHR
 # define MEMCHR		wmemchr
 # define PCMPEQ		pcmpeqd
+# define CHAR_PER_VEC	4
 #else
 # define MEMCHR		memchr
 # define PCMPEQ		pcmpeqb
+# define CHAR_PER_VEC	16
 #endif
 
 /* fast SSE2 version with using pmaxub and 64 byte loop */
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
 	movd	%esi, %xmm1
 	mov	%edi, %ecx
 
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#endif
 #ifdef USE_AS_WMEMCHR
 	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
-	shl	$2, %RDX_LP
 #else
-# ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
-# endif
 	punpcklbw %xmm1, %xmm1
 	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
 	test	%eax, %eax
 
 	jnz	L(matches_1)
-	sub	$16, %rdx
+	sub	$CHAR_PER_VEC, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
 	and	$15, %ecx
 	and	$-16, %rdi
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	add	%rcx, %rdx
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	jmp	L(loop_prolog)
 
@@ -77,16 +81,21 @@ L(crosscache):
 	movdqa	(%rdi), %xmm0
 
 	PCMPEQ	%xmm1, %xmm0
-/* Check if there is a match.  */
+	/* Check if there is a match.  */
 	pmovmskb %xmm0, %eax
-/* Remove the leading bytes.  */
+	/* Remove the leading bytes.  */
 	sar	%cl, %eax
 	test	%eax, %eax
 	je	L(unaligned_no_match)
-/* Check which byte is a match.  */
+	/* Check which byte is a match.  */
 	bsf	%eax, %eax
-
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	add	%rdi, %rax
 	add	%rcx, %rax
@@ -94,15 +103,18 @@ L(crosscache):
 
 	.p2align 4
 L(unaligned_no_match):
-        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+	/* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
 	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
 	   possible addition overflow.  */
 	neg	%rcx
 	add	$16, %rcx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	sub	%rcx, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 
 	.p2align 4
@@ -135,7 +147,7 @@ L(loop_prolog):
 	test	$0x3f, %rdi
 	jz	L(align64_loop)
 
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 
 	movdqa	(%rdi), %xmm0
@@ -167,11 +179,14 @@ L(loop_prolog):
 	mov	%rdi, %rcx
 	and	$-64, %rdi
 	and	$63, %ecx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	add	%rcx, %rdx
 
 	.p2align 4
 L(align64_loop):
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	movdqa	(%rdi), %xmm0
 	movdqa	16(%rdi), %xmm2
@@ -218,7 +233,7 @@ L(align64_loop):
 
 	.p2align 4
 L(exit_loop):
-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
 	jle	L(exit_loop_32)
 
 	movdqa	(%rdi), %xmm0
@@ -238,7 +253,7 @@ L(exit_loop):
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32_1)
-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
 	jle	L(return_null)
 
 	PCMPEQ	48(%rdi), %xmm1
@@ -250,13 +265,13 @@ L(exit_loop):
 
 	.p2align 4
 L(exit_loop_32):
-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
 	movdqa	(%rdi), %xmm0
 	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches_1)
-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
 	jbe	L(return_null)
 
 	PCMPEQ	16(%rdi), %xmm1
@@ -293,7 +308,13 @@ L(matches32):
 	.p2align 4
 L(matches_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	add	%rdi, %rax
 	ret
@@ -301,7 +322,13 @@ L(matches_1):
 	.p2align 4
 L(matches16_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	16(%rdi, %rax), %rax
 	ret
@@ -309,7 +336,13 @@ L(matches16_1):
 	.p2align 4
 L(matches32_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	32(%rdi, %rax), %rax
 	ret
@@ -317,7 +350,13 @@ L(matches32_1):
 	.p2align 4
 L(matches48_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	48(%rdi, %rax), %rax
 	ret
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 0d8758e3e7..afdb956502 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -54,21 +54,19 @@
 
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
-	test	%RDX_LP, %RDX_LP
-	jz	L(null)
-# endif
-# ifdef USE_AS_WMEMCHR
-	shl	$2, %RDX_LP
-# else
 #  ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+#  else
+	test	%RDX_LP, %RDX_LP
 #  endif
+	jz	L(null)
 # endif
 	/* Broadcast CHAR to YMMMATCH.  */
 	vmovd	%esi, %xmm0
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
 	vpmovmskb %ymm1, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* If length < CHAR_PER_VEC handle special.  */
-	cmpq	$VEC_SIZE, %rdx
+	cmpq	$CHAR_PER_VEC, %rdx
 	jbe	L(first_vec_x0)
 # endif
 	testl	%eax, %eax
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
 L(first_vec_x0):
 	/* Check if first match was before length.  */
 	tzcntl	%eax, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
 	xorl	%ecx, %ecx
 	cmpl	%eax, %edx
 	leaq	(%rdi, %rax), %rax
@@ -110,12 +112,12 @@ L(null):
 # endif
 	.p2align 4
 L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is necessary
-	   for computer return address if byte is found or adjusting length
-	   if it is not and this is memchr.  */
+	/* Save pointer before aligning as its original value is
+	   necessary for computer return address if byte is found or
+	   adjusting length if it is not and this is memchr.  */
 	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
-	   rdi for rawmemchr.  */
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+	   and rdi for rawmemchr.  */
 	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
 	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
@@ -124,6 +126,10 @@ L(cross_page_boundary):
 	   match).  */
 	leaq	1(%ALGN_PTR_REG), %rsi
 	subq	%RRAW_PTR_REG, %rsi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+#  endif
 # endif
 	/* Remove the leading bytes.  */
 	sarxl	%ERAW_PTR_REG, %eax, %eax
@@ -181,6 +187,10 @@ L(cross_page_continue):
 	orq	$(VEC_SIZE - 1), %rdi
 	/* esi is for adjusting length to see if near the end.  */
 	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %esi
+#  endif
 # else
 	orq	$(VEC_SIZE - 1), %rdi
 L(cross_page_continue):
@@ -213,7 +223,7 @@ L(cross_page_continue):
 
 # ifndef USE_AS_RAWMEMCHR
 	/* Check if at last VEC_SIZE * 4 length.  */
-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(last_4x_vec_or_less_cmpeq)
 	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
 	   length.  */
@@ -221,6 +231,10 @@ L(cross_page_continue):
 	movl	%edi, %ecx
 	orq	$(VEC_SIZE * 4 - 1), %rdi
 	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 	addq	%rcx, %rdx
 # else
 	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
@@ -250,15 +264,19 @@ L(loop_4x_vec):
 
 	subq	$-(VEC_SIZE * 4), %rdi
 
-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$(CHAR_PER_VEC * 4), %rdx
 	ja	L(loop_4x_vec)
 
-	/* Fall through into less than 4 remaining vectors of length case.
-	 */
+	/* Fall through into less than 4 remaining vectors of length
+	   case.  */
 	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	.p2align 4
 L(last_4x_vec_or_less):
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
 	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
 	jnz	L(first_vec_x1_check)
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
 L(last_4x_vec_or_less_cmpeq):
 	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
 	subq	$-(VEC_SIZE * 4), %rdi
 	/* Check first VEC regardless.  */
 	testl	%eax, %eax
-- 
2.25.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
  2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
                   ` (7 preceding siblings ...)
  2021-06-23  6:31 ` [PATCH v3 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
@ 2021-06-23  6:31 ` Noah Goldstein
  2021-06-23 17:27   ` H.J. Lu
  8 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-23  6:31 UTC (permalink / raw)
  To: libc-alpha

This commit fixes the bug mentioned in the previous commit.

The previous implementations of wmemchr in these files relied
on maxlen * sizeof(wchar_t) which was not guranteed by the standard.

The new overflow tests added in the previous commit now
pass (As well as all the other tests).

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
 sysdeps/x86_64/multiarch/strlen-vec.S  |  15 ++-
 2 files changed, 107 insertions(+), 38 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index bd2e6ee44a..b282a75613 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -44,21 +44,21 @@
 
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
 	/* Check zero length.  */
+#  ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RSI_LP, %RSI_LP
+#  else
 	test	%RSI_LP, %RSI_LP
+#  endif
 	jz	L(zero)
 	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
 	mov	%RSI_LP, %R8_LP
-#  ifdef USE_AS_WCSLEN
-	shl	$2, %RSI_LP
-#  elif defined __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-#  endif
 # endif
 	movl	%edi, %eax
 	movq	%rdi, %rdx
@@ -72,10 +72,10 @@ ENTRY (STRLEN)
 
 	/* Check the first VEC_SIZE bytes.  */
 	VPCMPEQ	(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 # ifdef USE_AS_STRNLEN
 	/* If length < VEC_SIZE handle special.  */
-	cmpq	$VEC_SIZE, %rsi
+	cmpq	$CHAR_PER_VEC, %rsi
 	jbe	L(first_vec_x0)
 # endif
 	/* If empty continue to aligned_more. Otherwise return bit
@@ -84,6 +84,7 @@ ENTRY (STRLEN)
 	jz	L(aligned_more)
 	tzcntl	%eax, %eax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -97,9 +98,14 @@ L(zero):
 L(first_vec_x0):
 	/* Set bit for max len so that tzcnt will return min of max len
 	   and position of first match.  */
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
 	btsq	%rsi, %rax
 	tzcntl	%eax, %eax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 #  endif
 	VZEROUPPER_RETURN
@@ -113,14 +119,19 @@ L(first_vec_x1):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE * 4 + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	incl	%edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -133,14 +144,19 @@ L(first_vec_x2):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE * 3 + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -153,14 +169,19 @@ L(first_vec_x3):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE * 2 + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE * 2 + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -173,14 +194,19 @@ L(first_vec_x4):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE * 3 + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -195,10 +221,14 @@ L(cross_page_continue):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
 # ifdef USE_AS_STRNLEN
-	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
-	   it simplies the logic in last_4x_vec_or_less.  */
+	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
+	   because it simplies the logic in last_4x_vec_or_less.  */
 	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
 	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 # endif
 	/* Load first VEC regardless.  */
 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
@@ -207,34 +237,38 @@ L(cross_page_continue):
 	subq	%rcx, %rsi
 	jb	L(last_4x_vec_or_less)
 # endif
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x4)
 
 	/* Align data to VEC_SIZE * 4 - 1.  */
 # ifdef USE_AS_STRNLEN
 	/* Before adjusting length check if at last VEC_SIZE * 4.  */
-	cmpq	$(VEC_SIZE * 4 - 1), %rsi
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
 	jbe	L(last_4x_vec_or_less_load)
 	incq	%rdi
 	movl	%edi, %ecx
 	orq	$(VEC_SIZE * 4 - 1), %rdi
 	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 	/* Readjust length.  */
 	addq	%rcx, %rsi
 # else
@@ -246,13 +280,13 @@ L(cross_page_continue):
 L(loop_4x_vec):
 # ifdef USE_AS_STRNLEN
 	/* Break if at end of length.  */
-	subq	$(VEC_SIZE * 4), %rsi
+	subq	$(CHAR_PER_VEC * 4), %rsi
 	jb	L(last_4x_vec_or_less_cmpeq)
 # endif
-	/* Save some code size by microfusing VPMINU with the load. Since
-	   the matches in ymm2/ymm4 can only be returned if there where no
-	   matches in ymm1/ymm3 respectively there is no issue with overlap.
-	 */
+	/* Save some code size by microfusing VPMINU with the load.
+	   Since the matches in ymm2/ymm4 can only be returned if there
+	   where no matches in ymm1/ymm3 respectively there is no issue
+	   with overlap.  */
 	vmovdqa	1(%rdi), %ymm1
 	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
 	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
@@ -260,7 +294,7 @@ L(loop_4x_vec):
 
 	VPMINU	%ymm2, %ymm4, %ymm5
 	VPCMPEQ	%ymm5, %ymm0, %ymm5
-	vpmovmskb	%ymm5, %ecx
+	vpmovmskb %ymm5, %ecx
 
 	subq	$-(VEC_SIZE * 4), %rdi
 	testl	%ecx, %ecx
@@ -268,27 +302,28 @@ L(loop_4x_vec):
 
 
 	VPCMPEQ	%ymm1, %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	subq	%rdx, %rdi
 	testl	%eax, %eax
 	jnz	L(last_vec_return_x0)
 
 	VPCMPEQ	%ymm2, %ymm0, %ymm2
-	vpmovmskb	%ymm2, %eax
+	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_return_x1)
 
 	/* Combine last 2 VEC.  */
 	VPCMPEQ	%ymm3, %ymm0, %ymm3
-	vpmovmskb	%ymm3, %eax
-	/* rcx has combined result from all 4 VEC. It will only be used if
-	   the first 3 other VEC all did not contain a match.  */
+	vpmovmskb %ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used
+	   if the first 3 other VEC all did not contain a match.  */
 	salq	$32, %rcx
 	orq	%rcx, %rax
 	tzcntq	%rax, %rax
 	subq	$(VEC_SIZE * 2 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -297,15 +332,19 @@ L(loop_4x_vec):
 # ifdef USE_AS_STRNLEN
 	.p2align 4
 L(last_4x_vec_or_less_load):
-	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
+	/* Depending on entry adjust rdi / prepare first VEC in ymm1.
+	 */
 	subq	$-(VEC_SIZE * 4), %rdi
 L(last_4x_vec_or_less_cmpeq):
 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 L(last_4x_vec_or_less):
-
-	vpmovmskb	%ymm1, %eax
-	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
-	   VEC_SIZE * 4.  */
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
+	vpmovmskb %ymm1, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off
+	   by VEC_SIZE * 4.  */
 	testl	$(VEC_SIZE * 2), %esi
 	jnz	L(last_4x_vec)
 
@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
 	jb	L(max)
 
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
 	cmpl	%eax, %esi
@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
 	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -340,6 +380,7 @@ L(last_vec_return_x0):
 	subq	$(VEC_SIZE * 4 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -350,6 +391,7 @@ L(last_vec_return_x1):
 	subq	$(VEC_SIZE * 3 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -366,6 +408,7 @@ L(last_vec_x1_check):
 	incl	%eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -381,14 +424,14 @@ L(last_4x_vec):
 	jnz	L(last_vec_x1)
 
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x2)
 
 	/* Normalize length.  */
 	andl	$(VEC_SIZE * 4 - 1), %esi
 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x3)
 
@@ -396,7 +439,7 @@ L(last_4x_vec):
 	jb	L(max)
 
 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
 	cmpl	%eax, %esi
@@ -405,6 +448,7 @@ L(last_4x_vec):
 	addl	$(VEC_SIZE * 3 + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -419,6 +463,7 @@ L(last_vec_x1):
 	incl	%eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -432,6 +477,7 @@ L(last_vec_x2):
 	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -447,6 +493,7 @@ L(last_vec_x3):
 	addl	$(VEC_SIZE * 2 + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -455,13 +502,13 @@ L(max_end):
 	VZEROUPPER_RETURN
 # endif
 
-	/* Cold case for crossing page with first load.	 */
+	/* Cold case for crossing page with first load.  */
 	.p2align 4
 L(cross_page_boundary):
 	/* Align data to VEC_SIZE - 1.  */
 	orq	$(VEC_SIZE - 1), %rdi
 	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
 	   so no need to manually mod rdx.  */
 	sarxl	%edx, %eax, %eax
@@ -470,6 +517,10 @@ L(cross_page_boundary):
 	jnz	L(cross_page_less_vec)
 	leaq	1(%rdi), %rcx
 	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %ecx
+#  endif
 	/* Check length.  */
 	cmpq	%rsi, %rcx
 	jb	L(cross_page_continue)
@@ -479,6 +530,7 @@ L(cross_page_boundary):
 	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide length by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 #  endif
 # endif
@@ -489,6 +541,10 @@ L(return_vzeroupper):
 	.p2align 4
 L(cross_page_less_vec):
 	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
 	cmpq	%rax, %rsi
 	cmovb	%esi, %eax
 #  ifdef USE_AS_WCSLEN
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
index 8f660bb9c7..439e486a43 100644
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
@@ -65,12 +65,25 @@ ENTRY(strlen)
 	ret
 L(n_nonzero):
 # ifdef AS_WCSLEN
-	shl	$2, %RSI_LP
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
+   overflow the only way this program doesn't have undefined behavior 
+   is if there is a null terminator in valid memory so wcslen will 
+   suffice.  */
+	mov	%RSI_LP, %R10_LP
+	sar	$62, %R10_LP
+	test	%R10_LP, %R10_LP
+	jnz	__wcslen_sse4_1
+	sal	$2, %RSI_LP
 # endif
 
+
 /* Initialize long lived registers.  */
 
 	add	%RDI_LP, %RSI_LP
+# ifdef AS_WCSLEN
+/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
+	jbe	__wcslen_sse4_1
+# endif
 	mov	%RSI_LP, %R10_LP
 	and	$-64, %R10_LP
 	mov	%RSI_LP, %R11_LP
-- 
2.25.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
  2021-06-23  6:31 ` [PATCH v3 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
@ 2021-06-23 17:27   ` H.J. Lu
  0 siblings, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2021-06-23 17:27 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jun 22, 2021 at 11:32 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit fixes the bug mentioned in the previous commit.
>
> The previous implementations of wmemchr in these files relied
> on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
>
> The new overflow tests added in the previous commit now
> pass (As well as all the other tests).
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
>  sysdeps/x86_64/multiarch/strlen-vec.S  |  15 ++-
>  2 files changed, 107 insertions(+), 38 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> index bd2e6ee44a..b282a75613 100644
> --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> @@ -44,21 +44,21 @@
>
>  # define VEC_SIZE 32
>  # define PAGE_SIZE 4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
>         .section SECTION(.text),"ax",@progbits
>  ENTRY (STRLEN)
>  # ifdef USE_AS_STRNLEN
>         /* Check zero length.  */
> +#  ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       and     %RSI_LP, %RSI_LP
> +#  else
>         test    %RSI_LP, %RSI_LP
> +#  endif
>         jz      L(zero)
>         /* Store max len in R8_LP before adjusting if using WCSLEN.  */
>         mov     %RSI_LP, %R8_LP
> -#  ifdef USE_AS_WCSLEN
> -       shl     $2, %RSI_LP
> -#  elif defined __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %esi, %esi
> -#  endif
>  # endif
>         movl    %edi, %eax
>         movq    %rdi, %rdx
> @@ -72,10 +72,10 @@ ENTRY (STRLEN)
>
>         /* Check the first VEC_SIZE bytes.  */
>         VPCMPEQ (%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>  # ifdef USE_AS_STRNLEN
>         /* If length < VEC_SIZE handle special.  */
> -       cmpq    $VEC_SIZE, %rsi
> +       cmpq    $CHAR_PER_VEC, %rsi
>         jbe     L(first_vec_x0)
>  # endif
>         /* If empty continue to aligned_more. Otherwise return bit
> @@ -84,6 +84,7 @@ ENTRY (STRLEN)
>         jz      L(aligned_more)
>         tzcntl  %eax, %eax
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  # endif
>         VZEROUPPER_RETURN
> @@ -97,9 +98,14 @@ L(zero):
>  L(first_vec_x0):
>         /* Set bit for max len so that tzcnt will return min of max len
>            and position of first match.  */
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %esi
> +#  endif
>         btsq    %rsi, %rax
>         tzcntl  %eax, %eax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -113,14 +119,19 @@ L(first_vec_x1):
>  # ifdef USE_AS_STRNLEN
>         /* Use ecx which was computed earlier to compute correct value.
>          */
> +#  ifdef USE_AS_WCSLEN
> +       leal    -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
> +#  else
>         subl    $(VEC_SIZE * 4 + 1), %ecx
>         addl    %ecx, %eax
> +#  endif
>  # else
>         subl    %edx, %edi
>         incl    %edi
>         addl    %edi, %eax
>  # endif
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  # endif
>         VZEROUPPER_RETURN
> @@ -133,14 +144,19 @@ L(first_vec_x2):
>  # ifdef USE_AS_STRNLEN
>         /* Use ecx which was computed earlier to compute correct value.
>          */
> +#  ifdef USE_AS_WCSLEN
> +       leal    -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
> +#  else
>         subl    $(VEC_SIZE * 3 + 1), %ecx
>         addl    %ecx, %eax
> +#  endif
>  # else
>         subl    %edx, %edi
>         addl    $(VEC_SIZE + 1), %edi
>         addl    %edi, %eax
>  # endif
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  # endif
>         VZEROUPPER_RETURN
> @@ -153,14 +169,19 @@ L(first_vec_x3):
>  # ifdef USE_AS_STRNLEN
>         /* Use ecx which was computed earlier to compute correct value.
>          */
> +#  ifdef USE_AS_WCSLEN
> +       leal    -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
> +#  else
>         subl    $(VEC_SIZE * 2 + 1), %ecx
>         addl    %ecx, %eax
> +#  endif
>  # else
>         subl    %edx, %edi
>         addl    $(VEC_SIZE * 2 + 1), %edi
>         addl    %edi, %eax
>  # endif
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  # endif
>         VZEROUPPER_RETURN
> @@ -173,14 +194,19 @@ L(first_vec_x4):
>  # ifdef USE_AS_STRNLEN
>         /* Use ecx which was computed earlier to compute correct value.
>          */
> +#  ifdef USE_AS_WCSLEN
> +       leal    -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
> +#  else
>         subl    $(VEC_SIZE + 1), %ecx
>         addl    %ecx, %eax
> +#  endif
>  # else
>         subl    %edx, %edi
>         addl    $(VEC_SIZE * 3 + 1), %edi
>         addl    %edi, %eax
>  # endif
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  # endif
>         VZEROUPPER_RETURN
> @@ -195,10 +221,14 @@ L(cross_page_continue):
>         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>            since data is only aligned to VEC_SIZE.  */
>  # ifdef USE_AS_STRNLEN
> -       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> -          it simplies the logic in last_4x_vec_or_less.  */
> +       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> +          because it simplies the logic in last_4x_vec_or_less.  */
>         leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
>         subq    %rdx, %rcx
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %ecx
> +#  endif
>  # endif
>         /* Load first VEC regardless.  */
>         VPCMPEQ 1(%rdi), %ymm0, %ymm1
> @@ -207,34 +237,38 @@ L(cross_page_continue):
>         subq    %rcx, %rsi
>         jb      L(last_4x_vec_or_less)
>  # endif
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x1)
>
>         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x2)
>
>         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x3)
>
>         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x4)
>
>         /* Align data to VEC_SIZE * 4 - 1.  */
>  # ifdef USE_AS_STRNLEN
>         /* Before adjusting length check if at last VEC_SIZE * 4.  */
> -       cmpq    $(VEC_SIZE * 4 - 1), %rsi
> +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
>         jbe     L(last_4x_vec_or_less_load)
>         incq    %rdi
>         movl    %edi, %ecx
>         orq     $(VEC_SIZE * 4 - 1), %rdi
>         andl    $(VEC_SIZE * 4 - 1), %ecx
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %ecx
> +#  endif
>         /* Readjust length.  */
>         addq    %rcx, %rsi
>  # else
> @@ -246,13 +280,13 @@ L(cross_page_continue):
>  L(loop_4x_vec):
>  # ifdef USE_AS_STRNLEN
>         /* Break if at end of length.  */
> -       subq    $(VEC_SIZE * 4), %rsi
> +       subq    $(CHAR_PER_VEC * 4), %rsi
>         jb      L(last_4x_vec_or_less_cmpeq)
>  # endif
> -       /* Save some code size by microfusing VPMINU with the load. Since
> -          the matches in ymm2/ymm4 can only be returned if there where no
> -          matches in ymm1/ymm3 respectively there is no issue with overlap.
> -        */
> +       /* Save some code size by microfusing VPMINU with the load.
> +          Since the matches in ymm2/ymm4 can only be returned if there
> +          where no matches in ymm1/ymm3 respectively there is no issue
> +          with overlap.  */
>         vmovdqa 1(%rdi), %ymm1
>         VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
>         vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> @@ -260,7 +294,7 @@ L(loop_4x_vec):
>
>         VPMINU  %ymm2, %ymm4, %ymm5
>         VPCMPEQ %ymm5, %ymm0, %ymm5
> -       vpmovmskb       %ymm5, %ecx
> +       vpmovmskb %ymm5, %ecx
>
>         subq    $-(VEC_SIZE * 4), %rdi
>         testl   %ecx, %ecx
> @@ -268,27 +302,28 @@ L(loop_4x_vec):
>
>
>         VPCMPEQ %ymm1, %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         subq    %rdx, %rdi
>         testl   %eax, %eax
>         jnz     L(last_vec_return_x0)
>
>         VPCMPEQ %ymm2, %ymm0, %ymm2
> -       vpmovmskb       %ymm2, %eax
> +       vpmovmskb %ymm2, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_return_x1)
>
>         /* Combine last 2 VEC.  */
>         VPCMPEQ %ymm3, %ymm0, %ymm3
> -       vpmovmskb       %ymm3, %eax
> -       /* rcx has combined result from all 4 VEC. It will only be used if
> -          the first 3 other VEC all did not contain a match.  */
> +       vpmovmskb %ymm3, %eax
> +       /* rcx has combined result from all 4 VEC. It will only be used
> +          if the first 3 other VEC all did not contain a match.  */
>         salq    $32, %rcx
>         orq     %rcx, %rax
>         tzcntq  %rax, %rax
>         subq    $(VEC_SIZE * 2 - 1), %rdi
>         addq    %rdi, %rax
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  # endif
>         VZEROUPPER_RETURN
> @@ -297,15 +332,19 @@ L(loop_4x_vec):
>  # ifdef USE_AS_STRNLEN
>         .p2align 4
>  L(last_4x_vec_or_less_load):
> -       /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> +       /* Depending on entry adjust rdi / prepare first VEC in ymm1.
> +        */
>         subq    $-(VEC_SIZE * 4), %rdi
>  L(last_4x_vec_or_less_cmpeq):
>         VPCMPEQ 1(%rdi), %ymm0, %ymm1
>  L(last_4x_vec_or_less):
> -
> -       vpmovmskb       %ymm1, %eax
> -       /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> -          VEC_SIZE * 4.  */
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %esi
> +#  endif
> +       vpmovmskb %ymm1, %eax
> +       /* If remaining length > VEC_SIZE * 2. This works if esi is off
> +          by VEC_SIZE * 4.  */
>         testl   $(VEC_SIZE * 2), %esi
>         jnz     L(last_4x_vec)
>
> @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
>         jb      L(max)
>
>         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         tzcntl  %eax, %eax
>         /* Check the end of data.  */
>         cmpl    %eax, %esi
> @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
>         addl    $(VEC_SIZE + 1), %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -340,6 +380,7 @@ L(last_vec_return_x0):
>         subq    $(VEC_SIZE * 4 - 1), %rdi
>         addq    %rdi, %rax
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  # endif
>         VZEROUPPER_RETURN
> @@ -350,6 +391,7 @@ L(last_vec_return_x1):
>         subq    $(VEC_SIZE * 3 - 1), %rdi
>         addq    %rdi, %rax
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  # endif
>         VZEROUPPER_RETURN
> @@ -366,6 +408,7 @@ L(last_vec_x1_check):
>         incl    %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -381,14 +424,14 @@ L(last_4x_vec):
>         jnz     L(last_vec_x1)
>
>         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_x2)
>
>         /* Normalize length.  */
>         andl    $(VEC_SIZE * 4 - 1), %esi
>         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_x3)
>
> @@ -396,7 +439,7 @@ L(last_4x_vec):
>         jb      L(max)
>
>         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         tzcntl  %eax, %eax
>         /* Check the end of data.  */
>         cmpl    %eax, %esi
> @@ -405,6 +448,7 @@ L(last_4x_vec):
>         addl    $(VEC_SIZE * 3 + 1), %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -419,6 +463,7 @@ L(last_vec_x1):
>         incl    %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -432,6 +477,7 @@ L(last_vec_x2):
>         addl    $(VEC_SIZE + 1), %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -447,6 +493,7 @@ L(last_vec_x3):
>         addl    $(VEC_SIZE * 2 + 1), %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -455,13 +502,13 @@ L(max_end):
>         VZEROUPPER_RETURN
>  # endif
>
> -       /* Cold case for crossing page with first load.  */
> +       /* Cold case for crossing page with first load.  */
>         .p2align 4
>  L(cross_page_boundary):
>         /* Align data to VEC_SIZE - 1.  */
>         orq     $(VEC_SIZE - 1), %rdi
>         VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
>            so no need to manually mod rdx.  */
>         sarxl   %edx, %eax, %eax
> @@ -470,6 +517,10 @@ L(cross_page_boundary):
>         jnz     L(cross_page_less_vec)
>         leaq    1(%rdi), %rcx
>         subq    %rdx, %rcx
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> +       shrl    $2, %ecx
> +#  endif
>         /* Check length.  */
>         cmpq    %rsi, %rcx
>         jb      L(cross_page_continue)
> @@ -479,6 +530,7 @@ L(cross_page_boundary):
>         jz      L(cross_page_continue)
>         tzcntl  %eax, %eax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide length by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  #  endif
>  # endif
> @@ -489,6 +541,10 @@ L(return_vzeroupper):
>         .p2align 4
>  L(cross_page_less_vec):
>         tzcntl  %eax, %eax
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %esi
> +#  endif
>         cmpq    %rax, %rsi
>         cmovb   %esi, %eax
>  #  ifdef USE_AS_WCSLEN
> diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
> index 8f660bb9c7..439e486a43 100644
> --- a/sysdeps/x86_64/multiarch/strlen-vec.S
> +++ b/sysdeps/x86_64/multiarch/strlen-vec.S
> @@ -65,12 +65,25 @@ ENTRY(strlen)
>         ret
>  L(n_nonzero):
>  # ifdef AS_WCSLEN
> -       shl     $2, %RSI_LP
> +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> +   overflow the only way this program doesn't have undefined behavior
> +   is if there is a null terminator in valid memory so wcslen will
> +   suffice.  */
> +       mov     %RSI_LP, %R10_LP
> +       sar     $62, %R10_LP
> +       test    %R10_LP, %R10_LP
> +       jnz     __wcslen_sse4_1
> +       sal     $2, %RSI_LP
>  # endif
>
> +
>  /* Initialize long lived registers.  */
>
>         add     %RDI_LP, %RSI_LP
> +# ifdef AS_WCSLEN
> +/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
> +       jbe     __wcslen_sse4_1
> +# endif
>         mov     %RSI_LP, %R10_LP
>         and     $-64, %R10_LP
>         mov     %RSI_LP, %R11_LP
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974]
  2021-06-23  6:31 ` [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat " Noah Goldstein
@ 2021-06-23 17:30   ` H.J. Lu
  2021-06-23 18:30     ` Noah Goldstein
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2021-06-23 17:30 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jun 22, 2021 at 11:32 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit adds tests for a bug in the wide char variant of the
> functions where the implementation may assume that maxlen for wcsnlen
> or n for wmemchr/strncat will not overflow when multiplied by
> sizeof(wchar_t).
>
> These tests show the following implementations failing on x86_64:
>
> wcsnlen-sse4_1
> wcsnlen-avx2
>
> wmemchr-sse2
> wmemchr-avx2
>
> strncat would fail as well if it where on a system that prefered
> either of the wcsnlen implementations that failed as it relies on
> wcsnlen.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> Rebased on: [PATCH v1 1/4] x86-64: Add wcslen optimize for sse4.1
>  string/test-memchr.c  | 39 ++++++++++++++++++++++++---
>  string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
>  string/test-strnlen.c | 33 +++++++++++++++++++++++
>  3 files changed, 130 insertions(+), 3 deletions(-)
>
> diff --git a/string/test-memchr.c b/string/test-memchr.c
> index 665edc32af..ce964284aa 100644
> --- a/string/test-memchr.c
> +++ b/string/test-memchr.c
> @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
>    CHAR *res = CALL (impl, s, c, n);
>    if (res != exp_res)
>      {
> -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
> -            res, exp_res);
> +      error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
> +             impl->name, s, c, n, res, exp_res);
>        ret = 1;
>        return;
>      }
> @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
>      }
>    buf[align + len] = 0;
>
> -  if (pos < len)
> +  if (pos < MIN(n, len))
>      {
>        buf[align + pos] = seek_char;
>        buf[align + len] = -seek_char;
> @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
>      do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
>  }
>
> +static void
> +do_overflow_tests (void)
> +{
> +  size_t i, j, len;
> +  const size_t one = 1;
> +  uintptr_t buf_addr = (uintptr_t) buf1;
> +
> +  for (i = 0; i < 750; ++i)
> +    {
> +        do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
> +        do_test (0, i, 751, i - buf_addr, BIG_CHAR);
> +        do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
> +        do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
> +        do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
> +
> +      len = 0;
> +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
> +        {
> +          len |= one << j;
> +          do_test (0, i, 751, len - i, BIG_CHAR);
> +          do_test (0, i, 751, len + i, BIG_CHAR);
> +          do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
> +          do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
> +
> +          do_test (0, i, 751, ~len - i, BIG_CHAR);
> +          do_test (0, i, 751, ~len + i, BIG_CHAR);
> +          do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
> +          do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
> +        }
> +    }
> +}
> +
>  static void
>  do_random_tests (void)
>  {
> @@ -221,6 +253,7 @@ test_main (void)
>      do_test (page_size / 2 - i, i, i, 1, 0x9B);
>
>    do_random_tests ();
> +  do_overflow_tests ();
>    return ret;
>  }
>
> diff --git a/string/test-strncat.c b/string/test-strncat.c
> index 2ef917b820..37ea26ea05 100644
> --- a/string/test-strncat.c
> +++ b/string/test-strncat.c
> @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
>      }
>  }
>
> +static void
> +do_overflow_tests (void)
> +{
> +  size_t i, j, len;
> +  const size_t one = 1;
> +  CHAR *s1, *s2;
> +  uintptr_t s1_addr;
> +  s1 = (CHAR *) buf1;
> +  s2 = (CHAR *) buf2;
> +  s1_addr = (uintptr_t)s1;
> + for (j = 0; j < 200; ++j)
> +      s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
> + s2[200] = 0;
> +  for (i = 0; i < 750; ++i) {
> +    for (j = 0; j < i; ++j)
> +      s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
> +    s1[i] = '\0';
> +
> +       FOR_EACH_IMPL (impl, 0)
> +    {
> +      s2[200] = '\0';
> +      do_one_test (impl, s2, s1, SIZE_MAX - i);
> +      s2[200] = '\0';
> +      do_one_test (impl, s2, s1, i - s1_addr);
> +      s2[200] = '\0';
> +      do_one_test (impl, s2, s1, -s1_addr - i);
> +      s2[200] = '\0';
> +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
> +      s2[200] = '\0';
> +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
> +    }
> +
> +    len = 0;
> +    for (j = 8 * sizeof(size_t) - 1; j ; --j)
> +      {
> +        len |= one << j;
> +        FOR_EACH_IMPL (impl, 0)
> +          {
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, len - i);
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, len + i);
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, len - s1_addr - i);
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, len - s1_addr + i);
> +
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, ~len - i);
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, ~len + i);
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, ~len - s1_addr - i);
> +            s2[200] = '\0';
> +            do_one_test (impl, s2, s1, ~len - s1_addr + i);
> +          }
> +      }
> +  }
> +}
> +
>  static void
>  do_random_tests (void)
>  {
> @@ -316,6 +376,7 @@ test_main (void)
>      }
>
>    do_random_tests ();
> +  do_overflow_tests ();
>    return ret;
>  }
>
> diff --git a/string/test-strnlen.c b/string/test-strnlen.c
> index 920f58e97b..f53e09263f 100644
> --- a/string/test-strnlen.c
> +++ b/string/test-strnlen.c
> @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
>      do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
>  }
>
> +static void
> +do_overflow_tests (void)
> +{
> +  size_t i, j, len;
> +  const size_t one = 1;
> +  uintptr_t buf_addr = (uintptr_t) buf1;
> +
> +  for (i = 0; i < 750; ++i)
> +    {
> +      do_test (0, i, SIZE_MAX - i, BIG_CHAR);
> +      do_test (0, i, i - buf_addr, BIG_CHAR);
> +      do_test (0, i, -buf_addr - i, BIG_CHAR);
> +      do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
> +      do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
> +
> +      len = 0;
> +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
> +        {
> +          len |= one << j;
> +          do_test (0, i, len - i, BIG_CHAR);
> +          do_test (0, i, len + i, BIG_CHAR);
> +          do_test (0, i, len - buf_addr - i, BIG_CHAR);
> +          do_test (0, i, len - buf_addr + i, BIG_CHAR);
> +
> +          do_test (0, i, ~len - i, BIG_CHAR);
> +          do_test (0, i, ~len + i, BIG_CHAR);
> +          do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
> +          do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
> +        }
> +    }
> +}
> +
>  static void
>  do_random_tests (void)
>  {
> @@ -283,6 +315,7 @@ test_main (void)
>    do_random_tests ();
>    do_page_tests ();
>    do_page_2_tests ();
> +  do_overflow_tests ();
>    return ret;
>  }
>
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ #27974]
  2021-06-23  6:31 ` [PATCH v3 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
@ 2021-06-23 17:30   ` H.J. Lu
  0 siblings, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2021-06-23 17:30 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jun 22, 2021 at 11:32 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit fixes the bug mentioned in the previous commit.
>
> The previous implementations of wmemchr in these files relied
> on n * sizeof(wchar_t) which was not guranteed by the standard.
>
> The new overflow tests added in the previous commit now
> pass (As well as all the other tests).
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/memchr.S                | 77 +++++++++++++++++++-------
>  sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
>  2 files changed, 98 insertions(+), 37 deletions(-)
>
> diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
> index beff2708de..3ddc4655cf 100644
> --- a/sysdeps/x86_64/memchr.S
> +++ b/sysdeps/x86_64/memchr.S
> @@ -21,9 +21,11 @@
>  #ifdef USE_AS_WMEMCHR
>  # define MEMCHR                wmemchr
>  # define PCMPEQ                pcmpeqd
> +# define CHAR_PER_VEC  4
>  #else
>  # define MEMCHR                memchr
>  # define PCMPEQ                pcmpeqb
> +# define CHAR_PER_VEC  16
>  #endif
>
>  /* fast SSE2 version with using pmaxub and 64 byte loop */
> @@ -33,15 +35,14 @@ ENTRY(MEMCHR)
>         movd    %esi, %xmm1
>         mov     %edi, %ecx
>
> +#ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %edx, %edx
> +#endif
>  #ifdef USE_AS_WMEMCHR
>         test    %RDX_LP, %RDX_LP
>         jz      L(return_null)
> -       shl     $2, %RDX_LP
>  #else
> -# ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %edx, %edx
> -# endif
>         punpcklbw %xmm1, %xmm1
>         test    %RDX_LP, %RDX_LP
>         jz      L(return_null)
> @@ -60,13 +61,16 @@ ENTRY(MEMCHR)
>         test    %eax, %eax
>
>         jnz     L(matches_1)
> -       sub     $16, %rdx
> +       sub     $CHAR_PER_VEC, %rdx
>         jbe     L(return_null)
>         add     $16, %rdi
>         and     $15, %ecx
>         and     $-16, %rdi
> +#ifdef USE_AS_WMEMCHR
> +       shr     $2, %ecx
> +#endif
>         add     %rcx, %rdx
> -       sub     $64, %rdx
> +       sub     $(CHAR_PER_VEC * 4), %rdx
>         jbe     L(exit_loop)
>         jmp     L(loop_prolog)
>
> @@ -77,16 +81,21 @@ L(crosscache):
>         movdqa  (%rdi), %xmm0
>
>         PCMPEQ  %xmm1, %xmm0
> -/* Check if there is a match.  */
> +       /* Check if there is a match.  */
>         pmovmskb %xmm0, %eax
> -/* Remove the leading bytes.  */
> +       /* Remove the leading bytes.  */
>         sar     %cl, %eax
>         test    %eax, %eax
>         je      L(unaligned_no_match)
> -/* Check which byte is a match.  */
> +       /* Check which byte is a match.  */
>         bsf     %eax, %eax
> -
> +#ifdef USE_AS_WMEMCHR
> +       mov     %eax, %esi
> +       shr     $2, %esi
> +       sub     %rsi, %rdx
> +#else
>         sub     %rax, %rdx
> +#endif
>         jbe     L(return_null)
>         add     %rdi, %rax
>         add     %rcx, %rax
> @@ -94,15 +103,18 @@ L(crosscache):
>
>         .p2align 4
>  L(unaligned_no_match):
> -        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
> +       /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
>            "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
>            possible addition overflow.  */
>         neg     %rcx
>         add     $16, %rcx
> +#ifdef USE_AS_WMEMCHR
> +       shr     $2, %ecx
> +#endif
>         sub     %rcx, %rdx
>         jbe     L(return_null)
>         add     $16, %rdi
> -       sub     $64, %rdx
> +       sub     $(CHAR_PER_VEC * 4), %rdx
>         jbe     L(exit_loop)
>
>         .p2align 4
> @@ -135,7 +147,7 @@ L(loop_prolog):
>         test    $0x3f, %rdi
>         jz      L(align64_loop)
>
> -       sub     $64, %rdx
> +       sub     $(CHAR_PER_VEC * 4), %rdx
>         jbe     L(exit_loop)
>
>         movdqa  (%rdi), %xmm0
> @@ -167,11 +179,14 @@ L(loop_prolog):
>         mov     %rdi, %rcx
>         and     $-64, %rdi
>         and     $63, %ecx
> +#ifdef USE_AS_WMEMCHR
> +       shr     $2, %ecx
> +#endif
>         add     %rcx, %rdx
>
>         .p2align 4
>  L(align64_loop):
> -       sub     $64, %rdx
> +       sub     $(CHAR_PER_VEC * 4), %rdx
>         jbe     L(exit_loop)
>         movdqa  (%rdi), %xmm0
>         movdqa  16(%rdi), %xmm2
> @@ -218,7 +233,7 @@ L(align64_loop):
>
>         .p2align 4
>  L(exit_loop):
> -       add     $32, %edx
> +       add     $(CHAR_PER_VEC * 2), %edx
>         jle     L(exit_loop_32)
>
>         movdqa  (%rdi), %xmm0
> @@ -238,7 +253,7 @@ L(exit_loop):
>         pmovmskb %xmm3, %eax
>         test    %eax, %eax
>         jnz     L(matches32_1)
> -       sub     $16, %edx
> +       sub     $CHAR_PER_VEC, %edx
>         jle     L(return_null)
>
>         PCMPEQ  48(%rdi), %xmm1
> @@ -250,13 +265,13 @@ L(exit_loop):
>
>         .p2align 4
>  L(exit_loop_32):
> -       add     $32, %edx
> +       add     $(CHAR_PER_VEC * 2), %edx
>         movdqa  (%rdi), %xmm0
>         PCMPEQ  %xmm1, %xmm0
>         pmovmskb %xmm0, %eax
>         test    %eax, %eax
>         jnz     L(matches_1)
> -       sub     $16, %edx
> +       sub     $CHAR_PER_VEC, %edx
>         jbe     L(return_null)
>
>         PCMPEQ  16(%rdi), %xmm1
> @@ -293,7 +308,13 @@ L(matches32):
>         .p2align 4
>  L(matches_1):
>         bsf     %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> +       mov     %eax, %esi
> +       shr     $2, %esi
> +       sub     %rsi, %rdx
> +#else
>         sub     %rax, %rdx
> +#endif
>         jbe     L(return_null)
>         add     %rdi, %rax
>         ret
> @@ -301,7 +322,13 @@ L(matches_1):
>         .p2align 4
>  L(matches16_1):
>         bsf     %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> +       mov     %eax, %esi
> +       shr     $2, %esi
> +       sub     %rsi, %rdx
> +#else
>         sub     %rax, %rdx
> +#endif
>         jbe     L(return_null)
>         lea     16(%rdi, %rax), %rax
>         ret
> @@ -309,7 +336,13 @@ L(matches16_1):
>         .p2align 4
>  L(matches32_1):
>         bsf     %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> +       mov     %eax, %esi
> +       shr     $2, %esi
> +       sub     %rsi, %rdx
> +#else
>         sub     %rax, %rdx
> +#endif
>         jbe     L(return_null)
>         lea     32(%rdi, %rax), %rax
>         ret
> @@ -317,7 +350,13 @@ L(matches32_1):
>         .p2align 4
>  L(matches48_1):
>         bsf     %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> +       mov     %eax, %esi
> +       shr     $2, %esi
> +       sub     %rsi, %rdx
> +#else
>         sub     %rax, %rdx
> +#endif
>         jbe     L(return_null)
>         lea     48(%rdi, %rax), %rax
>         ret
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 0d8758e3e7..afdb956502 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -54,21 +54,19 @@
>
>  # define VEC_SIZE 32
>  # define PAGE_SIZE 4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
>         .section SECTION(.text),"ax",@progbits
>  ENTRY (MEMCHR)
>  # ifndef USE_AS_RAWMEMCHR
>         /* Check for zero length.  */
> -       test    %RDX_LP, %RDX_LP
> -       jz      L(null)
> -# endif
> -# ifdef USE_AS_WMEMCHR
> -       shl     $2, %RDX_LP
> -# else
>  #  ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %edx, %edx
> +       /* Clear upper bits.  */
> +       and     %RDX_LP, %RDX_LP
> +#  else
> +       test    %RDX_LP, %RDX_LP
>  #  endif
> +       jz      L(null)
>  # endif
>         /* Broadcast CHAR to YMMMATCH.  */
>         vmovd   %esi, %xmm0
> @@ -84,7 +82,7 @@ ENTRY (MEMCHR)
>         vpmovmskb %ymm1, %eax
>  # ifndef USE_AS_RAWMEMCHR
>         /* If length < CHAR_PER_VEC handle special.  */
> -       cmpq    $VEC_SIZE, %rdx
> +       cmpq    $CHAR_PER_VEC, %rdx
>         jbe     L(first_vec_x0)
>  # endif
>         testl   %eax, %eax
> @@ -98,6 +96,10 @@ ENTRY (MEMCHR)
>  L(first_vec_x0):
>         /* Check if first match was before length.  */
>         tzcntl  %eax, %eax
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %edx
> +#  endif
>         xorl    %ecx, %ecx
>         cmpl    %eax, %edx
>         leaq    (%rdi, %rax), %rax
> @@ -110,12 +112,12 @@ L(null):
>  # endif
>         .p2align 4
>  L(cross_page_boundary):
> -       /* Save pointer before aligning as its original value is necessary
> -          for computer return address if byte is found or adjusting length
> -          if it is not and this is memchr.  */
> +       /* Save pointer before aligning as its original value is
> +          necessary for computer return address if byte is found or
> +          adjusting length if it is not and this is memchr.  */
>         movq    %rdi, %rcx
> -       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
> -          rdi for rawmemchr.  */
> +       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
> +          and rdi for rawmemchr.  */
>         orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
>         VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
> @@ -124,6 +126,10 @@ L(cross_page_boundary):
>            match).  */
>         leaq    1(%ALGN_PTR_REG), %rsi
>         subq    %RRAW_PTR_REG, %rsi
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> +       shrl    $2, %esi
> +#  endif
>  # endif
>         /* Remove the leading bytes.  */
>         sarxl   %ERAW_PTR_REG, %eax, %eax
> @@ -181,6 +187,10 @@ L(cross_page_continue):
>         orq     $(VEC_SIZE - 1), %rdi
>         /* esi is for adjusting length to see if near the end.  */
>         leal    (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %esi
> +#  endif
>  # else
>         orq     $(VEC_SIZE - 1), %rdi
>  L(cross_page_continue):
> @@ -213,7 +223,7 @@ L(cross_page_continue):
>
>  # ifndef USE_AS_RAWMEMCHR
>         /* Check if at last VEC_SIZE * 4 length.  */
> -       subq    $(VEC_SIZE * 4), %rdx
> +       subq    $(CHAR_PER_VEC * 4), %rdx
>         jbe     L(last_4x_vec_or_less_cmpeq)
>         /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
>            length.  */
> @@ -221,6 +231,10 @@ L(cross_page_continue):
>         movl    %edi, %ecx
>         orq     $(VEC_SIZE * 4 - 1), %rdi
>         andl    $(VEC_SIZE * 4 - 1), %ecx
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %ecx
> +#  endif
>         addq    %rcx, %rdx
>  # else
>         /* Align data to VEC_SIZE * 4 - 1 for loop.  */
> @@ -250,15 +264,19 @@ L(loop_4x_vec):
>
>         subq    $-(VEC_SIZE * 4), %rdi
>
> -       subq    $(VEC_SIZE * 4), %rdx
> +       subq    $(CHAR_PER_VEC * 4), %rdx
>         ja      L(loop_4x_vec)
>
> -       /* Fall through into less than 4 remaining vectors of length case.
> -        */
> +       /* Fall through into less than 4 remaining vectors of length
> +          case.  */
>         VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
>         .p2align 4
>  L(last_4x_vec_or_less):
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %edx
> +#  endif
>         /* Check if first VEC contained match.  */
>         testl   %eax, %eax
>         jnz     L(first_vec_x1_check)
> @@ -355,6 +373,10 @@ L(last_vec_x2_return):
>  L(last_4x_vec_or_less_cmpeq):
>         VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %edx
> +#  endif
>         subq    $-(VEC_SIZE * 4), %rdi
>         /* Check first VEC regardless.  */
>         testl   %eax, %eax
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974]
  2021-06-23 17:30   ` H.J. Lu
@ 2021-06-23 18:30     ` Noah Goldstein
  2022-01-27 21:06       ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-23 18:30 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Wed, Jun 23, 2021 at 1:30 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> On Tue, Jun 22, 2021 at 11:32 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> > This commit adds tests for a bug in the wide char variant of the
> > functions where the implementation may assume that maxlen for wcsnlen
> > or n for wmemchr/strncat will not overflow when multiplied by
> > sizeof(wchar_t).
> >
> > These tests show the following implementations failing on x86_64:
> >
> > wcsnlen-sse4_1
> > wcsnlen-avx2
> >
> > wmemchr-sse2
> > wmemchr-avx2
> >
> > strncat would fail as well if it where on a system that prefered
> > either of the wcsnlen implementations that failed as it relies on
> > wcsnlen.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> > Rebased on: [PATCH v1 1/4] x86-64: Add wcslen optimize for sse4.1
> >  string/test-memchr.c  | 39 ++++++++++++++++++++++++---
> >  string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
> >  string/test-strnlen.c | 33 +++++++++++++++++++++++
> >  3 files changed, 130 insertions(+), 3 deletions(-)
> >
> > diff --git a/string/test-memchr.c b/string/test-memchr.c
> > index 665edc32af..ce964284aa 100644
> > --- a/string/test-memchr.c
> > +++ b/string/test-memchr.c
> > @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c,
> size_t n, CHAR *exp_res)
> >    CHAR *res = CALL (impl, s, c, n);
> >    if (res != exp_res)
> >      {
> > -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
> > -            res, exp_res);
> > +      error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p !=
> %p",
> > +             impl->name, s, c, n, res, exp_res);
> >        ret = 1;
> >        return;
> >      }
> > @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t
> n, int seek_char)
> >      }
> >    buf[align + len] = 0;
> >
> > -  if (pos < len)
> > +  if (pos < MIN(n, len))
> >      {
> >        buf[align + pos] = seek_char;
> >        buf[align + len] = -seek_char;
> > @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len,
> size_t n, int seek_char)
> >      do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
> >  }
> >
> > +static void
> > +do_overflow_tests (void)
> > +{
> > +  size_t i, j, len;
> > +  const size_t one = 1;
> > +  uintptr_t buf_addr = (uintptr_t) buf1;
> > +
> > +  for (i = 0; i < 750; ++i)
> > +    {
> > +        do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
> > +        do_test (0, i, 751, i - buf_addr, BIG_CHAR);
> > +        do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
> > +        do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
> > +        do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
> > +
> > +      len = 0;
> > +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
> > +        {
> > +          len |= one << j;
> > +          do_test (0, i, 751, len - i, BIG_CHAR);
> > +          do_test (0, i, 751, len + i, BIG_CHAR);
> > +          do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
> > +          do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
> > +
> > +          do_test (0, i, 751, ~len - i, BIG_CHAR);
> > +          do_test (0, i, 751, ~len + i, BIG_CHAR);
> > +          do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
> > +          do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
> > +        }
> > +    }
> > +}
> > +
> >  static void
> >  do_random_tests (void)
> >  {
> > @@ -221,6 +253,7 @@ test_main (void)
> >      do_test (page_size / 2 - i, i, i, 1, 0x9B);
> >
> >    do_random_tests ();
> > +  do_overflow_tests ();
> >    return ret;
> >  }
> >
> > diff --git a/string/test-strncat.c b/string/test-strncat.c
> > index 2ef917b820..37ea26ea05 100644
> > --- a/string/test-strncat.c
> > +++ b/string/test-strncat.c
> > @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1,
> size_t len2,
> >      }
> >  }
> >
> > +static void
> > +do_overflow_tests (void)
> > +{
> > +  size_t i, j, len;
> > +  const size_t one = 1;
> > +  CHAR *s1, *s2;
> > +  uintptr_t s1_addr;
> > +  s1 = (CHAR *) buf1;
> > +  s2 = (CHAR *) buf2;
> > +  s1_addr = (uintptr_t)s1;
> > + for (j = 0; j < 200; ++j)
> > +      s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
> > + s2[200] = 0;
> > +  for (i = 0; i < 750; ++i) {
> > +    for (j = 0; j < i; ++j)
> > +      s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
> > +    s1[i] = '\0';
> > +
> > +       FOR_EACH_IMPL (impl, 0)
> > +    {
> > +      s2[200] = '\0';
> > +      do_one_test (impl, s2, s1, SIZE_MAX - i);
> > +      s2[200] = '\0';
> > +      do_one_test (impl, s2, s1, i - s1_addr);
> > +      s2[200] = '\0';
> > +      do_one_test (impl, s2, s1, -s1_addr - i);
> > +      s2[200] = '\0';
> > +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
> > +      s2[200] = '\0';
> > +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
> > +    }
> > +
> > +    len = 0;
> > +    for (j = 8 * sizeof(size_t) - 1; j ; --j)
> > +      {
> > +        len |= one << j;
> > +        FOR_EACH_IMPL (impl, 0)
> > +          {
> > +            s2[200] = '\0';
> > +            do_one_test (impl, s2, s1, len - i);
> > +            s2[200] = '\0';
> > +            do_one_test (impl, s2, s1, len + i);
> > +            s2[200] = '\0';
> > +            do_one_test (impl, s2, s1, len - s1_addr - i);
> > +            s2[200] = '\0';
> > +            do_one_test (impl, s2, s1, len - s1_addr + i);
> > +
> > +            s2[200] = '\0';
> > +            do_one_test (impl, s2, s1, ~len - i);
> > +            s2[200] = '\0';
> > +            do_one_test (impl, s2, s1, ~len + i);
> > +            s2[200] = '\0';
> > +            do_one_test (impl, s2, s1, ~len - s1_addr - i);
> > +            s2[200] = '\0';
> > +            do_one_test (impl, s2, s1, ~len - s1_addr + i);
> > +          }
> > +      }
> > +  }
> > +}
> > +
> >  static void
> >  do_random_tests (void)
> >  {
> > @@ -316,6 +376,7 @@ test_main (void)
> >      }
> >
> >    do_random_tests ();
> > +  do_overflow_tests ();
> >    return ret;
> >  }
> >
> > diff --git a/string/test-strnlen.c b/string/test-strnlen.c
> > index 920f58e97b..f53e09263f 100644
> > --- a/string/test-strnlen.c
> > +++ b/string/test-strnlen.c
> > @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int
> max_char)
> >      do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len,
> maxlen));
> >  }
> >
> > +static void
> > +do_overflow_tests (void)
> > +{
> > +  size_t i, j, len;
> > +  const size_t one = 1;
> > +  uintptr_t buf_addr = (uintptr_t) buf1;
> > +
> > +  for (i = 0; i < 750; ++i)
> > +    {
> > +      do_test (0, i, SIZE_MAX - i, BIG_CHAR);
> > +      do_test (0, i, i - buf_addr, BIG_CHAR);
> > +      do_test (0, i, -buf_addr - i, BIG_CHAR);
> > +      do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
> > +      do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
> > +
> > +      len = 0;
> > +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
> > +        {
> > +          len |= one << j;
> > +          do_test (0, i, len - i, BIG_CHAR);
> > +          do_test (0, i, len + i, BIG_CHAR);
> > +          do_test (0, i, len - buf_addr - i, BIG_CHAR);
> > +          do_test (0, i, len - buf_addr + i, BIG_CHAR);
> > +
> > +          do_test (0, i, ~len - i, BIG_CHAR);
> > +          do_test (0, i, ~len + i, BIG_CHAR);
> > +          do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
> > +          do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
> > +        }
> > +    }
> > +}
> > +
> >  static void
> >  do_random_tests (void)
> >  {
> > @@ -283,6 +315,7 @@ test_main (void)
> >    do_random_tests ();
> >    do_page_tests ();
> >    do_page_2_tests ();
> > +  do_overflow_tests ();
> >    return ret;
> >  }
> >
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
>

Pushed and closed the bug report (left comment in bug report with the
commits).

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974]
  2021-06-23 18:30     ` Noah Goldstein
@ 2022-01-27 21:06       ` H.J. Lu
  0 siblings, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2022-01-27 21:06 UTC (permalink / raw)
  To: Noah Goldstein, Libc-stable Mailing List
  Cc: GNU C Library, Carlos O'Donell

On Wed, Jun 23, 2021 at 11:30 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
> On Wed, Jun 23, 2021 at 1:30 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Tue, Jun 22, 2021 at 11:32 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >
>> > This commit adds tests for a bug in the wide char variant of the
>> > functions where the implementation may assume that maxlen for wcsnlen
>> > or n for wmemchr/strncat will not overflow when multiplied by
>> > sizeof(wchar_t).
>> >
>> > These tests show the following implementations failing on x86_64:
>> >
>> > wcsnlen-sse4_1
>> > wcsnlen-avx2
>> >
>> > wmemchr-sse2
>> > wmemchr-avx2
>> >
>> > strncat would fail as well if it where on a system that prefered
>> > either of the wcsnlen implementations that failed as it relies on
>> > wcsnlen.
>> >
>> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
>> > ---
>> > Rebased on: [PATCH v1 1/4] x86-64: Add wcslen optimize for sse4.1
>> >  string/test-memchr.c  | 39 ++++++++++++++++++++++++---
>> >  string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
>> >  string/test-strnlen.c | 33 +++++++++++++++++++++++
>> >  3 files changed, 130 insertions(+), 3 deletions(-)
>> >
>> > diff --git a/string/test-memchr.c b/string/test-memchr.c
>> > index 665edc32af..ce964284aa 100644
>> > --- a/string/test-memchr.c
>> > +++ b/string/test-memchr.c
>> > @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
>> >    CHAR *res = CALL (impl, s, c, n);
>> >    if (res != exp_res)
>> >      {
>> > -      error (0, 0, "Wrong result in function %s %p %p", impl->name,
>> > -            res, exp_res);
>> > +      error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
>> > +             impl->name, s, c, n, res, exp_res);
>> >        ret = 1;
>> >        return;
>> >      }
>> > @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
>> >      }
>> >    buf[align + len] = 0;
>> >
>> > -  if (pos < len)
>> > +  if (pos < MIN(n, len))
>> >      {
>> >        buf[align + pos] = seek_char;
>> >        buf[align + len] = -seek_char;
>> > @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
>> >      do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
>> >  }
>> >
>> > +static void
>> > +do_overflow_tests (void)
>> > +{
>> > +  size_t i, j, len;
>> > +  const size_t one = 1;
>> > +  uintptr_t buf_addr = (uintptr_t) buf1;
>> > +
>> > +  for (i = 0; i < 750; ++i)
>> > +    {
>> > +        do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
>> > +        do_test (0, i, 751, i - buf_addr, BIG_CHAR);
>> > +        do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
>> > +        do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
>> > +        do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
>> > +
>> > +      len = 0;
>> > +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
>> > +        {
>> > +          len |= one << j;
>> > +          do_test (0, i, 751, len - i, BIG_CHAR);
>> > +          do_test (0, i, 751, len + i, BIG_CHAR);
>> > +          do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
>> > +          do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
>> > +
>> > +          do_test (0, i, 751, ~len - i, BIG_CHAR);
>> > +          do_test (0, i, 751, ~len + i, BIG_CHAR);
>> > +          do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
>> > +          do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
>> > +        }
>> > +    }
>> > +}
>> > +
>> >  static void
>> >  do_random_tests (void)
>> >  {
>> > @@ -221,6 +253,7 @@ test_main (void)
>> >      do_test (page_size / 2 - i, i, i, 1, 0x9B);
>> >
>> >    do_random_tests ();
>> > +  do_overflow_tests ();
>> >    return ret;
>> >  }
>> >
>> > diff --git a/string/test-strncat.c b/string/test-strncat.c
>> > index 2ef917b820..37ea26ea05 100644
>> > --- a/string/test-strncat.c
>> > +++ b/string/test-strncat.c
>> > @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
>> >      }
>> >  }
>> >
>> > +static void
>> > +do_overflow_tests (void)
>> > +{
>> > +  size_t i, j, len;
>> > +  const size_t one = 1;
>> > +  CHAR *s1, *s2;
>> > +  uintptr_t s1_addr;
>> > +  s1 = (CHAR *) buf1;
>> > +  s2 = (CHAR *) buf2;
>> > +  s1_addr = (uintptr_t)s1;
>> > + for (j = 0; j < 200; ++j)
>> > +      s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
>> > + s2[200] = 0;
>> > +  for (i = 0; i < 750; ++i) {
>> > +    for (j = 0; j < i; ++j)
>> > +      s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
>> > +    s1[i] = '\0';
>> > +
>> > +       FOR_EACH_IMPL (impl, 0)
>> > +    {
>> > +      s2[200] = '\0';
>> > +      do_one_test (impl, s2, s1, SIZE_MAX - i);
>> > +      s2[200] = '\0';
>> > +      do_one_test (impl, s2, s1, i - s1_addr);
>> > +      s2[200] = '\0';
>> > +      do_one_test (impl, s2, s1, -s1_addr - i);
>> > +      s2[200] = '\0';
>> > +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
>> > +      s2[200] = '\0';
>> > +      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
>> > +    }
>> > +
>> > +    len = 0;
>> > +    for (j = 8 * sizeof(size_t) - 1; j ; --j)
>> > +      {
>> > +        len |= one << j;
>> > +        FOR_EACH_IMPL (impl, 0)
>> > +          {
>> > +            s2[200] = '\0';
>> > +            do_one_test (impl, s2, s1, len - i);
>> > +            s2[200] = '\0';
>> > +            do_one_test (impl, s2, s1, len + i);
>> > +            s2[200] = '\0';
>> > +            do_one_test (impl, s2, s1, len - s1_addr - i);
>> > +            s2[200] = '\0';
>> > +            do_one_test (impl, s2, s1, len - s1_addr + i);
>> > +
>> > +            s2[200] = '\0';
>> > +            do_one_test (impl, s2, s1, ~len - i);
>> > +            s2[200] = '\0';
>> > +            do_one_test (impl, s2, s1, ~len + i);
>> > +            s2[200] = '\0';
>> > +            do_one_test (impl, s2, s1, ~len - s1_addr - i);
>> > +            s2[200] = '\0';
>> > +            do_one_test (impl, s2, s1, ~len - s1_addr + i);
>> > +          }
>> > +      }
>> > +  }
>> > +}
>> > +
>> >  static void
>> >  do_random_tests (void)
>> >  {
>> > @@ -316,6 +376,7 @@ test_main (void)
>> >      }
>> >
>> >    do_random_tests ();
>> > +  do_overflow_tests ();
>> >    return ret;
>> >  }
>> >
>> > diff --git a/string/test-strnlen.c b/string/test-strnlen.c
>> > index 920f58e97b..f53e09263f 100644
>> > --- a/string/test-strnlen.c
>> > +++ b/string/test-strnlen.c
>> > @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
>> >      do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
>> >  }
>> >
>> > +static void
>> > +do_overflow_tests (void)
>> > +{
>> > +  size_t i, j, len;
>> > +  const size_t one = 1;
>> > +  uintptr_t buf_addr = (uintptr_t) buf1;
>> > +
>> > +  for (i = 0; i < 750; ++i)
>> > +    {
>> > +      do_test (0, i, SIZE_MAX - i, BIG_CHAR);
>> > +      do_test (0, i, i - buf_addr, BIG_CHAR);
>> > +      do_test (0, i, -buf_addr - i, BIG_CHAR);
>> > +      do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
>> > +      do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
>> > +
>> > +      len = 0;
>> > +      for (j = 8 * sizeof(size_t) - 1; j ; --j)
>> > +        {
>> > +          len |= one << j;
>> > +          do_test (0, i, len - i, BIG_CHAR);
>> > +          do_test (0, i, len + i, BIG_CHAR);
>> > +          do_test (0, i, len - buf_addr - i, BIG_CHAR);
>> > +          do_test (0, i, len - buf_addr + i, BIG_CHAR);
>> > +
>> > +          do_test (0, i, ~len - i, BIG_CHAR);
>> > +          do_test (0, i, ~len + i, BIG_CHAR);
>> > +          do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
>> > +          do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
>> > +        }
>> > +    }
>> > +}
>> > +
>> >  static void
>> >  do_random_tests (void)
>> >  {
>> > @@ -283,6 +315,7 @@ test_main (void)
>> >    do_random_tests ();
>> >    do_page_tests ();
>> >    do_page_2_tests ();
>> > +  do_overflow_tests ();
>> >    return ret;
>> >  }
>> >
>> > --
>> > 2.25.1
>> >
>>
>> LGTM.
>>
>> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>>
>> Thanks.
>>
>> --
>> H.J.
>
>
> Pushed and closed the bug report (left comment in bug report with the commits).

I am backporting this patch set to release branches, including their dependency
patches.

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2022-01-27 21:06 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
2021-06-09 20:52 ` [PATCH v1 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 Noah Goldstein
2021-06-09 20:52 ` [PATCH v1 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 Noah Goldstein
2021-06-09 21:53 ` [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat H.J. Lu
2021-06-09 22:26   ` Noah Goldstein
2021-06-22 15:43     ` Noah Goldstein
2021-06-22 16:18       ` H.J. Lu
2021-06-22 18:23         ` Noah Goldstein
2021-06-22 18:11 ` [PATCH v2 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974] Noah Goldstein
2021-06-22 21:24   ` H.J. Lu
2021-06-22 18:11 ` [PATCH v2 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
2021-06-22 21:24   ` H.J. Lu
2021-06-22 18:11 ` [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
2021-06-22 21:33   ` H.J. Lu
2021-06-22 23:16     ` Noah Goldstein
2021-06-22 23:28       ` H.J. Lu
2021-06-23  3:11         ` Noah Goldstein
2021-06-23  3:58           ` H.J. Lu
2021-06-23  4:55             ` Noah Goldstein
2021-06-23  6:31 ` [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat " Noah Goldstein
2021-06-23 17:30   ` H.J. Lu
2021-06-23 18:30     ` Noah Goldstein
2022-01-27 21:06       ` H.J. Lu
2021-06-23  6:31 ` [PATCH v3 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
2021-06-23 17:30   ` H.J. Lu
2021-06-23  6:31 ` [PATCH v3 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
2021-06-23 17:27   ` H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).