* [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat
@ 2021-06-09 20:52 Noah Goldstein
2021-06-09 20:52 ` [PATCH v1 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 Noah Goldstein
` (8 more replies)
0 siblings, 9 replies; 27+ messages in thread
From: Noah Goldstein @ 2021-06-09 20:52 UTC (permalink / raw)
To: libc-alpha
This commit adds tests for a bug in the wide char variant of the
functions where the implementation may assume that maxlen for wcsnlen
or n for wmemchr/strncat will not overflow when multiplied by
sizeof(wchar_t).
These tests show the following implementations failing on x86_64:
wcsnlen-sse4_1
wcsnlen-avx2
wmemchr-sse2
wmemchr-avx2
strncat would fail as well if it where on a system that prefered
either of the wcsnlen implementations that failed as it relies on
wcsnlen.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
string/test-memchr.c | 39 ++++++++++++++++++++++++---
string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
string/test-strnlen.c | 33 +++++++++++++++++++++++
3 files changed, 130 insertions(+), 3 deletions(-)
diff --git a/string/test-memchr.c b/string/test-memchr.c
index 665edc32af..ce964284aa 100644
--- a/string/test-memchr.c
+++ b/string/test-memchr.c
@@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
CHAR *res = CALL (impl, s, c, n);
if (res != exp_res)
{
- error (0, 0, "Wrong result in function %s %p %p", impl->name,
- res, exp_res);
+ error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
+ impl->name, s, c, n, res, exp_res);
ret = 1;
return;
}
@@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
}
buf[align + len] = 0;
- if (pos < len)
+ if (pos < MIN(n, len))
{
buf[align + pos] = seek_char;
buf[align + len] = -seek_char;
@@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
}
+static void
+do_overflow_tests (void)
+{
+ size_t i, j, len;
+ const size_t one = 1;
+ uintptr_t buf_addr = (uintptr_t) buf1;
+
+ for (i = 0; i < 750; ++i)
+ {
+ do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
+ do_test (0, i, 751, i - buf_addr, BIG_CHAR);
+ do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
+ do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
+ do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+ len = 0;
+ for (j = 8 * sizeof(size_t) - 1; j ; --j)
+ {
+ len |= one << j;
+ do_test (0, i, 751, len - i, BIG_CHAR);
+ do_test (0, i, 751, len + i, BIG_CHAR);
+ do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
+ do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
+
+ do_test (0, i, 751, ~len - i, BIG_CHAR);
+ do_test (0, i, 751, ~len + i, BIG_CHAR);
+ do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
+ do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
+ }
+ }
+}
+
static void
do_random_tests (void)
{
@@ -221,6 +253,7 @@ test_main (void)
do_test (page_size / 2 - i, i, i, 1, 0x9B);
do_random_tests ();
+ do_overflow_tests ();
return ret;
}
diff --git a/string/test-strncat.c b/string/test-strncat.c
index 2ef917b820..0ab7541d4e 100644
--- a/string/test-strncat.c
+++ b/string/test-strncat.c
@@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
}
}
+static void
+do_overflow_tests (void)
+{
+ size_t i, j, len;
+ const size_t one = 1;
+ CHAR *s1, *s2;
+ uintptr_t s1_addr;
+ s1 = (CHAR *) buf1;
+ s2 = (CHAR *) buf2;
+ s1_addr = (uintptr_t)s1;
+ for (j = 0; j < 200; ++j)
+ s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
+ s2[200] = 0;
+ for (i = 0; i < 750; ++i) {
+ for (j = 0; j < i; ++j)
+ s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
+ s1[i] = '\0';
+
+ FOR_EACH_IMPL (impl, 0)
+ {
+ s2[0] = '\0';
+ do_one_test (impl, s2, s1, SIZE_MAX - i);
+ s2[0] = '\0';
+ do_one_test (impl, s2, s1, i - s1_addr);
+ s2[0] = '\0';
+ do_one_test (impl, s2, s1, -s1_addr - i);
+ s2[0] = '\0';
+ do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
+ s2[0] = '\0';
+ do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
+ }
+
+ len = 0;
+ for (j = 8 * sizeof(size_t) - 1; j ; --j)
+ {
+ len |= one << j;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ s2[0] = '\0';
+ do_one_test (impl, s2, s1, len - i);
+ s2[0] = '\0';
+ do_one_test (impl, s2, s1, len + i);
+ s2[0] = '\0';
+ do_one_test (impl, s2, s1, len - s1_addr - i);
+ s2[0] = '\0';
+ do_one_test (impl, s2, s1, len - s1_addr + i);
+
+ s2[0] = '\0';
+ do_one_test (impl, s2, s1, ~len - i);
+ s2[0] = '\0';
+ do_one_test (impl, s2, s1, ~len + i);
+ s2[0] = '\0';
+ do_one_test (impl, s2, s1, ~len - s1_addr - i);
+ s2[0] = '\0';
+ do_one_test (impl, s2, s1, ~len - s1_addr + i);
+ }
+ }
+ }
+}
+
static void
do_random_tests (void)
{
@@ -316,6 +376,7 @@ test_main (void)
}
do_random_tests ();
+ do_overflow_tests ();
return ret;
}
diff --git a/string/test-strnlen.c b/string/test-strnlen.c
index 920f58e97b..f53e09263f 100644
--- a/string/test-strnlen.c
+++ b/string/test-strnlen.c
@@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
}
+static void
+do_overflow_tests (void)
+{
+ size_t i, j, len;
+ const size_t one = 1;
+ uintptr_t buf_addr = (uintptr_t) buf1;
+
+ for (i = 0; i < 750; ++i)
+ {
+ do_test (0, i, SIZE_MAX - i, BIG_CHAR);
+ do_test (0, i, i - buf_addr, BIG_CHAR);
+ do_test (0, i, -buf_addr - i, BIG_CHAR);
+ do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
+ do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+ len = 0;
+ for (j = 8 * sizeof(size_t) - 1; j ; --j)
+ {
+ len |= one << j;
+ do_test (0, i, len - i, BIG_CHAR);
+ do_test (0, i, len + i, BIG_CHAR);
+ do_test (0, i, len - buf_addr - i, BIG_CHAR);
+ do_test (0, i, len - buf_addr + i, BIG_CHAR);
+
+ do_test (0, i, ~len - i, BIG_CHAR);
+ do_test (0, i, ~len + i, BIG_CHAR);
+ do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
+ do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
+ }
+ }
+}
+
static void
do_random_tests (void)
{
@@ -283,6 +315,7 @@ test_main (void)
do_random_tests ();
do_page_tests ();
do_page_2_tests ();
+ do_overflow_tests ();
return ret;
}
--
2.25.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v1 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2
2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
@ 2021-06-09 20:52 ` Noah Goldstein
2021-06-09 20:52 ` [PATCH v1 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 Noah Goldstein
` (7 subsequent siblings)
8 siblings, 0 replies; 27+ messages in thread
From: Noah Goldstein @ 2021-06-09 20:52 UTC (permalink / raw)
To: libc-alpha
This commit fixes the bug mentioned in the previous commit.
The previous implementations of wmemchr in these files relied
on n * sizeof(wchar_t) which was not guranteed by the standard.
The new overflow tests added in the previous commit now
pass (As well as all the other tests).
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++-------
sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
2 files changed, 98 insertions(+), 37 deletions(-)
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index beff2708de..3ddc4655cf 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -21,9 +21,11 @@
#ifdef USE_AS_WMEMCHR
# define MEMCHR wmemchr
# define PCMPEQ pcmpeqd
+# define CHAR_PER_VEC 4
#else
# define MEMCHR memchr
# define PCMPEQ pcmpeqb
+# define CHAR_PER_VEC 16
#endif
/* fast SSE2 version with using pmaxub and 64 byte loop */
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
movd %esi, %xmm1
mov %edi, %ecx
+#ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+#endif
#ifdef USE_AS_WMEMCHR
test %RDX_LP, %RDX_LP
jz L(return_null)
- shl $2, %RDX_LP
#else
-# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %edx, %edx
-# endif
punpcklbw %xmm1, %xmm1
test %RDX_LP, %RDX_LP
jz L(return_null)
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
test %eax, %eax
jnz L(matches_1)
- sub $16, %rdx
+ sub $CHAR_PER_VEC, %rdx
jbe L(return_null)
add $16, %rdi
and $15, %ecx
and $-16, %rdi
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
add %rcx, %rdx
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
jmp L(loop_prolog)
@@ -77,16 +81,21 @@ L(crosscache):
movdqa (%rdi), %xmm0
PCMPEQ %xmm1, %xmm0
-/* Check if there is a match. */
+ /* Check if there is a match. */
pmovmskb %xmm0, %eax
-/* Remove the leading bytes. */
+ /* Remove the leading bytes. */
sar %cl, %eax
test %eax, %eax
je L(unaligned_no_match)
-/* Check which byte is a match. */
+ /* Check which byte is a match. */
bsf %eax, %eax
-
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
add %rdi, %rax
add %rcx, %rax
@@ -94,15 +103,18 @@ L(crosscache):
.p2align 4
L(unaligned_no_match):
- /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
+ /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
"rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
possible addition overflow. */
neg %rcx
add $16, %rcx
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
sub %rcx, %rdx
jbe L(return_null)
add $16, %rdi
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
.p2align 4
@@ -135,7 +147,7 @@ L(loop_prolog):
test $0x3f, %rdi
jz L(align64_loop)
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
movdqa (%rdi), %xmm0
@@ -167,11 +179,14 @@ L(loop_prolog):
mov %rdi, %rcx
and $-64, %rdi
and $63, %ecx
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
add %rcx, %rdx
.p2align 4
L(align64_loop):
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
movdqa (%rdi), %xmm0
movdqa 16(%rdi), %xmm2
@@ -218,7 +233,7 @@ L(align64_loop):
.p2align 4
L(exit_loop):
- add $32, %edx
+ add $(CHAR_PER_VEC * 2), %edx
jle L(exit_loop_32)
movdqa (%rdi), %xmm0
@@ -238,7 +253,7 @@ L(exit_loop):
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32_1)
- sub $16, %edx
+ sub $CHAR_PER_VEC, %edx
jle L(return_null)
PCMPEQ 48(%rdi), %xmm1
@@ -250,13 +265,13 @@ L(exit_loop):
.p2align 4
L(exit_loop_32):
- add $32, %edx
+ add $(CHAR_PER_VEC * 2), %edx
movdqa (%rdi), %xmm0
PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches_1)
- sub $16, %edx
+ sub $CHAR_PER_VEC, %edx
jbe L(return_null)
PCMPEQ 16(%rdi), %xmm1
@@ -293,7 +308,13 @@ L(matches32):
.p2align 4
L(matches_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
add %rdi, %rax
ret
@@ -301,7 +322,13 @@ L(matches_1):
.p2align 4
L(matches16_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 16(%rdi, %rax), %rax
ret
@@ -309,7 +336,13 @@ L(matches16_1):
.p2align 4
L(matches32_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 32(%rdi, %rax), %rax
ret
@@ -317,7 +350,13 @@ L(matches32_1):
.p2align 4
L(matches48_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 48(%rdi, %rax), %rax
ret
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 0d8758e3e7..afdb956502 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -54,21 +54,19 @@
# define VEC_SIZE 32
# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section SECTION(.text),"ax",@progbits
ENTRY (MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
- test %RDX_LP, %RDX_LP
- jz L(null)
-# endif
-# ifdef USE_AS_WMEMCHR
- shl $2, %RDX_LP
-# else
# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %edx, %edx
+ /* Clear upper bits. */
+ and %RDX_LP, %RDX_LP
+# else
+ test %RDX_LP, %RDX_LP
# endif
+ jz L(null)
# endif
/* Broadcast CHAR to YMMMATCH. */
vmovd %esi, %xmm0
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
vpmovmskb %ymm1, %eax
# ifndef USE_AS_RAWMEMCHR
/* If length < CHAR_PER_VEC handle special. */
- cmpq $VEC_SIZE, %rdx
+ cmpq $CHAR_PER_VEC, %rdx
jbe L(first_vec_x0)
# endif
testl %eax, %eax
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
L(first_vec_x0):
/* Check if first match was before length. */
tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
xorl %ecx, %ecx
cmpl %eax, %edx
leaq (%rdi, %rax), %rax
@@ -110,12 +112,12 @@ L(null):
# endif
.p2align 4
L(cross_page_boundary):
- /* Save pointer before aligning as its original value is necessary
- for computer return address if byte is found or adjusting length
- if it is not and this is memchr. */
+ /* Save pointer before aligning as its original value is
+ necessary for computer return address if byte is found or
+ adjusting length if it is not and this is memchr. */
movq %rdi, %rcx
- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
- rdi for rawmemchr. */
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+ and rdi for rawmemchr. */
orq $(VEC_SIZE - 1), %ALGN_PTR_REG
VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
@@ -124,6 +126,10 @@ L(cross_page_boundary):
match). */
leaq 1(%ALGN_PTR_REG), %rsi
subq %RRAW_PTR_REG, %rsi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get wchar_t count. */
+ shrl $2, %esi
+# endif
# endif
/* Remove the leading bytes. */
sarxl %ERAW_PTR_REG, %eax, %eax
@@ -181,6 +187,10 @@ L(cross_page_continue):
orq $(VEC_SIZE - 1), %rdi
/* esi is for adjusting length to see if near the end. */
leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %esi
+# endif
# else
orq $(VEC_SIZE - 1), %rdi
L(cross_page_continue):
@@ -213,7 +223,7 @@ L(cross_page_continue):
# ifndef USE_AS_RAWMEMCHR
/* Check if at last VEC_SIZE * 4 length. */
- subq $(VEC_SIZE * 4), %rdx
+ subq $(CHAR_PER_VEC * 4), %rdx
jbe L(last_4x_vec_or_less_cmpeq)
/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
length. */
@@ -221,6 +231,10 @@ L(cross_page_continue):
movl %edi, %ecx
orq $(VEC_SIZE * 4 - 1), %rdi
andl $(VEC_SIZE * 4 - 1), %ecx
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
addq %rcx, %rdx
# else
/* Align data to VEC_SIZE * 4 - 1 for loop. */
@@ -250,15 +264,19 @@ L(loop_4x_vec):
subq $-(VEC_SIZE * 4), %rdi
- subq $(VEC_SIZE * 4), %rdx
+ subq $(CHAR_PER_VEC * 4), %rdx
ja L(loop_4x_vec)
- /* Fall through into less than 4 remaining vectors of length case.
- */
+ /* Fall through into less than 4 remaining vectors of length
+ case. */
VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
.p2align 4
L(last_4x_vec_or_less):
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
/* Check if first VEC contained match. */
testl %eax, %eax
jnz L(first_vec_x1_check)
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
L(last_4x_vec_or_less_cmpeq):
VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
subq $-(VEC_SIZE * 4), %rdi
/* Check first VEC regardless. */
testl %eax, %eax
--
2.25.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v1 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2
2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
2021-06-09 20:52 ` [PATCH v1 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 Noah Goldstein
@ 2021-06-09 20:52 ` Noah Goldstein
2021-06-09 21:53 ` [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat H.J. Lu
` (6 subsequent siblings)
8 siblings, 0 replies; 27+ messages in thread
From: Noah Goldstein @ 2021-06-09 20:52 UTC (permalink / raw)
To: libc-alpha
This commit fixes the bug mentioned in the previous commit.
The previous implementations of wmemchr in these files relied
on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
The new overflow tests added in the previous commit now
pass (As well as all the other tests).
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
Its possible there is a room for a speedup in strnlen-avx2
and strnlen-evex if we check for overflow first and jump to
strlen. This allows for end pointers to be used as opposed
to tracking length which will save some ALU / code size.
sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
sysdeps/x86_64/strlen.S | 14 ++-
2 files changed, 106 insertions(+), 38 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index bd2e6ee44a..b282a75613 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -44,21 +44,21 @@
# define VEC_SIZE 32
# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section SECTION(.text),"ax",@progbits
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
/* Check zero length. */
+# ifdef __ILP32__
+ /* Clear upper bits. */
+ and %RSI_LP, %RSI_LP
+# else
test %RSI_LP, %RSI_LP
+# endif
jz L(zero)
/* Store max len in R8_LP before adjusting if using WCSLEN. */
mov %RSI_LP, %R8_LP
-# ifdef USE_AS_WCSLEN
- shl $2, %RSI_LP
-# elif defined __ILP32__
- /* Clear the upper 32 bits. */
- movl %esi, %esi
-# endif
# endif
movl %edi, %eax
movq %rdi, %rdx
@@ -72,10 +72,10 @@ ENTRY (STRLEN)
/* Check the first VEC_SIZE bytes. */
VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
# ifdef USE_AS_STRNLEN
/* If length < VEC_SIZE handle special. */
- cmpq $VEC_SIZE, %rsi
+ cmpq $CHAR_PER_VEC, %rsi
jbe L(first_vec_x0)
# endif
/* If empty continue to aligned_more. Otherwise return bit
@@ -84,6 +84,7 @@ ENTRY (STRLEN)
jz L(aligned_more)
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -97,9 +98,14 @@ L(zero):
L(first_vec_x0):
/* Set bit for max len so that tzcnt will return min of max len
and position of first match. */
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %esi
+# endif
btsq %rsi, %rax
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -113,14 +119,19 @@ L(first_vec_x1):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE * 4 + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
incl %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -133,14 +144,19 @@ L(first_vec_x2):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE * 3 + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
addl $(VEC_SIZE + 1), %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -153,14 +169,19 @@ L(first_vec_x3):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE * 2 + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
addl $(VEC_SIZE * 2 + 1), %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -173,14 +194,19 @@ L(first_vec_x4):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
addl $(VEC_SIZE * 3 + 1), %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -195,10 +221,14 @@ L(cross_page_continue):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
# ifdef USE_AS_STRNLEN
- /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
- it simplies the logic in last_4x_vec_or_less. */
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
+ because it simplies the logic in last_4x_vec_or_less. */
leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
subq %rdx, %rcx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
# endif
/* Load first VEC regardless. */
VPCMPEQ 1(%rdi), %ymm0, %ymm1
@@ -207,34 +237,38 @@ L(cross_page_continue):
subq %rcx, %rsi
jb L(last_4x_vec_or_less)
# endif
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x1)
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x4)
/* Align data to VEC_SIZE * 4 - 1. */
# ifdef USE_AS_STRNLEN
/* Before adjusting length check if at last VEC_SIZE * 4. */
- cmpq $(VEC_SIZE * 4 - 1), %rsi
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
jbe L(last_4x_vec_or_less_load)
incq %rdi
movl %edi, %ecx
orq $(VEC_SIZE * 4 - 1), %rdi
andl $(VEC_SIZE * 4 - 1), %ecx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
/* Readjust length. */
addq %rcx, %rsi
# else
@@ -246,13 +280,13 @@ L(cross_page_continue):
L(loop_4x_vec):
# ifdef USE_AS_STRNLEN
/* Break if at end of length. */
- subq $(VEC_SIZE * 4), %rsi
+ subq $(CHAR_PER_VEC * 4), %rsi
jb L(last_4x_vec_or_less_cmpeq)
# endif
- /* Save some code size by microfusing VPMINU with the load. Since
- the matches in ymm2/ymm4 can only be returned if there where no
- matches in ymm1/ymm3 respectively there is no issue with overlap.
- */
+ /* Save some code size by microfusing VPMINU with the load.
+ Since the matches in ymm2/ymm4 can only be returned if there
+ where no matches in ymm1/ymm3 respectively there is no issue
+ with overlap. */
vmovdqa 1(%rdi), %ymm1
VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
@@ -260,7 +294,7 @@ L(loop_4x_vec):
VPMINU %ymm2, %ymm4, %ymm5
VPCMPEQ %ymm5, %ymm0, %ymm5
- vpmovmskb %ymm5, %ecx
+ vpmovmskb %ymm5, %ecx
subq $-(VEC_SIZE * 4), %rdi
testl %ecx, %ecx
@@ -268,27 +302,28 @@ L(loop_4x_vec):
VPCMPEQ %ymm1, %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
subq %rdx, %rdi
testl %eax, %eax
jnz L(last_vec_return_x0)
VPCMPEQ %ymm2, %ymm0, %ymm2
- vpmovmskb %ymm2, %eax
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(last_vec_return_x1)
/* Combine last 2 VEC. */
VPCMPEQ %ymm3, %ymm0, %ymm3
- vpmovmskb %ymm3, %eax
- /* rcx has combined result from all 4 VEC. It will only be used if
- the first 3 other VEC all did not contain a match. */
+ vpmovmskb %ymm3, %eax
+ /* rcx has combined result from all 4 VEC. It will only be used
+ if the first 3 other VEC all did not contain a match. */
salq $32, %rcx
orq %rcx, %rax
tzcntq %rax, %rax
subq $(VEC_SIZE * 2 - 1), %rdi
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -297,15 +332,19 @@ L(loop_4x_vec):
# ifdef USE_AS_STRNLEN
.p2align 4
L(last_4x_vec_or_less_load):
- /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1.
+ */
subq $-(VEC_SIZE * 4), %rdi
L(last_4x_vec_or_less_cmpeq):
VPCMPEQ 1(%rdi), %ymm0, %ymm1
L(last_4x_vec_or_less):
-
- vpmovmskb %ymm1, %eax
- /* If remaining length > VEC_SIZE * 2. This works if esi is off by
- VEC_SIZE * 4. */
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %esi
+# endif
+ vpmovmskb %ymm1, %eax
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off
+ by VEC_SIZE * 4. */
testl $(VEC_SIZE * 2), %esi
jnz L(last_4x_vec)
@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
jb L(max)
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
tzcntl %eax, %eax
/* Check the end of data. */
cmpl %eax, %esi
@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -340,6 +380,7 @@ L(last_vec_return_x0):
subq $(VEC_SIZE * 4 - 1), %rdi
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -350,6 +391,7 @@ L(last_vec_return_x1):
subq $(VEC_SIZE * 3 - 1), %rdi
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -366,6 +408,7 @@ L(last_vec_x1_check):
incl %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -381,14 +424,14 @@ L(last_4x_vec):
jnz L(last_vec_x1)
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(last_vec_x2)
/* Normalize length. */
andl $(VEC_SIZE * 4 - 1), %esi
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(last_vec_x3)
@@ -396,7 +439,7 @@ L(last_4x_vec):
jb L(max)
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
tzcntl %eax, %eax
/* Check the end of data. */
cmpl %eax, %esi
@@ -405,6 +448,7 @@ L(last_4x_vec):
addl $(VEC_SIZE * 3 + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -419,6 +463,7 @@ L(last_vec_x1):
incl %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -432,6 +477,7 @@ L(last_vec_x2):
addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -447,6 +493,7 @@ L(last_vec_x3):
addl $(VEC_SIZE * 2 + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -455,13 +502,13 @@ L(max_end):
VZEROUPPER_RETURN
# endif
- /* Cold case for crossing page with first load. */
+ /* Cold case for crossing page with first load. */
.p2align 4
L(cross_page_boundary):
/* Align data to VEC_SIZE - 1. */
orq $(VEC_SIZE - 1), %rdi
VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
so no need to manually mod rdx. */
sarxl %edx, %eax, %eax
@@ -470,6 +517,10 @@ L(cross_page_boundary):
jnz L(cross_page_less_vec)
leaq 1(%rdi), %rcx
subq %rdx, %rcx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
+ shrl $2, %ecx
+# endif
/* Check length. */
cmpq %rsi, %rcx
jb L(cross_page_continue)
@@ -479,6 +530,7 @@ L(cross_page_boundary):
jz L(cross_page_continue)
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide length by 4 to get wchar_t count. */
shrl $2, %eax
# endif
# endif
@@ -489,6 +541,10 @@ L(return_vzeroupper):
.p2align 4
L(cross_page_less_vec):
tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %esi
+# endif
cmpq %rax, %rsi
cmovb %esi, %eax
# ifdef USE_AS_WCSLEN
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index d223ea1700..3fc6734910 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -65,12 +65,24 @@ ENTRY(strlen)
ret
L(n_nonzero):
# ifdef AS_WCSLEN
- shl $2, %RSI_LP
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
+ overflow the only way this program doesn't have undefined behavior
+ is if there is a null terminator in valid memory so strlen will
+ suffice. */
+ mov %RSI_LP, %R10_LP
+ sar $62, %R10_LP
+ test %R10_LP, %R10_LP
+ jnz __wcslen_sse2
+ sal $2, %RSI_LP
# endif
/* Initialize long lived registers. */
add %RDI_LP, %RSI_LP
+# ifdef AS_WCSLEN
+/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
+ jbe __wcslen_sse2
+# endif
mov %RSI_LP, %R10_LP
and $-64, %R10_LP
mov %RSI_LP, %R11_LP
--
2.25.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat
2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
2021-06-09 20:52 ` [PATCH v1 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 Noah Goldstein
2021-06-09 20:52 ` [PATCH v1 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 Noah Goldstein
@ 2021-06-09 21:53 ` H.J. Lu
2021-06-09 22:26 ` Noah Goldstein
2021-06-22 18:11 ` [PATCH v2 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974] Noah Goldstein
` (5 subsequent siblings)
8 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2021-06-09 21:53 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Wed, Jun 9, 2021 at 1:53 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit adds tests for a bug in the wide char variant of the
> functions where the implementation may assume that maxlen for wcsnlen
> or n for wmemchr/strncat will not overflow when multiplied by
> sizeof(wchar_t).
>
> These tests show the following implementations failing on x86_64:
>
> wcsnlen-sse4_1
> wcsnlen-avx2
>
> wmemchr-sse2
> wmemchr-avx2
>
> strncat would fail as well if it where on a system that prefered
> either of the wcsnlen implementations that failed as it relies on
> wcsnlen.
Please open a bug report for each standard C function. We need to
track them for backporting to release branches.
Thanks.
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> string/test-memchr.c | 39 ++++++++++++++++++++++++---
> string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
> string/test-strnlen.c | 33 +++++++++++++++++++++++
> 3 files changed, 130 insertions(+), 3 deletions(-)
>
> diff --git a/string/test-memchr.c b/string/test-memchr.c
> index 665edc32af..ce964284aa 100644
> --- a/string/test-memchr.c
> +++ b/string/test-memchr.c
> @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
> CHAR *res = CALL (impl, s, c, n);
> if (res != exp_res)
> {
> - error (0, 0, "Wrong result in function %s %p %p", impl->name,
> - res, exp_res);
> + error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
> + impl->name, s, c, n, res, exp_res);
> ret = 1;
> return;
> }
> @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
> }
> buf[align + len] = 0;
>
> - if (pos < len)
> + if (pos < MIN(n, len))
> {
> buf[align + pos] = seek_char;
> buf[align + len] = -seek_char;
> @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
> do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
> }
>
> +static void
> +do_overflow_tests (void)
> +{
> + size_t i, j, len;
> + const size_t one = 1;
> + uintptr_t buf_addr = (uintptr_t) buf1;
> +
> + for (i = 0; i < 750; ++i)
> + {
> + do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
> + do_test (0, i, 751, i - buf_addr, BIG_CHAR);
> + do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
> + do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
> + do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
> +
> + len = 0;
> + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> + {
> + len |= one << j;
> + do_test (0, i, 751, len - i, BIG_CHAR);
> + do_test (0, i, 751, len + i, BIG_CHAR);
> + do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
> + do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
> +
> + do_test (0, i, 751, ~len - i, BIG_CHAR);
> + do_test (0, i, 751, ~len + i, BIG_CHAR);
> + do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
> + do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
> + }
> + }
> +}
> +
> static void
> do_random_tests (void)
> {
> @@ -221,6 +253,7 @@ test_main (void)
> do_test (page_size / 2 - i, i, i, 1, 0x9B);
>
> do_random_tests ();
> + do_overflow_tests ();
> return ret;
> }
>
> diff --git a/string/test-strncat.c b/string/test-strncat.c
> index 2ef917b820..0ab7541d4e 100644
> --- a/string/test-strncat.c
> +++ b/string/test-strncat.c
> @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
> }
> }
>
> +static void
> +do_overflow_tests (void)
> +{
> + size_t i, j, len;
> + const size_t one = 1;
> + CHAR *s1, *s2;
> + uintptr_t s1_addr;
> + s1 = (CHAR *) buf1;
> + s2 = (CHAR *) buf2;
> + s1_addr = (uintptr_t)s1;
> + for (j = 0; j < 200; ++j)
> + s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
> + s2[200] = 0;
> + for (i = 0; i < 750; ++i) {
> + for (j = 0; j < i; ++j)
> + s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
> + s1[i] = '\0';
> +
> + FOR_EACH_IMPL (impl, 0)
> + {
> + s2[0] = '\0';
> + do_one_test (impl, s2, s1, SIZE_MAX - i);
> + s2[0] = '\0';
> + do_one_test (impl, s2, s1, i - s1_addr);
> + s2[0] = '\0';
> + do_one_test (impl, s2, s1, -s1_addr - i);
> + s2[0] = '\0';
> + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
> + s2[0] = '\0';
> + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
> + }
> +
> + len = 0;
> + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> + {
> + len |= one << j;
> + FOR_EACH_IMPL (impl, 0)
> + {
> + s2[0] = '\0';
> + do_one_test (impl, s2, s1, len - i);
> + s2[0] = '\0';
> + do_one_test (impl, s2, s1, len + i);
> + s2[0] = '\0';
> + do_one_test (impl, s2, s1, len - s1_addr - i);
> + s2[0] = '\0';
> + do_one_test (impl, s2, s1, len - s1_addr + i);
> +
> + s2[0] = '\0';
> + do_one_test (impl, s2, s1, ~len - i);
> + s2[0] = '\0';
> + do_one_test (impl, s2, s1, ~len + i);
> + s2[0] = '\0';
> + do_one_test (impl, s2, s1, ~len - s1_addr - i);
> + s2[0] = '\0';
> + do_one_test (impl, s2, s1, ~len - s1_addr + i);
> + }
> + }
> + }
> +}
> +
> static void
> do_random_tests (void)
> {
> @@ -316,6 +376,7 @@ test_main (void)
> }
>
> do_random_tests ();
> + do_overflow_tests ();
> return ret;
> }
>
> diff --git a/string/test-strnlen.c b/string/test-strnlen.c
> index 920f58e97b..f53e09263f 100644
> --- a/string/test-strnlen.c
> +++ b/string/test-strnlen.c
> @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
> do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
> }
>
> +static void
> +do_overflow_tests (void)
> +{
> + size_t i, j, len;
> + const size_t one = 1;
> + uintptr_t buf_addr = (uintptr_t) buf1;
> +
> + for (i = 0; i < 750; ++i)
> + {
> + do_test (0, i, SIZE_MAX - i, BIG_CHAR);
> + do_test (0, i, i - buf_addr, BIG_CHAR);
> + do_test (0, i, -buf_addr - i, BIG_CHAR);
> + do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
> + do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
> +
> + len = 0;
> + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> + {
> + len |= one << j;
> + do_test (0, i, len - i, BIG_CHAR);
> + do_test (0, i, len + i, BIG_CHAR);
> + do_test (0, i, len - buf_addr - i, BIG_CHAR);
> + do_test (0, i, len - buf_addr + i, BIG_CHAR);
> +
> + do_test (0, i, ~len - i, BIG_CHAR);
> + do_test (0, i, ~len + i, BIG_CHAR);
> + do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
> + do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
> + }
> + }
> +}
> +
> static void
> do_random_tests (void)
> {
> @@ -283,6 +315,7 @@ test_main (void)
> do_random_tests ();
> do_page_tests ();
> do_page_2_tests ();
> + do_overflow_tests ();
> return ret;
> }
>
> --
> 2.25.1
>
--
H.J.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat
2021-06-09 21:53 ` [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat H.J. Lu
@ 2021-06-09 22:26 ` Noah Goldstein
2021-06-22 15:43 ` Noah Goldstein
0 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-09 22:26 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Wed, Jun 9, 2021 at 5:54 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> On Wed, Jun 9, 2021 at 1:53 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> > This commit adds tests for a bug in the wide char variant of the
> > functions where the implementation may assume that maxlen for wcsnlen
> > or n for wmemchr/strncat will not overflow when multiplied by
> > sizeof(wchar_t).
> >
> > These tests show the following implementations failing on x86_64:
> >
> > wcsnlen-sse4_1
> > wcsnlen-avx2
> >
> > wmemchr-sse2
> > wmemchr-avx2
> >
> > strncat would fail as well if it where on a system that prefered
> > either of the wcsnlen implementations that failed as it relies on
> > wcsnlen.
>
> Please open a bug report for each standard C function. We need to
> track them for backporting to release branches.
>
Done: https://sourceware.org/bugzilla/show_bug.cgi?id=27974
>
> Thanks.
>
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> > string/test-memchr.c | 39 ++++++++++++++++++++++++---
> > string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
> > string/test-strnlen.c | 33 +++++++++++++++++++++++
> > 3 files changed, 130 insertions(+), 3 deletions(-)
> >
> > diff --git a/string/test-memchr.c b/string/test-memchr.c
> > index 665edc32af..ce964284aa 100644
> > --- a/string/test-memchr.c
> > +++ b/string/test-memchr.c
> > @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c,
> size_t n, CHAR *exp_res)
> > CHAR *res = CALL (impl, s, c, n);
> > if (res != exp_res)
> > {
> > - error (0, 0, "Wrong result in function %s %p %p", impl->name,
> > - res, exp_res);
> > + error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p !=
> %p",
> > + impl->name, s, c, n, res, exp_res);
> > ret = 1;
> > return;
> > }
> > @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t
> n, int seek_char)
> > }
> > buf[align + len] = 0;
> >
> > - if (pos < len)
> > + if (pos < MIN(n, len))
> > {
> > buf[align + pos] = seek_char;
> > buf[align + len] = -seek_char;
> > @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len,
> size_t n, int seek_char)
> > do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
> > }
> >
> > +static void
> > +do_overflow_tests (void)
> > +{
> > + size_t i, j, len;
> > + const size_t one = 1;
> > + uintptr_t buf_addr = (uintptr_t) buf1;
> > +
> > + for (i = 0; i < 750; ++i)
> > + {
> > + do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
> > + do_test (0, i, 751, i - buf_addr, BIG_CHAR);
> > + do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
> > + do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
> > + do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
> > +
> > + len = 0;
> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> > + {
> > + len |= one << j;
> > + do_test (0, i, 751, len - i, BIG_CHAR);
> > + do_test (0, i, 751, len + i, BIG_CHAR);
> > + do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
> > + do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
> > +
> > + do_test (0, i, 751, ~len - i, BIG_CHAR);
> > + do_test (0, i, 751, ~len + i, BIG_CHAR);
> > + do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
> > + do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
> > + }
> > + }
> > +}
> > +
> > static void
> > do_random_tests (void)
> > {
> > @@ -221,6 +253,7 @@ test_main (void)
> > do_test (page_size / 2 - i, i, i, 1, 0x9B);
> >
> > do_random_tests ();
> > + do_overflow_tests ();
> > return ret;
> > }
> >
> > diff --git a/string/test-strncat.c b/string/test-strncat.c
> > index 2ef917b820..0ab7541d4e 100644
> > --- a/string/test-strncat.c
> > +++ b/string/test-strncat.c
> > @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1,
> size_t len2,
> > }
> > }
> >
> > +static void
> > +do_overflow_tests (void)
> > +{
> > + size_t i, j, len;
> > + const size_t one = 1;
> > + CHAR *s1, *s2;
> > + uintptr_t s1_addr;
> > + s1 = (CHAR *) buf1;
> > + s2 = (CHAR *) buf2;
> > + s1_addr = (uintptr_t)s1;
> > + for (j = 0; j < 200; ++j)
> > + s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
> > + s2[200] = 0;
> > + for (i = 0; i < 750; ++i) {
> > + for (j = 0; j < i; ++j)
> > + s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
> > + s1[i] = '\0';
> > +
> > + FOR_EACH_IMPL (impl, 0)
> > + {
> > + s2[0] = '\0';
> > + do_one_test (impl, s2, s1, SIZE_MAX - i);
> > + s2[0] = '\0';
> > + do_one_test (impl, s2, s1, i - s1_addr);
> > + s2[0] = '\0';
> > + do_one_test (impl, s2, s1, -s1_addr - i);
> > + s2[0] = '\0';
> > + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
> > + s2[0] = '\0';
> > + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
> > + }
> > +
> > + len = 0;
> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> > + {
> > + len |= one << j;
> > + FOR_EACH_IMPL (impl, 0)
> > + {
> > + s2[0] = '\0';
> > + do_one_test (impl, s2, s1, len - i);
> > + s2[0] = '\0';
> > + do_one_test (impl, s2, s1, len + i);
> > + s2[0] = '\0';
> > + do_one_test (impl, s2, s1, len - s1_addr - i);
> > + s2[0] = '\0';
> > + do_one_test (impl, s2, s1, len - s1_addr + i);
> > +
> > + s2[0] = '\0';
> > + do_one_test (impl, s2, s1, ~len - i);
> > + s2[0] = '\0';
> > + do_one_test (impl, s2, s1, ~len + i);
> > + s2[0] = '\0';
> > + do_one_test (impl, s2, s1, ~len - s1_addr - i);
> > + s2[0] = '\0';
> > + do_one_test (impl, s2, s1, ~len - s1_addr + i);
> > + }
> > + }
> > + }
> > +}
> > +
> > static void
> > do_random_tests (void)
> > {
> > @@ -316,6 +376,7 @@ test_main (void)
> > }
> >
> > do_random_tests ();
> > + do_overflow_tests ();
> > return ret;
> > }
> >
> > diff --git a/string/test-strnlen.c b/string/test-strnlen.c
> > index 920f58e97b..f53e09263f 100644
> > --- a/string/test-strnlen.c
> > +++ b/string/test-strnlen.c
> > @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int
> max_char)
> > do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len,
> maxlen));
> > }
> >
> > +static void
> > +do_overflow_tests (void)
> > +{
> > + size_t i, j, len;
> > + const size_t one = 1;
> > + uintptr_t buf_addr = (uintptr_t) buf1;
> > +
> > + for (i = 0; i < 750; ++i)
> > + {
> > + do_test (0, i, SIZE_MAX - i, BIG_CHAR);
> > + do_test (0, i, i - buf_addr, BIG_CHAR);
> > + do_test (0, i, -buf_addr - i, BIG_CHAR);
> > + do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
> > + do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
> > +
> > + len = 0;
> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> > + {
> > + len |= one << j;
> > + do_test (0, i, len - i, BIG_CHAR);
> > + do_test (0, i, len + i, BIG_CHAR);
> > + do_test (0, i, len - buf_addr - i, BIG_CHAR);
> > + do_test (0, i, len - buf_addr + i, BIG_CHAR);
> > +
> > + do_test (0, i, ~len - i, BIG_CHAR);
> > + do_test (0, i, ~len + i, BIG_CHAR);
> > + do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
> > + do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
> > + }
> > + }
> > +}
> > +
> > static void
> > do_random_tests (void)
> > {
> > @@ -283,6 +315,7 @@ test_main (void)
> > do_random_tests ();
> > do_page_tests ();
> > do_page_2_tests ();
> > + do_overflow_tests ();
> > return ret;
> > }
> >
> > --
> > 2.25.1
> >
>
>
> --
> H.J.
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat
2021-06-09 22:26 ` Noah Goldstein
@ 2021-06-22 15:43 ` Noah Goldstein
2021-06-22 16:18 ` H.J. Lu
0 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-22 15:43 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Wed, Jun 9, 2021 at 6:26 PM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:
>
>
> On Wed, Jun 9, 2021 at 5:54 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
>> On Wed, Jun 9, 2021 at 1:53 PM Noah Goldstein <goldstein.w.n@gmail.com>
>> wrote:
>> >
>> > This commit adds tests for a bug in the wide char variant of the
>> > functions where the implementation may assume that maxlen for wcsnlen
>> > or n for wmemchr/strncat will not overflow when multiplied by
>> > sizeof(wchar_t).
>> >
>> > These tests show the following implementations failing on x86_64:
>> >
>> > wcsnlen-sse4_1
>> > wcsnlen-avx2
>> >
>> > wmemchr-sse2
>> > wmemchr-avx2
>> >
>> > strncat would fail as well if it where on a system that prefered
>> > either of the wcsnlen implementations that failed as it relies on
>> > wcsnlen.
>>
>> Please open a bug report for each standard C function. We need to
>> track them for backporting to release branches.
>>
>
> Done: https://sourceware.org/bugzilla/show_bug.cgi?id=27974
>
>
>>
>> Thanks.
>>
>> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
>> > ---
>> > string/test-memchr.c | 39 ++++++++++++++++++++++++---
>> > string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
>> > string/test-strnlen.c | 33 +++++++++++++++++++++++
>> > 3 files changed, 130 insertions(+), 3 deletions(-)
>> >
>> > diff --git a/string/test-memchr.c b/string/test-memchr.c
>> > index 665edc32af..ce964284aa 100644
>> > --- a/string/test-memchr.c
>> > +++ b/string/test-memchr.c
>> > @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c,
>> size_t n, CHAR *exp_res)
>> > CHAR *res = CALL (impl, s, c, n);
>> > if (res != exp_res)
>> > {
>> > - error (0, 0, "Wrong result in function %s %p %p", impl->name,
>> > - res, exp_res);
>> > + error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p !=
>> %p",
>> > + impl->name, s, c, n, res, exp_res);
>> > ret = 1;
>> > return;
>> > }
>> > @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t
>> n, int seek_char)
>> > }
>> > buf[align + len] = 0;
>> >
>> > - if (pos < len)
>> > + if (pos < MIN(n, len))
>> > {
>> > buf[align + pos] = seek_char;
>> > buf[align + len] = -seek_char;
>> > @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len,
>> size_t n, int seek_char)
>> > do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
>> > }
>> >
>> > +static void
>> > +do_overflow_tests (void)
>> > +{
>> > + size_t i, j, len;
>> > + const size_t one = 1;
>> > + uintptr_t buf_addr = (uintptr_t) buf1;
>> > +
>> > + for (i = 0; i < 750; ++i)
>> > + {
>> > + do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
>> > + do_test (0, i, 751, i - buf_addr, BIG_CHAR);
>> > + do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
>> > +
>> > + len = 0;
>> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
>> > + {
>> > + len |= one << j;
>> > + do_test (0, i, 751, len - i, BIG_CHAR);
>> > + do_test (0, i, 751, len + i, BIG_CHAR);
>> > + do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
>> > +
>> > + do_test (0, i, 751, ~len - i, BIG_CHAR);
>> > + do_test (0, i, 751, ~len + i, BIG_CHAR);
>> > + do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
>> > + }
>> > + }
>> > +}
>> > +
>> > static void
>> > do_random_tests (void)
>> > {
>> > @@ -221,6 +253,7 @@ test_main (void)
>> > do_test (page_size / 2 - i, i, i, 1, 0x9B);
>> >
>> > do_random_tests ();
>> > + do_overflow_tests ();
>> > return ret;
>> > }
>> >
>> > diff --git a/string/test-strncat.c b/string/test-strncat.c
>> > index 2ef917b820..0ab7541d4e 100644
>> > --- a/string/test-strncat.c
>> > +++ b/string/test-strncat.c
>> > @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t
>> len1, size_t len2,
>> > }
>> > }
>> >
>> > +static void
>> > +do_overflow_tests (void)
>> > +{
>> > + size_t i, j, len;
>> > + const size_t one = 1;
>> > + CHAR *s1, *s2;
>> > + uintptr_t s1_addr;
>> > + s1 = (CHAR *) buf1;
>> > + s2 = (CHAR *) buf2;
>> > + s1_addr = (uintptr_t)s1;
>> > + for (j = 0; j < 200; ++j)
>> > + s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
>> > + s2[200] = 0;
>> > + for (i = 0; i < 750; ++i) {
>> > + for (j = 0; j < i; ++j)
>> > + s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
>> > + s1[i] = '\0';
>> > +
>> > + FOR_EACH_IMPL (impl, 0)
>> > + {
>> > + s2[0] = '\0';
>> > + do_one_test (impl, s2, s1, SIZE_MAX - i);
>> > + s2[0] = '\0';
>> > + do_one_test (impl, s2, s1, i - s1_addr);
>> > + s2[0] = '\0';
>> > + do_one_test (impl, s2, s1, -s1_addr - i);
>> > + s2[0] = '\0';
>> > + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
>> > + s2[0] = '\0';
>> > + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
>> > + }
>> > +
>> > + len = 0;
>> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
>> > + {
>> > + len |= one << j;
>> > + FOR_EACH_IMPL (impl, 0)
>> > + {
>> > + s2[0] = '\0';
>> > + do_one_test (impl, s2, s1, len - i);
>> > + s2[0] = '\0';
>> > + do_one_test (impl, s2, s1, len + i);
>> > + s2[0] = '\0';
>> > + do_one_test (impl, s2, s1, len - s1_addr - i);
>> > + s2[0] = '\0';
>> > + do_one_test (impl, s2, s1, len - s1_addr + i);
>> > +
>> > + s2[0] = '\0';
>> > + do_one_test (impl, s2, s1, ~len - i);
>> > + s2[0] = '\0';
>> > + do_one_test (impl, s2, s1, ~len + i);
>> > + s2[0] = '\0';
>> > + do_one_test (impl, s2, s1, ~len - s1_addr - i);
>> > + s2[0] = '\0';
>> > + do_one_test (impl, s2, s1, ~len - s1_addr + i);
>> > + }
>> > + }
>> > + }
>> > +}
>> > +
>> > static void
>> > do_random_tests (void)
>> > {
>> > @@ -316,6 +376,7 @@ test_main (void)
>> > }
>> >
>> > do_random_tests ();
>> > + do_overflow_tests ();
>> > return ret;
>> > }
>> >
>> > diff --git a/string/test-strnlen.c b/string/test-strnlen.c
>> > index 920f58e97b..f53e09263f 100644
>> > --- a/string/test-strnlen.c
>> > +++ b/string/test-strnlen.c
>> > @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen,
>> int max_char)
>> > do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len,
>> maxlen));
>> > }
>> >
>> > +static void
>> > +do_overflow_tests (void)
>> > +{
>> > + size_t i, j, len;
>> > + const size_t one = 1;
>> > + uintptr_t buf_addr = (uintptr_t) buf1;
>> > +
>> > + for (i = 0; i < 750; ++i)
>> > + {
>> > + do_test (0, i, SIZE_MAX - i, BIG_CHAR);
>> > + do_test (0, i, i - buf_addr, BIG_CHAR);
>> > + do_test (0, i, -buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
>> > +
>> > + len = 0;
>> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
>> > + {
>> > + len |= one << j;
>> > + do_test (0, i, len - i, BIG_CHAR);
>> > + do_test (0, i, len + i, BIG_CHAR);
>> > + do_test (0, i, len - buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, len - buf_addr + i, BIG_CHAR);
>> > +
>> > + do_test (0, i, ~len - i, BIG_CHAR);
>> > + do_test (0, i, ~len + i, BIG_CHAR);
>> > + do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
>> > + }
>> > + }
>> > +}
>> > +
>> > static void
>> > do_random_tests (void)
>> > {
>> > @@ -283,6 +315,7 @@ test_main (void)
>> > do_random_tests ();
>> > do_page_tests ();
>> > do_page_2_tests ();
>> > + do_overflow_tests ();
>> > return ret;
>> > }
>> >
>> > --
>> > 2.25.1
>> >
>>
>>
>> --
>> H.J.
>>
>
Ping if we want this in 2.34
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat
2021-06-22 15:43 ` Noah Goldstein
@ 2021-06-22 16:18 ` H.J. Lu
2021-06-22 18:23 ` Noah Goldstein
0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2021-06-22 16:18 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jun 22, 2021 at 8:43 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
> On Wed, Jun 9, 2021 at 6:26 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>>
>>
>> On Wed, Jun 9, 2021 at 5:54 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>>
>>> On Wed, Jun 9, 2021 at 1:53 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>> >
>>> > This commit adds tests for a bug in the wide char variant of the
>>> > functions where the implementation may assume that maxlen for wcsnlen
>>> > or n for wmemchr/strncat will not overflow when multiplied by
>>> > sizeof(wchar_t).
>>> >
>>> > These tests show the following implementations failing on x86_64:
>>> >
>>> > wcsnlen-sse4_1
>>> > wcsnlen-avx2
>>> >
>>> > wmemchr-sse2
>>> > wmemchr-avx2
>>> >
>>> > strncat would fail as well if it where on a system that prefered
>>> > either of the wcsnlen implementations that failed as it relies on
>>> > wcsnlen.
>>>
>>> Please open a bug report for each standard C function. We need to
>>> track them for backporting to release branches.
>>
>>
>> Done: https://sourceware.org/bugzilla/show_bug.cgi?id=27974
>>
>>>
>>>
>>> Thanks.
>>>
>>> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
>>> > ---
>>> > string/test-memchr.c | 39 ++++++++++++++++++++++++---
>>> > string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
>>> > string/test-strnlen.c | 33 +++++++++++++++++++++++
>>> > 3 files changed, 130 insertions(+), 3 deletions(-)
>>> >
>>> > diff --git a/string/test-memchr.c b/string/test-memchr.c
>>> > index 665edc32af..ce964284aa 100644
>>> > --- a/string/test-memchr.c
>>> > +++ b/string/test-memchr.c
>>> > @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
>>> > CHAR *res = CALL (impl, s, c, n);
>>> > if (res != exp_res)
>>> > {
>>> > - error (0, 0, "Wrong result in function %s %p %p", impl->name,
>>> > - res, exp_res);
>>> > + error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
>>> > + impl->name, s, c, n, res, exp_res);
>>> > ret = 1;
>>> > return;
>>> > }
>>> > @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
>>> > }
>>> > buf[align + len] = 0;
>>> >
>>> > - if (pos < len)
>>> > + if (pos < MIN(n, len))
>>> > {
>>> > buf[align + pos] = seek_char;
>>> > buf[align + len] = -seek_char;
>>> > @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
>>> > do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
>>> > }
>>> >
>>> > +static void
>>> > +do_overflow_tests (void)
>>> > +{
>>> > + size_t i, j, len;
>>> > + const size_t one = 1;
>>> > + uintptr_t buf_addr = (uintptr_t) buf1;
>>> > +
>>> > + for (i = 0; i < 750; ++i)
>>> > + {
>>> > + do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
>>> > + do_test (0, i, 751, i - buf_addr, BIG_CHAR);
>>> > + do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
>>> > + do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
>>> > + do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
>>> > +
>>> > + len = 0;
>>> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
>>> > + {
>>> > + len |= one << j;
>>> > + do_test (0, i, 751, len - i, BIG_CHAR);
>>> > + do_test (0, i, 751, len + i, BIG_CHAR);
>>> > + do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
>>> > + do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
>>> > +
>>> > + do_test (0, i, 751, ~len - i, BIG_CHAR);
>>> > + do_test (0, i, 751, ~len + i, BIG_CHAR);
>>> > + do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
>>> > + do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
>>> > + }
>>> > + }
>>> > +}
>>> > +
>>> > static void
>>> > do_random_tests (void)
>>> > {
>>> > @@ -221,6 +253,7 @@ test_main (void)
>>> > do_test (page_size / 2 - i, i, i, 1, 0x9B);
>>> >
>>> > do_random_tests ();
>>> > + do_overflow_tests ();
>>> > return ret;
>>> > }
>>> >
>>> > diff --git a/string/test-strncat.c b/string/test-strncat.c
>>> > index 2ef917b820..0ab7541d4e 100644
>>> > --- a/string/test-strncat.c
>>> > +++ b/string/test-strncat.c
>>> > @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
>>> > }
>>> > }
>>> >
>>> > +static void
>>> > +do_overflow_tests (void)
>>> > +{
>>> > + size_t i, j, len;
>>> > + const size_t one = 1;
>>> > + CHAR *s1, *s2;
>>> > + uintptr_t s1_addr;
>>> > + s1 = (CHAR *) buf1;
>>> > + s2 = (CHAR *) buf2;
>>> > + s1_addr = (uintptr_t)s1;
>>> > + for (j = 0; j < 200; ++j)
>>> > + s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
>>> > + s2[200] = 0;
>>> > + for (i = 0; i < 750; ++i) {
>>> > + for (j = 0; j < i; ++j)
>>> > + s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
>>> > + s1[i] = '\0';
>>> > +
>>> > + FOR_EACH_IMPL (impl, 0)
>>> > + {
>>> > + s2[0] = '\0';
>>> > + do_one_test (impl, s2, s1, SIZE_MAX - i);
>>> > + s2[0] = '\0';
>>> > + do_one_test (impl, s2, s1, i - s1_addr);
>>> > + s2[0] = '\0';
>>> > + do_one_test (impl, s2, s1, -s1_addr - i);
>>> > + s2[0] = '\0';
>>> > + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
>>> > + s2[0] = '\0';
>>> > + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
>>> > + }
>>> > +
>>> > + len = 0;
>>> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
>>> > + {
>>> > + len |= one << j;
>>> > + FOR_EACH_IMPL (impl, 0)
>>> > + {
>>> > + s2[0] = '\0';
>>> > + do_one_test (impl, s2, s1, len - i);
>>> > + s2[0] = '\0';
>>> > + do_one_test (impl, s2, s1, len + i);
>>> > + s2[0] = '\0';
>>> > + do_one_test (impl, s2, s1, len - s1_addr - i);
>>> > + s2[0] = '\0';
>>> > + do_one_test (impl, s2, s1, len - s1_addr + i);
>>> > +
>>> > + s2[0] = '\0';
>>> > + do_one_test (impl, s2, s1, ~len - i);
>>> > + s2[0] = '\0';
>>> > + do_one_test (impl, s2, s1, ~len + i);
>>> > + s2[0] = '\0';
>>> > + do_one_test (impl, s2, s1, ~len - s1_addr - i);
>>> > + s2[0] = '\0';
>>> > + do_one_test (impl, s2, s1, ~len - s1_addr + i);
>>> > + }
>>> > + }
>>> > + }
>>> > +}
>>> > +
>>> > static void
>>> > do_random_tests (void)
>>> > {
>>> > @@ -316,6 +376,7 @@ test_main (void)
>>> > }
>>> >
>>> > do_random_tests ();
>>> > + do_overflow_tests ();
>>> > return ret;
>>> > }
>>> >
>>> > diff --git a/string/test-strnlen.c b/string/test-strnlen.c
>>> > index 920f58e97b..f53e09263f 100644
>>> > --- a/string/test-strnlen.c
>>> > +++ b/string/test-strnlen.c
>>> > @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
>>> > do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
>>> > }
>>> >
>>> > +static void
>>> > +do_overflow_tests (void)
>>> > +{
>>> > + size_t i, j, len;
>>> > + const size_t one = 1;
>>> > + uintptr_t buf_addr = (uintptr_t) buf1;
>>> > +
>>> > + for (i = 0; i < 750; ++i)
>>> > + {
>>> > + do_test (0, i, SIZE_MAX - i, BIG_CHAR);
>>> > + do_test (0, i, i - buf_addr, BIG_CHAR);
>>> > + do_test (0, i, -buf_addr - i, BIG_CHAR);
>>> > + do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
>>> > + do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
>>> > +
>>> > + len = 0;
>>> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
>>> > + {
>>> > + len |= one << j;
>>> > + do_test (0, i, len - i, BIG_CHAR);
>>> > + do_test (0, i, len + i, BIG_CHAR);
>>> > + do_test (0, i, len - buf_addr - i, BIG_CHAR);
>>> > + do_test (0, i, len - buf_addr + i, BIG_CHAR);
>>> > +
>>> > + do_test (0, i, ~len - i, BIG_CHAR);
>>> > + do_test (0, i, ~len + i, BIG_CHAR);
>>> > + do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
>>> > + do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
>>> > + }
>>> > + }
>>> > +}
>>> > +
>>> > static void
>>> > do_random_tests (void)
>>> > {
>>> > @@ -283,6 +315,7 @@ test_main (void)
>>> > do_random_tests ();
>>> > do_page_tests ();
>>> > do_page_2_tests ();
>>> > + do_overflow_tests ();
>>> > return ret;
>>> > }
>>> >
>>> > --
>>> > 2.25.1
>>> >
>>>
>>>
>>> --
>>> H.J.
>
>
> Ping if we want this in 2.34
Can you repost the patches with BZ# in the commit log?
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v2 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974]
2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
` (2 preceding siblings ...)
2021-06-09 21:53 ` [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat H.J. Lu
@ 2021-06-22 18:11 ` Noah Goldstein
2021-06-22 21:24 ` H.J. Lu
2021-06-22 18:11 ` [PATCH v2 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
` (4 subsequent siblings)
8 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-22 18:11 UTC (permalink / raw)
To: libc-alpha
This commit adds tests for a bug in the wide char variant of the
functions where the implementation may assume that maxlen for wcsnlen
or n for wmemchr/strncat will not overflow when multiplied by
sizeof(wchar_t).
These tests show the following implementations failing on x86_64:
wcsnlen-sse4_1
wcsnlen-avx2
wmemchr-sse2
wmemchr-avx2
strncat would fail as well if it where on a system that prefered
either of the wcsnlen implementations that failed as it relies on
wcsnlen.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
Some notes:
I only tested this patch (and the subsequent fixes) on a machine that
prefers EVEX.
The fix for wcsnlen-sse2 is possibly invalid. What it does is checks
if the computation is maxlen * sizeof(wchar_t) + s overflows, and if
so just calls wcslen. The rational is that either the end of the
string will be found in readable memory or the user invoked UB by
calling wcsnlen on a string that is not contained in valid memory
and without a maxlen to that will bound it in valid memory.
string/test-memchr.c | 39 ++++++++++++++++++++++++---
string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
string/test-strnlen.c | 33 +++++++++++++++++++++++
3 files changed, 130 insertions(+), 3 deletions(-)
diff --git a/string/test-memchr.c b/string/test-memchr.c
index 665edc32af..ce964284aa 100644
--- a/string/test-memchr.c
+++ b/string/test-memchr.c
@@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
CHAR *res = CALL (impl, s, c, n);
if (res != exp_res)
{
- error (0, 0, "Wrong result in function %s %p %p", impl->name,
- res, exp_res);
+ error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
+ impl->name, s, c, n, res, exp_res);
ret = 1;
return;
}
@@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
}
buf[align + len] = 0;
- if (pos < len)
+ if (pos < MIN(n, len))
{
buf[align + pos] = seek_char;
buf[align + len] = -seek_char;
@@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
}
+static void
+do_overflow_tests (void)
+{
+ size_t i, j, len;
+ const size_t one = 1;
+ uintptr_t buf_addr = (uintptr_t) buf1;
+
+ for (i = 0; i < 750; ++i)
+ {
+ do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
+ do_test (0, i, 751, i - buf_addr, BIG_CHAR);
+ do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
+ do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
+ do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+ len = 0;
+ for (j = 8 * sizeof(size_t) - 1; j ; --j)
+ {
+ len |= one << j;
+ do_test (0, i, 751, len - i, BIG_CHAR);
+ do_test (0, i, 751, len + i, BIG_CHAR);
+ do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
+ do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
+
+ do_test (0, i, 751, ~len - i, BIG_CHAR);
+ do_test (0, i, 751, ~len + i, BIG_CHAR);
+ do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
+ do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
+ }
+ }
+}
+
static void
do_random_tests (void)
{
@@ -221,6 +253,7 @@ test_main (void)
do_test (page_size / 2 - i, i, i, 1, 0x9B);
do_random_tests ();
+ do_overflow_tests ();
return ret;
}
diff --git a/string/test-strncat.c b/string/test-strncat.c
index 2ef917b820..37ea26ea05 100644
--- a/string/test-strncat.c
+++ b/string/test-strncat.c
@@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
}
}
+static void
+do_overflow_tests (void)
+{
+ size_t i, j, len;
+ const size_t one = 1;
+ CHAR *s1, *s2;
+ uintptr_t s1_addr;
+ s1 = (CHAR *) buf1;
+ s2 = (CHAR *) buf2;
+ s1_addr = (uintptr_t)s1;
+ for (j = 0; j < 200; ++j)
+ s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
+ s2[200] = 0;
+ for (i = 0; i < 750; ++i) {
+ for (j = 0; j < i; ++j)
+ s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
+ s1[i] = '\0';
+
+ FOR_EACH_IMPL (impl, 0)
+ {
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, SIZE_MAX - i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, i - s1_addr);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, -s1_addr - i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
+ }
+
+ len = 0;
+ for (j = 8 * sizeof(size_t) - 1; j ; --j)
+ {
+ len |= one << j;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, len - i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, len + i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, len - s1_addr - i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, len - s1_addr + i);
+
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, ~len - i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, ~len + i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, ~len - s1_addr - i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, ~len - s1_addr + i);
+ }
+ }
+ }
+}
+
static void
do_random_tests (void)
{
@@ -316,6 +376,7 @@ test_main (void)
}
do_random_tests ();
+ do_overflow_tests ();
return ret;
}
diff --git a/string/test-strnlen.c b/string/test-strnlen.c
index 920f58e97b..f53e09263f 100644
--- a/string/test-strnlen.c
+++ b/string/test-strnlen.c
@@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
}
+static void
+do_overflow_tests (void)
+{
+ size_t i, j, len;
+ const size_t one = 1;
+ uintptr_t buf_addr = (uintptr_t) buf1;
+
+ for (i = 0; i < 750; ++i)
+ {
+ do_test (0, i, SIZE_MAX - i, BIG_CHAR);
+ do_test (0, i, i - buf_addr, BIG_CHAR);
+ do_test (0, i, -buf_addr - i, BIG_CHAR);
+ do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
+ do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+ len = 0;
+ for (j = 8 * sizeof(size_t) - 1; j ; --j)
+ {
+ len |= one << j;
+ do_test (0, i, len - i, BIG_CHAR);
+ do_test (0, i, len + i, BIG_CHAR);
+ do_test (0, i, len - buf_addr - i, BIG_CHAR);
+ do_test (0, i, len - buf_addr + i, BIG_CHAR);
+
+ do_test (0, i, ~len - i, BIG_CHAR);
+ do_test (0, i, ~len + i, BIG_CHAR);
+ do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
+ do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
+ }
+ }
+}
+
static void
do_random_tests (void)
{
@@ -283,6 +315,7 @@ test_main (void)
do_random_tests ();
do_page_tests ();
do_page_2_tests ();
+ do_overflow_tests ();
return ret;
}
--
2.25.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v2 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ #27974]
2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
` (3 preceding siblings ...)
2021-06-22 18:11 ` [PATCH v2 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974] Noah Goldstein
@ 2021-06-22 18:11 ` Noah Goldstein
2021-06-22 21:24 ` H.J. Lu
2021-06-22 18:11 ` [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
` (3 subsequent siblings)
8 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-22 18:11 UTC (permalink / raw)
To: libc-alpha
This commit fixes the bug mentioned in the previous commit.
The previous implementations of wmemchr in these files relied
on n * sizeof(wchar_t) which was not guranteed by the standard.
The new overflow tests added in the previous commit now
pass (As well as all the other tests).
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++-------
sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
2 files changed, 98 insertions(+), 37 deletions(-)
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index beff2708de..3ddc4655cf 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -21,9 +21,11 @@
#ifdef USE_AS_WMEMCHR
# define MEMCHR wmemchr
# define PCMPEQ pcmpeqd
+# define CHAR_PER_VEC 4
#else
# define MEMCHR memchr
# define PCMPEQ pcmpeqb
+# define CHAR_PER_VEC 16
#endif
/* fast SSE2 version with using pmaxub and 64 byte loop */
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
movd %esi, %xmm1
mov %edi, %ecx
+#ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+#endif
#ifdef USE_AS_WMEMCHR
test %RDX_LP, %RDX_LP
jz L(return_null)
- shl $2, %RDX_LP
#else
-# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %edx, %edx
-# endif
punpcklbw %xmm1, %xmm1
test %RDX_LP, %RDX_LP
jz L(return_null)
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
test %eax, %eax
jnz L(matches_1)
- sub $16, %rdx
+ sub $CHAR_PER_VEC, %rdx
jbe L(return_null)
add $16, %rdi
and $15, %ecx
and $-16, %rdi
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
add %rcx, %rdx
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
jmp L(loop_prolog)
@@ -77,16 +81,21 @@ L(crosscache):
movdqa (%rdi), %xmm0
PCMPEQ %xmm1, %xmm0
-/* Check if there is a match. */
+ /* Check if there is a match. */
pmovmskb %xmm0, %eax
-/* Remove the leading bytes. */
+ /* Remove the leading bytes. */
sar %cl, %eax
test %eax, %eax
je L(unaligned_no_match)
-/* Check which byte is a match. */
+ /* Check which byte is a match. */
bsf %eax, %eax
-
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
add %rdi, %rax
add %rcx, %rax
@@ -94,15 +103,18 @@ L(crosscache):
.p2align 4
L(unaligned_no_match):
- /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
+ /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
"rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
possible addition overflow. */
neg %rcx
add $16, %rcx
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
sub %rcx, %rdx
jbe L(return_null)
add $16, %rdi
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
.p2align 4
@@ -135,7 +147,7 @@ L(loop_prolog):
test $0x3f, %rdi
jz L(align64_loop)
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
movdqa (%rdi), %xmm0
@@ -167,11 +179,14 @@ L(loop_prolog):
mov %rdi, %rcx
and $-64, %rdi
and $63, %ecx
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
add %rcx, %rdx
.p2align 4
L(align64_loop):
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
movdqa (%rdi), %xmm0
movdqa 16(%rdi), %xmm2
@@ -218,7 +233,7 @@ L(align64_loop):
.p2align 4
L(exit_loop):
- add $32, %edx
+ add $(CHAR_PER_VEC * 2), %edx
jle L(exit_loop_32)
movdqa (%rdi), %xmm0
@@ -238,7 +253,7 @@ L(exit_loop):
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32_1)
- sub $16, %edx
+ sub $CHAR_PER_VEC, %edx
jle L(return_null)
PCMPEQ 48(%rdi), %xmm1
@@ -250,13 +265,13 @@ L(exit_loop):
.p2align 4
L(exit_loop_32):
- add $32, %edx
+ add $(CHAR_PER_VEC * 2), %edx
movdqa (%rdi), %xmm0
PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches_1)
- sub $16, %edx
+ sub $CHAR_PER_VEC, %edx
jbe L(return_null)
PCMPEQ 16(%rdi), %xmm1
@@ -293,7 +308,13 @@ L(matches32):
.p2align 4
L(matches_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
add %rdi, %rax
ret
@@ -301,7 +322,13 @@ L(matches_1):
.p2align 4
L(matches16_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 16(%rdi, %rax), %rax
ret
@@ -309,7 +336,13 @@ L(matches16_1):
.p2align 4
L(matches32_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 32(%rdi, %rax), %rax
ret
@@ -317,7 +350,13 @@ L(matches32_1):
.p2align 4
L(matches48_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 48(%rdi, %rax), %rax
ret
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 0d8758e3e7..afdb956502 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -54,21 +54,19 @@
# define VEC_SIZE 32
# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section SECTION(.text),"ax",@progbits
ENTRY (MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
- test %RDX_LP, %RDX_LP
- jz L(null)
-# endif
-# ifdef USE_AS_WMEMCHR
- shl $2, %RDX_LP
-# else
# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %edx, %edx
+ /* Clear upper bits. */
+ and %RDX_LP, %RDX_LP
+# else
+ test %RDX_LP, %RDX_LP
# endif
+ jz L(null)
# endif
/* Broadcast CHAR to YMMMATCH. */
vmovd %esi, %xmm0
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
vpmovmskb %ymm1, %eax
# ifndef USE_AS_RAWMEMCHR
/* If length < CHAR_PER_VEC handle special. */
- cmpq $VEC_SIZE, %rdx
+ cmpq $CHAR_PER_VEC, %rdx
jbe L(first_vec_x0)
# endif
testl %eax, %eax
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
L(first_vec_x0):
/* Check if first match was before length. */
tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
xorl %ecx, %ecx
cmpl %eax, %edx
leaq (%rdi, %rax), %rax
@@ -110,12 +112,12 @@ L(null):
# endif
.p2align 4
L(cross_page_boundary):
- /* Save pointer before aligning as its original value is necessary
- for computer return address if byte is found or adjusting length
- if it is not and this is memchr. */
+ /* Save pointer before aligning as its original value is
+ necessary for computer return address if byte is found or
+ adjusting length if it is not and this is memchr. */
movq %rdi, %rcx
- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
- rdi for rawmemchr. */
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+ and rdi for rawmemchr. */
orq $(VEC_SIZE - 1), %ALGN_PTR_REG
VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
@@ -124,6 +126,10 @@ L(cross_page_boundary):
match). */
leaq 1(%ALGN_PTR_REG), %rsi
subq %RRAW_PTR_REG, %rsi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get wchar_t count. */
+ shrl $2, %esi
+# endif
# endif
/* Remove the leading bytes. */
sarxl %ERAW_PTR_REG, %eax, %eax
@@ -181,6 +187,10 @@ L(cross_page_continue):
orq $(VEC_SIZE - 1), %rdi
/* esi is for adjusting length to see if near the end. */
leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %esi
+# endif
# else
orq $(VEC_SIZE - 1), %rdi
L(cross_page_continue):
@@ -213,7 +223,7 @@ L(cross_page_continue):
# ifndef USE_AS_RAWMEMCHR
/* Check if at last VEC_SIZE * 4 length. */
- subq $(VEC_SIZE * 4), %rdx
+ subq $(CHAR_PER_VEC * 4), %rdx
jbe L(last_4x_vec_or_less_cmpeq)
/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
length. */
@@ -221,6 +231,10 @@ L(cross_page_continue):
movl %edi, %ecx
orq $(VEC_SIZE * 4 - 1), %rdi
andl $(VEC_SIZE * 4 - 1), %ecx
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
addq %rcx, %rdx
# else
/* Align data to VEC_SIZE * 4 - 1 for loop. */
@@ -250,15 +264,19 @@ L(loop_4x_vec):
subq $-(VEC_SIZE * 4), %rdi
- subq $(VEC_SIZE * 4), %rdx
+ subq $(CHAR_PER_VEC * 4), %rdx
ja L(loop_4x_vec)
- /* Fall through into less than 4 remaining vectors of length case.
- */
+ /* Fall through into less than 4 remaining vectors of length
+ case. */
VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
.p2align 4
L(last_4x_vec_or_less):
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
/* Check if first VEC contained match. */
testl %eax, %eax
jnz L(first_vec_x1_check)
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
L(last_4x_vec_or_less_cmpeq):
VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
subq $-(VEC_SIZE * 4), %rdi
/* Check first VEC regardless. */
testl %eax, %eax
--
2.25.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
` (4 preceding siblings ...)
2021-06-22 18:11 ` [PATCH v2 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
@ 2021-06-22 18:11 ` Noah Goldstein
2021-06-22 21:33 ` H.J. Lu
2021-06-23 6:31 ` [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat " Noah Goldstein
` (2 subsequent siblings)
8 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-22 18:11 UTC (permalink / raw)
To: libc-alpha
This commit fixes the bug mentioned in the previous commit.
The previous implementations of wmemchr in these files relied
on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
The new overflow tests added in the previous commit now
pass (As well as all the other tests).
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
sysdeps/x86_64/strlen.S | 14 ++-
2 files changed, 106 insertions(+), 38 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index bd2e6ee44a..b282a75613 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -44,21 +44,21 @@
# define VEC_SIZE 32
# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section SECTION(.text),"ax",@progbits
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
/* Check zero length. */
+# ifdef __ILP32__
+ /* Clear upper bits. */
+ and %RSI_LP, %RSI_LP
+# else
test %RSI_LP, %RSI_LP
+# endif
jz L(zero)
/* Store max len in R8_LP before adjusting if using WCSLEN. */
mov %RSI_LP, %R8_LP
-# ifdef USE_AS_WCSLEN
- shl $2, %RSI_LP
-# elif defined __ILP32__
- /* Clear the upper 32 bits. */
- movl %esi, %esi
-# endif
# endif
movl %edi, %eax
movq %rdi, %rdx
@@ -72,10 +72,10 @@ ENTRY (STRLEN)
/* Check the first VEC_SIZE bytes. */
VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
# ifdef USE_AS_STRNLEN
/* If length < VEC_SIZE handle special. */
- cmpq $VEC_SIZE, %rsi
+ cmpq $CHAR_PER_VEC, %rsi
jbe L(first_vec_x0)
# endif
/* If empty continue to aligned_more. Otherwise return bit
@@ -84,6 +84,7 @@ ENTRY (STRLEN)
jz L(aligned_more)
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -97,9 +98,14 @@ L(zero):
L(first_vec_x0):
/* Set bit for max len so that tzcnt will return min of max len
and position of first match. */
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %esi
+# endif
btsq %rsi, %rax
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -113,14 +119,19 @@ L(first_vec_x1):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE * 4 + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
incl %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -133,14 +144,19 @@ L(first_vec_x2):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE * 3 + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
addl $(VEC_SIZE + 1), %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -153,14 +169,19 @@ L(first_vec_x3):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE * 2 + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
addl $(VEC_SIZE * 2 + 1), %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -173,14 +194,19 @@ L(first_vec_x4):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
addl $(VEC_SIZE * 3 + 1), %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -195,10 +221,14 @@ L(cross_page_continue):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
# ifdef USE_AS_STRNLEN
- /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
- it simplies the logic in last_4x_vec_or_less. */
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
+ because it simplies the logic in last_4x_vec_or_less. */
leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
subq %rdx, %rcx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
# endif
/* Load first VEC regardless. */
VPCMPEQ 1(%rdi), %ymm0, %ymm1
@@ -207,34 +237,38 @@ L(cross_page_continue):
subq %rcx, %rsi
jb L(last_4x_vec_or_less)
# endif
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x1)
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x4)
/* Align data to VEC_SIZE * 4 - 1. */
# ifdef USE_AS_STRNLEN
/* Before adjusting length check if at last VEC_SIZE * 4. */
- cmpq $(VEC_SIZE * 4 - 1), %rsi
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
jbe L(last_4x_vec_or_less_load)
incq %rdi
movl %edi, %ecx
orq $(VEC_SIZE * 4 - 1), %rdi
andl $(VEC_SIZE * 4 - 1), %ecx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
/* Readjust length. */
addq %rcx, %rsi
# else
@@ -246,13 +280,13 @@ L(cross_page_continue):
L(loop_4x_vec):
# ifdef USE_AS_STRNLEN
/* Break if at end of length. */
- subq $(VEC_SIZE * 4), %rsi
+ subq $(CHAR_PER_VEC * 4), %rsi
jb L(last_4x_vec_or_less_cmpeq)
# endif
- /* Save some code size by microfusing VPMINU with the load. Since
- the matches in ymm2/ymm4 can only be returned if there where no
- matches in ymm1/ymm3 respectively there is no issue with overlap.
- */
+ /* Save some code size by microfusing VPMINU with the load.
+ Since the matches in ymm2/ymm4 can only be returned if there
+ where no matches in ymm1/ymm3 respectively there is no issue
+ with overlap. */
vmovdqa 1(%rdi), %ymm1
VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
@@ -260,7 +294,7 @@ L(loop_4x_vec):
VPMINU %ymm2, %ymm4, %ymm5
VPCMPEQ %ymm5, %ymm0, %ymm5
- vpmovmskb %ymm5, %ecx
+ vpmovmskb %ymm5, %ecx
subq $-(VEC_SIZE * 4), %rdi
testl %ecx, %ecx
@@ -268,27 +302,28 @@ L(loop_4x_vec):
VPCMPEQ %ymm1, %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
subq %rdx, %rdi
testl %eax, %eax
jnz L(last_vec_return_x0)
VPCMPEQ %ymm2, %ymm0, %ymm2
- vpmovmskb %ymm2, %eax
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(last_vec_return_x1)
/* Combine last 2 VEC. */
VPCMPEQ %ymm3, %ymm0, %ymm3
- vpmovmskb %ymm3, %eax
- /* rcx has combined result from all 4 VEC. It will only be used if
- the first 3 other VEC all did not contain a match. */
+ vpmovmskb %ymm3, %eax
+ /* rcx has combined result from all 4 VEC. It will only be used
+ if the first 3 other VEC all did not contain a match. */
salq $32, %rcx
orq %rcx, %rax
tzcntq %rax, %rax
subq $(VEC_SIZE * 2 - 1), %rdi
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -297,15 +332,19 @@ L(loop_4x_vec):
# ifdef USE_AS_STRNLEN
.p2align 4
L(last_4x_vec_or_less_load):
- /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1.
+ */
subq $-(VEC_SIZE * 4), %rdi
L(last_4x_vec_or_less_cmpeq):
VPCMPEQ 1(%rdi), %ymm0, %ymm1
L(last_4x_vec_or_less):
-
- vpmovmskb %ymm1, %eax
- /* If remaining length > VEC_SIZE * 2. This works if esi is off by
- VEC_SIZE * 4. */
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %esi
+# endif
+ vpmovmskb %ymm1, %eax
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off
+ by VEC_SIZE * 4. */
testl $(VEC_SIZE * 2), %esi
jnz L(last_4x_vec)
@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
jb L(max)
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
tzcntl %eax, %eax
/* Check the end of data. */
cmpl %eax, %esi
@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -340,6 +380,7 @@ L(last_vec_return_x0):
subq $(VEC_SIZE * 4 - 1), %rdi
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -350,6 +391,7 @@ L(last_vec_return_x1):
subq $(VEC_SIZE * 3 - 1), %rdi
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -366,6 +408,7 @@ L(last_vec_x1_check):
incl %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -381,14 +424,14 @@ L(last_4x_vec):
jnz L(last_vec_x1)
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(last_vec_x2)
/* Normalize length. */
andl $(VEC_SIZE * 4 - 1), %esi
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(last_vec_x3)
@@ -396,7 +439,7 @@ L(last_4x_vec):
jb L(max)
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
tzcntl %eax, %eax
/* Check the end of data. */
cmpl %eax, %esi
@@ -405,6 +448,7 @@ L(last_4x_vec):
addl $(VEC_SIZE * 3 + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -419,6 +463,7 @@ L(last_vec_x1):
incl %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -432,6 +477,7 @@ L(last_vec_x2):
addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -447,6 +493,7 @@ L(last_vec_x3):
addl $(VEC_SIZE * 2 + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -455,13 +502,13 @@ L(max_end):
VZEROUPPER_RETURN
# endif
- /* Cold case for crossing page with first load. */
+ /* Cold case for crossing page with first load. */
.p2align 4
L(cross_page_boundary):
/* Align data to VEC_SIZE - 1. */
orq $(VEC_SIZE - 1), %rdi
VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
so no need to manually mod rdx. */
sarxl %edx, %eax, %eax
@@ -470,6 +517,10 @@ L(cross_page_boundary):
jnz L(cross_page_less_vec)
leaq 1(%rdi), %rcx
subq %rdx, %rcx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
+ shrl $2, %ecx
+# endif
/* Check length. */
cmpq %rsi, %rcx
jb L(cross_page_continue)
@@ -479,6 +530,7 @@ L(cross_page_boundary):
jz L(cross_page_continue)
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide length by 4 to get wchar_t count. */
shrl $2, %eax
# endif
# endif
@@ -489,6 +541,10 @@ L(return_vzeroupper):
.p2align 4
L(cross_page_less_vec):
tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %esi
+# endif
cmpq %rax, %rsi
cmovb %esi, %eax
# ifdef USE_AS_WCSLEN
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index d223ea1700..3fc6734910 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -65,12 +65,24 @@ ENTRY(strlen)
ret
L(n_nonzero):
# ifdef AS_WCSLEN
- shl $2, %RSI_LP
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
+ overflow the only way this program doesn't have undefined behavior
+ is if there is a null terminator in valid memory so strlen will
+ suffice. */
+ mov %RSI_LP, %R10_LP
+ sar $62, %R10_LP
+ test %R10_LP, %R10_LP
+ jnz __wcslen_sse2
+ sal $2, %RSI_LP
# endif
/* Initialize long lived registers. */
add %RDI_LP, %RSI_LP
+# ifdef AS_WCSLEN
+/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
+ jbe __wcslen_sse2
+# endif
mov %RSI_LP, %R10_LP
and $-64, %R10_LP
mov %RSI_LP, %R11_LP
--
2.25.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat
2021-06-22 16:18 ` H.J. Lu
@ 2021-06-22 18:23 ` Noah Goldstein
0 siblings, 0 replies; 27+ messages in thread
From: Noah Goldstein @ 2021-06-22 18:23 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jun 22, 2021 at 12:19 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> On Tue, Jun 22, 2021 at 8:43 AM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> >
> > On Wed, Jun 9, 2021 at 6:26 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >>
> >>
> >>
> >> On Wed, Jun 9, 2021 at 5:54 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>>
> >>> On Wed, Jun 9, 2021 at 1:53 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >>> >
> >>> > This commit adds tests for a bug in the wide char variant of the
> >>> > functions where the implementation may assume that maxlen for wcsnlen
> >>> > or n for wmemchr/strncat will not overflow when multiplied by
> >>> > sizeof(wchar_t).
> >>> >
> >>> > These tests show the following implementations failing on x86_64:
> >>> >
> >>> > wcsnlen-sse4_1
> >>> > wcsnlen-avx2
> >>> >
> >>> > wmemchr-sse2
> >>> > wmemchr-avx2
> >>> >
> >>> > strncat would fail as well if it where on a system that prefered
> >>> > either of the wcsnlen implementations that failed as it relies on
> >>> > wcsnlen.
> >>>
> >>> Please open a bug report for each standard C function. We need to
> >>> track them for backporting to release branches.
> >>
> >>
> >> Done: https://sourceware.org/bugzilla/show_bug.cgi?id=27974
> >>
> >>>
> >>>
> >>> Thanks.
> >>>
> >>> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> >>> > ---
> >>> > string/test-memchr.c | 39 ++++++++++++++++++++++++---
> >>> > string/test-strncat.c | 61
> +++++++++++++++++++++++++++++++++++++++++++
> >>> > string/test-strnlen.c | 33 +++++++++++++++++++++++
> >>> > 3 files changed, 130 insertions(+), 3 deletions(-)
> >>> >
> >>> > diff --git a/string/test-memchr.c b/string/test-memchr.c
> >>> > index 665edc32af..ce964284aa 100644
> >>> > --- a/string/test-memchr.c
> >>> > +++ b/string/test-memchr.c
> >>> > @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c,
> size_t n, CHAR *exp_res)
> >>> > CHAR *res = CALL (impl, s, c, n);
> >>> > if (res != exp_res)
> >>> > {
> >>> > - error (0, 0, "Wrong result in function %s %p %p", impl->name,
> >>> > - res, exp_res);
> >>> > + error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p
> != %p",
> >>> > + impl->name, s, c, n, res, exp_res);
> >>> > ret = 1;
> >>> > return;
> >>> > }
> >>> > @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len,
> size_t n, int seek_char)
> >>> > }
> >>> > buf[align + len] = 0;
> >>> >
> >>> > - if (pos < len)
> >>> > + if (pos < MIN(n, len))
> >>> > {
> >>> > buf[align + pos] = seek_char;
> >>> > buf[align + len] = -seek_char;
> >>> > @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len,
> size_t n, int seek_char)
> >>> > do_one_test (impl, (CHAR *) (buf + align), seek_char, n,
> result);
> >>> > }
> >>> >
> >>> > +static void
> >>> > +do_overflow_tests (void)
> >>> > +{
> >>> > + size_t i, j, len;
> >>> > + const size_t one = 1;
> >>> > + uintptr_t buf_addr = (uintptr_t) buf1;
> >>> > +
> >>> > + for (i = 0; i < 750; ++i)
> >>> > + {
> >>> > + do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
> >>> > + do_test (0, i, 751, i - buf_addr, BIG_CHAR);
> >>> > + do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
> >>> > + do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
> >>> > + do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
> >>> > +
> >>> > + len = 0;
> >>> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> >>> > + {
> >>> > + len |= one << j;
> >>> > + do_test (0, i, 751, len - i, BIG_CHAR);
> >>> > + do_test (0, i, 751, len + i, BIG_CHAR);
> >>> > + do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
> >>> > + do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
> >>> > +
> >>> > + do_test (0, i, 751, ~len - i, BIG_CHAR);
> >>> > + do_test (0, i, 751, ~len + i, BIG_CHAR);
> >>> > + do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
> >>> > + do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
> >>> > + }
> >>> > + }
> >>> > +}
> >>> > +
> >>> > static void
> >>> > do_random_tests (void)
> >>> > {
> >>> > @@ -221,6 +253,7 @@ test_main (void)
> >>> > do_test (page_size / 2 - i, i, i, 1, 0x9B);
> >>> >
> >>> > do_random_tests ();
> >>> > + do_overflow_tests ();
> >>> > return ret;
> >>> > }
> >>> >
> >>> > diff --git a/string/test-strncat.c b/string/test-strncat.c
> >>> > index 2ef917b820..0ab7541d4e 100644
> >>> > --- a/string/test-strncat.c
> >>> > +++ b/string/test-strncat.c
> >>> > @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t
> len1, size_t len2,
> >>> > }
> >>> > }
> >>> >
> >>> > +static void
> >>> > +do_overflow_tests (void)
> >>> > +{
> >>> > + size_t i, j, len;
> >>> > + const size_t one = 1;
> >>> > + CHAR *s1, *s2;
> >>> > + uintptr_t s1_addr;
> >>> > + s1 = (CHAR *) buf1;
> >>> > + s2 = (CHAR *) buf2;
> >>> > + s1_addr = (uintptr_t)s1;
> >>> > + for (j = 0; j < 200; ++j)
> >>> > + s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
> >>> > + s2[200] = 0;
> >>> > + for (i = 0; i < 750; ++i) {
> >>> > + for (j = 0; j < i; ++j)
> >>> > + s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
> >>> > + s1[i] = '\0';
> >>> > +
> >>> > + FOR_EACH_IMPL (impl, 0)
> >>> > + {
> >>> > + s2[0] = '\0';
> >>> > + do_one_test (impl, s2, s1, SIZE_MAX - i);
> >>> > + s2[0] = '\0';
> >>> > + do_one_test (impl, s2, s1, i - s1_addr);
> >>> > + s2[0] = '\0';
> >>> > + do_one_test (impl, s2, s1, -s1_addr - i);
> >>> > + s2[0] = '\0';
> >>> > + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
> >>> > + s2[0] = '\0';
> >>> > + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
> >>> > + }
> >>> > +
> >>> > + len = 0;
> >>> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> >>> > + {
> >>> > + len |= one << j;
> >>> > + FOR_EACH_IMPL (impl, 0)
> >>> > + {
> >>> > + s2[0] = '\0';
> >>> > + do_one_test (impl, s2, s1, len - i);
> >>> > + s2[0] = '\0';
> >>> > + do_one_test (impl, s2, s1, len + i);
> >>> > + s2[0] = '\0';
> >>> > + do_one_test (impl, s2, s1, len - s1_addr - i);
> >>> > + s2[0] = '\0';
> >>> > + do_one_test (impl, s2, s1, len - s1_addr + i);
> >>> > +
> >>> > + s2[0] = '\0';
> >>> > + do_one_test (impl, s2, s1, ~len - i);
> >>> > + s2[0] = '\0';
> >>> > + do_one_test (impl, s2, s1, ~len + i);
> >>> > + s2[0] = '\0';
> >>> > + do_one_test (impl, s2, s1, ~len - s1_addr - i);
> >>> > + s2[0] = '\0';
> >>> > + do_one_test (impl, s2, s1, ~len - s1_addr + i);
> >>> > + }
> >>> > + }
> >>> > + }
> >>> > +}
> >>> > +
> >>> > static void
> >>> > do_random_tests (void)
> >>> > {
> >>> > @@ -316,6 +376,7 @@ test_main (void)
> >>> > }
> >>> >
> >>> > do_random_tests ();
> >>> > + do_overflow_tests ();
> >>> > return ret;
> >>> > }
> >>> >
> >>> > diff --git a/string/test-strnlen.c b/string/test-strnlen.c
> >>> > index 920f58e97b..f53e09263f 100644
> >>> > --- a/string/test-strnlen.c
> >>> > +++ b/string/test-strnlen.c
> >>> > @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen,
> int max_char)
> >>> > do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len,
> maxlen));
> >>> > }
> >>> >
> >>> > +static void
> >>> > +do_overflow_tests (void)
> >>> > +{
> >>> > + size_t i, j, len;
> >>> > + const size_t one = 1;
> >>> > + uintptr_t buf_addr = (uintptr_t) buf1;
> >>> > +
> >>> > + for (i = 0; i < 750; ++i)
> >>> > + {
> >>> > + do_test (0, i, SIZE_MAX - i, BIG_CHAR);
> >>> > + do_test (0, i, i - buf_addr, BIG_CHAR);
> >>> > + do_test (0, i, -buf_addr - i, BIG_CHAR);
> >>> > + do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
> >>> > + do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
> >>> > +
> >>> > + len = 0;
> >>> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> >>> > + {
> >>> > + len |= one << j;
> >>> > + do_test (0, i, len - i, BIG_CHAR);
> >>> > + do_test (0, i, len + i, BIG_CHAR);
> >>> > + do_test (0, i, len - buf_addr - i, BIG_CHAR);
> >>> > + do_test (0, i, len - buf_addr + i, BIG_CHAR);
> >>> > +
> >>> > + do_test (0, i, ~len - i, BIG_CHAR);
> >>> > + do_test (0, i, ~len + i, BIG_CHAR);
> >>> > + do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
> >>> > + do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
> >>> > + }
> >>> > + }
> >>> > +}
> >>> > +
> >>> > static void
> >>> > do_random_tests (void)
> >>> > {
> >>> > @@ -283,6 +315,7 @@ test_main (void)
> >>> > do_random_tests ();
> >>> > do_page_tests ();
> >>> > do_page_2_tests ();
> >>> > + do_overflow_tests ();
> >>> > return ret;
> >>> > }
> >>> >
> >>> > --
> >>> > 2.25.1
> >>> >
> >>>
> >>>
> >>> --
> >>> H.J.
> >
> >
> > Ping if we want this in 2.34
>
> Can you repost the patches with BZ# in the commit log?
>
Done. (Not sure why the patch didn't come in as reply to this one but just
posted).
>
> Thanks.
>
> --
> H.J.
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974]
2021-06-22 18:11 ` [PATCH v2 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974] Noah Goldstein
@ 2021-06-22 21:24 ` H.J. Lu
0 siblings, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2021-06-22 21:24 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit adds tests for a bug in the wide char variant of the
> functions where the implementation may assume that maxlen for wcsnlen
> or n for wmemchr/strncat will not overflow when multiplied by
> sizeof(wchar_t).
>
> These tests show the following implementations failing on x86_64:
>
> wcsnlen-sse4_1
> wcsnlen-avx2
>
> wmemchr-sse2
> wmemchr-avx2
>
> strncat would fail as well if it where on a system that prefered
> either of the wcsnlen implementations that failed as it relies on
> wcsnlen.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> Some notes:
>
> I only tested this patch (and the subsequent fixes) on a machine that
> prefers EVEX.
>
> The fix for wcsnlen-sse2 is possibly invalid. What it does is checks
> if the computation is maxlen * sizeof(wchar_t) + s overflows, and if
> so just calls wcslen. The rational is that either the end of the
> string will be found in readable memory or the user invoked UB by
> calling wcsnlen on a string that is not contained in valid memory
> and without a maxlen to that will bound it in valid memory.
>
> string/test-memchr.c | 39 ++++++++++++++++++++++++---
> string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
> string/test-strnlen.c | 33 +++++++++++++++++++++++
> 3 files changed, 130 insertions(+), 3 deletions(-)
>
> diff --git a/string/test-memchr.c b/string/test-memchr.c
> index 665edc32af..ce964284aa 100644
> --- a/string/test-memchr.c
> +++ b/string/test-memchr.c
> @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
> CHAR *res = CALL (impl, s, c, n);
> if (res != exp_res)
> {
> - error (0, 0, "Wrong result in function %s %p %p", impl->name,
> - res, exp_res);
> + error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
> + impl->name, s, c, n, res, exp_res);
> ret = 1;
> return;
> }
> @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
> }
> buf[align + len] = 0;
>
> - if (pos < len)
> + if (pos < MIN(n, len))
> {
> buf[align + pos] = seek_char;
> buf[align + len] = -seek_char;
> @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
> do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
> }
>
> +static void
> +do_overflow_tests (void)
> +{
> + size_t i, j, len;
> + const size_t one = 1;
> + uintptr_t buf_addr = (uintptr_t) buf1;
> +
> + for (i = 0; i < 750; ++i)
> + {
> + do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
> + do_test (0, i, 751, i - buf_addr, BIG_CHAR);
> + do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
> + do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
> + do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
> +
> + len = 0;
> + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> + {
> + len |= one << j;
> + do_test (0, i, 751, len - i, BIG_CHAR);
> + do_test (0, i, 751, len + i, BIG_CHAR);
> + do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
> + do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
> +
> + do_test (0, i, 751, ~len - i, BIG_CHAR);
> + do_test (0, i, 751, ~len + i, BIG_CHAR);
> + do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
> + do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
> + }
> + }
> +}
> +
> static void
> do_random_tests (void)
> {
> @@ -221,6 +253,7 @@ test_main (void)
> do_test (page_size / 2 - i, i, i, 1, 0x9B);
>
> do_random_tests ();
> + do_overflow_tests ();
> return ret;
> }
>
> diff --git a/string/test-strncat.c b/string/test-strncat.c
> index 2ef917b820..37ea26ea05 100644
> --- a/string/test-strncat.c
> +++ b/string/test-strncat.c
> @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
> }
> }
>
> +static void
> +do_overflow_tests (void)
> +{
> + size_t i, j, len;
> + const size_t one = 1;
> + CHAR *s1, *s2;
> + uintptr_t s1_addr;
> + s1 = (CHAR *) buf1;
> + s2 = (CHAR *) buf2;
> + s1_addr = (uintptr_t)s1;
> + for (j = 0; j < 200; ++j)
> + s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
> + s2[200] = 0;
> + for (i = 0; i < 750; ++i) {
> + for (j = 0; j < i; ++j)
> + s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
> + s1[i] = '\0';
> +
> + FOR_EACH_IMPL (impl, 0)
> + {
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, SIZE_MAX - i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, i - s1_addr);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, -s1_addr - i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
> + }
> +
> + len = 0;
> + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> + {
> + len |= one << j;
> + FOR_EACH_IMPL (impl, 0)
> + {
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, len - i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, len + i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, len - s1_addr - i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, len - s1_addr + i);
> +
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, ~len - i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, ~len + i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, ~len - s1_addr - i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, ~len - s1_addr + i);
> + }
> + }
> + }
> +}
> +
> static void
> do_random_tests (void)
> {
> @@ -316,6 +376,7 @@ test_main (void)
> }
>
> do_random_tests ();
> + do_overflow_tests ();
> return ret;
> }
>
> diff --git a/string/test-strnlen.c b/string/test-strnlen.c
> index 920f58e97b..f53e09263f 100644
> --- a/string/test-strnlen.c
> +++ b/string/test-strnlen.c
> @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
> do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
> }
>
> +static void
> +do_overflow_tests (void)
> +{
> + size_t i, j, len;
> + const size_t one = 1;
> + uintptr_t buf_addr = (uintptr_t) buf1;
> +
> + for (i = 0; i < 750; ++i)
> + {
> + do_test (0, i, SIZE_MAX - i, BIG_CHAR);
> + do_test (0, i, i - buf_addr, BIG_CHAR);
> + do_test (0, i, -buf_addr - i, BIG_CHAR);
> + do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
> + do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
> +
> + len = 0;
> + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> + {
> + len |= one << j;
> + do_test (0, i, len - i, BIG_CHAR);
> + do_test (0, i, len + i, BIG_CHAR);
> + do_test (0, i, len - buf_addr - i, BIG_CHAR);
> + do_test (0, i, len - buf_addr + i, BIG_CHAR);
> +
> + do_test (0, i, ~len - i, BIG_CHAR);
> + do_test (0, i, ~len + i, BIG_CHAR);
> + do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
> + do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
> + }
> + }
> +}
> +
> static void
> do_random_tests (void)
> {
> @@ -283,6 +315,7 @@ test_main (void)
> do_random_tests ();
> do_page_tests ();
> do_page_2_tests ();
> + do_overflow_tests ();
> return ret;
> }
>
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ #27974]
2021-06-22 18:11 ` [PATCH v2 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
@ 2021-06-22 21:24 ` H.J. Lu
0 siblings, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2021-06-22 21:24 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit fixes the bug mentioned in the previous commit.
>
> The previous implementations of wmemchr in these files relied
> on n * sizeof(wchar_t) which was not guranteed by the standard.
>
> The new overflow tests added in the previous commit now
> pass (As well as all the other tests).
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++-------
> sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
> 2 files changed, 98 insertions(+), 37 deletions(-)
>
> diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
> index beff2708de..3ddc4655cf 100644
> --- a/sysdeps/x86_64/memchr.S
> +++ b/sysdeps/x86_64/memchr.S
> @@ -21,9 +21,11 @@
> #ifdef USE_AS_WMEMCHR
> # define MEMCHR wmemchr
> # define PCMPEQ pcmpeqd
> +# define CHAR_PER_VEC 4
> #else
> # define MEMCHR memchr
> # define PCMPEQ pcmpeqb
> +# define CHAR_PER_VEC 16
> #endif
>
> /* fast SSE2 version with using pmaxub and 64 byte loop */
> @@ -33,15 +35,14 @@ ENTRY(MEMCHR)
> movd %esi, %xmm1
> mov %edi, %ecx
>
> +#ifdef __ILP32__
> + /* Clear the upper 32 bits. */
> + movl %edx, %edx
> +#endif
> #ifdef USE_AS_WMEMCHR
> test %RDX_LP, %RDX_LP
> jz L(return_null)
> - shl $2, %RDX_LP
> #else
> -# ifdef __ILP32__
> - /* Clear the upper 32 bits. */
> - movl %edx, %edx
> -# endif
> punpcklbw %xmm1, %xmm1
> test %RDX_LP, %RDX_LP
> jz L(return_null)
> @@ -60,13 +61,16 @@ ENTRY(MEMCHR)
> test %eax, %eax
>
> jnz L(matches_1)
> - sub $16, %rdx
> + sub $CHAR_PER_VEC, %rdx
> jbe L(return_null)
> add $16, %rdi
> and $15, %ecx
> and $-16, %rdi
> +#ifdef USE_AS_WMEMCHR
> + shr $2, %ecx
> +#endif
> add %rcx, %rdx
> - sub $64, %rdx
> + sub $(CHAR_PER_VEC * 4), %rdx
> jbe L(exit_loop)
> jmp L(loop_prolog)
>
> @@ -77,16 +81,21 @@ L(crosscache):
> movdqa (%rdi), %xmm0
>
> PCMPEQ %xmm1, %xmm0
> -/* Check if there is a match. */
> + /* Check if there is a match. */
> pmovmskb %xmm0, %eax
> -/* Remove the leading bytes. */
> + /* Remove the leading bytes. */
> sar %cl, %eax
> test %eax, %eax
> je L(unaligned_no_match)
> -/* Check which byte is a match. */
> + /* Check which byte is a match. */
> bsf %eax, %eax
> -
> +#ifdef USE_AS_WMEMCHR
> + mov %eax, %esi
> + shr $2, %esi
> + sub %rsi, %rdx
> +#else
> sub %rax, %rdx
> +#endif
> jbe L(return_null)
> add %rdi, %rax
> add %rcx, %rax
> @@ -94,15 +103,18 @@ L(crosscache):
>
> .p2align 4
> L(unaligned_no_match):
> - /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
> + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
> "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
> possible addition overflow. */
> neg %rcx
> add $16, %rcx
> +#ifdef USE_AS_WMEMCHR
> + shr $2, %ecx
> +#endif
> sub %rcx, %rdx
> jbe L(return_null)
> add $16, %rdi
> - sub $64, %rdx
> + sub $(CHAR_PER_VEC * 4), %rdx
> jbe L(exit_loop)
>
> .p2align 4
> @@ -135,7 +147,7 @@ L(loop_prolog):
> test $0x3f, %rdi
> jz L(align64_loop)
>
> - sub $64, %rdx
> + sub $(CHAR_PER_VEC * 4), %rdx
> jbe L(exit_loop)
>
> movdqa (%rdi), %xmm0
> @@ -167,11 +179,14 @@ L(loop_prolog):
> mov %rdi, %rcx
> and $-64, %rdi
> and $63, %ecx
> +#ifdef USE_AS_WMEMCHR
> + shr $2, %ecx
> +#endif
> add %rcx, %rdx
>
> .p2align 4
> L(align64_loop):
> - sub $64, %rdx
> + sub $(CHAR_PER_VEC * 4), %rdx
> jbe L(exit_loop)
> movdqa (%rdi), %xmm0
> movdqa 16(%rdi), %xmm2
> @@ -218,7 +233,7 @@ L(align64_loop):
>
> .p2align 4
> L(exit_loop):
> - add $32, %edx
> + add $(CHAR_PER_VEC * 2), %edx
> jle L(exit_loop_32)
>
> movdqa (%rdi), %xmm0
> @@ -238,7 +253,7 @@ L(exit_loop):
> pmovmskb %xmm3, %eax
> test %eax, %eax
> jnz L(matches32_1)
> - sub $16, %edx
> + sub $CHAR_PER_VEC, %edx
> jle L(return_null)
>
> PCMPEQ 48(%rdi), %xmm1
> @@ -250,13 +265,13 @@ L(exit_loop):
>
> .p2align 4
> L(exit_loop_32):
> - add $32, %edx
> + add $(CHAR_PER_VEC * 2), %edx
> movdqa (%rdi), %xmm0
> PCMPEQ %xmm1, %xmm0
> pmovmskb %xmm0, %eax
> test %eax, %eax
> jnz L(matches_1)
> - sub $16, %edx
> + sub $CHAR_PER_VEC, %edx
> jbe L(return_null)
>
> PCMPEQ 16(%rdi), %xmm1
> @@ -293,7 +308,13 @@ L(matches32):
> .p2align 4
> L(matches_1):
> bsf %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> + mov %eax, %esi
> + shr $2, %esi
> + sub %rsi, %rdx
> +#else
> sub %rax, %rdx
> +#endif
> jbe L(return_null)
> add %rdi, %rax
> ret
> @@ -301,7 +322,13 @@ L(matches_1):
> .p2align 4
> L(matches16_1):
> bsf %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> + mov %eax, %esi
> + shr $2, %esi
> + sub %rsi, %rdx
> +#else
> sub %rax, %rdx
> +#endif
> jbe L(return_null)
> lea 16(%rdi, %rax), %rax
> ret
> @@ -309,7 +336,13 @@ L(matches16_1):
> .p2align 4
> L(matches32_1):
> bsf %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> + mov %eax, %esi
> + shr $2, %esi
> + sub %rsi, %rdx
> +#else
> sub %rax, %rdx
> +#endif
> jbe L(return_null)
> lea 32(%rdi, %rax), %rax
> ret
> @@ -317,7 +350,13 @@ L(matches32_1):
> .p2align 4
> L(matches48_1):
> bsf %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> + mov %eax, %esi
> + shr $2, %esi
> + sub %rsi, %rdx
> +#else
> sub %rax, %rdx
> +#endif
> jbe L(return_null)
> lea 48(%rdi, %rax), %rax
> ret
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 0d8758e3e7..afdb956502 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -54,21 +54,19 @@
>
> # define VEC_SIZE 32
> # define PAGE_SIZE 4096
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
>
> .section SECTION(.text),"ax",@progbits
> ENTRY (MEMCHR)
> # ifndef USE_AS_RAWMEMCHR
> /* Check for zero length. */
> - test %RDX_LP, %RDX_LP
> - jz L(null)
> -# endif
> -# ifdef USE_AS_WMEMCHR
> - shl $2, %RDX_LP
> -# else
> # ifdef __ILP32__
> - /* Clear the upper 32 bits. */
> - movl %edx, %edx
> + /* Clear upper bits. */
> + and %RDX_LP, %RDX_LP
> +# else
> + test %RDX_LP, %RDX_LP
> # endif
> + jz L(null)
> # endif
> /* Broadcast CHAR to YMMMATCH. */
> vmovd %esi, %xmm0
> @@ -84,7 +82,7 @@ ENTRY (MEMCHR)
> vpmovmskb %ymm1, %eax
> # ifndef USE_AS_RAWMEMCHR
> /* If length < CHAR_PER_VEC handle special. */
> - cmpq $VEC_SIZE, %rdx
> + cmpq $CHAR_PER_VEC, %rdx
> jbe L(first_vec_x0)
> # endif
> testl %eax, %eax
> @@ -98,6 +96,10 @@ ENTRY (MEMCHR)
> L(first_vec_x0):
> /* Check if first match was before length. */
> tzcntl %eax, %eax
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Multiply length by 4 to get byte count. */
> + sall $2, %edx
> +# endif
> xorl %ecx, %ecx
> cmpl %eax, %edx
> leaq (%rdi, %rax), %rax
> @@ -110,12 +112,12 @@ L(null):
> # endif
> .p2align 4
> L(cross_page_boundary):
> - /* Save pointer before aligning as its original value is necessary
> - for computer return address if byte is found or adjusting length
> - if it is not and this is memchr. */
> + /* Save pointer before aligning as its original value is
> + necessary for computer return address if byte is found or
> + adjusting length if it is not and this is memchr. */
> movq %rdi, %rcx
> - /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
> - rdi for rawmemchr. */
> + /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
> + and rdi for rawmemchr. */
> orq $(VEC_SIZE - 1), %ALGN_PTR_REG
> VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> @@ -124,6 +126,10 @@ L(cross_page_boundary):
> match). */
> leaq 1(%ALGN_PTR_REG), %rsi
> subq %RRAW_PTR_REG, %rsi
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> + shrl $2, %esi
> +# endif
> # endif
> /* Remove the leading bytes. */
> sarxl %ERAW_PTR_REG, %eax, %eax
> @@ -181,6 +187,10 @@ L(cross_page_continue):
> orq $(VEC_SIZE - 1), %rdi
> /* esi is for adjusting length to see if near the end. */
> leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Divide bytes by 4 to get the wchar_t count. */
> + sarl $2, %esi
> +# endif
> # else
> orq $(VEC_SIZE - 1), %rdi
> L(cross_page_continue):
> @@ -213,7 +223,7 @@ L(cross_page_continue):
>
> # ifndef USE_AS_RAWMEMCHR
> /* Check if at last VEC_SIZE * 4 length. */
> - subq $(VEC_SIZE * 4), %rdx
> + subq $(CHAR_PER_VEC * 4), %rdx
> jbe L(last_4x_vec_or_less_cmpeq)
> /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
> length. */
> @@ -221,6 +231,10 @@ L(cross_page_continue):
> movl %edi, %ecx
> orq $(VEC_SIZE * 4 - 1), %rdi
> andl $(VEC_SIZE * 4 - 1), %ecx
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Divide bytes by 4 to get the wchar_t count. */
> + sarl $2, %ecx
> +# endif
> addq %rcx, %rdx
> # else
> /* Align data to VEC_SIZE * 4 - 1 for loop. */
> @@ -250,15 +264,19 @@ L(loop_4x_vec):
>
> subq $-(VEC_SIZE * 4), %rdi
>
> - subq $(VEC_SIZE * 4), %rdx
> + subq $(CHAR_PER_VEC * 4), %rdx
> ja L(loop_4x_vec)
>
> - /* Fall through into less than 4 remaining vectors of length case.
> - */
> + /* Fall through into less than 4 remaining vectors of length
> + case. */
> VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> .p2align 4
> L(last_4x_vec_or_less):
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Multiply length by 4 to get byte count. */
> + sall $2, %edx
> +# endif
> /* Check if first VEC contained match. */
> testl %eax, %eax
> jnz L(first_vec_x1_check)
> @@ -355,6 +373,10 @@ L(last_vec_x2_return):
> L(last_4x_vec_or_less_cmpeq):
> VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Multiply length by 4 to get byte count. */
> + sall $2, %edx
> +# endif
> subq $-(VEC_SIZE * 4), %rdi
> /* Check first VEC regardless. */
> testl %eax, %eax
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
2021-06-22 18:11 ` [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
@ 2021-06-22 21:33 ` H.J. Lu
2021-06-22 23:16 ` Noah Goldstein
0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2021-06-22 21:33 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit fixes the bug mentioned in the previous commit.
>
> The previous implementations of wmemchr in these files relied
> on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
>
> The new overflow tests added in the previous commit now
> pass (As well as all the other tests).
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
> sysdeps/x86_64/strlen.S | 14 ++-
> 2 files changed, 106 insertions(+), 38 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> index bd2e6ee44a..b282a75613 100644
> --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> @@ -44,21 +44,21 @@
>
> # define VEC_SIZE 32
> # define PAGE_SIZE 4096
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
>
> .section SECTION(.text),"ax",@progbits
> ENTRY (STRLEN)
> # ifdef USE_AS_STRNLEN
> /* Check zero length. */
> +# ifdef __ILP32__
> + /* Clear upper bits. */
> + and %RSI_LP, %RSI_LP
> +# else
> test %RSI_LP, %RSI_LP
> +# endif
> jz L(zero)
> /* Store max len in R8_LP before adjusting if using WCSLEN. */
> mov %RSI_LP, %R8_LP
> -# ifdef USE_AS_WCSLEN
> - shl $2, %RSI_LP
> -# elif defined __ILP32__
> - /* Clear the upper 32 bits. */
> - movl %esi, %esi
> -# endif
> # endif
> movl %edi, %eax
> movq %rdi, %rdx
> @@ -72,10 +72,10 @@ ENTRY (STRLEN)
>
> /* Check the first VEC_SIZE bytes. */
> VPCMPEQ (%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> # ifdef USE_AS_STRNLEN
> /* If length < VEC_SIZE handle special. */
> - cmpq $VEC_SIZE, %rsi
> + cmpq $CHAR_PER_VEC, %rsi
> jbe L(first_vec_x0)
> # endif
> /* If empty continue to aligned_more. Otherwise return bit
> @@ -84,6 +84,7 @@ ENTRY (STRLEN)
> jz L(aligned_more)
> tzcntl %eax, %eax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrl $2, %eax
> # endif
> VZEROUPPER_RETURN
> @@ -97,9 +98,14 @@ L(zero):
> L(first_vec_x0):
> /* Set bit for max len so that tzcnt will return min of max len
> and position of first match. */
> +# ifdef USE_AS_WCSLEN
> + /* NB: Multiply length by 4 to get byte count. */
> + sall $2, %esi
> +# endif
> btsq %rsi, %rax
> tzcntl %eax, %eax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrl $2, %eax
> # endif
> VZEROUPPER_RETURN
> @@ -113,14 +119,19 @@ L(first_vec_x1):
> # ifdef USE_AS_STRNLEN
> /* Use ecx which was computed earlier to compute correct value.
> */
> +# ifdef USE_AS_WCSLEN
> + leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
> +# else
> subl $(VEC_SIZE * 4 + 1), %ecx
> addl %ecx, %eax
> +# endif
> # else
> subl %edx, %edi
> incl %edi
> addl %edi, %eax
> # endif
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrl $2, %eax
> # endif
> VZEROUPPER_RETURN
> @@ -133,14 +144,19 @@ L(first_vec_x2):
> # ifdef USE_AS_STRNLEN
> /* Use ecx which was computed earlier to compute correct value.
> */
> +# ifdef USE_AS_WCSLEN
> + leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
> +# else
> subl $(VEC_SIZE * 3 + 1), %ecx
> addl %ecx, %eax
> +# endif
> # else
> subl %edx, %edi
> addl $(VEC_SIZE + 1), %edi
> addl %edi, %eax
> # endif
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrl $2, %eax
> # endif
> VZEROUPPER_RETURN
> @@ -153,14 +169,19 @@ L(first_vec_x3):
> # ifdef USE_AS_STRNLEN
> /* Use ecx which was computed earlier to compute correct value.
> */
> +# ifdef USE_AS_WCSLEN
> + leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
> +# else
> subl $(VEC_SIZE * 2 + 1), %ecx
> addl %ecx, %eax
> +# endif
> # else
> subl %edx, %edi
> addl $(VEC_SIZE * 2 + 1), %edi
> addl %edi, %eax
> # endif
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrl $2, %eax
> # endif
> VZEROUPPER_RETURN
> @@ -173,14 +194,19 @@ L(first_vec_x4):
> # ifdef USE_AS_STRNLEN
> /* Use ecx which was computed earlier to compute correct value.
> */
> +# ifdef USE_AS_WCSLEN
> + leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
> +# else
> subl $(VEC_SIZE + 1), %ecx
> addl %ecx, %eax
> +# endif
> # else
> subl %edx, %edi
> addl $(VEC_SIZE * 3 + 1), %edi
> addl %edi, %eax
> # endif
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrl $2, %eax
> # endif
> VZEROUPPER_RETURN
> @@ -195,10 +221,14 @@ L(cross_page_continue):
> /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> since data is only aligned to VEC_SIZE. */
> # ifdef USE_AS_STRNLEN
> - /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> - it simplies the logic in last_4x_vec_or_less. */
> + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> + because it simplies the logic in last_4x_vec_or_less. */
> leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> subq %rdx, %rcx
> +# ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get the wchar_t count. */
> + sarl $2, %ecx
> +# endif
> # endif
> /* Load first VEC regardless. */
> VPCMPEQ 1(%rdi), %ymm0, %ymm1
> @@ -207,34 +237,38 @@ L(cross_page_continue):
> subq %rcx, %rsi
> jb L(last_4x_vec_or_less)
> # endif
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x1)
>
> VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x2)
>
> VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x3)
>
> VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x4)
>
> /* Align data to VEC_SIZE * 4 - 1. */
> # ifdef USE_AS_STRNLEN
> /* Before adjusting length check if at last VEC_SIZE * 4. */
> - cmpq $(VEC_SIZE * 4 - 1), %rsi
> + cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
> jbe L(last_4x_vec_or_less_load)
> incq %rdi
> movl %edi, %ecx
> orq $(VEC_SIZE * 4 - 1), %rdi
> andl $(VEC_SIZE * 4 - 1), %ecx
> +# ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get the wchar_t count. */
> + sarl $2, %ecx
> +# endif
> /* Readjust length. */
> addq %rcx, %rsi
> # else
> @@ -246,13 +280,13 @@ L(cross_page_continue):
> L(loop_4x_vec):
> # ifdef USE_AS_STRNLEN
> /* Break if at end of length. */
> - subq $(VEC_SIZE * 4), %rsi
> + subq $(CHAR_PER_VEC * 4), %rsi
> jb L(last_4x_vec_or_less_cmpeq)
> # endif
> - /* Save some code size by microfusing VPMINU with the load. Since
> - the matches in ymm2/ymm4 can only be returned if there where no
> - matches in ymm1/ymm3 respectively there is no issue with overlap.
> - */
> + /* Save some code size by microfusing VPMINU with the load.
> + Since the matches in ymm2/ymm4 can only be returned if there
> + where no matches in ymm1/ymm3 respectively there is no issue
> + with overlap. */
> vmovdqa 1(%rdi), %ymm1
> VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> @@ -260,7 +294,7 @@ L(loop_4x_vec):
>
> VPMINU %ymm2, %ymm4, %ymm5
> VPCMPEQ %ymm5, %ymm0, %ymm5
> - vpmovmskb %ymm5, %ecx
> + vpmovmskb %ymm5, %ecx
>
> subq $-(VEC_SIZE * 4), %rdi
> testl %ecx, %ecx
> @@ -268,27 +302,28 @@ L(loop_4x_vec):
>
>
> VPCMPEQ %ymm1, %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> subq %rdx, %rdi
> testl %eax, %eax
> jnz L(last_vec_return_x0)
>
> VPCMPEQ %ymm2, %ymm0, %ymm2
> - vpmovmskb %ymm2, %eax
> + vpmovmskb %ymm2, %eax
> testl %eax, %eax
> jnz L(last_vec_return_x1)
>
> /* Combine last 2 VEC. */
> VPCMPEQ %ymm3, %ymm0, %ymm3
> - vpmovmskb %ymm3, %eax
> - /* rcx has combined result from all 4 VEC. It will only be used if
> - the first 3 other VEC all did not contain a match. */
> + vpmovmskb %ymm3, %eax
> + /* rcx has combined result from all 4 VEC. It will only be used
> + if the first 3 other VEC all did not contain a match. */
> salq $32, %rcx
> orq %rcx, %rax
> tzcntq %rax, %rax
> subq $(VEC_SIZE * 2 - 1), %rdi
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -297,15 +332,19 @@ L(loop_4x_vec):
> # ifdef USE_AS_STRNLEN
> .p2align 4
> L(last_4x_vec_or_less_load):
> - /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
> + /* Depending on entry adjust rdi / prepare first VEC in ymm1.
> + */
> subq $-(VEC_SIZE * 4), %rdi
> L(last_4x_vec_or_less_cmpeq):
> VPCMPEQ 1(%rdi), %ymm0, %ymm1
> L(last_4x_vec_or_less):
> -
> - vpmovmskb %ymm1, %eax
> - /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> - VEC_SIZE * 4. */
> +# ifdef USE_AS_WCSLEN
> + /* NB: Multiply length by 4 to get byte count. */
> + sall $2, %esi
> +# endif
> + vpmovmskb %ymm1, %eax
> + /* If remaining length > VEC_SIZE * 2. This works if esi is off
> + by VEC_SIZE * 4. */
> testl $(VEC_SIZE * 2), %esi
> jnz L(last_4x_vec)
>
> @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
> jb L(max)
>
> VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> tzcntl %eax, %eax
> /* Check the end of data. */
> cmpl %eax, %esi
> @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
> addl $(VEC_SIZE + 1), %eax
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -340,6 +380,7 @@ L(last_vec_return_x0):
> subq $(VEC_SIZE * 4 - 1), %rdi
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -350,6 +391,7 @@ L(last_vec_return_x1):
> subq $(VEC_SIZE * 3 - 1), %rdi
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -366,6 +408,7 @@ L(last_vec_x1_check):
> incl %eax
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -381,14 +424,14 @@ L(last_4x_vec):
> jnz L(last_vec_x1)
>
> VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(last_vec_x2)
>
> /* Normalize length. */
> andl $(VEC_SIZE * 4 - 1), %esi
> VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(last_vec_x3)
>
> @@ -396,7 +439,7 @@ L(last_4x_vec):
> jb L(max)
>
> VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> tzcntl %eax, %eax
> /* Check the end of data. */
> cmpl %eax, %esi
> @@ -405,6 +448,7 @@ L(last_4x_vec):
> addl $(VEC_SIZE * 3 + 1), %eax
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -419,6 +463,7 @@ L(last_vec_x1):
> incl %eax
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -432,6 +477,7 @@ L(last_vec_x2):
> addl $(VEC_SIZE + 1), %eax
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -447,6 +493,7 @@ L(last_vec_x3):
> addl $(VEC_SIZE * 2 + 1), %eax
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -455,13 +502,13 @@ L(max_end):
> VZEROUPPER_RETURN
> # endif
>
> - /* Cold case for crossing page with first load. */
> + /* Cold case for crossing page with first load. */
> .p2align 4
> L(cross_page_boundary):
> /* Align data to VEC_SIZE - 1. */
> orq $(VEC_SIZE - 1), %rdi
> VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> so no need to manually mod rdx. */
> sarxl %edx, %eax, %eax
> @@ -470,6 +517,10 @@ L(cross_page_boundary):
> jnz L(cross_page_less_vec)
> leaq 1(%rdi), %rcx
> subq %rdx, %rcx
> +# ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> + shrl $2, %ecx
> +# endif
> /* Check length. */
> cmpq %rsi, %rcx
> jb L(cross_page_continue)
> @@ -479,6 +530,7 @@ L(cross_page_boundary):
> jz L(cross_page_continue)
> tzcntl %eax, %eax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide length by 4 to get wchar_t count. */
> shrl $2, %eax
> # endif
> # endif
> @@ -489,6 +541,10 @@ L(return_vzeroupper):
> .p2align 4
> L(cross_page_less_vec):
> tzcntl %eax, %eax
> +# ifdef USE_AS_WCSLEN
> + /* NB: Multiply length by 4 to get byte count. */
> + sall $2, %esi
> +# endif
> cmpq %rax, %rsi
> cmovb %esi, %eax
> # ifdef USE_AS_WCSLEN
> diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> index d223ea1700..3fc6734910 100644
> --- a/sysdeps/x86_64/strlen.S
> +++ b/sysdeps/x86_64/strlen.S
> @@ -65,12 +65,24 @@ ENTRY(strlen)
> ret
> L(n_nonzero):
> # ifdef AS_WCSLEN
> - shl $2, %RSI_LP
> +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> + overflow the only way this program doesn't have undefined behavior
> + is if there is a null terminator in valid memory so strlen will
> + suffice. */
> + mov %RSI_LP, %R10_LP
> + sar $62, %R10_LP
> + test %R10_LP, %R10_LP
> + jnz __wcslen_sse2
Branch to __wcslen_sse2 is wrong for 2 reasons:
1. __wcslen_sse2 is undefined with --disable-multi-arch.
2. You should skip ENDBR64 at function entry.
Please create a new label and branch to it.
> + sal $2, %RSI_LP
> # endif
>
> /* Initialize long lived registers. */
>
> add %RDI_LP, %RSI_LP
> +# ifdef AS_WCSLEN
> +/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
> + jbe __wcslen_sse2
> +# endif
> mov %RSI_LP, %R10_LP
> and $-64, %R10_LP
> mov %RSI_LP, %R11_LP
> --
> 2.25.1
>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
2021-06-22 21:33 ` H.J. Lu
@ 2021-06-22 23:16 ` Noah Goldstein
2021-06-22 23:28 ` H.J. Lu
0 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-22 23:16 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jun 22, 2021 at 5:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> > This commit fixes the bug mentioned in the previous commit.
> >
> > The previous implementations of wmemchr in these files relied
> > on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
> >
> > The new overflow tests added in the previous commit now
> > pass (As well as all the other tests).
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> > sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
> > sysdeps/x86_64/strlen.S | 14 ++-
> > 2 files changed, 106 insertions(+), 38 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S
> b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > index bd2e6ee44a..b282a75613 100644
> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > @@ -44,21 +44,21 @@
> >
> > # define VEC_SIZE 32
> > # define PAGE_SIZE 4096
> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> >
> > .section SECTION(.text),"ax",@progbits
> > ENTRY (STRLEN)
> > # ifdef USE_AS_STRNLEN
> > /* Check zero length. */
> > +# ifdef __ILP32__
> > + /* Clear upper bits. */
> > + and %RSI_LP, %RSI_LP
> > +# else
> > test %RSI_LP, %RSI_LP
> > +# endif
> > jz L(zero)
> > /* Store max len in R8_LP before adjusting if using WCSLEN. */
> > mov %RSI_LP, %R8_LP
> > -# ifdef USE_AS_WCSLEN
> > - shl $2, %RSI_LP
> > -# elif defined __ILP32__
> > - /* Clear the upper 32 bits. */
> > - movl %esi, %esi
> > -# endif
> > # endif
> > movl %edi, %eax
> > movq %rdi, %rdx
> > @@ -72,10 +72,10 @@ ENTRY (STRLEN)
> >
> > /* Check the first VEC_SIZE bytes. */
> > VPCMPEQ (%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vpmovmskb %ymm1, %eax
> > # ifdef USE_AS_STRNLEN
> > /* If length < VEC_SIZE handle special. */
> > - cmpq $VEC_SIZE, %rsi
> > + cmpq $CHAR_PER_VEC, %rsi
> > jbe L(first_vec_x0)
> > # endif
> > /* If empty continue to aligned_more. Otherwise return bit
> > @@ -84,6 +84,7 @@ ENTRY (STRLEN)
> > jz L(aligned_more)
> > tzcntl %eax, %eax
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > shrl $2, %eax
> > # endif
> > VZEROUPPER_RETURN
> > @@ -97,9 +98,14 @@ L(zero):
> > L(first_vec_x0):
> > /* Set bit for max len so that tzcnt will return min of max len
> > and position of first match. */
> > +# ifdef USE_AS_WCSLEN
> > + /* NB: Multiply length by 4 to get byte count. */
> > + sall $2, %esi
> > +# endif
> > btsq %rsi, %rax
> > tzcntl %eax, %eax
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > shrl $2, %eax
> > # endif
> > VZEROUPPER_RETURN
> > @@ -113,14 +119,19 @@ L(first_vec_x1):
> > # ifdef USE_AS_STRNLEN
> > /* Use ecx which was computed earlier to compute correct value.
> > */
> > +# ifdef USE_AS_WCSLEN
> > + leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
> > +# else
> > subl $(VEC_SIZE * 4 + 1), %ecx
> > addl %ecx, %eax
> > +# endif
> > # else
> > subl %edx, %edi
> > incl %edi
> > addl %edi, %eax
> > # endif
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > shrl $2, %eax
> > # endif
> > VZEROUPPER_RETURN
> > @@ -133,14 +144,19 @@ L(first_vec_x2):
> > # ifdef USE_AS_STRNLEN
> > /* Use ecx which was computed earlier to compute correct value.
> > */
> > +# ifdef USE_AS_WCSLEN
> > + leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
> > +# else
> > subl $(VEC_SIZE * 3 + 1), %ecx
> > addl %ecx, %eax
> > +# endif
> > # else
> > subl %edx, %edi
> > addl $(VEC_SIZE + 1), %edi
> > addl %edi, %eax
> > # endif
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > shrl $2, %eax
> > # endif
> > VZEROUPPER_RETURN
> > @@ -153,14 +169,19 @@ L(first_vec_x3):
> > # ifdef USE_AS_STRNLEN
> > /* Use ecx which was computed earlier to compute correct value.
> > */
> > +# ifdef USE_AS_WCSLEN
> > + leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
> > +# else
> > subl $(VEC_SIZE * 2 + 1), %ecx
> > addl %ecx, %eax
> > +# endif
> > # else
> > subl %edx, %edi
> > addl $(VEC_SIZE * 2 + 1), %edi
> > addl %edi, %eax
> > # endif
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > shrl $2, %eax
> > # endif
> > VZEROUPPER_RETURN
> > @@ -173,14 +194,19 @@ L(first_vec_x4):
> > # ifdef USE_AS_STRNLEN
> > /* Use ecx which was computed earlier to compute correct value.
> > */
> > +# ifdef USE_AS_WCSLEN
> > + leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
> > +# else
> > subl $(VEC_SIZE + 1), %ecx
> > addl %ecx, %eax
> > +# endif
> > # else
> > subl %edx, %edi
> > addl $(VEC_SIZE * 3 + 1), %edi
> > addl %edi, %eax
> > # endif
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > shrl $2, %eax
> > # endif
> > VZEROUPPER_RETURN
> > @@ -195,10 +221,14 @@ L(cross_page_continue):
> > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > since data is only aligned to VEC_SIZE. */
> > # ifdef USE_AS_STRNLEN
> > - /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> because
> > - it simplies the logic in last_4x_vec_or_less. */
> > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> > + because it simplies the logic in last_4x_vec_or_less. */
> > leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> > subq %rdx, %rcx
> > +# ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get the wchar_t count. */
> > + sarl $2, %ecx
> > +# endif
> > # endif
> > /* Load first VEC regardless. */
> > VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > @@ -207,34 +237,38 @@ L(cross_page_continue):
> > subq %rcx, %rsi
> > jb L(last_4x_vec_or_less)
> > # endif
> > - vpmovmskb %ymm1, %eax
> > + vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x1)
> >
> > VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x2)
> >
> > VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x3)
> >
> > VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x4)
> >
> > /* Align data to VEC_SIZE * 4 - 1. */
> > # ifdef USE_AS_STRNLEN
> > /* Before adjusting length check if at last VEC_SIZE * 4. */
> > - cmpq $(VEC_SIZE * 4 - 1), %rsi
> > + cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
> > jbe L(last_4x_vec_or_less_load)
> > incq %rdi
> > movl %edi, %ecx
> > orq $(VEC_SIZE * 4 - 1), %rdi
> > andl $(VEC_SIZE * 4 - 1), %ecx
> > +# ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get the wchar_t count. */
> > + sarl $2, %ecx
> > +# endif
> > /* Readjust length. */
> > addq %rcx, %rsi
> > # else
> > @@ -246,13 +280,13 @@ L(cross_page_continue):
> > L(loop_4x_vec):
> > # ifdef USE_AS_STRNLEN
> > /* Break if at end of length. */
> > - subq $(VEC_SIZE * 4), %rsi
> > + subq $(CHAR_PER_VEC * 4), %rsi
> > jb L(last_4x_vec_or_less_cmpeq)
> > # endif
> > - /* Save some code size by microfusing VPMINU with the load. Since
> > - the matches in ymm2/ymm4 can only be returned if there where
> no
> > - matches in ymm1/ymm3 respectively there is no issue with
> overlap.
> > - */
> > + /* Save some code size by microfusing VPMINU with the load.
> > + Since the matches in ymm2/ymm4 can only be returned if there
> > + where no matches in ymm1/ymm3 respectively there is no issue
> > + with overlap. */
> > vmovdqa 1(%rdi), %ymm1
> > VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> > vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> > @@ -260,7 +294,7 @@ L(loop_4x_vec):
> >
> > VPMINU %ymm2, %ymm4, %ymm5
> > VPCMPEQ %ymm5, %ymm0, %ymm5
> > - vpmovmskb %ymm5, %ecx
> > + vpmovmskb %ymm5, %ecx
> >
> > subq $-(VEC_SIZE * 4), %rdi
> > testl %ecx, %ecx
> > @@ -268,27 +302,28 @@ L(loop_4x_vec):
> >
> >
> > VPCMPEQ %ymm1, %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vpmovmskb %ymm1, %eax
> > subq %rdx, %rdi
> > testl %eax, %eax
> > jnz L(last_vec_return_x0)
> >
> > VPCMPEQ %ymm2, %ymm0, %ymm2
> > - vpmovmskb %ymm2, %eax
> > + vpmovmskb %ymm2, %eax
> > testl %eax, %eax
> > jnz L(last_vec_return_x1)
> >
> > /* Combine last 2 VEC. */
> > VPCMPEQ %ymm3, %ymm0, %ymm3
> > - vpmovmskb %ymm3, %eax
> > - /* rcx has combined result from all 4 VEC. It will only be used
> if
> > - the first 3 other VEC all did not contain a match. */
> > + vpmovmskb %ymm3, %eax
> > + /* rcx has combined result from all 4 VEC. It will only be used
> > + if the first 3 other VEC all did not contain a match. */
> > salq $32, %rcx
> > orq %rcx, %rax
> > tzcntq %rax, %rax
> > subq $(VEC_SIZE * 2 - 1), %rdi
> > addq %rdi, %rax
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > shrq $2, %rax
> > # endif
> > VZEROUPPER_RETURN
> > @@ -297,15 +332,19 @@ L(loop_4x_vec):
> > # ifdef USE_AS_STRNLEN
> > .p2align 4
> > L(last_4x_vec_or_less_load):
> > - /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
> > + /* Depending on entry adjust rdi / prepare first VEC in ymm1.
> > + */
> > subq $-(VEC_SIZE * 4), %rdi
> > L(last_4x_vec_or_less_cmpeq):
> > VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > L(last_4x_vec_or_less):
> > -
> > - vpmovmskb %ymm1, %eax
> > - /* If remaining length > VEC_SIZE * 2. This works if esi is off
> by
> > - VEC_SIZE * 4. */
> > +# ifdef USE_AS_WCSLEN
> > + /* NB: Multiply length by 4 to get byte count. */
> > + sall $2, %esi
> > +# endif
> > + vpmovmskb %ymm1, %eax
> > + /* If remaining length > VEC_SIZE * 2. This works if esi is off
> > + by VEC_SIZE * 4. */
> > testl $(VEC_SIZE * 2), %esi
> > jnz L(last_4x_vec)
> >
> > @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
> > jb L(max)
> >
> > VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vpmovmskb %ymm1, %eax
> > tzcntl %eax, %eax
> > /* Check the end of data. */
> > cmpl %eax, %esi
> > @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
> > addl $(VEC_SIZE + 1), %eax
> > addq %rdi, %rax
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > shrq $2, %rax
> > # endif
> > VZEROUPPER_RETURN
> > @@ -340,6 +380,7 @@ L(last_vec_return_x0):
> > subq $(VEC_SIZE * 4 - 1), %rdi
> > addq %rdi, %rax
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > shrq $2, %rax
> > # endif
> > VZEROUPPER_RETURN
> > @@ -350,6 +391,7 @@ L(last_vec_return_x1):
> > subq $(VEC_SIZE * 3 - 1), %rdi
> > addq %rdi, %rax
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > shrq $2, %rax
> > # endif
> > VZEROUPPER_RETURN
> > @@ -366,6 +408,7 @@ L(last_vec_x1_check):
> > incl %eax
> > addq %rdi, %rax
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > shrq $2, %rax
> > # endif
> > VZEROUPPER_RETURN
> > @@ -381,14 +424,14 @@ L(last_4x_vec):
> > jnz L(last_vec_x1)
> >
> > VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > jnz L(last_vec_x2)
> >
> > /* Normalize length. */
> > andl $(VEC_SIZE * 4 - 1), %esi
> > VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > jnz L(last_vec_x3)
> >
> > @@ -396,7 +439,7 @@ L(last_4x_vec):
> > jb L(max)
> >
> > VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vpmovmskb %ymm1, %eax
> > tzcntl %eax, %eax
> > /* Check the end of data. */
> > cmpl %eax, %esi
> > @@ -405,6 +448,7 @@ L(last_4x_vec):
> > addl $(VEC_SIZE * 3 + 1), %eax
> > addq %rdi, %rax
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > shrq $2, %rax
> > # endif
> > VZEROUPPER_RETURN
> > @@ -419,6 +463,7 @@ L(last_vec_x1):
> > incl %eax
> > addq %rdi, %rax
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > shrq $2, %rax
> > # endif
> > VZEROUPPER_RETURN
> > @@ -432,6 +477,7 @@ L(last_vec_x2):
> > addl $(VEC_SIZE + 1), %eax
> > addq %rdi, %rax
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > shrq $2, %rax
> > # endif
> > VZEROUPPER_RETURN
> > @@ -447,6 +493,7 @@ L(last_vec_x3):
> > addl $(VEC_SIZE * 2 + 1), %eax
> > addq %rdi, %rax
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > shrq $2, %rax
> > # endif
> > VZEROUPPER_RETURN
> > @@ -455,13 +502,13 @@ L(max_end):
> > VZEROUPPER_RETURN
> > # endif
> >
> > - /* Cold case for crossing page with first load. */
> > + /* Cold case for crossing page with first load. */
> > .p2align 4
> > L(cross_page_boundary):
> > /* Align data to VEC_SIZE - 1. */
> > orq $(VEC_SIZE - 1), %rdi
> > VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vpmovmskb %ymm1, %eax
> > /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > so no need to manually mod rdx. */
> > sarxl %edx, %eax, %eax
> > @@ -470,6 +517,10 @@ L(cross_page_boundary):
> > jnz L(cross_page_less_vec)
> > leaq 1(%rdi), %rcx
> > subq %rdx, %rcx
> > +# ifdef USE_AS_WCSLEN
> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> > + shrl $2, %ecx
> > +# endif
> > /* Check length. */
> > cmpq %rsi, %rcx
> > jb L(cross_page_continue)
> > @@ -479,6 +530,7 @@ L(cross_page_boundary):
> > jz L(cross_page_continue)
> > tzcntl %eax, %eax
> > # ifdef USE_AS_WCSLEN
> > + /* NB: Divide length by 4 to get wchar_t count. */
> > shrl $2, %eax
> > # endif
> > # endif
> > @@ -489,6 +541,10 @@ L(return_vzeroupper):
> > .p2align 4
> > L(cross_page_less_vec):
> > tzcntl %eax, %eax
> > +# ifdef USE_AS_WCSLEN
> > + /* NB: Multiply length by 4 to get byte count. */
> > + sall $2, %esi
> > +# endif
> > cmpq %rax, %rsi
> > cmovb %esi, %eax
> > # ifdef USE_AS_WCSLEN
> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> > index d223ea1700..3fc6734910 100644
> > --- a/sysdeps/x86_64/strlen.S
> > +++ b/sysdeps/x86_64/strlen.S
> > @@ -65,12 +65,24 @@ ENTRY(strlen)
> > ret
> > L(n_nonzero):
> > # ifdef AS_WCSLEN
> > - shl $2, %RSI_LP
> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> > + overflow the only way this program doesn't have undefined behavior
> > + is if there is a null terminator in valid memory so strlen will
> > + suffice. */
> > + mov %RSI_LP, %R10_LP
> > + sar $62, %R10_LP
> > + test %R10_LP, %R10_LP
> > + jnz __wcslen_sse2
>
> Branch to __wcslen_sse2 is wrong for 2 reasons:
>
> 1. __wcslen_sse2 is undefined with --disable-multi-arch.
>
Won't __wcsnlen_sse2 be undefined with --disable-multi-arch as well?
> 2. You should skip ENDBR64 at function entry.
>
> Please create a new label and branch to it.
>
> I am not quite sure how to do this. I am trying to use
strstr-sse2-unaligned.S as a template:
https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S;h=21e1a5f7cfde8ec07fcc4fc80d26984a58d651d7;hb=HEAD#l78
which appears to make a direct call to the global label of __strchr_sse2
without anything special in strchr-sse2.S or strstr-sse2-unaligned.S.
Is there an example in the code you know of I can follow?
> > + sal $2, %RSI_LP
> > # endif
> >
> > /* Initialize long lived registers. */
> >
> > add %RDI_LP, %RSI_LP
> > +# ifdef AS_WCSLEN
> > +/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
> > + jbe __wcslen_sse2
> > +# endif
> > mov %RSI_LP, %R10_LP
> > and $-64, %R10_LP
> > mov %RSI_LP, %R11_LP
> > --
> > 2.25.1
> >
>
> Thanks.
>
>
> --
> H.J.
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
2021-06-22 23:16 ` Noah Goldstein
@ 2021-06-22 23:28 ` H.J. Lu
2021-06-23 3:11 ` Noah Goldstein
0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2021-06-22 23:28 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jun 22, 2021 at 4:16 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
> On Tue, Jun 22, 2021 at 5:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >
>> > This commit fixes the bug mentioned in the previous commit.
>> >
>> > The previous implementations of wmemchr in these files relied
>> > on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
>> >
>> > The new overflow tests added in the previous commit now
>> > pass (As well as all the other tests).
>> >
>> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
>> > ---
>> > sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
>> > sysdeps/x86_64/strlen.S | 14 ++-
>> > 2 files changed, 106 insertions(+), 38 deletions(-)
>> >
>> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
>> > index bd2e6ee44a..b282a75613 100644
>> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
>> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
>> > @@ -44,21 +44,21 @@
>> >
>> > # define VEC_SIZE 32
>> > # define PAGE_SIZE 4096
>> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
>> >
>> > .section SECTION(.text),"ax",@progbits
>> > ENTRY (STRLEN)
>> > # ifdef USE_AS_STRNLEN
>> > /* Check zero length. */
>> > +# ifdef __ILP32__
>> > + /* Clear upper bits. */
>> > + and %RSI_LP, %RSI_LP
>> > +# else
>> > test %RSI_LP, %RSI_LP
>> > +# endif
>> > jz L(zero)
>> > /* Store max len in R8_LP before adjusting if using WCSLEN. */
>> > mov %RSI_LP, %R8_LP
>> > -# ifdef USE_AS_WCSLEN
>> > - shl $2, %RSI_LP
>> > -# elif defined __ILP32__
>> > - /* Clear the upper 32 bits. */
>> > - movl %esi, %esi
>> > -# endif
>> > # endif
>> > movl %edi, %eax
>> > movq %rdi, %rdx
>> > @@ -72,10 +72,10 @@ ENTRY (STRLEN)
>> >
>> > /* Check the first VEC_SIZE bytes. */
>> > VPCMPEQ (%rdi), %ymm0, %ymm1
>> > - vpmovmskb %ymm1, %eax
>> > + vpmovmskb %ymm1, %eax
>> > # ifdef USE_AS_STRNLEN
>> > /* If length < VEC_SIZE handle special. */
>> > - cmpq $VEC_SIZE, %rsi
>> > + cmpq $CHAR_PER_VEC, %rsi
>> > jbe L(first_vec_x0)
>> > # endif
>> > /* If empty continue to aligned_more. Otherwise return bit
>> > @@ -84,6 +84,7 @@ ENTRY (STRLEN)
>> > jz L(aligned_more)
>> > tzcntl %eax, %eax
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > shrl $2, %eax
>> > # endif
>> > VZEROUPPER_RETURN
>> > @@ -97,9 +98,14 @@ L(zero):
>> > L(first_vec_x0):
>> > /* Set bit for max len so that tzcnt will return min of max len
>> > and position of first match. */
>> > +# ifdef USE_AS_WCSLEN
>> > + /* NB: Multiply length by 4 to get byte count. */
>> > + sall $2, %esi
>> > +# endif
>> > btsq %rsi, %rax
>> > tzcntl %eax, %eax
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > shrl $2, %eax
>> > # endif
>> > VZEROUPPER_RETURN
>> > @@ -113,14 +119,19 @@ L(first_vec_x1):
>> > # ifdef USE_AS_STRNLEN
>> > /* Use ecx which was computed earlier to compute correct value.
>> > */
>> > +# ifdef USE_AS_WCSLEN
>> > + leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
>> > +# else
>> > subl $(VEC_SIZE * 4 + 1), %ecx
>> > addl %ecx, %eax
>> > +# endif
>> > # else
>> > subl %edx, %edi
>> > incl %edi
>> > addl %edi, %eax
>> > # endif
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > shrl $2, %eax
>> > # endif
>> > VZEROUPPER_RETURN
>> > @@ -133,14 +144,19 @@ L(first_vec_x2):
>> > # ifdef USE_AS_STRNLEN
>> > /* Use ecx which was computed earlier to compute correct value.
>> > */
>> > +# ifdef USE_AS_WCSLEN
>> > + leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
>> > +# else
>> > subl $(VEC_SIZE * 3 + 1), %ecx
>> > addl %ecx, %eax
>> > +# endif
>> > # else
>> > subl %edx, %edi
>> > addl $(VEC_SIZE + 1), %edi
>> > addl %edi, %eax
>> > # endif
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > shrl $2, %eax
>> > # endif
>> > VZEROUPPER_RETURN
>> > @@ -153,14 +169,19 @@ L(first_vec_x3):
>> > # ifdef USE_AS_STRNLEN
>> > /* Use ecx which was computed earlier to compute correct value.
>> > */
>> > +# ifdef USE_AS_WCSLEN
>> > + leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
>> > +# else
>> > subl $(VEC_SIZE * 2 + 1), %ecx
>> > addl %ecx, %eax
>> > +# endif
>> > # else
>> > subl %edx, %edi
>> > addl $(VEC_SIZE * 2 + 1), %edi
>> > addl %edi, %eax
>> > # endif
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > shrl $2, %eax
>> > # endif
>> > VZEROUPPER_RETURN
>> > @@ -173,14 +194,19 @@ L(first_vec_x4):
>> > # ifdef USE_AS_STRNLEN
>> > /* Use ecx which was computed earlier to compute correct value.
>> > */
>> > +# ifdef USE_AS_WCSLEN
>> > + leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
>> > +# else
>> > subl $(VEC_SIZE + 1), %ecx
>> > addl %ecx, %eax
>> > +# endif
>> > # else
>> > subl %edx, %edi
>> > addl $(VEC_SIZE * 3 + 1), %edi
>> > addl %edi, %eax
>> > # endif
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > shrl $2, %eax
>> > # endif
>> > VZEROUPPER_RETURN
>> > @@ -195,10 +221,14 @@ L(cross_page_continue):
>> > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
>> > since data is only aligned to VEC_SIZE. */
>> > # ifdef USE_AS_STRNLEN
>> > - /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
>> > - it simplies the logic in last_4x_vec_or_less. */
>> > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
>> > + because it simplies the logic in last_4x_vec_or_less. */
>> > leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
>> > subq %rdx, %rcx
>> > +# ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get the wchar_t count. */
>> > + sarl $2, %ecx
>> > +# endif
>> > # endif
>> > /* Load first VEC regardless. */
>> > VPCMPEQ 1(%rdi), %ymm0, %ymm1
>> > @@ -207,34 +237,38 @@ L(cross_page_continue):
>> > subq %rcx, %rsi
>> > jb L(last_4x_vec_or_less)
>> > # endif
>> > - vpmovmskb %ymm1, %eax
>> > + vpmovmskb %ymm1, %eax
>> > testl %eax, %eax
>> > jnz L(first_vec_x1)
>> >
>> > VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> > - vpmovmskb %ymm1, %eax
>> > + vpmovmskb %ymm1, %eax
>> > testl %eax, %eax
>> > jnz L(first_vec_x2)
>> >
>> > VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
>> > - vpmovmskb %ymm1, %eax
>> > + vpmovmskb %ymm1, %eax
>> > testl %eax, %eax
>> > jnz L(first_vec_x3)
>> >
>> > VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
>> > - vpmovmskb %ymm1, %eax
>> > + vpmovmskb %ymm1, %eax
>> > testl %eax, %eax
>> > jnz L(first_vec_x4)
>> >
>> > /* Align data to VEC_SIZE * 4 - 1. */
>> > # ifdef USE_AS_STRNLEN
>> > /* Before adjusting length check if at last VEC_SIZE * 4. */
>> > - cmpq $(VEC_SIZE * 4 - 1), %rsi
>> > + cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
>> > jbe L(last_4x_vec_or_less_load)
>> > incq %rdi
>> > movl %edi, %ecx
>> > orq $(VEC_SIZE * 4 - 1), %rdi
>> > andl $(VEC_SIZE * 4 - 1), %ecx
>> > +# ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get the wchar_t count. */
>> > + sarl $2, %ecx
>> > +# endif
>> > /* Readjust length. */
>> > addq %rcx, %rsi
>> > # else
>> > @@ -246,13 +280,13 @@ L(cross_page_continue):
>> > L(loop_4x_vec):
>> > # ifdef USE_AS_STRNLEN
>> > /* Break if at end of length. */
>> > - subq $(VEC_SIZE * 4), %rsi
>> > + subq $(CHAR_PER_VEC * 4), %rsi
>> > jb L(last_4x_vec_or_less_cmpeq)
>> > # endif
>> > - /* Save some code size by microfusing VPMINU with the load. Since
>> > - the matches in ymm2/ymm4 can only be returned if there where no
>> > - matches in ymm1/ymm3 respectively there is no issue with overlap.
>> > - */
>> > + /* Save some code size by microfusing VPMINU with the load.
>> > + Since the matches in ymm2/ymm4 can only be returned if there
>> > + where no matches in ymm1/ymm3 respectively there is no issue
>> > + with overlap. */
>> > vmovdqa 1(%rdi), %ymm1
>> > VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
>> > vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
>> > @@ -260,7 +294,7 @@ L(loop_4x_vec):
>> >
>> > VPMINU %ymm2, %ymm4, %ymm5
>> > VPCMPEQ %ymm5, %ymm0, %ymm5
>> > - vpmovmskb %ymm5, %ecx
>> > + vpmovmskb %ymm5, %ecx
>> >
>> > subq $-(VEC_SIZE * 4), %rdi
>> > testl %ecx, %ecx
>> > @@ -268,27 +302,28 @@ L(loop_4x_vec):
>> >
>> >
>> > VPCMPEQ %ymm1, %ymm0, %ymm1
>> > - vpmovmskb %ymm1, %eax
>> > + vpmovmskb %ymm1, %eax
>> > subq %rdx, %rdi
>> > testl %eax, %eax
>> > jnz L(last_vec_return_x0)
>> >
>> > VPCMPEQ %ymm2, %ymm0, %ymm2
>> > - vpmovmskb %ymm2, %eax
>> > + vpmovmskb %ymm2, %eax
>> > testl %eax, %eax
>> > jnz L(last_vec_return_x1)
>> >
>> > /* Combine last 2 VEC. */
>> > VPCMPEQ %ymm3, %ymm0, %ymm3
>> > - vpmovmskb %ymm3, %eax
>> > - /* rcx has combined result from all 4 VEC. It will only be used if
>> > - the first 3 other VEC all did not contain a match. */
>> > + vpmovmskb %ymm3, %eax
>> > + /* rcx has combined result from all 4 VEC. It will only be used
>> > + if the first 3 other VEC all did not contain a match. */
>> > salq $32, %rcx
>> > orq %rcx, %rax
>> > tzcntq %rax, %rax
>> > subq $(VEC_SIZE * 2 - 1), %rdi
>> > addq %rdi, %rax
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > shrq $2, %rax
>> > # endif
>> > VZEROUPPER_RETURN
>> > @@ -297,15 +332,19 @@ L(loop_4x_vec):
>> > # ifdef USE_AS_STRNLEN
>> > .p2align 4
>> > L(last_4x_vec_or_less_load):
>> > - /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
>> > + /* Depending on entry adjust rdi / prepare first VEC in ymm1.
>> > + */
>> > subq $-(VEC_SIZE * 4), %rdi
>> > L(last_4x_vec_or_less_cmpeq):
>> > VPCMPEQ 1(%rdi), %ymm0, %ymm1
>> > L(last_4x_vec_or_less):
>> > -
>> > - vpmovmskb %ymm1, %eax
>> > - /* If remaining length > VEC_SIZE * 2. This works if esi is off by
>> > - VEC_SIZE * 4. */
>> > +# ifdef USE_AS_WCSLEN
>> > + /* NB: Multiply length by 4 to get byte count. */
>> > + sall $2, %esi
>> > +# endif
>> > + vpmovmskb %ymm1, %eax
>> > + /* If remaining length > VEC_SIZE * 2. This works if esi is off
>> > + by VEC_SIZE * 4. */
>> > testl $(VEC_SIZE * 2), %esi
>> > jnz L(last_4x_vec)
>> >
>> > @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
>> > jb L(max)
>> >
>> > VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> > - vpmovmskb %ymm1, %eax
>> > + vpmovmskb %ymm1, %eax
>> > tzcntl %eax, %eax
>> > /* Check the end of data. */
>> > cmpl %eax, %esi
>> > @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
>> > addl $(VEC_SIZE + 1), %eax
>> > addq %rdi, %rax
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > shrq $2, %rax
>> > # endif
>> > VZEROUPPER_RETURN
>> > @@ -340,6 +380,7 @@ L(last_vec_return_x0):
>> > subq $(VEC_SIZE * 4 - 1), %rdi
>> > addq %rdi, %rax
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > shrq $2, %rax
>> > # endif
>> > VZEROUPPER_RETURN
>> > @@ -350,6 +391,7 @@ L(last_vec_return_x1):
>> > subq $(VEC_SIZE * 3 - 1), %rdi
>> > addq %rdi, %rax
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > shrq $2, %rax
>> > # endif
>> > VZEROUPPER_RETURN
>> > @@ -366,6 +408,7 @@ L(last_vec_x1_check):
>> > incl %eax
>> > addq %rdi, %rax
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > shrq $2, %rax
>> > # endif
>> > VZEROUPPER_RETURN
>> > @@ -381,14 +424,14 @@ L(last_4x_vec):
>> > jnz L(last_vec_x1)
>> >
>> > VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> > - vpmovmskb %ymm1, %eax
>> > + vpmovmskb %ymm1, %eax
>> > testl %eax, %eax
>> > jnz L(last_vec_x2)
>> >
>> > /* Normalize length. */
>> > andl $(VEC_SIZE * 4 - 1), %esi
>> > VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
>> > - vpmovmskb %ymm1, %eax
>> > + vpmovmskb %ymm1, %eax
>> > testl %eax, %eax
>> > jnz L(last_vec_x3)
>> >
>> > @@ -396,7 +439,7 @@ L(last_4x_vec):
>> > jb L(max)
>> >
>> > VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
>> > - vpmovmskb %ymm1, %eax
>> > + vpmovmskb %ymm1, %eax
>> > tzcntl %eax, %eax
>> > /* Check the end of data. */
>> > cmpl %eax, %esi
>> > @@ -405,6 +448,7 @@ L(last_4x_vec):
>> > addl $(VEC_SIZE * 3 + 1), %eax
>> > addq %rdi, %rax
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > shrq $2, %rax
>> > # endif
>> > VZEROUPPER_RETURN
>> > @@ -419,6 +463,7 @@ L(last_vec_x1):
>> > incl %eax
>> > addq %rdi, %rax
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > shrq $2, %rax
>> > # endif
>> > VZEROUPPER_RETURN
>> > @@ -432,6 +477,7 @@ L(last_vec_x2):
>> > addl $(VEC_SIZE + 1), %eax
>> > addq %rdi, %rax
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > shrq $2, %rax
>> > # endif
>> > VZEROUPPER_RETURN
>> > @@ -447,6 +493,7 @@ L(last_vec_x3):
>> > addl $(VEC_SIZE * 2 + 1), %eax
>> > addq %rdi, %rax
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > shrq $2, %rax
>> > # endif
>> > VZEROUPPER_RETURN
>> > @@ -455,13 +502,13 @@ L(max_end):
>> > VZEROUPPER_RETURN
>> > # endif
>> >
>> > - /* Cold case for crossing page with first load. */
>> > + /* Cold case for crossing page with first load. */
>> > .p2align 4
>> > L(cross_page_boundary):
>> > /* Align data to VEC_SIZE - 1. */
>> > orq $(VEC_SIZE - 1), %rdi
>> > VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
>> > - vpmovmskb %ymm1, %eax
>> > + vpmovmskb %ymm1, %eax
>> > /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
>> > so no need to manually mod rdx. */
>> > sarxl %edx, %eax, %eax
>> > @@ -470,6 +517,10 @@ L(cross_page_boundary):
>> > jnz L(cross_page_less_vec)
>> > leaq 1(%rdi), %rcx
>> > subq %rdx, %rcx
>> > +# ifdef USE_AS_WCSLEN
>> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> > + shrl $2, %ecx
>> > +# endif
>> > /* Check length. */
>> > cmpq %rsi, %rcx
>> > jb L(cross_page_continue)
>> > @@ -479,6 +530,7 @@ L(cross_page_boundary):
>> > jz L(cross_page_continue)
>> > tzcntl %eax, %eax
>> > # ifdef USE_AS_WCSLEN
>> > + /* NB: Divide length by 4 to get wchar_t count. */
>> > shrl $2, %eax
>> > # endif
>> > # endif
>> > @@ -489,6 +541,10 @@ L(return_vzeroupper):
>> > .p2align 4
>> > L(cross_page_less_vec):
>> > tzcntl %eax, %eax
>> > +# ifdef USE_AS_WCSLEN
>> > + /* NB: Multiply length by 4 to get byte count. */
>> > + sall $2, %esi
>> > +# endif
>> > cmpq %rax, %rsi
>> > cmovb %esi, %eax
>> > # ifdef USE_AS_WCSLEN
>> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
>> > index d223ea1700..3fc6734910 100644
>> > --- a/sysdeps/x86_64/strlen.S
>> > +++ b/sysdeps/x86_64/strlen.S
>> > @@ -65,12 +65,24 @@ ENTRY(strlen)
>> > ret
>> > L(n_nonzero):
>> > # ifdef AS_WCSLEN
>> > - shl $2, %RSI_LP
>> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
>> > + overflow the only way this program doesn't have undefined behavior
>> > + is if there is a null terminator in valid memory so strlen will
>> > + suffice. */
>> > + mov %RSI_LP, %R10_LP
>> > + sar $62, %R10_LP
>> > + test %R10_LP, %R10_LP
>> > + jnz __wcslen_sse2
>>
>> Branch to __wcslen_sse2 is wrong for 2 reasons:
>>
>> 1. __wcslen_sse2 is undefined with --disable-multi-arch.
>
> Won't __wcsnlen_sse2 be undefined with --disable-multi-arch as well?
>
>>
>> 2. You should skip ENDBR64 at function entry.
>>
>> Please create a new label and branch to it.
>>
> I am not quite sure how to do this. I am trying to use
> strstr-sse2-unaligned.S as a template:
> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S;h=21e1a5f7cfde8ec07fcc4fc80d26984a58d651d7;hb=HEAD#l78
> which appears to make a direct call to the global label of __strchr_sse2
> without anything special in strchr-sse2.S or strstr-sse2-unaligned.S.
This is different since all files are in sysdeps/x86_64/multiarch.
> Is there an example in the code you know of I can follow?
There are no exact same codes.
memmove-vec-unaligned-erms.S has
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
movq %rdi, %rax
L(start): <<<<<<<<<<<<<< This is equivalent to __wcslen_sse2.
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
>>
>> > + sal $2, %RSI_LP
>> > # endif
>> >
>> > /* Initialize long lived registers. */
>> >
>> > add %RDI_LP, %RSI_LP
>> > +# ifdef AS_WCSLEN
>> > +/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
>> > + jbe __wcslen_sse2
>> > +# endif
>> > mov %RSI_LP, %R10_LP
>> > and $-64, %R10_LP
>> > mov %RSI_LP, %R11_LP
>> > --
>> > 2.25.1
>> >
>>
>> Thanks.
>>
>>
>> --
>> H.J.
--
H.J.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
2021-06-22 23:28 ` H.J. Lu
@ 2021-06-23 3:11 ` Noah Goldstein
2021-06-23 3:58 ` H.J. Lu
0 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-23 3:11 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jun 22, 2021 at 7:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> On Tue, Jun 22, 2021 at 4:16 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> >
> >
> > On Tue, Jun 22, 2021 at 5:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <
> goldstein.w.n@gmail.com> wrote:
> >> >
> >> > This commit fixes the bug mentioned in the previous commit.
> >> >
> >> > The previous implementations of wmemchr in these files relied
> >> > on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
> >> >
> >> > The new overflow tests added in the previous commit now
> >> > pass (As well as all the other tests).
> >> >
> >> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> >> > ---
> >> > sysdeps/x86_64/multiarch/strlen-avx2.S | 130
> ++++++++++++++++++-------
> >> > sysdeps/x86_64/strlen.S | 14 ++-
> >> > 2 files changed, 106 insertions(+), 38 deletions(-)
> >> >
> >> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S
> b/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> > index bd2e6ee44a..b282a75613 100644
> >> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> > @@ -44,21 +44,21 @@
> >> >
> >> > # define VEC_SIZE 32
> >> > # define PAGE_SIZE 4096
> >> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> >> >
> >> > .section SECTION(.text),"ax",@progbits
> >> > ENTRY (STRLEN)
> >> > # ifdef USE_AS_STRNLEN
> >> > /* Check zero length. */
> >> > +# ifdef __ILP32__
> >> > + /* Clear upper bits. */
> >> > + and %RSI_LP, %RSI_LP
> >> > +# else
> >> > test %RSI_LP, %RSI_LP
> >> > +# endif
> >> > jz L(zero)
> >> > /* Store max len in R8_LP before adjusting if using WCSLEN.
> */
> >> > mov %RSI_LP, %R8_LP
> >> > -# ifdef USE_AS_WCSLEN
> >> > - shl $2, %RSI_LP
> >> > -# elif defined __ILP32__
> >> > - /* Clear the upper 32 bits. */
> >> > - movl %esi, %esi
> >> > -# endif
> >> > # endif
> >> > movl %edi, %eax
> >> > movq %rdi, %rdx
> >> > @@ -72,10 +72,10 @@ ENTRY (STRLEN)
> >> >
> >> > /* Check the first VEC_SIZE bytes. */
> >> > VPCMPEQ (%rdi), %ymm0, %ymm1
> >> > - vpmovmskb %ymm1, %eax
> >> > + vpmovmskb %ymm1, %eax
> >> > # ifdef USE_AS_STRNLEN
> >> > /* If length < VEC_SIZE handle special. */
> >> > - cmpq $VEC_SIZE, %rsi
> >> > + cmpq $CHAR_PER_VEC, %rsi
> >> > jbe L(first_vec_x0)
> >> > # endif
> >> > /* If empty continue to aligned_more. Otherwise return bit
> >> > @@ -84,6 +84,7 @@ ENTRY (STRLEN)
> >> > jz L(aligned_more)
> >> > tzcntl %eax, %eax
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > shrl $2, %eax
> >> > # endif
> >> > VZEROUPPER_RETURN
> >> > @@ -97,9 +98,14 @@ L(zero):
> >> > L(first_vec_x0):
> >> > /* Set bit for max len so that tzcnt will return min of max
> len
> >> > and position of first match. */
> >> > +# ifdef USE_AS_WCSLEN
> >> > + /* NB: Multiply length by 4 to get byte count. */
> >> > + sall $2, %esi
> >> > +# endif
> >> > btsq %rsi, %rax
> >> > tzcntl %eax, %eax
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > shrl $2, %eax
> >> > # endif
> >> > VZEROUPPER_RETURN
> >> > @@ -113,14 +119,19 @@ L(first_vec_x1):
> >> > # ifdef USE_AS_STRNLEN
> >> > /* Use ecx which was computed earlier to compute correct
> value.
> >> > */
> >> > +# ifdef USE_AS_WCSLEN
> >> > + leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
> >> > +# else
> >> > subl $(VEC_SIZE * 4 + 1), %ecx
> >> > addl %ecx, %eax
> >> > +# endif
> >> > # else
> >> > subl %edx, %edi
> >> > incl %edi
> >> > addl %edi, %eax
> >> > # endif
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > shrl $2, %eax
> >> > # endif
> >> > VZEROUPPER_RETURN
> >> > @@ -133,14 +144,19 @@ L(first_vec_x2):
> >> > # ifdef USE_AS_STRNLEN
> >> > /* Use ecx which was computed earlier to compute correct
> value.
> >> > */
> >> > +# ifdef USE_AS_WCSLEN
> >> > + leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
> >> > +# else
> >> > subl $(VEC_SIZE * 3 + 1), %ecx
> >> > addl %ecx, %eax
> >> > +# endif
> >> > # else
> >> > subl %edx, %edi
> >> > addl $(VEC_SIZE + 1), %edi
> >> > addl %edi, %eax
> >> > # endif
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > shrl $2, %eax
> >> > # endif
> >> > VZEROUPPER_RETURN
> >> > @@ -153,14 +169,19 @@ L(first_vec_x3):
> >> > # ifdef USE_AS_STRNLEN
> >> > /* Use ecx which was computed earlier to compute correct
> value.
> >> > */
> >> > +# ifdef USE_AS_WCSLEN
> >> > + leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
> >> > +# else
> >> > subl $(VEC_SIZE * 2 + 1), %ecx
> >> > addl %ecx, %eax
> >> > +# endif
> >> > # else
> >> > subl %edx, %edi
> >> > addl $(VEC_SIZE * 2 + 1), %edi
> >> > addl %edi, %eax
> >> > # endif
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > shrl $2, %eax
> >> > # endif
> >> > VZEROUPPER_RETURN
> >> > @@ -173,14 +194,19 @@ L(first_vec_x4):
> >> > # ifdef USE_AS_STRNLEN
> >> > /* Use ecx which was computed earlier to compute correct
> value.
> >> > */
> >> > +# ifdef USE_AS_WCSLEN
> >> > + leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
> >> > +# else
> >> > subl $(VEC_SIZE + 1), %ecx
> >> > addl %ecx, %eax
> >> > +# endif
> >> > # else
> >> > subl %edx, %edi
> >> > addl $(VEC_SIZE * 3 + 1), %edi
> >> > addl %edi, %eax
> >> > # endif
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > shrl $2, %eax
> >> > # endif
> >> > VZEROUPPER_RETURN
> >> > @@ -195,10 +221,14 @@ L(cross_page_continue):
> >> > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> >> > since data is only aligned to VEC_SIZE. */
> >> > # ifdef USE_AS_STRNLEN
> >> > - /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> because
> >> > - it simplies the logic in last_4x_vec_or_less. */
> >> > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> >> > + because it simplies the logic in last_4x_vec_or_less. */
> >> > leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> >> > subq %rdx, %rcx
> >> > +# ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get the wchar_t count. */
> >> > + sarl $2, %ecx
> >> > +# endif
> >> > # endif
> >> > /* Load first VEC regardless. */
> >> > VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >> > @@ -207,34 +237,38 @@ L(cross_page_continue):
> >> > subq %rcx, %rsi
> >> > jb L(last_4x_vec_or_less)
> >> > # endif
> >> > - vpmovmskb %ymm1, %eax
> >> > + vpmovmskb %ymm1, %eax
> >> > testl %eax, %eax
> >> > jnz L(first_vec_x1)
> >> >
> >> > VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> > - vpmovmskb %ymm1, %eax
> >> > + vpmovmskb %ymm1, %eax
> >> > testl %eax, %eax
> >> > jnz L(first_vec_x2)
> >> >
> >> > VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> >> > - vpmovmskb %ymm1, %eax
> >> > + vpmovmskb %ymm1, %eax
> >> > testl %eax, %eax
> >> > jnz L(first_vec_x3)
> >> >
> >> > VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> >> > - vpmovmskb %ymm1, %eax
> >> > + vpmovmskb %ymm1, %eax
> >> > testl %eax, %eax
> >> > jnz L(first_vec_x4)
> >> >
> >> > /* Align data to VEC_SIZE * 4 - 1. */
> >> > # ifdef USE_AS_STRNLEN
> >> > /* Before adjusting length check if at last VEC_SIZE * 4. */
> >> > - cmpq $(VEC_SIZE * 4 - 1), %rsi
> >> > + cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
> >> > jbe L(last_4x_vec_or_less_load)
> >> > incq %rdi
> >> > movl %edi, %ecx
> >> > orq $(VEC_SIZE * 4 - 1), %rdi
> >> > andl $(VEC_SIZE * 4 - 1), %ecx
> >> > +# ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get the wchar_t count. */
> >> > + sarl $2, %ecx
> >> > +# endif
> >> > /* Readjust length. */
> >> > addq %rcx, %rsi
> >> > # else
> >> > @@ -246,13 +280,13 @@ L(cross_page_continue):
> >> > L(loop_4x_vec):
> >> > # ifdef USE_AS_STRNLEN
> >> > /* Break if at end of length. */
> >> > - subq $(VEC_SIZE * 4), %rsi
> >> > + subq $(CHAR_PER_VEC * 4), %rsi
> >> > jb L(last_4x_vec_or_less_cmpeq)
> >> > # endif
> >> > - /* Save some code size by microfusing VPMINU with the load.
> Since
> >> > - the matches in ymm2/ymm4 can only be returned if there
> where no
> >> > - matches in ymm1/ymm3 respectively there is no issue with
> overlap.
> >> > - */
> >> > + /* Save some code size by microfusing VPMINU with the load.
> >> > + Since the matches in ymm2/ymm4 can only be returned if
> there
> >> > + where no matches in ymm1/ymm3 respectively there is no
> issue
> >> > + with overlap. */
> >> > vmovdqa 1(%rdi), %ymm1
> >> > VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> >> > vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> >> > @@ -260,7 +294,7 @@ L(loop_4x_vec):
> >> >
> >> > VPMINU %ymm2, %ymm4, %ymm5
> >> > VPCMPEQ %ymm5, %ymm0, %ymm5
> >> > - vpmovmskb %ymm5, %ecx
> >> > + vpmovmskb %ymm5, %ecx
> >> >
> >> > subq $-(VEC_SIZE * 4), %rdi
> >> > testl %ecx, %ecx
> >> > @@ -268,27 +302,28 @@ L(loop_4x_vec):
> >> >
> >> >
> >> > VPCMPEQ %ymm1, %ymm0, %ymm1
> >> > - vpmovmskb %ymm1, %eax
> >> > + vpmovmskb %ymm1, %eax
> >> > subq %rdx, %rdi
> >> > testl %eax, %eax
> >> > jnz L(last_vec_return_x0)
> >> >
> >> > VPCMPEQ %ymm2, %ymm0, %ymm2
> >> > - vpmovmskb %ymm2, %eax
> >> > + vpmovmskb %ymm2, %eax
> >> > testl %eax, %eax
> >> > jnz L(last_vec_return_x1)
> >> >
> >> > /* Combine last 2 VEC. */
> >> > VPCMPEQ %ymm3, %ymm0, %ymm3
> >> > - vpmovmskb %ymm3, %eax
> >> > - /* rcx has combined result from all 4 VEC. It will only be
> used if
> >> > - the first 3 other VEC all did not contain a match. */
> >> > + vpmovmskb %ymm3, %eax
> >> > + /* rcx has combined result from all 4 VEC. It will only be
> used
> >> > + if the first 3 other VEC all did not contain a match. */
> >> > salq $32, %rcx
> >> > orq %rcx, %rax
> >> > tzcntq %rax, %rax
> >> > subq $(VEC_SIZE * 2 - 1), %rdi
> >> > addq %rdi, %rax
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > shrq $2, %rax
> >> > # endif
> >> > VZEROUPPER_RETURN
> >> > @@ -297,15 +332,19 @@ L(loop_4x_vec):
> >> > # ifdef USE_AS_STRNLEN
> >> > .p2align 4
> >> > L(last_4x_vec_or_less_load):
> >> > - /* Depending on entry adjust rdi / prepare first VEC in
> ymm1. */
> >> > + /* Depending on entry adjust rdi / prepare first VEC in ymm1.
> >> > + */
> >> > subq $-(VEC_SIZE * 4), %rdi
> >> > L(last_4x_vec_or_less_cmpeq):
> >> > VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >> > L(last_4x_vec_or_less):
> >> > -
> >> > - vpmovmskb %ymm1, %eax
> >> > - /* If remaining length > VEC_SIZE * 2. This works if esi is
> off by
> >> > - VEC_SIZE * 4. */
> >> > +# ifdef USE_AS_WCSLEN
> >> > + /* NB: Multiply length by 4 to get byte count. */
> >> > + sall $2, %esi
> >> > +# endif
> >> > + vpmovmskb %ymm1, %eax
> >> > + /* If remaining length > VEC_SIZE * 2. This works if esi is
> off
> >> > + by VEC_SIZE * 4. */
> >> > testl $(VEC_SIZE * 2), %esi
> >> > jnz L(last_4x_vec)
> >> >
> >> > @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
> >> > jb L(max)
> >> >
> >> > VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> > - vpmovmskb %ymm1, %eax
> >> > + vpmovmskb %ymm1, %eax
> >> > tzcntl %eax, %eax
> >> > /* Check the end of data. */
> >> > cmpl %eax, %esi
> >> > @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
> >> > addl $(VEC_SIZE + 1), %eax
> >> > addq %rdi, %rax
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > shrq $2, %rax
> >> > # endif
> >> > VZEROUPPER_RETURN
> >> > @@ -340,6 +380,7 @@ L(last_vec_return_x0):
> >> > subq $(VEC_SIZE * 4 - 1), %rdi
> >> > addq %rdi, %rax
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > shrq $2, %rax
> >> > # endif
> >> > VZEROUPPER_RETURN
> >> > @@ -350,6 +391,7 @@ L(last_vec_return_x1):
> >> > subq $(VEC_SIZE * 3 - 1), %rdi
> >> > addq %rdi, %rax
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > shrq $2, %rax
> >> > # endif
> >> > VZEROUPPER_RETURN
> >> > @@ -366,6 +408,7 @@ L(last_vec_x1_check):
> >> > incl %eax
> >> > addq %rdi, %rax
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > shrq $2, %rax
> >> > # endif
> >> > VZEROUPPER_RETURN
> >> > @@ -381,14 +424,14 @@ L(last_4x_vec):
> >> > jnz L(last_vec_x1)
> >> >
> >> > VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> > - vpmovmskb %ymm1, %eax
> >> > + vpmovmskb %ymm1, %eax
> >> > testl %eax, %eax
> >> > jnz L(last_vec_x2)
> >> >
> >> > /* Normalize length. */
> >> > andl $(VEC_SIZE * 4 - 1), %esi
> >> > VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> >> > - vpmovmskb %ymm1, %eax
> >> > + vpmovmskb %ymm1, %eax
> >> > testl %eax, %eax
> >> > jnz L(last_vec_x3)
> >> >
> >> > @@ -396,7 +439,7 @@ L(last_4x_vec):
> >> > jb L(max)
> >> >
> >> > VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> >> > - vpmovmskb %ymm1, %eax
> >> > + vpmovmskb %ymm1, %eax
> >> > tzcntl %eax, %eax
> >> > /* Check the end of data. */
> >> > cmpl %eax, %esi
> >> > @@ -405,6 +448,7 @@ L(last_4x_vec):
> >> > addl $(VEC_SIZE * 3 + 1), %eax
> >> > addq %rdi, %rax
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > shrq $2, %rax
> >> > # endif
> >> > VZEROUPPER_RETURN
> >> > @@ -419,6 +463,7 @@ L(last_vec_x1):
> >> > incl %eax
> >> > addq %rdi, %rax
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > shrq $2, %rax
> >> > # endif
> >> > VZEROUPPER_RETURN
> >> > @@ -432,6 +477,7 @@ L(last_vec_x2):
> >> > addl $(VEC_SIZE + 1), %eax
> >> > addq %rdi, %rax
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > shrq $2, %rax
> >> > # endif
> >> > VZEROUPPER_RETURN
> >> > @@ -447,6 +493,7 @@ L(last_vec_x3):
> >> > addl $(VEC_SIZE * 2 + 1), %eax
> >> > addq %rdi, %rax
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > shrq $2, %rax
> >> > # endif
> >> > VZEROUPPER_RETURN
> >> > @@ -455,13 +502,13 @@ L(max_end):
> >> > VZEROUPPER_RETURN
> >> > # endif
> >> >
> >> > - /* Cold case for crossing page with first load. */
> >> > + /* Cold case for crossing page with first load. */
> >> > .p2align 4
> >> > L(cross_page_boundary):
> >> > /* Align data to VEC_SIZE - 1. */
> >> > orq $(VEC_SIZE - 1), %rdi
> >> > VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> >> > - vpmovmskb %ymm1, %eax
> >> > + vpmovmskb %ymm1, %eax
> >> > /* Remove the leading bytes. sarxl only uses bits [5:0] of
> COUNT
> >> > so no need to manually mod rdx. */
> >> > sarxl %edx, %eax, %eax
> >> > @@ -470,6 +517,10 @@ L(cross_page_boundary):
> >> > jnz L(cross_page_less_vec)
> >> > leaq 1(%rdi), %rcx
> >> > subq %rdx, %rcx
> >> > +# ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> > + shrl $2, %ecx
> >> > +# endif
> >> > /* Check length. */
> >> > cmpq %rsi, %rcx
> >> > jb L(cross_page_continue)
> >> > @@ -479,6 +530,7 @@ L(cross_page_boundary):
> >> > jz L(cross_page_continue)
> >> > tzcntl %eax, %eax
> >> > # ifdef USE_AS_WCSLEN
> >> > + /* NB: Divide length by 4 to get wchar_t count. */
> >> > shrl $2, %eax
> >> > # endif
> >> > # endif
> >> > @@ -489,6 +541,10 @@ L(return_vzeroupper):
> >> > .p2align 4
> >> > L(cross_page_less_vec):
> >> > tzcntl %eax, %eax
> >> > +# ifdef USE_AS_WCSLEN
> >> > + /* NB: Multiply length by 4 to get byte count. */
> >> > + sall $2, %esi
> >> > +# endif
> >> > cmpq %rax, %rsi
> >> > cmovb %esi, %eax
> >> > # ifdef USE_AS_WCSLEN
> >> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> >> > index d223ea1700..3fc6734910 100644
> >> > --- a/sysdeps/x86_64/strlen.S
> >> > +++ b/sysdeps/x86_64/strlen.S
> >> > @@ -65,12 +65,24 @@ ENTRY(strlen)
> >> > ret
> >> > L(n_nonzero):
> >> > # ifdef AS_WCSLEN
> >> > - shl $2, %RSI_LP
> >> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> >> > + overflow the only way this program doesn't have undefined behavior
> >> > + is if there is a null terminator in valid memory so strlen will
> >> > + suffice. */
> >> > + mov %RSI_LP, %R10_LP
> >> > + sar $62, %R10_LP
> >> > + test %R10_LP, %R10_LP
> >> > + jnz __wcslen_sse2
> >>
> >> Branch to __wcslen_sse2 is wrong for 2 reasons:
> >>
> >> 1. __wcslen_sse2 is undefined with --disable-multi-arch.
> >
> > Won't __wcsnlen_sse2 be undefined with --disable-multi-arch as well?
> >
> >>
> >> 2. You should skip ENDBR64 at function entry.
> >>
> >> Please create a new label and branch to it.
> >>
> > I am not quite sure how to do this. I am trying to use
> > strstr-sse2-unaligned.S as a template:
> >
> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S;h=21e1a5f7cfde8ec07fcc4fc80d26984a58d651d7;hb=HEAD#l78
> > which appears to make a direct call to the global label of __strchr_sse2
> > without anything special in strchr-sse2.S or strstr-sse2-unaligned.S.
> This is different since all files are in sysdeps/x86_64/multiarch.
>
I see. So it turns out we are missing wcslen_sse4_1 which strlen.S
can also implement (it passes all tests). Would jumping to that be
valid?
Otherwise I think the best bet is to add a target for wcslen_sse4_1
and define it and wcsnlen_sse4_1 in the same file so the label is visible.
The only issue is the #defines in strlen.S need to all be protected which
is a bit messy. If we don't want to define wcslen_sse4_1 for whatever
reason, I already have this approach working with defining
wcsnlen_sse4_1 in the same file as wcslen-sse2.S and entering from
a local label. But looking at the code it seems the strlen.S file is a bit
better optimized. Thoughts?
> > Is there an example in the code you know of I can follow?
>
> There are no exact same codes.
>
> memmove-vec-unaligned-erms.S has
>
> ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
> movq %rdi, %rax
> L(start): <<<<<<<<<<<<<< This is equivalent to __wcslen_sse2.
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> movl %edx, %edx
> # endif
> >>
> >> > + sal $2, %RSI_LP
> >> > # endif
> >> >
> >> > /* Initialize long lived registers. */
> >> >
> >> > add %RDI_LP, %RSI_LP
> >> > +# ifdef AS_WCSLEN
> >> > +/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
> >> > + jbe __wcslen_sse2
> >> > +# endif
> >> > mov %RSI_LP, %R10_LP
> >> > and $-64, %R10_LP
> >> > mov %RSI_LP, %R11_LP
> >> > --
> >> > 2.25.1
> >> >
> >>
> >> Thanks.
> >>
> >>
> >> --
> >> H.J.
>
>
>
> --
> H.J.
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
2021-06-23 3:11 ` Noah Goldstein
@ 2021-06-23 3:58 ` H.J. Lu
2021-06-23 4:55 ` Noah Goldstein
0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2021-06-23 3:58 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jun 22, 2021 at 8:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
>
> On Tue, Jun 22, 2021 at 7:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Tue, Jun 22, 2021 at 4:16 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >
>> >
>> >
>> > On Tue, Jun 22, 2021 at 5:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>> >>
>> >> On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >> >
>> >> > This commit fixes the bug mentioned in the previous commit.
>> >> >
>> >> > The previous implementations of wmemchr in these files relied
>> >> > on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
>> >> >
>> >> > The new overflow tests added in the previous commit now
>> >> > pass (As well as all the other tests).
>> >> >
>> >> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
>> >> > ---
>> >> > sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
>> >> > sysdeps/x86_64/strlen.S | 14 ++-
>> >> > 2 files changed, 106 insertions(+), 38 deletions(-)
>> >> >
>> >> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
>> >> > index bd2e6ee44a..b282a75613 100644
>> >> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
>> >> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
>> >> > @@ -44,21 +44,21 @@
>> >> >
>> >> > # define VEC_SIZE 32
>> >> > # define PAGE_SIZE 4096
>> >> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
>> >> >
>> >> > .section SECTION(.text),"ax",@progbits
>> >> > ENTRY (STRLEN)
>> >> > # ifdef USE_AS_STRNLEN
>> >> > /* Check zero length. */
>> >> > +# ifdef __ILP32__
>> >> > + /* Clear upper bits. */
>> >> > + and %RSI_LP, %RSI_LP
>> >> > +# else
>> >> > test %RSI_LP, %RSI_LP
>> >> > +# endif
>> >> > jz L(zero)
>> >> > /* Store max len in R8_LP before adjusting if using WCSLEN. */
>> >> > mov %RSI_LP, %R8_LP
>> >> > -# ifdef USE_AS_WCSLEN
>> >> > - shl $2, %RSI_LP
>> >> > -# elif defined __ILP32__
>> >> > - /* Clear the upper 32 bits. */
>> >> > - movl %esi, %esi
>> >> > -# endif
>> >> > # endif
>> >> > movl %edi, %eax
>> >> > movq %rdi, %rdx
>> >> > @@ -72,10 +72,10 @@ ENTRY (STRLEN)
>> >> >
>> >> > /* Check the first VEC_SIZE bytes. */
>> >> > VPCMPEQ (%rdi), %ymm0, %ymm1
>> >> > - vpmovmskb %ymm1, %eax
>> >> > + vpmovmskb %ymm1, %eax
>> >> > # ifdef USE_AS_STRNLEN
>> >> > /* If length < VEC_SIZE handle special. */
>> >> > - cmpq $VEC_SIZE, %rsi
>> >> > + cmpq $CHAR_PER_VEC, %rsi
>> >> > jbe L(first_vec_x0)
>> >> > # endif
>> >> > /* If empty continue to aligned_more. Otherwise return bit
>> >> > @@ -84,6 +84,7 @@ ENTRY (STRLEN)
>> >> > jz L(aligned_more)
>> >> > tzcntl %eax, %eax
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > shrl $2, %eax
>> >> > # endif
>> >> > VZEROUPPER_RETURN
>> >> > @@ -97,9 +98,14 @@ L(zero):
>> >> > L(first_vec_x0):
>> >> > /* Set bit for max len so that tzcnt will return min of max len
>> >> > and position of first match. */
>> >> > +# ifdef USE_AS_WCSLEN
>> >> > + /* NB: Multiply length by 4 to get byte count. */
>> >> > + sall $2, %esi
>> >> > +# endif
>> >> > btsq %rsi, %rax
>> >> > tzcntl %eax, %eax
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > shrl $2, %eax
>> >> > # endif
>> >> > VZEROUPPER_RETURN
>> >> > @@ -113,14 +119,19 @@ L(first_vec_x1):
>> >> > # ifdef USE_AS_STRNLEN
>> >> > /* Use ecx which was computed earlier to compute correct value.
>> >> > */
>> >> > +# ifdef USE_AS_WCSLEN
>> >> > + leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
>> >> > +# else
>> >> > subl $(VEC_SIZE * 4 + 1), %ecx
>> >> > addl %ecx, %eax
>> >> > +# endif
>> >> > # else
>> >> > subl %edx, %edi
>> >> > incl %edi
>> >> > addl %edi, %eax
>> >> > # endif
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > shrl $2, %eax
>> >> > # endif
>> >> > VZEROUPPER_RETURN
>> >> > @@ -133,14 +144,19 @@ L(first_vec_x2):
>> >> > # ifdef USE_AS_STRNLEN
>> >> > /* Use ecx which was computed earlier to compute correct value.
>> >> > */
>> >> > +# ifdef USE_AS_WCSLEN
>> >> > + leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
>> >> > +# else
>> >> > subl $(VEC_SIZE * 3 + 1), %ecx
>> >> > addl %ecx, %eax
>> >> > +# endif
>> >> > # else
>> >> > subl %edx, %edi
>> >> > addl $(VEC_SIZE + 1), %edi
>> >> > addl %edi, %eax
>> >> > # endif
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > shrl $2, %eax
>> >> > # endif
>> >> > VZEROUPPER_RETURN
>> >> > @@ -153,14 +169,19 @@ L(first_vec_x3):
>> >> > # ifdef USE_AS_STRNLEN
>> >> > /* Use ecx which was computed earlier to compute correct value.
>> >> > */
>> >> > +# ifdef USE_AS_WCSLEN
>> >> > + leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
>> >> > +# else
>> >> > subl $(VEC_SIZE * 2 + 1), %ecx
>> >> > addl %ecx, %eax
>> >> > +# endif
>> >> > # else
>> >> > subl %edx, %edi
>> >> > addl $(VEC_SIZE * 2 + 1), %edi
>> >> > addl %edi, %eax
>> >> > # endif
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > shrl $2, %eax
>> >> > # endif
>> >> > VZEROUPPER_RETURN
>> >> > @@ -173,14 +194,19 @@ L(first_vec_x4):
>> >> > # ifdef USE_AS_STRNLEN
>> >> > /* Use ecx which was computed earlier to compute correct value.
>> >> > */
>> >> > +# ifdef USE_AS_WCSLEN
>> >> > + leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
>> >> > +# else
>> >> > subl $(VEC_SIZE + 1), %ecx
>> >> > addl %ecx, %eax
>> >> > +# endif
>> >> > # else
>> >> > subl %edx, %edi
>> >> > addl $(VEC_SIZE * 3 + 1), %edi
>> >> > addl %edi, %eax
>> >> > # endif
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > shrl $2, %eax
>> >> > # endif
>> >> > VZEROUPPER_RETURN
>> >> > @@ -195,10 +221,14 @@ L(cross_page_continue):
>> >> > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
>> >> > since data is only aligned to VEC_SIZE. */
>> >> > # ifdef USE_AS_STRNLEN
>> >> > - /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
>> >> > - it simplies the logic in last_4x_vec_or_less. */
>> >> > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
>> >> > + because it simplies the logic in last_4x_vec_or_less. */
>> >> > leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
>> >> > subq %rdx, %rcx
>> >> > +# ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get the wchar_t count. */
>> >> > + sarl $2, %ecx
>> >> > +# endif
>> >> > # endif
>> >> > /* Load first VEC regardless. */
>> >> > VPCMPEQ 1(%rdi), %ymm0, %ymm1
>> >> > @@ -207,34 +237,38 @@ L(cross_page_continue):
>> >> > subq %rcx, %rsi
>> >> > jb L(last_4x_vec_or_less)
>> >> > # endif
>> >> > - vpmovmskb %ymm1, %eax
>> >> > + vpmovmskb %ymm1, %eax
>> >> > testl %eax, %eax
>> >> > jnz L(first_vec_x1)
>> >> >
>> >> > VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> >> > - vpmovmskb %ymm1, %eax
>> >> > + vpmovmskb %ymm1, %eax
>> >> > testl %eax, %eax
>> >> > jnz L(first_vec_x2)
>> >> >
>> >> > VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
>> >> > - vpmovmskb %ymm1, %eax
>> >> > + vpmovmskb %ymm1, %eax
>> >> > testl %eax, %eax
>> >> > jnz L(first_vec_x3)
>> >> >
>> >> > VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
>> >> > - vpmovmskb %ymm1, %eax
>> >> > + vpmovmskb %ymm1, %eax
>> >> > testl %eax, %eax
>> >> > jnz L(first_vec_x4)
>> >> >
>> >> > /* Align data to VEC_SIZE * 4 - 1. */
>> >> > # ifdef USE_AS_STRNLEN
>> >> > /* Before adjusting length check if at last VEC_SIZE * 4. */
>> >> > - cmpq $(VEC_SIZE * 4 - 1), %rsi
>> >> > + cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
>> >> > jbe L(last_4x_vec_or_less_load)
>> >> > incq %rdi
>> >> > movl %edi, %ecx
>> >> > orq $(VEC_SIZE * 4 - 1), %rdi
>> >> > andl $(VEC_SIZE * 4 - 1), %ecx
>> >> > +# ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get the wchar_t count. */
>> >> > + sarl $2, %ecx
>> >> > +# endif
>> >> > /* Readjust length. */
>> >> > addq %rcx, %rsi
>> >> > # else
>> >> > @@ -246,13 +280,13 @@ L(cross_page_continue):
>> >> > L(loop_4x_vec):
>> >> > # ifdef USE_AS_STRNLEN
>> >> > /* Break if at end of length. */
>> >> > - subq $(VEC_SIZE * 4), %rsi
>> >> > + subq $(CHAR_PER_VEC * 4), %rsi
>> >> > jb L(last_4x_vec_or_less_cmpeq)
>> >> > # endif
>> >> > - /* Save some code size by microfusing VPMINU with the load. Since
>> >> > - the matches in ymm2/ymm4 can only be returned if there where no
>> >> > - matches in ymm1/ymm3 respectively there is no issue with overlap.
>> >> > - */
>> >> > + /* Save some code size by microfusing VPMINU with the load.
>> >> > + Since the matches in ymm2/ymm4 can only be returned if there
>> >> > + where no matches in ymm1/ymm3 respectively there is no issue
>> >> > + with overlap. */
>> >> > vmovdqa 1(%rdi), %ymm1
>> >> > VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
>> >> > vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
>> >> > @@ -260,7 +294,7 @@ L(loop_4x_vec):
>> >> >
>> >> > VPMINU %ymm2, %ymm4, %ymm5
>> >> > VPCMPEQ %ymm5, %ymm0, %ymm5
>> >> > - vpmovmskb %ymm5, %ecx
>> >> > + vpmovmskb %ymm5, %ecx
>> >> >
>> >> > subq $-(VEC_SIZE * 4), %rdi
>> >> > testl %ecx, %ecx
>> >> > @@ -268,27 +302,28 @@ L(loop_4x_vec):
>> >> >
>> >> >
>> >> > VPCMPEQ %ymm1, %ymm0, %ymm1
>> >> > - vpmovmskb %ymm1, %eax
>> >> > + vpmovmskb %ymm1, %eax
>> >> > subq %rdx, %rdi
>> >> > testl %eax, %eax
>> >> > jnz L(last_vec_return_x0)
>> >> >
>> >> > VPCMPEQ %ymm2, %ymm0, %ymm2
>> >> > - vpmovmskb %ymm2, %eax
>> >> > + vpmovmskb %ymm2, %eax
>> >> > testl %eax, %eax
>> >> > jnz L(last_vec_return_x1)
>> >> >
>> >> > /* Combine last 2 VEC. */
>> >> > VPCMPEQ %ymm3, %ymm0, %ymm3
>> >> > - vpmovmskb %ymm3, %eax
>> >> > - /* rcx has combined result from all 4 VEC. It will only be used if
>> >> > - the first 3 other VEC all did not contain a match. */
>> >> > + vpmovmskb %ymm3, %eax
>> >> > + /* rcx has combined result from all 4 VEC. It will only be used
>> >> > + if the first 3 other VEC all did not contain a match. */
>> >> > salq $32, %rcx
>> >> > orq %rcx, %rax
>> >> > tzcntq %rax, %rax
>> >> > subq $(VEC_SIZE * 2 - 1), %rdi
>> >> > addq %rdi, %rax
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > shrq $2, %rax
>> >> > # endif
>> >> > VZEROUPPER_RETURN
>> >> > @@ -297,15 +332,19 @@ L(loop_4x_vec):
>> >> > # ifdef USE_AS_STRNLEN
>> >> > .p2align 4
>> >> > L(last_4x_vec_or_less_load):
>> >> > - /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
>> >> > + /* Depending on entry adjust rdi / prepare first VEC in ymm1.
>> >> > + */
>> >> > subq $-(VEC_SIZE * 4), %rdi
>> >> > L(last_4x_vec_or_less_cmpeq):
>> >> > VPCMPEQ 1(%rdi), %ymm0, %ymm1
>> >> > L(last_4x_vec_or_less):
>> >> > -
>> >> > - vpmovmskb %ymm1, %eax
>> >> > - /* If remaining length > VEC_SIZE * 2. This works if esi is off by
>> >> > - VEC_SIZE * 4. */
>> >> > +# ifdef USE_AS_WCSLEN
>> >> > + /* NB: Multiply length by 4 to get byte count. */
>> >> > + sall $2, %esi
>> >> > +# endif
>> >> > + vpmovmskb %ymm1, %eax
>> >> > + /* If remaining length > VEC_SIZE * 2. This works if esi is off
>> >> > + by VEC_SIZE * 4. */
>> >> > testl $(VEC_SIZE * 2), %esi
>> >> > jnz L(last_4x_vec)
>> >> >
>> >> > @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
>> >> > jb L(max)
>> >> >
>> >> > VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> >> > - vpmovmskb %ymm1, %eax
>> >> > + vpmovmskb %ymm1, %eax
>> >> > tzcntl %eax, %eax
>> >> > /* Check the end of data. */
>> >> > cmpl %eax, %esi
>> >> > @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
>> >> > addl $(VEC_SIZE + 1), %eax
>> >> > addq %rdi, %rax
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > shrq $2, %rax
>> >> > # endif
>> >> > VZEROUPPER_RETURN
>> >> > @@ -340,6 +380,7 @@ L(last_vec_return_x0):
>> >> > subq $(VEC_SIZE * 4 - 1), %rdi
>> >> > addq %rdi, %rax
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > shrq $2, %rax
>> >> > # endif
>> >> > VZEROUPPER_RETURN
>> >> > @@ -350,6 +391,7 @@ L(last_vec_return_x1):
>> >> > subq $(VEC_SIZE * 3 - 1), %rdi
>> >> > addq %rdi, %rax
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > shrq $2, %rax
>> >> > # endif
>> >> > VZEROUPPER_RETURN
>> >> > @@ -366,6 +408,7 @@ L(last_vec_x1_check):
>> >> > incl %eax
>> >> > addq %rdi, %rax
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > shrq $2, %rax
>> >> > # endif
>> >> > VZEROUPPER_RETURN
>> >> > @@ -381,14 +424,14 @@ L(last_4x_vec):
>> >> > jnz L(last_vec_x1)
>> >> >
>> >> > VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> >> > - vpmovmskb %ymm1, %eax
>> >> > + vpmovmskb %ymm1, %eax
>> >> > testl %eax, %eax
>> >> > jnz L(last_vec_x2)
>> >> >
>> >> > /* Normalize length. */
>> >> > andl $(VEC_SIZE * 4 - 1), %esi
>> >> > VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
>> >> > - vpmovmskb %ymm1, %eax
>> >> > + vpmovmskb %ymm1, %eax
>> >> > testl %eax, %eax
>> >> > jnz L(last_vec_x3)
>> >> >
>> >> > @@ -396,7 +439,7 @@ L(last_4x_vec):
>> >> > jb L(max)
>> >> >
>> >> > VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
>> >> > - vpmovmskb %ymm1, %eax
>> >> > + vpmovmskb %ymm1, %eax
>> >> > tzcntl %eax, %eax
>> >> > /* Check the end of data. */
>> >> > cmpl %eax, %esi
>> >> > @@ -405,6 +448,7 @@ L(last_4x_vec):
>> >> > addl $(VEC_SIZE * 3 + 1), %eax
>> >> > addq %rdi, %rax
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > shrq $2, %rax
>> >> > # endif
>> >> > VZEROUPPER_RETURN
>> >> > @@ -419,6 +463,7 @@ L(last_vec_x1):
>> >> > incl %eax
>> >> > addq %rdi, %rax
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > shrq $2, %rax
>> >> > # endif
>> >> > VZEROUPPER_RETURN
>> >> > @@ -432,6 +477,7 @@ L(last_vec_x2):
>> >> > addl $(VEC_SIZE + 1), %eax
>> >> > addq %rdi, %rax
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > shrq $2, %rax
>> >> > # endif
>> >> > VZEROUPPER_RETURN
>> >> > @@ -447,6 +493,7 @@ L(last_vec_x3):
>> >> > addl $(VEC_SIZE * 2 + 1), %eax
>> >> > addq %rdi, %rax
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > shrq $2, %rax
>> >> > # endif
>> >> > VZEROUPPER_RETURN
>> >> > @@ -455,13 +502,13 @@ L(max_end):
>> >> > VZEROUPPER_RETURN
>> >> > # endif
>> >> >
>> >> > - /* Cold case for crossing page with first load. */
>> >> > + /* Cold case for crossing page with first load. */
>> >> > .p2align 4
>> >> > L(cross_page_boundary):
>> >> > /* Align data to VEC_SIZE - 1. */
>> >> > orq $(VEC_SIZE - 1), %rdi
>> >> > VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
>> >> > - vpmovmskb %ymm1, %eax
>> >> > + vpmovmskb %ymm1, %eax
>> >> > /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
>> >> > so no need to manually mod rdx. */
>> >> > sarxl %edx, %eax, %eax
>> >> > @@ -470,6 +517,10 @@ L(cross_page_boundary):
>> >> > jnz L(cross_page_less_vec)
>> >> > leaq 1(%rdi), %rcx
>> >> > subq %rdx, %rcx
>> >> > +# ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
>> >> > + shrl $2, %ecx
>> >> > +# endif
>> >> > /* Check length. */
>> >> > cmpq %rsi, %rcx
>> >> > jb L(cross_page_continue)
>> >> > @@ -479,6 +530,7 @@ L(cross_page_boundary):
>> >> > jz L(cross_page_continue)
>> >> > tzcntl %eax, %eax
>> >> > # ifdef USE_AS_WCSLEN
>> >> > + /* NB: Divide length by 4 to get wchar_t count. */
>> >> > shrl $2, %eax
>> >> > # endif
>> >> > # endif
>> >> > @@ -489,6 +541,10 @@ L(return_vzeroupper):
>> >> > .p2align 4
>> >> > L(cross_page_less_vec):
>> >> > tzcntl %eax, %eax
>> >> > +# ifdef USE_AS_WCSLEN
>> >> > + /* NB: Multiply length by 4 to get byte count. */
>> >> > + sall $2, %esi
>> >> > +# endif
>> >> > cmpq %rax, %rsi
>> >> > cmovb %esi, %eax
>> >> > # ifdef USE_AS_WCSLEN
>> >> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
>> >> > index d223ea1700..3fc6734910 100644
>> >> > --- a/sysdeps/x86_64/strlen.S
>> >> > +++ b/sysdeps/x86_64/strlen.S
>> >> > @@ -65,12 +65,24 @@ ENTRY(strlen)
>> >> > ret
>> >> > L(n_nonzero):
>> >> > # ifdef AS_WCSLEN
>> >> > - shl $2, %RSI_LP
>> >> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
>> >> > + overflow the only way this program doesn't have undefined behavior
>> >> > + is if there is a null terminator in valid memory so strlen will
>> >> > + suffice. */
>> >> > + mov %RSI_LP, %R10_LP
>> >> > + sar $62, %R10_LP
>> >> > + test %R10_LP, %R10_LP
>> >> > + jnz __wcslen_sse2
>> >>
>> >> Branch to __wcslen_sse2 is wrong for 2 reasons:
>> >>
>> >> 1. __wcslen_sse2 is undefined with --disable-multi-arch.
>> >
>> > Won't __wcsnlen_sse2 be undefined with --disable-multi-arch as well?
>> >
>> >>
>> >> 2. You should skip ENDBR64 at function entry.
>> >>
>> >> Please create a new label and branch to it.
>> >>
>> > I am not quite sure how to do this. I am trying to use
>> > strstr-sse2-unaligned.S as a template:
>> > https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S;h=21e1a5f7cfde8ec07fcc4fc80d26984a58d651d7;hb=HEAD#l78
>> > which appears to make a direct call to the global label of __strchr_sse2
>> > without anything special in strchr-sse2.S or strstr-sse2-unaligned.S.
>>
>>
>> This is different since all files are in sysdeps/x86_64/multiarch.
>
>
> I see. So it turns out we are missing wcslen_sse4_1 which strlen.S
> can also implement (it passes all tests). Would jumping to that be
> valid?
>
> Otherwise I think the best bet is to add a target for wcslen_sse4_1
> and define it and wcsnlen_sse4_1 in the same file so the label is visible.
> The only issue is the #defines in strlen.S need to all be protected which
> is a bit messy. If we don't want to define wcslen_sse4_1 for whatever
> reason, I already have this approach working with defining
> wcsnlen_sse4_1 in the same file as wcslen-sse2.S and entering from
> a local label. But looking at the code it seems the strlen.S file is a bit
> better optimized. Thoughts?
>
I see what is going on. I was confused by SSE4 codes in strlen.S.
I submitted a patch to move it to multiarch/strlen-vec.S.
Yes, we should add wcslen_sse4_1. My question is why we need
to branch from __wcsnlen_sse4_1 to __strlen_sse2 with overflow?
Can you make __wcsnlen_sse4_1 to properly handle it directly?
--
H.J.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
2021-06-23 3:58 ` H.J. Lu
@ 2021-06-23 4:55 ` Noah Goldstein
0 siblings, 0 replies; 27+ messages in thread
From: Noah Goldstein @ 2021-06-23 4:55 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jun 22, 2021 at 11:59 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> On Tue, Jun 22, 2021 at 8:11 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> >
> >
> >
> > On Tue, Jun 22, 2021 at 7:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> On Tue, Jun 22, 2021 at 4:16 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >> >
> >> >
> >> >
> >> > On Tue, Jun 22, 2021 at 5:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >> >>
> >> >> On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <
> goldstein.w.n@gmail.com> wrote:
> >> >> >
> >> >> > This commit fixes the bug mentioned in the previous commit.
> >> >> >
> >> >> > The previous implementations of wmemchr in these files relied
> >> >> > on maxlen * sizeof(wchar_t) which was not guranteed by the
> standard.
> >> >> >
> >> >> > The new overflow tests added in the previous commit now
> >> >> > pass (As well as all the other tests).
> >> >> >
> >> >> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> >> >> > ---
> >> >> > sysdeps/x86_64/multiarch/strlen-avx2.S | 130
> ++++++++++++++++++-------
> >> >> > sysdeps/x86_64/strlen.S | 14 ++-
> >> >> > 2 files changed, 106 insertions(+), 38 deletions(-)
> >> >> >
> >> >> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S
> b/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> >> > index bd2e6ee44a..b282a75613 100644
> >> >> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> >> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> >> > @@ -44,21 +44,21 @@
> >> >> >
> >> >> > # define VEC_SIZE 32
> >> >> > # define PAGE_SIZE 4096
> >> >> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> >> >> >
> >> >> > .section SECTION(.text),"ax",@progbits
> >> >> > ENTRY (STRLEN)
> >> >> > # ifdef USE_AS_STRNLEN
> >> >> > /* Check zero length. */
> >> >> > +# ifdef __ILP32__
> >> >> > + /* Clear upper bits. */
> >> >> > + and %RSI_LP, %RSI_LP
> >> >> > +# else
> >> >> > test %RSI_LP, %RSI_LP
> >> >> > +# endif
> >> >> > jz L(zero)
> >> >> > /* Store max len in R8_LP before adjusting if using
> WCSLEN. */
> >> >> > mov %RSI_LP, %R8_LP
> >> >> > -# ifdef USE_AS_WCSLEN
> >> >> > - shl $2, %RSI_LP
> >> >> > -# elif defined __ILP32__
> >> >> > - /* Clear the upper 32 bits. */
> >> >> > - movl %esi, %esi
> >> >> > -# endif
> >> >> > # endif
> >> >> > movl %edi, %eax
> >> >> > movq %rdi, %rdx
> >> >> > @@ -72,10 +72,10 @@ ENTRY (STRLEN)
> >> >> >
> >> >> > /* Check the first VEC_SIZE bytes. */
> >> >> > VPCMPEQ (%rdi), %ymm0, %ymm1
> >> >> > - vpmovmskb %ymm1, %eax
> >> >> > + vpmovmskb %ymm1, %eax
> >> >> > # ifdef USE_AS_STRNLEN
> >> >> > /* If length < VEC_SIZE handle special. */
> >> >> > - cmpq $VEC_SIZE, %rsi
> >> >> > + cmpq $CHAR_PER_VEC, %rsi
> >> >> > jbe L(first_vec_x0)
> >> >> > # endif
> >> >> > /* If empty continue to aligned_more. Otherwise return bit
> >> >> > @@ -84,6 +84,7 @@ ENTRY (STRLEN)
> >> >> > jz L(aligned_more)
> >> >> > tzcntl %eax, %eax
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > shrl $2, %eax
> >> >> > # endif
> >> >> > VZEROUPPER_RETURN
> >> >> > @@ -97,9 +98,14 @@ L(zero):
> >> >> > L(first_vec_x0):
> >> >> > /* Set bit for max len so that tzcnt will return min of
> max len
> >> >> > and position of first match. */
> >> >> > +# ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Multiply length by 4 to get byte count. */
> >> >> > + sall $2, %esi
> >> >> > +# endif
> >> >> > btsq %rsi, %rax
> >> >> > tzcntl %eax, %eax
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > shrl $2, %eax
> >> >> > # endif
> >> >> > VZEROUPPER_RETURN
> >> >> > @@ -113,14 +119,19 @@ L(first_vec_x1):
> >> >> > # ifdef USE_AS_STRNLEN
> >> >> > /* Use ecx which was computed earlier to compute correct
> value.
> >> >> > */
> >> >> > +# ifdef USE_AS_WCSLEN
> >> >> > + leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
> >> >> > +# else
> >> >> > subl $(VEC_SIZE * 4 + 1), %ecx
> >> >> > addl %ecx, %eax
> >> >> > +# endif
> >> >> > # else
> >> >> > subl %edx, %edi
> >> >> > incl %edi
> >> >> > addl %edi, %eax
> >> >> > # endif
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > shrl $2, %eax
> >> >> > # endif
> >> >> > VZEROUPPER_RETURN
> >> >> > @@ -133,14 +144,19 @@ L(first_vec_x2):
> >> >> > # ifdef USE_AS_STRNLEN
> >> >> > /* Use ecx which was computed earlier to compute correct
> value.
> >> >> > */
> >> >> > +# ifdef USE_AS_WCSLEN
> >> >> > + leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
> >> >> > +# else
> >> >> > subl $(VEC_SIZE * 3 + 1), %ecx
> >> >> > addl %ecx, %eax
> >> >> > +# endif
> >> >> > # else
> >> >> > subl %edx, %edi
> >> >> > addl $(VEC_SIZE + 1), %edi
> >> >> > addl %edi, %eax
> >> >> > # endif
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > shrl $2, %eax
> >> >> > # endif
> >> >> > VZEROUPPER_RETURN
> >> >> > @@ -153,14 +169,19 @@ L(first_vec_x3):
> >> >> > # ifdef USE_AS_STRNLEN
> >> >> > /* Use ecx which was computed earlier to compute correct
> value.
> >> >> > */
> >> >> > +# ifdef USE_AS_WCSLEN
> >> >> > + leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
> >> >> > +# else
> >> >> > subl $(VEC_SIZE * 2 + 1), %ecx
> >> >> > addl %ecx, %eax
> >> >> > +# endif
> >> >> > # else
> >> >> > subl %edx, %edi
> >> >> > addl $(VEC_SIZE * 2 + 1), %edi
> >> >> > addl %edi, %eax
> >> >> > # endif
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > shrl $2, %eax
> >> >> > # endif
> >> >> > VZEROUPPER_RETURN
> >> >> > @@ -173,14 +194,19 @@ L(first_vec_x4):
> >> >> > # ifdef USE_AS_STRNLEN
> >> >> > /* Use ecx which was computed earlier to compute correct
> value.
> >> >> > */
> >> >> > +# ifdef USE_AS_WCSLEN
> >> >> > + leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
> >> >> > +# else
> >> >> > subl $(VEC_SIZE + 1), %ecx
> >> >> > addl %ecx, %eax
> >> >> > +# endif
> >> >> > # else
> >> >> > subl %edx, %edi
> >> >> > addl $(VEC_SIZE * 3 + 1), %edi
> >> >> > addl %edi, %eax
> >> >> > # endif
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > shrl $2, %eax
> >> >> > # endif
> >> >> > VZEROUPPER_RETURN
> >> >> > @@ -195,10 +221,14 @@ L(cross_page_continue):
> >> >> > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a
> time
> >> >> > since data is only aligned to VEC_SIZE. */
> >> >> > # ifdef USE_AS_STRNLEN
> >> >> > - /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> because
> >> >> > - it simplies the logic in last_4x_vec_or_less. */
> >> >> > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> >> >> > + because it simplies the logic in last_4x_vec_or_less.
> */
> >> >> > leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> >> >> > subq %rdx, %rcx
> >> >> > +# ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get the wchar_t count. */
> >> >> > + sarl $2, %ecx
> >> >> > +# endif
> >> >> > # endif
> >> >> > /* Load first VEC regardless. */
> >> >> > VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >> >> > @@ -207,34 +237,38 @@ L(cross_page_continue):
> >> >> > subq %rcx, %rsi
> >> >> > jb L(last_4x_vec_or_less)
> >> >> > # endif
> >> >> > - vpmovmskb %ymm1, %eax
> >> >> > + vpmovmskb %ymm1, %eax
> >> >> > testl %eax, %eax
> >> >> > jnz L(first_vec_x1)
> >> >> >
> >> >> > VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> >> > - vpmovmskb %ymm1, %eax
> >> >> > + vpmovmskb %ymm1, %eax
> >> >> > testl %eax, %eax
> >> >> > jnz L(first_vec_x2)
> >> >> >
> >> >> > VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> >> >> > - vpmovmskb %ymm1, %eax
> >> >> > + vpmovmskb %ymm1, %eax
> >> >> > testl %eax, %eax
> >> >> > jnz L(first_vec_x3)
> >> >> >
> >> >> > VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> >> >> > - vpmovmskb %ymm1, %eax
> >> >> > + vpmovmskb %ymm1, %eax
> >> >> > testl %eax, %eax
> >> >> > jnz L(first_vec_x4)
> >> >> >
> >> >> > /* Align data to VEC_SIZE * 4 - 1. */
> >> >> > # ifdef USE_AS_STRNLEN
> >> >> > /* Before adjusting length check if at last VEC_SIZE * 4.
> */
> >> >> > - cmpq $(VEC_SIZE * 4 - 1), %rsi
> >> >> > + cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
> >> >> > jbe L(last_4x_vec_or_less_load)
> >> >> > incq %rdi
> >> >> > movl %edi, %ecx
> >> >> > orq $(VEC_SIZE * 4 - 1), %rdi
> >> >> > andl $(VEC_SIZE * 4 - 1), %ecx
> >> >> > +# ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get the wchar_t count. */
> >> >> > + sarl $2, %ecx
> >> >> > +# endif
> >> >> > /* Readjust length. */
> >> >> > addq %rcx, %rsi
> >> >> > # else
> >> >> > @@ -246,13 +280,13 @@ L(cross_page_continue):
> >> >> > L(loop_4x_vec):
> >> >> > # ifdef USE_AS_STRNLEN
> >> >> > /* Break if at end of length. */
> >> >> > - subq $(VEC_SIZE * 4), %rsi
> >> >> > + subq $(CHAR_PER_VEC * 4), %rsi
> >> >> > jb L(last_4x_vec_or_less_cmpeq)
> >> >> > # endif
> >> >> > - /* Save some code size by microfusing VPMINU with the
> load. Since
> >> >> > - the matches in ymm2/ymm4 can only be returned if there
> where no
> >> >> > - matches in ymm1/ymm3 respectively there is no issue
> with overlap.
> >> >> > - */
> >> >> > + /* Save some code size by microfusing VPMINU with the load.
> >> >> > + Since the matches in ymm2/ymm4 can only be returned if
> there
> >> >> > + where no matches in ymm1/ymm3 respectively there is no
> issue
> >> >> > + with overlap. */
> >> >> > vmovdqa 1(%rdi), %ymm1
> >> >> > VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> >> >> > vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> >> >> > @@ -260,7 +294,7 @@ L(loop_4x_vec):
> >> >> >
> >> >> > VPMINU %ymm2, %ymm4, %ymm5
> >> >> > VPCMPEQ %ymm5, %ymm0, %ymm5
> >> >> > - vpmovmskb %ymm5, %ecx
> >> >> > + vpmovmskb %ymm5, %ecx
> >> >> >
> >> >> > subq $-(VEC_SIZE * 4), %rdi
> >> >> > testl %ecx, %ecx
> >> >> > @@ -268,27 +302,28 @@ L(loop_4x_vec):
> >> >> >
> >> >> >
> >> >> > VPCMPEQ %ymm1, %ymm0, %ymm1
> >> >> > - vpmovmskb %ymm1, %eax
> >> >> > + vpmovmskb %ymm1, %eax
> >> >> > subq %rdx, %rdi
> >> >> > testl %eax, %eax
> >> >> > jnz L(last_vec_return_x0)
> >> >> >
> >> >> > VPCMPEQ %ymm2, %ymm0, %ymm2
> >> >> > - vpmovmskb %ymm2, %eax
> >> >> > + vpmovmskb %ymm2, %eax
> >> >> > testl %eax, %eax
> >> >> > jnz L(last_vec_return_x1)
> >> >> >
> >> >> > /* Combine last 2 VEC. */
> >> >> > VPCMPEQ %ymm3, %ymm0, %ymm3
> >> >> > - vpmovmskb %ymm3, %eax
> >> >> > - /* rcx has combined result from all 4 VEC. It will only be
> used if
> >> >> > - the first 3 other VEC all did not contain a match. */
> >> >> > + vpmovmskb %ymm3, %eax
> >> >> > + /* rcx has combined result from all 4 VEC. It will only be
> used
> >> >> > + if the first 3 other VEC all did not contain a match.
> */
> >> >> > salq $32, %rcx
> >> >> > orq %rcx, %rax
> >> >> > tzcntq %rax, %rax
> >> >> > subq $(VEC_SIZE * 2 - 1), %rdi
> >> >> > addq %rdi, %rax
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > shrq $2, %rax
> >> >> > # endif
> >> >> > VZEROUPPER_RETURN
> >> >> > @@ -297,15 +332,19 @@ L(loop_4x_vec):
> >> >> > # ifdef USE_AS_STRNLEN
> >> >> > .p2align 4
> >> >> > L(last_4x_vec_or_less_load):
> >> >> > - /* Depending on entry adjust rdi / prepare first VEC in
> ymm1. */
> >> >> > + /* Depending on entry adjust rdi / prepare first VEC in
> ymm1.
> >> >> > + */
> >> >> > subq $-(VEC_SIZE * 4), %rdi
> >> >> > L(last_4x_vec_or_less_cmpeq):
> >> >> > VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >> >> > L(last_4x_vec_or_less):
> >> >> > -
> >> >> > - vpmovmskb %ymm1, %eax
> >> >> > - /* If remaining length > VEC_SIZE * 2. This works if esi
> is off by
> >> >> > - VEC_SIZE * 4. */
> >> >> > +# ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Multiply length by 4 to get byte count. */
> >> >> > + sall $2, %esi
> >> >> > +# endif
> >> >> > + vpmovmskb %ymm1, %eax
> >> >> > + /* If remaining length > VEC_SIZE * 2. This works if esi
> is off
> >> >> > + by VEC_SIZE * 4. */
> >> >> > testl $(VEC_SIZE * 2), %esi
> >> >> > jnz L(last_4x_vec)
> >> >> >
> >> >> > @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
> >> >> > jb L(max)
> >> >> >
> >> >> > VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> >> > - vpmovmskb %ymm1, %eax
> >> >> > + vpmovmskb %ymm1, %eax
> >> >> > tzcntl %eax, %eax
> >> >> > /* Check the end of data. */
> >> >> > cmpl %eax, %esi
> >> >> > @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
> >> >> > addl $(VEC_SIZE + 1), %eax
> >> >> > addq %rdi, %rax
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > shrq $2, %rax
> >> >> > # endif
> >> >> > VZEROUPPER_RETURN
> >> >> > @@ -340,6 +380,7 @@ L(last_vec_return_x0):
> >> >> > subq $(VEC_SIZE * 4 - 1), %rdi
> >> >> > addq %rdi, %rax
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > shrq $2, %rax
> >> >> > # endif
> >> >> > VZEROUPPER_RETURN
> >> >> > @@ -350,6 +391,7 @@ L(last_vec_return_x1):
> >> >> > subq $(VEC_SIZE * 3 - 1), %rdi
> >> >> > addq %rdi, %rax
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > shrq $2, %rax
> >> >> > # endif
> >> >> > VZEROUPPER_RETURN
> >> >> > @@ -366,6 +408,7 @@ L(last_vec_x1_check):
> >> >> > incl %eax
> >> >> > addq %rdi, %rax
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > shrq $2, %rax
> >> >> > # endif
> >> >> > VZEROUPPER_RETURN
> >> >> > @@ -381,14 +424,14 @@ L(last_4x_vec):
> >> >> > jnz L(last_vec_x1)
> >> >> >
> >> >> > VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> >> > - vpmovmskb %ymm1, %eax
> >> >> > + vpmovmskb %ymm1, %eax
> >> >> > testl %eax, %eax
> >> >> > jnz L(last_vec_x2)
> >> >> >
> >> >> > /* Normalize length. */
> >> >> > andl $(VEC_SIZE * 4 - 1), %esi
> >> >> > VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> >> >> > - vpmovmskb %ymm1, %eax
> >> >> > + vpmovmskb %ymm1, %eax
> >> >> > testl %eax, %eax
> >> >> > jnz L(last_vec_x3)
> >> >> >
> >> >> > @@ -396,7 +439,7 @@ L(last_4x_vec):
> >> >> > jb L(max)
> >> >> >
> >> >> > VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> >> >> > - vpmovmskb %ymm1, %eax
> >> >> > + vpmovmskb %ymm1, %eax
> >> >> > tzcntl %eax, %eax
> >> >> > /* Check the end of data. */
> >> >> > cmpl %eax, %esi
> >> >> > @@ -405,6 +448,7 @@ L(last_4x_vec):
> >> >> > addl $(VEC_SIZE * 3 + 1), %eax
> >> >> > addq %rdi, %rax
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > shrq $2, %rax
> >> >> > # endif
> >> >> > VZEROUPPER_RETURN
> >> >> > @@ -419,6 +463,7 @@ L(last_vec_x1):
> >> >> > incl %eax
> >> >> > addq %rdi, %rax
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > shrq $2, %rax
> >> >> > # endif
> >> >> > VZEROUPPER_RETURN
> >> >> > @@ -432,6 +477,7 @@ L(last_vec_x2):
> >> >> > addl $(VEC_SIZE + 1), %eax
> >> >> > addq %rdi, %rax
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > shrq $2, %rax
> >> >> > # endif
> >> >> > VZEROUPPER_RETURN
> >> >> > @@ -447,6 +493,7 @@ L(last_vec_x3):
> >> >> > addl $(VEC_SIZE * 2 + 1), %eax
> >> >> > addq %rdi, %rax
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > shrq $2, %rax
> >> >> > # endif
> >> >> > VZEROUPPER_RETURN
> >> >> > @@ -455,13 +502,13 @@ L(max_end):
> >> >> > VZEROUPPER_RETURN
> >> >> > # endif
> >> >> >
> >> >> > - /* Cold case for crossing page with first load. */
> >> >> > + /* Cold case for crossing page with first load. */
> >> >> > .p2align 4
> >> >> > L(cross_page_boundary):
> >> >> > /* Align data to VEC_SIZE - 1. */
> >> >> > orq $(VEC_SIZE - 1), %rdi
> >> >> > VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> >> >> > - vpmovmskb %ymm1, %eax
> >> >> > + vpmovmskb %ymm1, %eax
> >> >> > /* Remove the leading bytes. sarxl only uses bits [5:0] of
> COUNT
> >> >> > so no need to manually mod rdx. */
> >> >> > sarxl %edx, %eax, %eax
> >> >> > @@ -470,6 +517,10 @@ L(cross_page_boundary):
> >> >> > jnz L(cross_page_less_vec)
> >> >> > leaq 1(%rdi), %rcx
> >> >> > subq %rdx, %rcx
> >> >> > +# ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide bytes by 4 to get wchar_t count. */
> >> >> > + shrl $2, %ecx
> >> >> > +# endif
> >> >> > /* Check length. */
> >> >> > cmpq %rsi, %rcx
> >> >> > jb L(cross_page_continue)
> >> >> > @@ -479,6 +530,7 @@ L(cross_page_boundary):
> >> >> > jz L(cross_page_continue)
> >> >> > tzcntl %eax, %eax
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Divide length by 4 to get wchar_t count. */
> >> >> > shrl $2, %eax
> >> >> > # endif
> >> >> > # endif
> >> >> > @@ -489,6 +541,10 @@ L(return_vzeroupper):
> >> >> > .p2align 4
> >> >> > L(cross_page_less_vec):
> >> >> > tzcntl %eax, %eax
> >> >> > +# ifdef USE_AS_WCSLEN
> >> >> > + /* NB: Multiply length by 4 to get byte count. */
> >> >> > + sall $2, %esi
> >> >> > +# endif
> >> >> > cmpq %rax, %rsi
> >> >> > cmovb %esi, %eax
> >> >> > # ifdef USE_AS_WCSLEN
> >> >> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> >> >> > index d223ea1700..3fc6734910 100644
> >> >> > --- a/sysdeps/x86_64/strlen.S
> >> >> > +++ b/sysdeps/x86_64/strlen.S
> >> >> > @@ -65,12 +65,24 @@ ENTRY(strlen)
> >> >> > ret
> >> >> > L(n_nonzero):
> >> >> > # ifdef AS_WCSLEN
> >> >> > - shl $2, %RSI_LP
> >> >> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> >> >> > + overflow the only way this program doesn't have undefined
> behavior
> >> >> > + is if there is a null terminator in valid memory so strlen will
> >> >> > + suffice. */
> >> >> > + mov %RSI_LP, %R10_LP
> >> >> > + sar $62, %R10_LP
> >> >> > + test %R10_LP, %R10_LP
> >> >> > + jnz __wcslen_sse2
> >> >>
> >> >> Branch to __wcslen_sse2 is wrong for 2 reasons:
> >> >>
> >> >> 1. __wcslen_sse2 is undefined with --disable-multi-arch.
> >> >
> >> > Won't __wcsnlen_sse2 be undefined with --disable-multi-arch as well?
> >> >
> >> >>
> >> >> 2. You should skip ENDBR64 at function entry.
> >> >>
> >> >> Please create a new label and branch to it.
> >> >>
> >> > I am not quite sure how to do this. I am trying to use
> >> > strstr-sse2-unaligned.S as a template:
> >> >
> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S;h=21e1a5f7cfde8ec07fcc4fc80d26984a58d651d7;hb=HEAD#l78
> >> > which appears to make a direct call to the global label of
> __strchr_sse2
> >> > without anything special in strchr-sse2.S or strstr-sse2-unaligned.S.
> >>
> >>
> >> This is different since all files are in sysdeps/x86_64/multiarch.
> >
> >
> > I see. So it turns out we are missing wcslen_sse4_1 which strlen.S
> > can also implement (it passes all tests). Would jumping to that be
> > valid?
> >
> > Otherwise I think the best bet is to add a target for wcslen_sse4_1
> > and define it and wcsnlen_sse4_1 in the same file so the label is
> visible.
> > The only issue is the #defines in strlen.S need to all be protected which
> > is a bit messy. If we don't want to define wcslen_sse4_1 for whatever
> > reason, I already have this approach working with defining
> > wcsnlen_sse4_1 in the same file as wcslen-sse2.S and entering from
> > a local label. But looking at the code it seems the strlen.S file is a
> bit
> > better optimized. Thoughts?
> >
>
> I see what is going on. I was confused by SSE4 codes in strlen.S.
> I submitted a patch to move it to multiarch/strlen-vec.S.
> Yes, we should add wcslen_sse4_1. My question is why we need
> to branch from __wcsnlen_sse4_1 to __strlen_sse2 with overflow?
> Can you make __wcsnlen_sse4_1 to properly handle it directly?
>
> The current approach makes it non-trivial:
# define STRNLEN_PROLOG \
mov %r11, %rsi; \
subq %rax, %rsi; \
andq $-64, %rax; \
testq $-64, %rsi; \
je L(strnlen_ret)
AFAICT forces the length to be in bytes and rewriting that
affects the entire file's logic.
I considered porting the avx2 solution but I don't think it really fits
since the results from 4x vec all fit in a 64 bit register and the vast
improvement of the branch predictor on machines that use avx2.
I also think in the overflow case it is likely faster going through
wcslen as given that all the length bookkeeping / branches
can be dropped although it definitely does pessimize the common
no-overflow case.
> --
> H.J.
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974]
2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
` (5 preceding siblings ...)
2021-06-22 18:11 ` [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
@ 2021-06-23 6:31 ` Noah Goldstein
2021-06-23 17:30 ` H.J. Lu
2021-06-23 6:31 ` [PATCH v3 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
2021-06-23 6:31 ` [PATCH v3 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
8 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-23 6:31 UTC (permalink / raw)
To: libc-alpha
This commit adds tests for a bug in the wide char variant of the
functions where the implementation may assume that maxlen for wcsnlen
or n for wmemchr/strncat will not overflow when multiplied by
sizeof(wchar_t).
These tests show the following implementations failing on x86_64:
wcsnlen-sse4_1
wcsnlen-avx2
wmemchr-sse2
wmemchr-avx2
strncat would fail as well if it where on a system that prefered
either of the wcsnlen implementations that failed as it relies on
wcsnlen.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
Rebased on: [PATCH v1 1/4] x86-64: Add wcslen optimize for sse4.1
string/test-memchr.c | 39 ++++++++++++++++++++++++---
string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
string/test-strnlen.c | 33 +++++++++++++++++++++++
3 files changed, 130 insertions(+), 3 deletions(-)
diff --git a/string/test-memchr.c b/string/test-memchr.c
index 665edc32af..ce964284aa 100644
--- a/string/test-memchr.c
+++ b/string/test-memchr.c
@@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
CHAR *res = CALL (impl, s, c, n);
if (res != exp_res)
{
- error (0, 0, "Wrong result in function %s %p %p", impl->name,
- res, exp_res);
+ error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
+ impl->name, s, c, n, res, exp_res);
ret = 1;
return;
}
@@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
}
buf[align + len] = 0;
- if (pos < len)
+ if (pos < MIN(n, len))
{
buf[align + pos] = seek_char;
buf[align + len] = -seek_char;
@@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
}
+static void
+do_overflow_tests (void)
+{
+ size_t i, j, len;
+ const size_t one = 1;
+ uintptr_t buf_addr = (uintptr_t) buf1;
+
+ for (i = 0; i < 750; ++i)
+ {
+ do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
+ do_test (0, i, 751, i - buf_addr, BIG_CHAR);
+ do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
+ do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
+ do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+ len = 0;
+ for (j = 8 * sizeof(size_t) - 1; j ; --j)
+ {
+ len |= one << j;
+ do_test (0, i, 751, len - i, BIG_CHAR);
+ do_test (0, i, 751, len + i, BIG_CHAR);
+ do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
+ do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
+
+ do_test (0, i, 751, ~len - i, BIG_CHAR);
+ do_test (0, i, 751, ~len + i, BIG_CHAR);
+ do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
+ do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
+ }
+ }
+}
+
static void
do_random_tests (void)
{
@@ -221,6 +253,7 @@ test_main (void)
do_test (page_size / 2 - i, i, i, 1, 0x9B);
do_random_tests ();
+ do_overflow_tests ();
return ret;
}
diff --git a/string/test-strncat.c b/string/test-strncat.c
index 2ef917b820..37ea26ea05 100644
--- a/string/test-strncat.c
+++ b/string/test-strncat.c
@@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
}
}
+static void
+do_overflow_tests (void)
+{
+ size_t i, j, len;
+ const size_t one = 1;
+ CHAR *s1, *s2;
+ uintptr_t s1_addr;
+ s1 = (CHAR *) buf1;
+ s2 = (CHAR *) buf2;
+ s1_addr = (uintptr_t)s1;
+ for (j = 0; j < 200; ++j)
+ s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
+ s2[200] = 0;
+ for (i = 0; i < 750; ++i) {
+ for (j = 0; j < i; ++j)
+ s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
+ s1[i] = '\0';
+
+ FOR_EACH_IMPL (impl, 0)
+ {
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, SIZE_MAX - i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, i - s1_addr);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, -s1_addr - i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
+ }
+
+ len = 0;
+ for (j = 8 * sizeof(size_t) - 1; j ; --j)
+ {
+ len |= one << j;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, len - i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, len + i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, len - s1_addr - i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, len - s1_addr + i);
+
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, ~len - i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, ~len + i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, ~len - s1_addr - i);
+ s2[200] = '\0';
+ do_one_test (impl, s2, s1, ~len - s1_addr + i);
+ }
+ }
+ }
+}
+
static void
do_random_tests (void)
{
@@ -316,6 +376,7 @@ test_main (void)
}
do_random_tests ();
+ do_overflow_tests ();
return ret;
}
diff --git a/string/test-strnlen.c b/string/test-strnlen.c
index 920f58e97b..f53e09263f 100644
--- a/string/test-strnlen.c
+++ b/string/test-strnlen.c
@@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
}
+static void
+do_overflow_tests (void)
+{
+ size_t i, j, len;
+ const size_t one = 1;
+ uintptr_t buf_addr = (uintptr_t) buf1;
+
+ for (i = 0; i < 750; ++i)
+ {
+ do_test (0, i, SIZE_MAX - i, BIG_CHAR);
+ do_test (0, i, i - buf_addr, BIG_CHAR);
+ do_test (0, i, -buf_addr - i, BIG_CHAR);
+ do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
+ do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+ len = 0;
+ for (j = 8 * sizeof(size_t) - 1; j ; --j)
+ {
+ len |= one << j;
+ do_test (0, i, len - i, BIG_CHAR);
+ do_test (0, i, len + i, BIG_CHAR);
+ do_test (0, i, len - buf_addr - i, BIG_CHAR);
+ do_test (0, i, len - buf_addr + i, BIG_CHAR);
+
+ do_test (0, i, ~len - i, BIG_CHAR);
+ do_test (0, i, ~len + i, BIG_CHAR);
+ do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
+ do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
+ }
+ }
+}
+
static void
do_random_tests (void)
{
@@ -283,6 +315,7 @@ test_main (void)
do_random_tests ();
do_page_tests ();
do_page_2_tests ();
+ do_overflow_tests ();
return ret;
}
--
2.25.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v3 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ #27974]
2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
` (6 preceding siblings ...)
2021-06-23 6:31 ` [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat " Noah Goldstein
@ 2021-06-23 6:31 ` Noah Goldstein
2021-06-23 17:30 ` H.J. Lu
2021-06-23 6:31 ` [PATCH v3 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
8 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-23 6:31 UTC (permalink / raw)
To: libc-alpha
This commit fixes the bug mentioned in the previous commit.
The previous implementations of wmemchr in these files relied
on n * sizeof(wchar_t) which was not guranteed by the standard.
The new overflow tests added in the previous commit now
pass (As well as all the other tests).
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++-------
sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
2 files changed, 98 insertions(+), 37 deletions(-)
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index beff2708de..3ddc4655cf 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -21,9 +21,11 @@
#ifdef USE_AS_WMEMCHR
# define MEMCHR wmemchr
# define PCMPEQ pcmpeqd
+# define CHAR_PER_VEC 4
#else
# define MEMCHR memchr
# define PCMPEQ pcmpeqb
+# define CHAR_PER_VEC 16
#endif
/* fast SSE2 version with using pmaxub and 64 byte loop */
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
movd %esi, %xmm1
mov %edi, %ecx
+#ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+#endif
#ifdef USE_AS_WMEMCHR
test %RDX_LP, %RDX_LP
jz L(return_null)
- shl $2, %RDX_LP
#else
-# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %edx, %edx
-# endif
punpcklbw %xmm1, %xmm1
test %RDX_LP, %RDX_LP
jz L(return_null)
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
test %eax, %eax
jnz L(matches_1)
- sub $16, %rdx
+ sub $CHAR_PER_VEC, %rdx
jbe L(return_null)
add $16, %rdi
and $15, %ecx
and $-16, %rdi
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
add %rcx, %rdx
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
jmp L(loop_prolog)
@@ -77,16 +81,21 @@ L(crosscache):
movdqa (%rdi), %xmm0
PCMPEQ %xmm1, %xmm0
-/* Check if there is a match. */
+ /* Check if there is a match. */
pmovmskb %xmm0, %eax
-/* Remove the leading bytes. */
+ /* Remove the leading bytes. */
sar %cl, %eax
test %eax, %eax
je L(unaligned_no_match)
-/* Check which byte is a match. */
+ /* Check which byte is a match. */
bsf %eax, %eax
-
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
add %rdi, %rax
add %rcx, %rax
@@ -94,15 +103,18 @@ L(crosscache):
.p2align 4
L(unaligned_no_match):
- /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
+ /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
"rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
possible addition overflow. */
neg %rcx
add $16, %rcx
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
sub %rcx, %rdx
jbe L(return_null)
add $16, %rdi
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
.p2align 4
@@ -135,7 +147,7 @@ L(loop_prolog):
test $0x3f, %rdi
jz L(align64_loop)
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
movdqa (%rdi), %xmm0
@@ -167,11 +179,14 @@ L(loop_prolog):
mov %rdi, %rcx
and $-64, %rdi
and $63, %ecx
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
add %rcx, %rdx
.p2align 4
L(align64_loop):
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
movdqa (%rdi), %xmm0
movdqa 16(%rdi), %xmm2
@@ -218,7 +233,7 @@ L(align64_loop):
.p2align 4
L(exit_loop):
- add $32, %edx
+ add $(CHAR_PER_VEC * 2), %edx
jle L(exit_loop_32)
movdqa (%rdi), %xmm0
@@ -238,7 +253,7 @@ L(exit_loop):
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32_1)
- sub $16, %edx
+ sub $CHAR_PER_VEC, %edx
jle L(return_null)
PCMPEQ 48(%rdi), %xmm1
@@ -250,13 +265,13 @@ L(exit_loop):
.p2align 4
L(exit_loop_32):
- add $32, %edx
+ add $(CHAR_PER_VEC * 2), %edx
movdqa (%rdi), %xmm0
PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches_1)
- sub $16, %edx
+ sub $CHAR_PER_VEC, %edx
jbe L(return_null)
PCMPEQ 16(%rdi), %xmm1
@@ -293,7 +308,13 @@ L(matches32):
.p2align 4
L(matches_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
add %rdi, %rax
ret
@@ -301,7 +322,13 @@ L(matches_1):
.p2align 4
L(matches16_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 16(%rdi, %rax), %rax
ret
@@ -309,7 +336,13 @@ L(matches16_1):
.p2align 4
L(matches32_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 32(%rdi, %rax), %rax
ret
@@ -317,7 +350,13 @@ L(matches32_1):
.p2align 4
L(matches48_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 48(%rdi, %rax), %rax
ret
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 0d8758e3e7..afdb956502 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -54,21 +54,19 @@
# define VEC_SIZE 32
# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section SECTION(.text),"ax",@progbits
ENTRY (MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
- test %RDX_LP, %RDX_LP
- jz L(null)
-# endif
-# ifdef USE_AS_WMEMCHR
- shl $2, %RDX_LP
-# else
# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %edx, %edx
+ /* Clear upper bits. */
+ and %RDX_LP, %RDX_LP
+# else
+ test %RDX_LP, %RDX_LP
# endif
+ jz L(null)
# endif
/* Broadcast CHAR to YMMMATCH. */
vmovd %esi, %xmm0
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
vpmovmskb %ymm1, %eax
# ifndef USE_AS_RAWMEMCHR
/* If length < CHAR_PER_VEC handle special. */
- cmpq $VEC_SIZE, %rdx
+ cmpq $CHAR_PER_VEC, %rdx
jbe L(first_vec_x0)
# endif
testl %eax, %eax
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
L(first_vec_x0):
/* Check if first match was before length. */
tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
xorl %ecx, %ecx
cmpl %eax, %edx
leaq (%rdi, %rax), %rax
@@ -110,12 +112,12 @@ L(null):
# endif
.p2align 4
L(cross_page_boundary):
- /* Save pointer before aligning as its original value is necessary
- for computer return address if byte is found or adjusting length
- if it is not and this is memchr. */
+ /* Save pointer before aligning as its original value is
+ necessary for computer return address if byte is found or
+ adjusting length if it is not and this is memchr. */
movq %rdi, %rcx
- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
- rdi for rawmemchr. */
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+ and rdi for rawmemchr. */
orq $(VEC_SIZE - 1), %ALGN_PTR_REG
VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
@@ -124,6 +126,10 @@ L(cross_page_boundary):
match). */
leaq 1(%ALGN_PTR_REG), %rsi
subq %RRAW_PTR_REG, %rsi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get wchar_t count. */
+ shrl $2, %esi
+# endif
# endif
/* Remove the leading bytes. */
sarxl %ERAW_PTR_REG, %eax, %eax
@@ -181,6 +187,10 @@ L(cross_page_continue):
orq $(VEC_SIZE - 1), %rdi
/* esi is for adjusting length to see if near the end. */
leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %esi
+# endif
# else
orq $(VEC_SIZE - 1), %rdi
L(cross_page_continue):
@@ -213,7 +223,7 @@ L(cross_page_continue):
# ifndef USE_AS_RAWMEMCHR
/* Check if at last VEC_SIZE * 4 length. */
- subq $(VEC_SIZE * 4), %rdx
+ subq $(CHAR_PER_VEC * 4), %rdx
jbe L(last_4x_vec_or_less_cmpeq)
/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
length. */
@@ -221,6 +231,10 @@ L(cross_page_continue):
movl %edi, %ecx
orq $(VEC_SIZE * 4 - 1), %rdi
andl $(VEC_SIZE * 4 - 1), %ecx
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
addq %rcx, %rdx
# else
/* Align data to VEC_SIZE * 4 - 1 for loop. */
@@ -250,15 +264,19 @@ L(loop_4x_vec):
subq $-(VEC_SIZE * 4), %rdi
- subq $(VEC_SIZE * 4), %rdx
+ subq $(CHAR_PER_VEC * 4), %rdx
ja L(loop_4x_vec)
- /* Fall through into less than 4 remaining vectors of length case.
- */
+ /* Fall through into less than 4 remaining vectors of length
+ case. */
VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
.p2align 4
L(last_4x_vec_or_less):
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
/* Check if first VEC contained match. */
testl %eax, %eax
jnz L(first_vec_x1_check)
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
L(last_4x_vec_or_less_cmpeq):
VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
subq $-(VEC_SIZE * 4), %rdi
/* Check first VEC regardless. */
testl %eax, %eax
--
2.25.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v3 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
` (7 preceding siblings ...)
2021-06-23 6:31 ` [PATCH v3 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
@ 2021-06-23 6:31 ` Noah Goldstein
2021-06-23 17:27 ` H.J. Lu
8 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-23 6:31 UTC (permalink / raw)
To: libc-alpha
This commit fixes the bug mentioned in the previous commit.
The previous implementations of wmemchr in these files relied
on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
The new overflow tests added in the previous commit now
pass (As well as all the other tests).
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
sysdeps/x86_64/multiarch/strlen-vec.S | 15 ++-
2 files changed, 107 insertions(+), 38 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index bd2e6ee44a..b282a75613 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -44,21 +44,21 @@
# define VEC_SIZE 32
# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section SECTION(.text),"ax",@progbits
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
/* Check zero length. */
+# ifdef __ILP32__
+ /* Clear upper bits. */
+ and %RSI_LP, %RSI_LP
+# else
test %RSI_LP, %RSI_LP
+# endif
jz L(zero)
/* Store max len in R8_LP before adjusting if using WCSLEN. */
mov %RSI_LP, %R8_LP
-# ifdef USE_AS_WCSLEN
- shl $2, %RSI_LP
-# elif defined __ILP32__
- /* Clear the upper 32 bits. */
- movl %esi, %esi
-# endif
# endif
movl %edi, %eax
movq %rdi, %rdx
@@ -72,10 +72,10 @@ ENTRY (STRLEN)
/* Check the first VEC_SIZE bytes. */
VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
# ifdef USE_AS_STRNLEN
/* If length < VEC_SIZE handle special. */
- cmpq $VEC_SIZE, %rsi
+ cmpq $CHAR_PER_VEC, %rsi
jbe L(first_vec_x0)
# endif
/* If empty continue to aligned_more. Otherwise return bit
@@ -84,6 +84,7 @@ ENTRY (STRLEN)
jz L(aligned_more)
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -97,9 +98,14 @@ L(zero):
L(first_vec_x0):
/* Set bit for max len so that tzcnt will return min of max len
and position of first match. */
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %esi
+# endif
btsq %rsi, %rax
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -113,14 +119,19 @@ L(first_vec_x1):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE * 4 + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
incl %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -133,14 +144,19 @@ L(first_vec_x2):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE * 3 + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
addl $(VEC_SIZE + 1), %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -153,14 +169,19 @@ L(first_vec_x3):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE * 2 + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
addl $(VEC_SIZE * 2 + 1), %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -173,14 +194,19 @@ L(first_vec_x4):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
addl $(VEC_SIZE * 3 + 1), %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -195,10 +221,14 @@ L(cross_page_continue):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
# ifdef USE_AS_STRNLEN
- /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
- it simplies the logic in last_4x_vec_or_less. */
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
+ because it simplies the logic in last_4x_vec_or_less. */
leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
subq %rdx, %rcx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
# endif
/* Load first VEC regardless. */
VPCMPEQ 1(%rdi), %ymm0, %ymm1
@@ -207,34 +237,38 @@ L(cross_page_continue):
subq %rcx, %rsi
jb L(last_4x_vec_or_less)
# endif
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x1)
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x4)
/* Align data to VEC_SIZE * 4 - 1. */
# ifdef USE_AS_STRNLEN
/* Before adjusting length check if at last VEC_SIZE * 4. */
- cmpq $(VEC_SIZE * 4 - 1), %rsi
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
jbe L(last_4x_vec_or_less_load)
incq %rdi
movl %edi, %ecx
orq $(VEC_SIZE * 4 - 1), %rdi
andl $(VEC_SIZE * 4 - 1), %ecx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
/* Readjust length. */
addq %rcx, %rsi
# else
@@ -246,13 +280,13 @@ L(cross_page_continue):
L(loop_4x_vec):
# ifdef USE_AS_STRNLEN
/* Break if at end of length. */
- subq $(VEC_SIZE * 4), %rsi
+ subq $(CHAR_PER_VEC * 4), %rsi
jb L(last_4x_vec_or_less_cmpeq)
# endif
- /* Save some code size by microfusing VPMINU with the load. Since
- the matches in ymm2/ymm4 can only be returned if there where no
- matches in ymm1/ymm3 respectively there is no issue with overlap.
- */
+ /* Save some code size by microfusing VPMINU with the load.
+ Since the matches in ymm2/ymm4 can only be returned if there
+ where no matches in ymm1/ymm3 respectively there is no issue
+ with overlap. */
vmovdqa 1(%rdi), %ymm1
VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
@@ -260,7 +294,7 @@ L(loop_4x_vec):
VPMINU %ymm2, %ymm4, %ymm5
VPCMPEQ %ymm5, %ymm0, %ymm5
- vpmovmskb %ymm5, %ecx
+ vpmovmskb %ymm5, %ecx
subq $-(VEC_SIZE * 4), %rdi
testl %ecx, %ecx
@@ -268,27 +302,28 @@ L(loop_4x_vec):
VPCMPEQ %ymm1, %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
subq %rdx, %rdi
testl %eax, %eax
jnz L(last_vec_return_x0)
VPCMPEQ %ymm2, %ymm0, %ymm2
- vpmovmskb %ymm2, %eax
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(last_vec_return_x1)
/* Combine last 2 VEC. */
VPCMPEQ %ymm3, %ymm0, %ymm3
- vpmovmskb %ymm3, %eax
- /* rcx has combined result from all 4 VEC. It will only be used if
- the first 3 other VEC all did not contain a match. */
+ vpmovmskb %ymm3, %eax
+ /* rcx has combined result from all 4 VEC. It will only be used
+ if the first 3 other VEC all did not contain a match. */
salq $32, %rcx
orq %rcx, %rax
tzcntq %rax, %rax
subq $(VEC_SIZE * 2 - 1), %rdi
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -297,15 +332,19 @@ L(loop_4x_vec):
# ifdef USE_AS_STRNLEN
.p2align 4
L(last_4x_vec_or_less_load):
- /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1.
+ */
subq $-(VEC_SIZE * 4), %rdi
L(last_4x_vec_or_less_cmpeq):
VPCMPEQ 1(%rdi), %ymm0, %ymm1
L(last_4x_vec_or_less):
-
- vpmovmskb %ymm1, %eax
- /* If remaining length > VEC_SIZE * 2. This works if esi is off by
- VEC_SIZE * 4. */
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %esi
+# endif
+ vpmovmskb %ymm1, %eax
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off
+ by VEC_SIZE * 4. */
testl $(VEC_SIZE * 2), %esi
jnz L(last_4x_vec)
@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
jb L(max)
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
tzcntl %eax, %eax
/* Check the end of data. */
cmpl %eax, %esi
@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -340,6 +380,7 @@ L(last_vec_return_x0):
subq $(VEC_SIZE * 4 - 1), %rdi
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -350,6 +391,7 @@ L(last_vec_return_x1):
subq $(VEC_SIZE * 3 - 1), %rdi
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -366,6 +408,7 @@ L(last_vec_x1_check):
incl %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -381,14 +424,14 @@ L(last_4x_vec):
jnz L(last_vec_x1)
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(last_vec_x2)
/* Normalize length. */
andl $(VEC_SIZE * 4 - 1), %esi
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(last_vec_x3)
@@ -396,7 +439,7 @@ L(last_4x_vec):
jb L(max)
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
tzcntl %eax, %eax
/* Check the end of data. */
cmpl %eax, %esi
@@ -405,6 +448,7 @@ L(last_4x_vec):
addl $(VEC_SIZE * 3 + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -419,6 +463,7 @@ L(last_vec_x1):
incl %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -432,6 +477,7 @@ L(last_vec_x2):
addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -447,6 +493,7 @@ L(last_vec_x3):
addl $(VEC_SIZE * 2 + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -455,13 +502,13 @@ L(max_end):
VZEROUPPER_RETURN
# endif
- /* Cold case for crossing page with first load. */
+ /* Cold case for crossing page with first load. */
.p2align 4
L(cross_page_boundary):
/* Align data to VEC_SIZE - 1. */
orq $(VEC_SIZE - 1), %rdi
VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
so no need to manually mod rdx. */
sarxl %edx, %eax, %eax
@@ -470,6 +517,10 @@ L(cross_page_boundary):
jnz L(cross_page_less_vec)
leaq 1(%rdi), %rcx
subq %rdx, %rcx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
+ shrl $2, %ecx
+# endif
/* Check length. */
cmpq %rsi, %rcx
jb L(cross_page_continue)
@@ -479,6 +530,7 @@ L(cross_page_boundary):
jz L(cross_page_continue)
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide length by 4 to get wchar_t count. */
shrl $2, %eax
# endif
# endif
@@ -489,6 +541,10 @@ L(return_vzeroupper):
.p2align 4
L(cross_page_less_vec):
tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %esi
+# endif
cmpq %rax, %rsi
cmovb %esi, %eax
# ifdef USE_AS_WCSLEN
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
index 8f660bb9c7..439e486a43 100644
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
@@ -65,12 +65,25 @@ ENTRY(strlen)
ret
L(n_nonzero):
# ifdef AS_WCSLEN
- shl $2, %RSI_LP
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
+ overflow the only way this program doesn't have undefined behavior
+ is if there is a null terminator in valid memory so wcslen will
+ suffice. */
+ mov %RSI_LP, %R10_LP
+ sar $62, %R10_LP
+ test %R10_LP, %R10_LP
+ jnz __wcslen_sse4_1
+ sal $2, %RSI_LP
# endif
+
/* Initialize long lived registers. */
add %RDI_LP, %RSI_LP
+# ifdef AS_WCSLEN
+/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
+ jbe __wcslen_sse4_1
+# endif
mov %RSI_LP, %R10_LP
and $-64, %R10_LP
mov %RSI_LP, %R11_LP
--
2.25.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v3 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
2021-06-23 6:31 ` [PATCH v3 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
@ 2021-06-23 17:27 ` H.J. Lu
0 siblings, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2021-06-23 17:27 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jun 22, 2021 at 11:32 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit fixes the bug mentioned in the previous commit.
>
> The previous implementations of wmemchr in these files relied
> on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
>
> The new overflow tests added in the previous commit now
> pass (As well as all the other tests).
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
> sysdeps/x86_64/multiarch/strlen-vec.S | 15 ++-
> 2 files changed, 107 insertions(+), 38 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> index bd2e6ee44a..b282a75613 100644
> --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> @@ -44,21 +44,21 @@
>
> # define VEC_SIZE 32
> # define PAGE_SIZE 4096
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
>
> .section SECTION(.text),"ax",@progbits
> ENTRY (STRLEN)
> # ifdef USE_AS_STRNLEN
> /* Check zero length. */
> +# ifdef __ILP32__
> + /* Clear upper bits. */
> + and %RSI_LP, %RSI_LP
> +# else
> test %RSI_LP, %RSI_LP
> +# endif
> jz L(zero)
> /* Store max len in R8_LP before adjusting if using WCSLEN. */
> mov %RSI_LP, %R8_LP
> -# ifdef USE_AS_WCSLEN
> - shl $2, %RSI_LP
> -# elif defined __ILP32__
> - /* Clear the upper 32 bits. */
> - movl %esi, %esi
> -# endif
> # endif
> movl %edi, %eax
> movq %rdi, %rdx
> @@ -72,10 +72,10 @@ ENTRY (STRLEN)
>
> /* Check the first VEC_SIZE bytes. */
> VPCMPEQ (%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> # ifdef USE_AS_STRNLEN
> /* If length < VEC_SIZE handle special. */
> - cmpq $VEC_SIZE, %rsi
> + cmpq $CHAR_PER_VEC, %rsi
> jbe L(first_vec_x0)
> # endif
> /* If empty continue to aligned_more. Otherwise return bit
> @@ -84,6 +84,7 @@ ENTRY (STRLEN)
> jz L(aligned_more)
> tzcntl %eax, %eax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrl $2, %eax
> # endif
> VZEROUPPER_RETURN
> @@ -97,9 +98,14 @@ L(zero):
> L(first_vec_x0):
> /* Set bit for max len so that tzcnt will return min of max len
> and position of first match. */
> +# ifdef USE_AS_WCSLEN
> + /* NB: Multiply length by 4 to get byte count. */
> + sall $2, %esi
> +# endif
> btsq %rsi, %rax
> tzcntl %eax, %eax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrl $2, %eax
> # endif
> VZEROUPPER_RETURN
> @@ -113,14 +119,19 @@ L(first_vec_x1):
> # ifdef USE_AS_STRNLEN
> /* Use ecx which was computed earlier to compute correct value.
> */
> +# ifdef USE_AS_WCSLEN
> + leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
> +# else
> subl $(VEC_SIZE * 4 + 1), %ecx
> addl %ecx, %eax
> +# endif
> # else
> subl %edx, %edi
> incl %edi
> addl %edi, %eax
> # endif
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrl $2, %eax
> # endif
> VZEROUPPER_RETURN
> @@ -133,14 +144,19 @@ L(first_vec_x2):
> # ifdef USE_AS_STRNLEN
> /* Use ecx which was computed earlier to compute correct value.
> */
> +# ifdef USE_AS_WCSLEN
> + leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
> +# else
> subl $(VEC_SIZE * 3 + 1), %ecx
> addl %ecx, %eax
> +# endif
> # else
> subl %edx, %edi
> addl $(VEC_SIZE + 1), %edi
> addl %edi, %eax
> # endif
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrl $2, %eax
> # endif
> VZEROUPPER_RETURN
> @@ -153,14 +169,19 @@ L(first_vec_x3):
> # ifdef USE_AS_STRNLEN
> /* Use ecx which was computed earlier to compute correct value.
> */
> +# ifdef USE_AS_WCSLEN
> + leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
> +# else
> subl $(VEC_SIZE * 2 + 1), %ecx
> addl %ecx, %eax
> +# endif
> # else
> subl %edx, %edi
> addl $(VEC_SIZE * 2 + 1), %edi
> addl %edi, %eax
> # endif
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrl $2, %eax
> # endif
> VZEROUPPER_RETURN
> @@ -173,14 +194,19 @@ L(first_vec_x4):
> # ifdef USE_AS_STRNLEN
> /* Use ecx which was computed earlier to compute correct value.
> */
> +# ifdef USE_AS_WCSLEN
> + leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
> +# else
> subl $(VEC_SIZE + 1), %ecx
> addl %ecx, %eax
> +# endif
> # else
> subl %edx, %edi
> addl $(VEC_SIZE * 3 + 1), %edi
> addl %edi, %eax
> # endif
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrl $2, %eax
> # endif
> VZEROUPPER_RETURN
> @@ -195,10 +221,14 @@ L(cross_page_continue):
> /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> since data is only aligned to VEC_SIZE. */
> # ifdef USE_AS_STRNLEN
> - /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> - it simplies the logic in last_4x_vec_or_less. */
> + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> + because it simplies the logic in last_4x_vec_or_less. */
> leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> subq %rdx, %rcx
> +# ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get the wchar_t count. */
> + sarl $2, %ecx
> +# endif
> # endif
> /* Load first VEC regardless. */
> VPCMPEQ 1(%rdi), %ymm0, %ymm1
> @@ -207,34 +237,38 @@ L(cross_page_continue):
> subq %rcx, %rsi
> jb L(last_4x_vec_or_less)
> # endif
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x1)
>
> VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x2)
>
> VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x3)
>
> VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x4)
>
> /* Align data to VEC_SIZE * 4 - 1. */
> # ifdef USE_AS_STRNLEN
> /* Before adjusting length check if at last VEC_SIZE * 4. */
> - cmpq $(VEC_SIZE * 4 - 1), %rsi
> + cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
> jbe L(last_4x_vec_or_less_load)
> incq %rdi
> movl %edi, %ecx
> orq $(VEC_SIZE * 4 - 1), %rdi
> andl $(VEC_SIZE * 4 - 1), %ecx
> +# ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get the wchar_t count. */
> + sarl $2, %ecx
> +# endif
> /* Readjust length. */
> addq %rcx, %rsi
> # else
> @@ -246,13 +280,13 @@ L(cross_page_continue):
> L(loop_4x_vec):
> # ifdef USE_AS_STRNLEN
> /* Break if at end of length. */
> - subq $(VEC_SIZE * 4), %rsi
> + subq $(CHAR_PER_VEC * 4), %rsi
> jb L(last_4x_vec_or_less_cmpeq)
> # endif
> - /* Save some code size by microfusing VPMINU with the load. Since
> - the matches in ymm2/ymm4 can only be returned if there where no
> - matches in ymm1/ymm3 respectively there is no issue with overlap.
> - */
> + /* Save some code size by microfusing VPMINU with the load.
> + Since the matches in ymm2/ymm4 can only be returned if there
> + where no matches in ymm1/ymm3 respectively there is no issue
> + with overlap. */
> vmovdqa 1(%rdi), %ymm1
> VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> @@ -260,7 +294,7 @@ L(loop_4x_vec):
>
> VPMINU %ymm2, %ymm4, %ymm5
> VPCMPEQ %ymm5, %ymm0, %ymm5
> - vpmovmskb %ymm5, %ecx
> + vpmovmskb %ymm5, %ecx
>
> subq $-(VEC_SIZE * 4), %rdi
> testl %ecx, %ecx
> @@ -268,27 +302,28 @@ L(loop_4x_vec):
>
>
> VPCMPEQ %ymm1, %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> subq %rdx, %rdi
> testl %eax, %eax
> jnz L(last_vec_return_x0)
>
> VPCMPEQ %ymm2, %ymm0, %ymm2
> - vpmovmskb %ymm2, %eax
> + vpmovmskb %ymm2, %eax
> testl %eax, %eax
> jnz L(last_vec_return_x1)
>
> /* Combine last 2 VEC. */
> VPCMPEQ %ymm3, %ymm0, %ymm3
> - vpmovmskb %ymm3, %eax
> - /* rcx has combined result from all 4 VEC. It will only be used if
> - the first 3 other VEC all did not contain a match. */
> + vpmovmskb %ymm3, %eax
> + /* rcx has combined result from all 4 VEC. It will only be used
> + if the first 3 other VEC all did not contain a match. */
> salq $32, %rcx
> orq %rcx, %rax
> tzcntq %rax, %rax
> subq $(VEC_SIZE * 2 - 1), %rdi
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -297,15 +332,19 @@ L(loop_4x_vec):
> # ifdef USE_AS_STRNLEN
> .p2align 4
> L(last_4x_vec_or_less_load):
> - /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
> + /* Depending on entry adjust rdi / prepare first VEC in ymm1.
> + */
> subq $-(VEC_SIZE * 4), %rdi
> L(last_4x_vec_or_less_cmpeq):
> VPCMPEQ 1(%rdi), %ymm0, %ymm1
> L(last_4x_vec_or_less):
> -
> - vpmovmskb %ymm1, %eax
> - /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> - VEC_SIZE * 4. */
> +# ifdef USE_AS_WCSLEN
> + /* NB: Multiply length by 4 to get byte count. */
> + sall $2, %esi
> +# endif
> + vpmovmskb %ymm1, %eax
> + /* If remaining length > VEC_SIZE * 2. This works if esi is off
> + by VEC_SIZE * 4. */
> testl $(VEC_SIZE * 2), %esi
> jnz L(last_4x_vec)
>
> @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
> jb L(max)
>
> VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> tzcntl %eax, %eax
> /* Check the end of data. */
> cmpl %eax, %esi
> @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
> addl $(VEC_SIZE + 1), %eax
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -340,6 +380,7 @@ L(last_vec_return_x0):
> subq $(VEC_SIZE * 4 - 1), %rdi
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -350,6 +391,7 @@ L(last_vec_return_x1):
> subq $(VEC_SIZE * 3 - 1), %rdi
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -366,6 +408,7 @@ L(last_vec_x1_check):
> incl %eax
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -381,14 +424,14 @@ L(last_4x_vec):
> jnz L(last_vec_x1)
>
> VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(last_vec_x2)
>
> /* Normalize length. */
> andl $(VEC_SIZE * 4 - 1), %esi
> VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(last_vec_x3)
>
> @@ -396,7 +439,7 @@ L(last_4x_vec):
> jb L(max)
>
> VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> tzcntl %eax, %eax
> /* Check the end of data. */
> cmpl %eax, %esi
> @@ -405,6 +448,7 @@ L(last_4x_vec):
> addl $(VEC_SIZE * 3 + 1), %eax
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -419,6 +463,7 @@ L(last_vec_x1):
> incl %eax
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -432,6 +477,7 @@ L(last_vec_x2):
> addl $(VEC_SIZE + 1), %eax
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -447,6 +493,7 @@ L(last_vec_x3):
> addl $(VEC_SIZE * 2 + 1), %eax
> addq %rdi, %rax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> shrq $2, %rax
> # endif
> VZEROUPPER_RETURN
> @@ -455,13 +502,13 @@ L(max_end):
> VZEROUPPER_RETURN
> # endif
>
> - /* Cold case for crossing page with first load. */
> + /* Cold case for crossing page with first load. */
> .p2align 4
> L(cross_page_boundary):
> /* Align data to VEC_SIZE - 1. */
> orq $(VEC_SIZE - 1), %rdi
> VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm1, %eax
> /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> so no need to manually mod rdx. */
> sarxl %edx, %eax, %eax
> @@ -470,6 +517,10 @@ L(cross_page_boundary):
> jnz L(cross_page_less_vec)
> leaq 1(%rdi), %rcx
> subq %rdx, %rcx
> +# ifdef USE_AS_WCSLEN
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> + shrl $2, %ecx
> +# endif
> /* Check length. */
> cmpq %rsi, %rcx
> jb L(cross_page_continue)
> @@ -479,6 +530,7 @@ L(cross_page_boundary):
> jz L(cross_page_continue)
> tzcntl %eax, %eax
> # ifdef USE_AS_WCSLEN
> + /* NB: Divide length by 4 to get wchar_t count. */
> shrl $2, %eax
> # endif
> # endif
> @@ -489,6 +541,10 @@ L(return_vzeroupper):
> .p2align 4
> L(cross_page_less_vec):
> tzcntl %eax, %eax
> +# ifdef USE_AS_WCSLEN
> + /* NB: Multiply length by 4 to get byte count. */
> + sall $2, %esi
> +# endif
> cmpq %rax, %rsi
> cmovb %esi, %eax
> # ifdef USE_AS_WCSLEN
> diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
> index 8f660bb9c7..439e486a43 100644
> --- a/sysdeps/x86_64/multiarch/strlen-vec.S
> +++ b/sysdeps/x86_64/multiarch/strlen-vec.S
> @@ -65,12 +65,25 @@ ENTRY(strlen)
> ret
> L(n_nonzero):
> # ifdef AS_WCSLEN
> - shl $2, %RSI_LP
> +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> + overflow the only way this program doesn't have undefined behavior
> + is if there is a null terminator in valid memory so wcslen will
> + suffice. */
> + mov %RSI_LP, %R10_LP
> + sar $62, %R10_LP
> + test %R10_LP, %R10_LP
> + jnz __wcslen_sse4_1
> + sal $2, %RSI_LP
> # endif
>
> +
> /* Initialize long lived registers. */
>
> add %RDI_LP, %RSI_LP
> +# ifdef AS_WCSLEN
> +/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
> + jbe __wcslen_sse4_1
> +# endif
> mov %RSI_LP, %R10_LP
> and $-64, %R10_LP
> mov %RSI_LP, %R11_LP
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974]
2021-06-23 6:31 ` [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat " Noah Goldstein
@ 2021-06-23 17:30 ` H.J. Lu
2021-06-23 18:30 ` Noah Goldstein
0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2021-06-23 17:30 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jun 22, 2021 at 11:32 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit adds tests for a bug in the wide char variant of the
> functions where the implementation may assume that maxlen for wcsnlen
> or n for wmemchr/strncat will not overflow when multiplied by
> sizeof(wchar_t).
>
> These tests show the following implementations failing on x86_64:
>
> wcsnlen-sse4_1
> wcsnlen-avx2
>
> wmemchr-sse2
> wmemchr-avx2
>
> strncat would fail as well if it where on a system that prefered
> either of the wcsnlen implementations that failed as it relies on
> wcsnlen.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> Rebased on: [PATCH v1 1/4] x86-64: Add wcslen optimize for sse4.1
> string/test-memchr.c | 39 ++++++++++++++++++++++++---
> string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
> string/test-strnlen.c | 33 +++++++++++++++++++++++
> 3 files changed, 130 insertions(+), 3 deletions(-)
>
> diff --git a/string/test-memchr.c b/string/test-memchr.c
> index 665edc32af..ce964284aa 100644
> --- a/string/test-memchr.c
> +++ b/string/test-memchr.c
> @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
> CHAR *res = CALL (impl, s, c, n);
> if (res != exp_res)
> {
> - error (0, 0, "Wrong result in function %s %p %p", impl->name,
> - res, exp_res);
> + error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
> + impl->name, s, c, n, res, exp_res);
> ret = 1;
> return;
> }
> @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
> }
> buf[align + len] = 0;
>
> - if (pos < len)
> + if (pos < MIN(n, len))
> {
> buf[align + pos] = seek_char;
> buf[align + len] = -seek_char;
> @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
> do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
> }
>
> +static void
> +do_overflow_tests (void)
> +{
> + size_t i, j, len;
> + const size_t one = 1;
> + uintptr_t buf_addr = (uintptr_t) buf1;
> +
> + for (i = 0; i < 750; ++i)
> + {
> + do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
> + do_test (0, i, 751, i - buf_addr, BIG_CHAR);
> + do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
> + do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
> + do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
> +
> + len = 0;
> + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> + {
> + len |= one << j;
> + do_test (0, i, 751, len - i, BIG_CHAR);
> + do_test (0, i, 751, len + i, BIG_CHAR);
> + do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
> + do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
> +
> + do_test (0, i, 751, ~len - i, BIG_CHAR);
> + do_test (0, i, 751, ~len + i, BIG_CHAR);
> + do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
> + do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
> + }
> + }
> +}
> +
> static void
> do_random_tests (void)
> {
> @@ -221,6 +253,7 @@ test_main (void)
> do_test (page_size / 2 - i, i, i, 1, 0x9B);
>
> do_random_tests ();
> + do_overflow_tests ();
> return ret;
> }
>
> diff --git a/string/test-strncat.c b/string/test-strncat.c
> index 2ef917b820..37ea26ea05 100644
> --- a/string/test-strncat.c
> +++ b/string/test-strncat.c
> @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
> }
> }
>
> +static void
> +do_overflow_tests (void)
> +{
> + size_t i, j, len;
> + const size_t one = 1;
> + CHAR *s1, *s2;
> + uintptr_t s1_addr;
> + s1 = (CHAR *) buf1;
> + s2 = (CHAR *) buf2;
> + s1_addr = (uintptr_t)s1;
> + for (j = 0; j < 200; ++j)
> + s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
> + s2[200] = 0;
> + for (i = 0; i < 750; ++i) {
> + for (j = 0; j < i; ++j)
> + s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
> + s1[i] = '\0';
> +
> + FOR_EACH_IMPL (impl, 0)
> + {
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, SIZE_MAX - i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, i - s1_addr);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, -s1_addr - i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
> + }
> +
> + len = 0;
> + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> + {
> + len |= one << j;
> + FOR_EACH_IMPL (impl, 0)
> + {
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, len - i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, len + i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, len - s1_addr - i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, len - s1_addr + i);
> +
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, ~len - i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, ~len + i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, ~len - s1_addr - i);
> + s2[200] = '\0';
> + do_one_test (impl, s2, s1, ~len - s1_addr + i);
> + }
> + }
> + }
> +}
> +
> static void
> do_random_tests (void)
> {
> @@ -316,6 +376,7 @@ test_main (void)
> }
>
> do_random_tests ();
> + do_overflow_tests ();
> return ret;
> }
>
> diff --git a/string/test-strnlen.c b/string/test-strnlen.c
> index 920f58e97b..f53e09263f 100644
> --- a/string/test-strnlen.c
> +++ b/string/test-strnlen.c
> @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
> do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
> }
>
> +static void
> +do_overflow_tests (void)
> +{
> + size_t i, j, len;
> + const size_t one = 1;
> + uintptr_t buf_addr = (uintptr_t) buf1;
> +
> + for (i = 0; i < 750; ++i)
> + {
> + do_test (0, i, SIZE_MAX - i, BIG_CHAR);
> + do_test (0, i, i - buf_addr, BIG_CHAR);
> + do_test (0, i, -buf_addr - i, BIG_CHAR);
> + do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
> + do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
> +
> + len = 0;
> + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> + {
> + len |= one << j;
> + do_test (0, i, len - i, BIG_CHAR);
> + do_test (0, i, len + i, BIG_CHAR);
> + do_test (0, i, len - buf_addr - i, BIG_CHAR);
> + do_test (0, i, len - buf_addr + i, BIG_CHAR);
> +
> + do_test (0, i, ~len - i, BIG_CHAR);
> + do_test (0, i, ~len + i, BIG_CHAR);
> + do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
> + do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
> + }
> + }
> +}
> +
> static void
> do_random_tests (void)
> {
> @@ -283,6 +315,7 @@ test_main (void)
> do_random_tests ();
> do_page_tests ();
> do_page_2_tests ();
> + do_overflow_tests ();
> return ret;
> }
>
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v3 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ #27974]
2021-06-23 6:31 ` [PATCH v3 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
@ 2021-06-23 17:30 ` H.J. Lu
0 siblings, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2021-06-23 17:30 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jun 22, 2021 at 11:32 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit fixes the bug mentioned in the previous commit.
>
> The previous implementations of wmemchr in these files relied
> on n * sizeof(wchar_t) which was not guranteed by the standard.
>
> The new overflow tests added in the previous commit now
> pass (As well as all the other tests).
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++-------
> sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
> 2 files changed, 98 insertions(+), 37 deletions(-)
>
> diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
> index beff2708de..3ddc4655cf 100644
> --- a/sysdeps/x86_64/memchr.S
> +++ b/sysdeps/x86_64/memchr.S
> @@ -21,9 +21,11 @@
> #ifdef USE_AS_WMEMCHR
> # define MEMCHR wmemchr
> # define PCMPEQ pcmpeqd
> +# define CHAR_PER_VEC 4
> #else
> # define MEMCHR memchr
> # define PCMPEQ pcmpeqb
> +# define CHAR_PER_VEC 16
> #endif
>
> /* fast SSE2 version with using pmaxub and 64 byte loop */
> @@ -33,15 +35,14 @@ ENTRY(MEMCHR)
> movd %esi, %xmm1
> mov %edi, %ecx
>
> +#ifdef __ILP32__
> + /* Clear the upper 32 bits. */
> + movl %edx, %edx
> +#endif
> #ifdef USE_AS_WMEMCHR
> test %RDX_LP, %RDX_LP
> jz L(return_null)
> - shl $2, %RDX_LP
> #else
> -# ifdef __ILP32__
> - /* Clear the upper 32 bits. */
> - movl %edx, %edx
> -# endif
> punpcklbw %xmm1, %xmm1
> test %RDX_LP, %RDX_LP
> jz L(return_null)
> @@ -60,13 +61,16 @@ ENTRY(MEMCHR)
> test %eax, %eax
>
> jnz L(matches_1)
> - sub $16, %rdx
> + sub $CHAR_PER_VEC, %rdx
> jbe L(return_null)
> add $16, %rdi
> and $15, %ecx
> and $-16, %rdi
> +#ifdef USE_AS_WMEMCHR
> + shr $2, %ecx
> +#endif
> add %rcx, %rdx
> - sub $64, %rdx
> + sub $(CHAR_PER_VEC * 4), %rdx
> jbe L(exit_loop)
> jmp L(loop_prolog)
>
> @@ -77,16 +81,21 @@ L(crosscache):
> movdqa (%rdi), %xmm0
>
> PCMPEQ %xmm1, %xmm0
> -/* Check if there is a match. */
> + /* Check if there is a match. */
> pmovmskb %xmm0, %eax
> -/* Remove the leading bytes. */
> + /* Remove the leading bytes. */
> sar %cl, %eax
> test %eax, %eax
> je L(unaligned_no_match)
> -/* Check which byte is a match. */
> + /* Check which byte is a match. */
> bsf %eax, %eax
> -
> +#ifdef USE_AS_WMEMCHR
> + mov %eax, %esi
> + shr $2, %esi
> + sub %rsi, %rdx
> +#else
> sub %rax, %rdx
> +#endif
> jbe L(return_null)
> add %rdi, %rax
> add %rcx, %rax
> @@ -94,15 +103,18 @@ L(crosscache):
>
> .p2align 4
> L(unaligned_no_match):
> - /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
> + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
> "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
> possible addition overflow. */
> neg %rcx
> add $16, %rcx
> +#ifdef USE_AS_WMEMCHR
> + shr $2, %ecx
> +#endif
> sub %rcx, %rdx
> jbe L(return_null)
> add $16, %rdi
> - sub $64, %rdx
> + sub $(CHAR_PER_VEC * 4), %rdx
> jbe L(exit_loop)
>
> .p2align 4
> @@ -135,7 +147,7 @@ L(loop_prolog):
> test $0x3f, %rdi
> jz L(align64_loop)
>
> - sub $64, %rdx
> + sub $(CHAR_PER_VEC * 4), %rdx
> jbe L(exit_loop)
>
> movdqa (%rdi), %xmm0
> @@ -167,11 +179,14 @@ L(loop_prolog):
> mov %rdi, %rcx
> and $-64, %rdi
> and $63, %ecx
> +#ifdef USE_AS_WMEMCHR
> + shr $2, %ecx
> +#endif
> add %rcx, %rdx
>
> .p2align 4
> L(align64_loop):
> - sub $64, %rdx
> + sub $(CHAR_PER_VEC * 4), %rdx
> jbe L(exit_loop)
> movdqa (%rdi), %xmm0
> movdqa 16(%rdi), %xmm2
> @@ -218,7 +233,7 @@ L(align64_loop):
>
> .p2align 4
> L(exit_loop):
> - add $32, %edx
> + add $(CHAR_PER_VEC * 2), %edx
> jle L(exit_loop_32)
>
> movdqa (%rdi), %xmm0
> @@ -238,7 +253,7 @@ L(exit_loop):
> pmovmskb %xmm3, %eax
> test %eax, %eax
> jnz L(matches32_1)
> - sub $16, %edx
> + sub $CHAR_PER_VEC, %edx
> jle L(return_null)
>
> PCMPEQ 48(%rdi), %xmm1
> @@ -250,13 +265,13 @@ L(exit_loop):
>
> .p2align 4
> L(exit_loop_32):
> - add $32, %edx
> + add $(CHAR_PER_VEC * 2), %edx
> movdqa (%rdi), %xmm0
> PCMPEQ %xmm1, %xmm0
> pmovmskb %xmm0, %eax
> test %eax, %eax
> jnz L(matches_1)
> - sub $16, %edx
> + sub $CHAR_PER_VEC, %edx
> jbe L(return_null)
>
> PCMPEQ 16(%rdi), %xmm1
> @@ -293,7 +308,13 @@ L(matches32):
> .p2align 4
> L(matches_1):
> bsf %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> + mov %eax, %esi
> + shr $2, %esi
> + sub %rsi, %rdx
> +#else
> sub %rax, %rdx
> +#endif
> jbe L(return_null)
> add %rdi, %rax
> ret
> @@ -301,7 +322,13 @@ L(matches_1):
> .p2align 4
> L(matches16_1):
> bsf %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> + mov %eax, %esi
> + shr $2, %esi
> + sub %rsi, %rdx
> +#else
> sub %rax, %rdx
> +#endif
> jbe L(return_null)
> lea 16(%rdi, %rax), %rax
> ret
> @@ -309,7 +336,13 @@ L(matches16_1):
> .p2align 4
> L(matches32_1):
> bsf %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> + mov %eax, %esi
> + shr $2, %esi
> + sub %rsi, %rdx
> +#else
> sub %rax, %rdx
> +#endif
> jbe L(return_null)
> lea 32(%rdi, %rax), %rax
> ret
> @@ -317,7 +350,13 @@ L(matches32_1):
> .p2align 4
> L(matches48_1):
> bsf %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> + mov %eax, %esi
> + shr $2, %esi
> + sub %rsi, %rdx
> +#else
> sub %rax, %rdx
> +#endif
> jbe L(return_null)
> lea 48(%rdi, %rax), %rax
> ret
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 0d8758e3e7..afdb956502 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -54,21 +54,19 @@
>
> # define VEC_SIZE 32
> # define PAGE_SIZE 4096
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
>
> .section SECTION(.text),"ax",@progbits
> ENTRY (MEMCHR)
> # ifndef USE_AS_RAWMEMCHR
> /* Check for zero length. */
> - test %RDX_LP, %RDX_LP
> - jz L(null)
> -# endif
> -# ifdef USE_AS_WMEMCHR
> - shl $2, %RDX_LP
> -# else
> # ifdef __ILP32__
> - /* Clear the upper 32 bits. */
> - movl %edx, %edx
> + /* Clear upper bits. */
> + and %RDX_LP, %RDX_LP
> +# else
> + test %RDX_LP, %RDX_LP
> # endif
> + jz L(null)
> # endif
> /* Broadcast CHAR to YMMMATCH. */
> vmovd %esi, %xmm0
> @@ -84,7 +82,7 @@ ENTRY (MEMCHR)
> vpmovmskb %ymm1, %eax
> # ifndef USE_AS_RAWMEMCHR
> /* If length < CHAR_PER_VEC handle special. */
> - cmpq $VEC_SIZE, %rdx
> + cmpq $CHAR_PER_VEC, %rdx
> jbe L(first_vec_x0)
> # endif
> testl %eax, %eax
> @@ -98,6 +96,10 @@ ENTRY (MEMCHR)
> L(first_vec_x0):
> /* Check if first match was before length. */
> tzcntl %eax, %eax
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Multiply length by 4 to get byte count. */
> + sall $2, %edx
> +# endif
> xorl %ecx, %ecx
> cmpl %eax, %edx
> leaq (%rdi, %rax), %rax
> @@ -110,12 +112,12 @@ L(null):
> # endif
> .p2align 4
> L(cross_page_boundary):
> - /* Save pointer before aligning as its original value is necessary
> - for computer return address if byte is found or adjusting length
> - if it is not and this is memchr. */
> + /* Save pointer before aligning as its original value is
> + necessary for computer return address if byte is found or
> + adjusting length if it is not and this is memchr. */
> movq %rdi, %rcx
> - /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
> - rdi for rawmemchr. */
> + /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
> + and rdi for rawmemchr. */
> orq $(VEC_SIZE - 1), %ALGN_PTR_REG
> VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> @@ -124,6 +126,10 @@ L(cross_page_boundary):
> match). */
> leaq 1(%ALGN_PTR_REG), %rsi
> subq %RRAW_PTR_REG, %rsi
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Divide bytes by 4 to get wchar_t count. */
> + shrl $2, %esi
> +# endif
> # endif
> /* Remove the leading bytes. */
> sarxl %ERAW_PTR_REG, %eax, %eax
> @@ -181,6 +187,10 @@ L(cross_page_continue):
> orq $(VEC_SIZE - 1), %rdi
> /* esi is for adjusting length to see if near the end. */
> leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Divide bytes by 4 to get the wchar_t count. */
> + sarl $2, %esi
> +# endif
> # else
> orq $(VEC_SIZE - 1), %rdi
> L(cross_page_continue):
> @@ -213,7 +223,7 @@ L(cross_page_continue):
>
> # ifndef USE_AS_RAWMEMCHR
> /* Check if at last VEC_SIZE * 4 length. */
> - subq $(VEC_SIZE * 4), %rdx
> + subq $(CHAR_PER_VEC * 4), %rdx
> jbe L(last_4x_vec_or_less_cmpeq)
> /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
> length. */
> @@ -221,6 +231,10 @@ L(cross_page_continue):
> movl %edi, %ecx
> orq $(VEC_SIZE * 4 - 1), %rdi
> andl $(VEC_SIZE * 4 - 1), %ecx
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Divide bytes by 4 to get the wchar_t count. */
> + sarl $2, %ecx
> +# endif
> addq %rcx, %rdx
> # else
> /* Align data to VEC_SIZE * 4 - 1 for loop. */
> @@ -250,15 +264,19 @@ L(loop_4x_vec):
>
> subq $-(VEC_SIZE * 4), %rdi
>
> - subq $(VEC_SIZE * 4), %rdx
> + subq $(CHAR_PER_VEC * 4), %rdx
> ja L(loop_4x_vec)
>
> - /* Fall through into less than 4 remaining vectors of length case.
> - */
> + /* Fall through into less than 4 remaining vectors of length
> + case. */
> VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> .p2align 4
> L(last_4x_vec_or_less):
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Multiply length by 4 to get byte count. */
> + sall $2, %edx
> +# endif
> /* Check if first VEC contained match. */
> testl %eax, %eax
> jnz L(first_vec_x1_check)
> @@ -355,6 +373,10 @@ L(last_vec_x2_return):
> L(last_4x_vec_or_less_cmpeq):
> VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Multiply length by 4 to get byte count. */
> + sall $2, %edx
> +# endif
> subq $-(VEC_SIZE * 4), %rdi
> /* Check first VEC regardless. */
> testl %eax, %eax
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974]
2021-06-23 17:30 ` H.J. Lu
@ 2021-06-23 18:30 ` Noah Goldstein
2022-01-27 21:06 ` H.J. Lu
0 siblings, 1 reply; 27+ messages in thread
From: Noah Goldstein @ 2021-06-23 18:30 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Wed, Jun 23, 2021 at 1:30 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> On Tue, Jun 22, 2021 at 11:32 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> > This commit adds tests for a bug in the wide char variant of the
> > functions where the implementation may assume that maxlen for wcsnlen
> > or n for wmemchr/strncat will not overflow when multiplied by
> > sizeof(wchar_t).
> >
> > These tests show the following implementations failing on x86_64:
> >
> > wcsnlen-sse4_1
> > wcsnlen-avx2
> >
> > wmemchr-sse2
> > wmemchr-avx2
> >
> > strncat would fail as well if it where on a system that prefered
> > either of the wcsnlen implementations that failed as it relies on
> > wcsnlen.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> > Rebased on: [PATCH v1 1/4] x86-64: Add wcslen optimize for sse4.1
> > string/test-memchr.c | 39 ++++++++++++++++++++++++---
> > string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
> > string/test-strnlen.c | 33 +++++++++++++++++++++++
> > 3 files changed, 130 insertions(+), 3 deletions(-)
> >
> > diff --git a/string/test-memchr.c b/string/test-memchr.c
> > index 665edc32af..ce964284aa 100644
> > --- a/string/test-memchr.c
> > +++ b/string/test-memchr.c
> > @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c,
> size_t n, CHAR *exp_res)
> > CHAR *res = CALL (impl, s, c, n);
> > if (res != exp_res)
> > {
> > - error (0, 0, "Wrong result in function %s %p %p", impl->name,
> > - res, exp_res);
> > + error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p !=
> %p",
> > + impl->name, s, c, n, res, exp_res);
> > ret = 1;
> > return;
> > }
> > @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t
> n, int seek_char)
> > }
> > buf[align + len] = 0;
> >
> > - if (pos < len)
> > + if (pos < MIN(n, len))
> > {
> > buf[align + pos] = seek_char;
> > buf[align + len] = -seek_char;
> > @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len,
> size_t n, int seek_char)
> > do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
> > }
> >
> > +static void
> > +do_overflow_tests (void)
> > +{
> > + size_t i, j, len;
> > + const size_t one = 1;
> > + uintptr_t buf_addr = (uintptr_t) buf1;
> > +
> > + for (i = 0; i < 750; ++i)
> > + {
> > + do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
> > + do_test (0, i, 751, i - buf_addr, BIG_CHAR);
> > + do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
> > + do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
> > + do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
> > +
> > + len = 0;
> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> > + {
> > + len |= one << j;
> > + do_test (0, i, 751, len - i, BIG_CHAR);
> > + do_test (0, i, 751, len + i, BIG_CHAR);
> > + do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
> > + do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
> > +
> > + do_test (0, i, 751, ~len - i, BIG_CHAR);
> > + do_test (0, i, 751, ~len + i, BIG_CHAR);
> > + do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
> > + do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
> > + }
> > + }
> > +}
> > +
> > static void
> > do_random_tests (void)
> > {
> > @@ -221,6 +253,7 @@ test_main (void)
> > do_test (page_size / 2 - i, i, i, 1, 0x9B);
> >
> > do_random_tests ();
> > + do_overflow_tests ();
> > return ret;
> > }
> >
> > diff --git a/string/test-strncat.c b/string/test-strncat.c
> > index 2ef917b820..37ea26ea05 100644
> > --- a/string/test-strncat.c
> > +++ b/string/test-strncat.c
> > @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1,
> size_t len2,
> > }
> > }
> >
> > +static void
> > +do_overflow_tests (void)
> > +{
> > + size_t i, j, len;
> > + const size_t one = 1;
> > + CHAR *s1, *s2;
> > + uintptr_t s1_addr;
> > + s1 = (CHAR *) buf1;
> > + s2 = (CHAR *) buf2;
> > + s1_addr = (uintptr_t)s1;
> > + for (j = 0; j < 200; ++j)
> > + s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
> > + s2[200] = 0;
> > + for (i = 0; i < 750; ++i) {
> > + for (j = 0; j < i; ++j)
> > + s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
> > + s1[i] = '\0';
> > +
> > + FOR_EACH_IMPL (impl, 0)
> > + {
> > + s2[200] = '\0';
> > + do_one_test (impl, s2, s1, SIZE_MAX - i);
> > + s2[200] = '\0';
> > + do_one_test (impl, s2, s1, i - s1_addr);
> > + s2[200] = '\0';
> > + do_one_test (impl, s2, s1, -s1_addr - i);
> > + s2[200] = '\0';
> > + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
> > + s2[200] = '\0';
> > + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
> > + }
> > +
> > + len = 0;
> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> > + {
> > + len |= one << j;
> > + FOR_EACH_IMPL (impl, 0)
> > + {
> > + s2[200] = '\0';
> > + do_one_test (impl, s2, s1, len - i);
> > + s2[200] = '\0';
> > + do_one_test (impl, s2, s1, len + i);
> > + s2[200] = '\0';
> > + do_one_test (impl, s2, s1, len - s1_addr - i);
> > + s2[200] = '\0';
> > + do_one_test (impl, s2, s1, len - s1_addr + i);
> > +
> > + s2[200] = '\0';
> > + do_one_test (impl, s2, s1, ~len - i);
> > + s2[200] = '\0';
> > + do_one_test (impl, s2, s1, ~len + i);
> > + s2[200] = '\0';
> > + do_one_test (impl, s2, s1, ~len - s1_addr - i);
> > + s2[200] = '\0';
> > + do_one_test (impl, s2, s1, ~len - s1_addr + i);
> > + }
> > + }
> > + }
> > +}
> > +
> > static void
> > do_random_tests (void)
> > {
> > @@ -316,6 +376,7 @@ test_main (void)
> > }
> >
> > do_random_tests ();
> > + do_overflow_tests ();
> > return ret;
> > }
> >
> > diff --git a/string/test-strnlen.c b/string/test-strnlen.c
> > index 920f58e97b..f53e09263f 100644
> > --- a/string/test-strnlen.c
> > +++ b/string/test-strnlen.c
> > @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int
> max_char)
> > do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len,
> maxlen));
> > }
> >
> > +static void
> > +do_overflow_tests (void)
> > +{
> > + size_t i, j, len;
> > + const size_t one = 1;
> > + uintptr_t buf_addr = (uintptr_t) buf1;
> > +
> > + for (i = 0; i < 750; ++i)
> > + {
> > + do_test (0, i, SIZE_MAX - i, BIG_CHAR);
> > + do_test (0, i, i - buf_addr, BIG_CHAR);
> > + do_test (0, i, -buf_addr - i, BIG_CHAR);
> > + do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
> > + do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
> > +
> > + len = 0;
> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
> > + {
> > + len |= one << j;
> > + do_test (0, i, len - i, BIG_CHAR);
> > + do_test (0, i, len + i, BIG_CHAR);
> > + do_test (0, i, len - buf_addr - i, BIG_CHAR);
> > + do_test (0, i, len - buf_addr + i, BIG_CHAR);
> > +
> > + do_test (0, i, ~len - i, BIG_CHAR);
> > + do_test (0, i, ~len + i, BIG_CHAR);
> > + do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
> > + do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
> > + }
> > + }
> > +}
> > +
> > static void
> > do_random_tests (void)
> > {
> > @@ -283,6 +315,7 @@ test_main (void)
> > do_random_tests ();
> > do_page_tests ();
> > do_page_2_tests ();
> > + do_overflow_tests ();
> > return ret;
> > }
> >
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
>
Pushed and closed the bug report (left comment in bug report with the
commits).
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974]
2021-06-23 18:30 ` Noah Goldstein
@ 2022-01-27 21:06 ` H.J. Lu
0 siblings, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2022-01-27 21:06 UTC (permalink / raw)
To: Noah Goldstein, Libc-stable Mailing List
Cc: GNU C Library, Carlos O'Donell
On Wed, Jun 23, 2021 at 11:30 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
> On Wed, Jun 23, 2021 at 1:30 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Tue, Jun 22, 2021 at 11:32 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >
>> > This commit adds tests for a bug in the wide char variant of the
>> > functions where the implementation may assume that maxlen for wcsnlen
>> > or n for wmemchr/strncat will not overflow when multiplied by
>> > sizeof(wchar_t).
>> >
>> > These tests show the following implementations failing on x86_64:
>> >
>> > wcsnlen-sse4_1
>> > wcsnlen-avx2
>> >
>> > wmemchr-sse2
>> > wmemchr-avx2
>> >
>> > strncat would fail as well if it where on a system that prefered
>> > either of the wcsnlen implementations that failed as it relies on
>> > wcsnlen.
>> >
>> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
>> > ---
>> > Rebased on: [PATCH v1 1/4] x86-64: Add wcslen optimize for sse4.1
>> > string/test-memchr.c | 39 ++++++++++++++++++++++++---
>> > string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
>> > string/test-strnlen.c | 33 +++++++++++++++++++++++
>> > 3 files changed, 130 insertions(+), 3 deletions(-)
>> >
>> > diff --git a/string/test-memchr.c b/string/test-memchr.c
>> > index 665edc32af..ce964284aa 100644
>> > --- a/string/test-memchr.c
>> > +++ b/string/test-memchr.c
>> > @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
>> > CHAR *res = CALL (impl, s, c, n);
>> > if (res != exp_res)
>> > {
>> > - error (0, 0, "Wrong result in function %s %p %p", impl->name,
>> > - res, exp_res);
>> > + error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
>> > + impl->name, s, c, n, res, exp_res);
>> > ret = 1;
>> > return;
>> > }
>> > @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
>> > }
>> > buf[align + len] = 0;
>> >
>> > - if (pos < len)
>> > + if (pos < MIN(n, len))
>> > {
>> > buf[align + pos] = seek_char;
>> > buf[align + len] = -seek_char;
>> > @@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
>> > do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
>> > }
>> >
>> > +static void
>> > +do_overflow_tests (void)
>> > +{
>> > + size_t i, j, len;
>> > + const size_t one = 1;
>> > + uintptr_t buf_addr = (uintptr_t) buf1;
>> > +
>> > + for (i = 0; i < 750; ++i)
>> > + {
>> > + do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
>> > + do_test (0, i, 751, i - buf_addr, BIG_CHAR);
>> > + do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
>> > +
>> > + len = 0;
>> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
>> > + {
>> > + len |= one << j;
>> > + do_test (0, i, 751, len - i, BIG_CHAR);
>> > + do_test (0, i, 751, len + i, BIG_CHAR);
>> > + do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
>> > +
>> > + do_test (0, i, 751, ~len - i, BIG_CHAR);
>> > + do_test (0, i, 751, ~len + i, BIG_CHAR);
>> > + do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
>> > + }
>> > + }
>> > +}
>> > +
>> > static void
>> > do_random_tests (void)
>> > {
>> > @@ -221,6 +253,7 @@ test_main (void)
>> > do_test (page_size / 2 - i, i, i, 1, 0x9B);
>> >
>> > do_random_tests ();
>> > + do_overflow_tests ();
>> > return ret;
>> > }
>> >
>> > diff --git a/string/test-strncat.c b/string/test-strncat.c
>> > index 2ef917b820..37ea26ea05 100644
>> > --- a/string/test-strncat.c
>> > +++ b/string/test-strncat.c
>> > @@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
>> > }
>> > }
>> >
>> > +static void
>> > +do_overflow_tests (void)
>> > +{
>> > + size_t i, j, len;
>> > + const size_t one = 1;
>> > + CHAR *s1, *s2;
>> > + uintptr_t s1_addr;
>> > + s1 = (CHAR *) buf1;
>> > + s2 = (CHAR *) buf2;
>> > + s1_addr = (uintptr_t)s1;
>> > + for (j = 0; j < 200; ++j)
>> > + s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
>> > + s2[200] = 0;
>> > + for (i = 0; i < 750; ++i) {
>> > + for (j = 0; j < i; ++j)
>> > + s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
>> > + s1[i] = '\0';
>> > +
>> > + FOR_EACH_IMPL (impl, 0)
>> > + {
>> > + s2[200] = '\0';
>> > + do_one_test (impl, s2, s1, SIZE_MAX - i);
>> > + s2[200] = '\0';
>> > + do_one_test (impl, s2, s1, i - s1_addr);
>> > + s2[200] = '\0';
>> > + do_one_test (impl, s2, s1, -s1_addr - i);
>> > + s2[200] = '\0';
>> > + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
>> > + s2[200] = '\0';
>> > + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
>> > + }
>> > +
>> > + len = 0;
>> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
>> > + {
>> > + len |= one << j;
>> > + FOR_EACH_IMPL (impl, 0)
>> > + {
>> > + s2[200] = '\0';
>> > + do_one_test (impl, s2, s1, len - i);
>> > + s2[200] = '\0';
>> > + do_one_test (impl, s2, s1, len + i);
>> > + s2[200] = '\0';
>> > + do_one_test (impl, s2, s1, len - s1_addr - i);
>> > + s2[200] = '\0';
>> > + do_one_test (impl, s2, s1, len - s1_addr + i);
>> > +
>> > + s2[200] = '\0';
>> > + do_one_test (impl, s2, s1, ~len - i);
>> > + s2[200] = '\0';
>> > + do_one_test (impl, s2, s1, ~len + i);
>> > + s2[200] = '\0';
>> > + do_one_test (impl, s2, s1, ~len - s1_addr - i);
>> > + s2[200] = '\0';
>> > + do_one_test (impl, s2, s1, ~len - s1_addr + i);
>> > + }
>> > + }
>> > + }
>> > +}
>> > +
>> > static void
>> > do_random_tests (void)
>> > {
>> > @@ -316,6 +376,7 @@ test_main (void)
>> > }
>> >
>> > do_random_tests ();
>> > + do_overflow_tests ();
>> > return ret;
>> > }
>> >
>> > diff --git a/string/test-strnlen.c b/string/test-strnlen.c
>> > index 920f58e97b..f53e09263f 100644
>> > --- a/string/test-strnlen.c
>> > +++ b/string/test-strnlen.c
>> > @@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
>> > do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
>> > }
>> >
>> > +static void
>> > +do_overflow_tests (void)
>> > +{
>> > + size_t i, j, len;
>> > + const size_t one = 1;
>> > + uintptr_t buf_addr = (uintptr_t) buf1;
>> > +
>> > + for (i = 0; i < 750; ++i)
>> > + {
>> > + do_test (0, i, SIZE_MAX - i, BIG_CHAR);
>> > + do_test (0, i, i - buf_addr, BIG_CHAR);
>> > + do_test (0, i, -buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
>> > +
>> > + len = 0;
>> > + for (j = 8 * sizeof(size_t) - 1; j ; --j)
>> > + {
>> > + len |= one << j;
>> > + do_test (0, i, len - i, BIG_CHAR);
>> > + do_test (0, i, len + i, BIG_CHAR);
>> > + do_test (0, i, len - buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, len - buf_addr + i, BIG_CHAR);
>> > +
>> > + do_test (0, i, ~len - i, BIG_CHAR);
>> > + do_test (0, i, ~len + i, BIG_CHAR);
>> > + do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
>> > + do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
>> > + }
>> > + }
>> > +}
>> > +
>> > static void
>> > do_random_tests (void)
>> > {
>> > @@ -283,6 +315,7 @@ test_main (void)
>> > do_random_tests ();
>> > do_page_tests ();
>> > do_page_2_tests ();
>> > + do_overflow_tests ();
>> > return ret;
>> > }
>> >
>> > --
>> > 2.25.1
>> >
>>
>> LGTM.
>>
>> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>>
>> Thanks.
>>
>> --
>> H.J.
>
>
> Pushed and closed the bug report (left comment in bug report with the commits).
I am backporting this patch set to release branches, including their dependency
patches.
--
H.J.
^ permalink raw reply [flat|nested] 27+ messages in thread
end of thread, other threads:[~2022-01-27 21:06 UTC | newest]
Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-09 20:52 [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat Noah Goldstein
2021-06-09 20:52 ` [PATCH v1 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 Noah Goldstein
2021-06-09 20:52 ` [PATCH v1 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 Noah Goldstein
2021-06-09 21:53 ` [PATCH v1 1/3] String: Add additional overflow tests for strnlen, memchr, and strncat H.J. Lu
2021-06-09 22:26 ` Noah Goldstein
2021-06-22 15:43 ` Noah Goldstein
2021-06-22 16:18 ` H.J. Lu
2021-06-22 18:23 ` Noah Goldstein
2021-06-22 18:11 ` [PATCH v2 1/3] String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974] Noah Goldstein
2021-06-22 21:24 ` H.J. Lu
2021-06-22 18:11 ` [PATCH v2 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
2021-06-22 21:24 ` H.J. Lu
2021-06-22 18:11 ` [PATCH v2 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
2021-06-22 21:33 ` H.J. Lu
2021-06-22 23:16 ` Noah Goldstein
2021-06-22 23:28 ` H.J. Lu
2021-06-23 3:11 ` Noah Goldstein
2021-06-23 3:58 ` H.J. Lu
2021-06-23 4:55 ` Noah Goldstein
2021-06-23 6:31 ` [PATCH v3 1/3] String: Add overflow tests for strnlen, memchr, and strncat " Noah Goldstein
2021-06-23 17:30 ` H.J. Lu
2021-06-23 18:30 ` Noah Goldstein
2022-01-27 21:06 ` H.J. Lu
2021-06-23 6:31 ` [PATCH v3 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 " Noah Goldstein
2021-06-23 17:30 ` H.J. Lu
2021-06-23 6:31 ` [PATCH v3 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 " Noah Goldstein
2021-06-23 17:27 ` H.J. Lu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).