* [PATCH v7 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
@ 2021-04-02 21:35 Noah Goldstein
2021-04-02 21:35 ` [PATCH v7 2/2] x86: Expanding test-memmove.c, test-memcpy.c, bench-memcpy-large.c Noah Goldstein
0 siblings, 1 reply; 2+ messages in thread
From: Noah Goldstein @ 2021-04-02 21:35 UTC (permalink / raw)
To: libc-alpha
From: noah <goldstein.w.n@gmail.com>
No Bug. This commit updates the large memcpy case (no overlap). The
update is to perform memcpy on either 2 or 4 contiguous pages at
once. This 1) helps to alleviate the affects of false memory aliasing
when destination and source have a close 4k alignment and 2) In most
cases and for most DRAM units is a modestly more efficient access
pattern. These changes are a clear performance improvement for
VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
pass.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
.../multiarch/memmove-vec-unaligned-erms.S | 307 ++++++++++++++----
1 file changed, 245 insertions(+), 62 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 897a3d9762..9c7f95c653 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -35,7 +35,16 @@
__x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
7. If size >= __x86_shared_non_temporal_threshold and there is no
overlap between destination and source, use non-temporal store
- instead of aligned store. */
+ instead of aligned store copying from either 2 or 4 pages at
+ once.
+ 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
+ and source and destination do not page alias, copy from 2 pages
+ at once using non-temporal stores. Page aliasing in this case is
+ considered true if destination's page alignment - sources' page
+ alignment is less than 8 * VEC_SIZE.
+ 9. If size >= 16 * __x86_shared_non_temporal_threshold or source
+ and destination do page alias copy from 4 pages at once using
+ non-temporal stores. */
#include <sysdep.h>
@@ -67,6 +76,34 @@
# endif
#endif
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+
+#if PAGE_SIZE != 4096
+# error Unsupported PAGE_SIZE
+#endif
+
+#ifndef LOG_PAGE_SIZE
+# define LOG_PAGE_SIZE 12
+#endif
+
+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
+# error Invalid LOG_PAGE_SIZE
+#endif
+
+/* Byte per page for large_memcpy inner loop. */
+#if VEC_SIZE == 64
+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
+#else
+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
+#endif
+
+/* Amount to shift rdx by to compare for memcpy_large_4x. */
+#ifndef LOG_4X_MEMCPY_THRESH
+# define LOG_4X_MEMCPY_THRESH 4
+#endif
+
/* Avoid short distance rep movsb only with non-SSE vector. */
#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
@@ -106,6 +143,28 @@
# error Unsupported PREFETCH_SIZE!
#endif
+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
+ VMOVU (offset)base, vec0; \
+ VMOVU ((offset) + VEC_SIZE)base, vec1;
+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
+ VMOVNT vec0, (offset)base; \
+ VMOVNT vec1, ((offset) + VEC_SIZE)base;
+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+ VMOVU (offset)base, vec0; \
+ VMOVU ((offset) + VEC_SIZE)base, vec1; \
+ VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
+ VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+ VMOVNT vec0, (offset)base; \
+ VMOVNT vec1, ((offset) + VEC_SIZE)base; \
+ VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
+ VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
+#else
+# error Invalid LARGE_LOAD_SIZE
+#endif
+
#ifndef SECTION
# error SECTION is not defined!
#endif
@@ -393,6 +452,15 @@ L(last_4x_vec):
VZEROUPPER_RETURN
L(more_8x_vec):
+ /* Check if non-temporal move candidate. */
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ /* Check non-temporal store threshold. */
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ ja L(large_memcpy_2x)
+#endif
+ /* Entry if rdx is greater than non-temporal threshold but there
+ is overlap. */
+L(more_8x_vec_check):
cmpq %rsi, %rdi
ja L(more_8x_vec_backward)
/* Source == destination is less common. */
@@ -419,11 +487,6 @@ L(more_8x_vec):
subq %r8, %rdi
/* Adjust length. */
addq %r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
- /* Check non-temporal store threshold. */
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
- ja L(large_forward)
-#endif
L(loop_4x_vec_forward):
/* Copy 4 * VEC a time forward. */
VMOVU (%rsi), %VEC(0)
@@ -470,11 +533,6 @@ L(more_8x_vec_backward):
subq %r8, %r9
/* Adjust length. */
subq %r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
- /* Check non-temporal store threshold. */
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
- ja L(large_backward)
-#endif
L(loop_4x_vec_backward):
/* Copy 4 * VEC a time backward. */
VMOVU (%rcx), %VEC(0)
@@ -500,72 +558,197 @@ L(loop_4x_vec_backward):
VZEROUPPER_RETURN
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-L(large_forward):
+L(large_memcpy_2x):
+ /* Compute absolute value of difference between source and
+ destination. */
+ movq %rdi, %r9
+ subq %rsi, %r9
+ movq %r9, %r8
+ leaq -1(%r9), %rcx
+ sarq $63, %r8
+ xorq %r8, %r9
+ subq %r8, %r9
/* Don't use non-temporal store if there is overlap between
- destination and source since destination may be in cache
- when source is loaded. */
- leaq (%rdi, %rdx), %r10
- cmpq %r10, %rsi
- jb L(loop_4x_vec_forward)
-L(loop_large_forward):
+ destination and source since destination may be in cache when
+ source is loaded. */
+ cmpq %r9, %rdx
+ ja L(more_8x_vec_check)
+
+ /* Cache align destination. First store the first 64 bytes then
+ adjust alignments. */
+ VMOVU (%rsi), %VEC(8)
+#if VEC_SIZE < 64
+ VMOVU VEC_SIZE(%rsi), %VEC(9)
+#if VEC_SIZE < 32
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
+#endif
+#endif
+ VMOVU %VEC(8), (%rdi)
+#if VEC_SIZE < 64
+ VMOVU %VEC(9), VEC_SIZE(%rdi)
+#if VEC_SIZE < 32
+ VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
+#endif
+#endif
+ /* Adjust source, destination, and size. */
+ movq %rdi, %r8
+ andq $63, %r8
+ /* Get the negative of offset for alignment. */
+ subq $64, %r8
+ /* Adjust source. */
+ subq %r8, %rsi
+ /* Adjust destination which should be aligned now. */
+ subq %r8, %rdi
+ /* Adjust length. */
+ addq %r8, %rdx
+
+ /* Test if source and destination addresses will alias. If they do
+ the larger pipeline in large_memcpy_4x alleviated the
+ performance drop. */
+ testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
+ jz L(large_memcpy_4x)
+
+ movq %rdx, %r10
+ shrq $LOG_4X_MEMCPY_THRESH, %r10
+ cmp __x86_shared_non_temporal_threshold(%rip), %r10
+ jae L(large_memcpy_4x)
+
+ /* edx will store remainder size for copying tail. */
+ andl $(PAGE_SIZE * 2 - 1), %edx
+ /* r10 stores outer loop counter. */
+ shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
+ /* Copy 4x VEC at a time from 2 pages. */
+L(loop_large_memcpy_2x_outer):
+ /* ecx stores inner loop counter. */
+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_2x_inner):
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
+ /* Load vectors from rsi. */
+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+ addq $LARGE_LOAD_SIZE, %rsi
+ /* Non-temporal store vectors to rdi. */
+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+ addq $LARGE_LOAD_SIZE, %rdi
+ decl %ecx
+ jnz L(loop_large_memcpy_2x_inner)
+ addq $PAGE_SIZE, %rdi
+ addq $PAGE_SIZE, %rsi
+ decq %r10
+ jne L(loop_large_memcpy_2x_outer)
+
+ /* Check if only last 4 loads are needed. */
+ cmpl $(VEC_SIZE * 4), %edx
+ jbe L(large_memcpy_2x_end)
+
+ /* Handle the last 2 * PAGE_SIZE bytes. */
+L(loop_large_memcpy_2x_tail):
/* Copy 4 * VEC a time forward with non-temporal stores. */
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- addq $PREFETCHED_LOAD_SIZE, %rsi
- subq $PREFETCHED_LOAD_SIZE, %rdx
+ addq $(VEC_SIZE * 4), %rsi
+ subl $(VEC_SIZE * 4), %edx
VMOVNT %VEC(0), (%rdi)
VMOVNT %VEC(1), VEC_SIZE(%rdi)
VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
- addq $PREFETCHED_LOAD_SIZE, %rdi
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
- ja L(loop_large_forward)
+ addq $(VEC_SIZE * 4), %rdi
+ cmpl $(VEC_SIZE * 4), %edx
+ ja L(loop_large_memcpy_2x_tail)
+
+L(large_memcpy_2x_end):
sfence
/* Store the last 4 * VEC. */
- VMOVU %VEC(5), (%rcx)
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
- /* Store the first VEC. */
- VMOVU %VEC(4), (%r11)
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
VZEROUPPER_RETURN
-L(large_backward):
- /* Don't use non-temporal store if there is overlap between
- destination and source since destination may be in cache
- when source is loaded. */
- leaq (%rcx, %rdx), %r10
- cmpq %r10, %r9
- jb L(loop_4x_vec_backward)
-L(loop_large_backward):
- /* Copy 4 * VEC a time backward with non-temporal stores. */
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
- VMOVU (%rcx), %VEC(0)
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
- subq $PREFETCHED_LOAD_SIZE, %rcx
- subq $PREFETCHED_LOAD_SIZE, %rdx
- VMOVNT %VEC(0), (%r9)
- VMOVNT %VEC(1), -VEC_SIZE(%r9)
- VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
- VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
- subq $PREFETCHED_LOAD_SIZE, %r9
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
- ja L(loop_large_backward)
+L(large_memcpy_4x):
+ movq %rdx, %r10
+ /* edx will store remainder size for copying tail. */
+ andl $(PAGE_SIZE * 4 - 1), %edx
+ /* r10 stores outer loop counter. */
+ shrq $(LOG_PAGE_SIZE + 2), %r10
+ /* Copy 4x VEC at a time from 4 pages. */
+L(loop_large_memcpy_4x_outer):
+ /* ecx stores inner loop counter. */
+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_4x_inner):
+ /* Only one prefetch set per page as doing 4 pages give more time
+ for prefetcher to keep up. */
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
+ /* Load vectors from rsi. */
+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+ addq $LARGE_LOAD_SIZE, %rsi
+ /* Non-temporal store vectors to rdi. */
+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+ STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+ STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+ addq $LARGE_LOAD_SIZE, %rdi
+ decl %ecx
+ jnz L(loop_large_memcpy_4x_inner)
+ addq $(PAGE_SIZE * 3), %rdi
+ addq $(PAGE_SIZE * 3), %rsi
+ decq %r10
+ jne L(loop_large_memcpy_4x_outer)
+
+ /* Check if only last 4 loads are needed. */
+ cmpl $(VEC_SIZE * 4), %edx
+ jbe L(large_memcpy_4x_end)
+
+ /* Handle the last 4 * PAGE_SIZE bytes. */
+L(loop_large_memcpy_4x_tail):
+ /* Copy 4 * VEC a time forward with non-temporal stores. */
+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+ addq $(VEC_SIZE * 4), %rsi
+ subl $(VEC_SIZE * 4), %edx
+ VMOVNT %VEC(0), (%rdi)
+ VMOVNT %VEC(1), VEC_SIZE(%rdi)
+ VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
+ addq $(VEC_SIZE * 4), %rdi
+ cmpl $(VEC_SIZE * 4), %edx
+ ja L(loop_large_memcpy_4x_tail)
+
+L(large_memcpy_4x_end):
sfence
- /* Store the first 4 * VEC. */
- VMOVU %VEC(4), (%rdi)
- VMOVU %VEC(5), VEC_SIZE(%rdi)
- VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
- /* Store the last VEC. */
- VMOVU %VEC(8), (%r11)
+ /* Store the last 4 * VEC. */
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
VZEROUPPER_RETURN
#endif
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
--
2.29.2
^ permalink raw reply [flat|nested] 2+ messages in thread
* [PATCH v7 2/2] x86: Expanding test-memmove.c, test-memcpy.c, bench-memcpy-large.c
2021-04-02 21:35 [PATCH v7 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S Noah Goldstein
@ 2021-04-02 21:35 ` Noah Goldstein
0 siblings, 0 replies; 2+ messages in thread
From: Noah Goldstein @ 2021-04-02 21:35 UTC (permalink / raw)
To: libc-alpha
From: noah <goldstein.w.n@gmail.com>
No Bug. This commit expanding the range of tests / benchmarks for
memmove and memcpy. The test expansion is mostly in the vein of
increasing the maximum size, increasing the number of unique
alignments tested, and testing both source < destination and vice
versa. The benchmark expansaion is just to increase the number of
unique alignments. test-memcpy, test-memccpy, test-mempcpy,
test-memmove, and tst-memmove-overflow all pass.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
benchtests/bench-memcpy-large.c | 8 +++-
string/test-memcpy.c | 61 ++++++++++++++++-------------
string/test-memmove.c | 68 ++++++++++++++++++++-------------
3 files changed, 82 insertions(+), 55 deletions(-)
diff --git a/benchtests/bench-memcpy-large.c b/benchtests/bench-memcpy-large.c
index 3df1575514..efb9627b1e 100644
--- a/benchtests/bench-memcpy-large.c
+++ b/benchtests/bench-memcpy-large.c
@@ -57,11 +57,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len)
size_t i, j;
char *s1, *s2;
- align1 &= 63;
+ align1 &= 4095;
if (align1 + len >= page_size)
return;
- align2 &= 63;
+ align2 &= 4095;
if (align2 + len >= page_size)
return;
@@ -113,6 +113,10 @@ test_main (void)
do_test (&json_ctx, 0, 3, i + 15);
do_test (&json_ctx, 3, 0, i + 31);
do_test (&json_ctx, 3, 5, i + 63);
+ do_test (&json_ctx, 0, 127, i);
+ do_test (&json_ctx, 0, 255, i);
+ do_test (&json_ctx, 0, 256, i);
+ do_test (&json_ctx, 0, 4064, i);
}
json_array_end (&json_ctx);
diff --git a/string/test-memcpy.c b/string/test-memcpy.c
index 2e9c6bd099..c9dfc88fed 100644
--- a/string/test-memcpy.c
+++ b/string/test-memcpy.c
@@ -82,11 +82,11 @@ do_test (size_t align1, size_t align2, size_t len)
size_t i, j;
char *s1, *s2;
- align1 &= 63;
+ align1 &= 4095;
if (align1 + len >= page_size)
return;
- align2 &= 63;
+ align2 &= 4095;
if (align2 + len >= page_size)
return;
@@ -213,11 +213,9 @@ do_random_tests (void)
}
static void
-do_test1 (void)
+do_test1 (size_t size)
{
- size_t size = 0x100000;
void *large_buf;
-
large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANON, -1, 0);
if (large_buf == MAP_FAILED)
@@ -233,27 +231,32 @@ do_test1 (void)
uint32_t *dest = large_buf;
uint32_t *src = large_buf + size + page_size;
size_t i;
-
- for (i = 0; i < arrary_size; i++)
- src[i] = (uint32_t) i;
-
- FOR_EACH_IMPL (impl, 0)
+ size_t repeats;
+ for(repeats = 0; repeats < 2; repeats++)
{
- memset (dest, -1, size);
- CALL (impl, (char *) dest, (char *) src, size);
for (i = 0; i < arrary_size; i++)
- if (dest[i] != src[i])
- {
- error (0, 0,
- "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
- impl->name, dest, src, i);
- ret = 1;
- break;
- }
+ src[i] = (uint32_t) i;
+
+ FOR_EACH_IMPL (impl, 0)
+ {
+ printf ("\t\tRunning: %s\n", impl->name);
+ memset (dest, -1, size);
+ CALL (impl, (char *) dest, (char *) src, size);
+ for (i = 0; i < arrary_size; i++)
+ if (dest[i] != src[i])
+ {
+ error (0, 0,
+ "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
+ impl->name, dest, src, i);
+ ret = 1;
+ munmap ((void *) large_buf, size * 2 + page_size);
+ return;
+ }
+ }
+ dest = src;
+ src = large_buf;
}
-
- munmap ((void *) dest, size);
- munmap ((void *) src, size);
+ munmap ((void *) large_buf, size * 2 + page_size);
}
int
@@ -275,7 +278,6 @@ test_main (void)
do_test (0, i, 1 << i);
do_test (i, i, 1 << i);
}
-
for (i = 0; i < 32; ++i)
{
do_test (0, 0, i);
@@ -294,12 +296,19 @@ test_main (void)
do_test (i, i, 16 * i);
}
+ for (i = 19; i <= 25; ++i)
+ {
+ do_test (255, 0, 1 << i);
+ do_test (0, 255, i);
+ do_test (0, 4000, i);
+ }
+
do_test (0, 0, getpagesize ());
do_random_tests ();
- do_test1 ();
-
+ do_test1 (0x100000);
+ do_test1 (0x2000000);
return ret;
}
diff --git a/string/test-memmove.c b/string/test-memmove.c
index 2e3ce75b9b..2e665fa3a5 100644
--- a/string/test-memmove.c
+++ b/string/test-memmove.c
@@ -247,7 +247,7 @@ do_random_tests (void)
}
static void
-do_test2 (void)
+do_test2 (size_t offset)
{
size_t size = 0x20000000;
uint32_t * large_buf;
@@ -268,33 +268,45 @@ do_test2 (void)
}
size_t bytes_move = 0x80000000 - (uintptr_t) large_buf;
+ if (bytes_move + offset * sizeof (uint32_t) > size)
+ {
+ munmap ((void *) large_buf, size);
+ return;
+ }
size_t arr_size = bytes_move / sizeof (uint32_t);
size_t i;
-
- FOR_EACH_IMPL (impl, 0)
+ size_t repeats;
+ uint32_t * src = large_buf;
+ uint32_t * dst = &large_buf[offset];
+ for (repeats = 0; repeats < 2; ++repeats)
{
- for (i = 0; i < arr_size; i++)
- large_buf[i] = (uint32_t) i;
-
- uint32_t * dst = &large_buf[33];
-
-#ifdef TEST_BCOPY
- CALL (impl, (char *) large_buf, (char *) dst, bytes_move);
-#else
- CALL (impl, (char *) dst, (char *) large_buf, bytes_move);
-#endif
-
- for (i = 0; i < arr_size; i++)
- {
- if (dst[i] != (uint32_t) i)
- {
- error (0, 0,
- "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
- impl->name, dst, large_buf, i);
- ret = 1;
- break;
- }
- }
+ FOR_EACH_IMPL (impl, 0)
+ {
+ for (i = 0; i < arr_size; i++)
+ src[i] = (uint32_t) i;
+
+
+ #ifdef TEST_BCOPY
+ CALL (impl, (char *) src, (char *) dst, bytes_move);
+ #else
+ CALL (impl, (char *) dst, (char *) src, bytes_move);
+ #endif
+
+ for (i = 0; i < arr_size; i++)
+ {
+ if (dst[i] != (uint32_t) i)
+ {
+ error (0, 0,
+ "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
+ impl->name, dst, large_buf, i);
+ ret = 1;
+ munmap ((void *) large_buf, size);
+ return;
+ }
+ }
+ }
+ src = dst;
+ dst = large_buf;
}
munmap ((void *) large_buf, size);
@@ -340,8 +352,10 @@ test_main (void)
do_random_tests ();
- do_test2 ();
-
+ do_test2 (33);
+ do_test2 (0x200000);
+ do_test2 (0x4000000 - 1);
+ do_test2 (0x4000000);
return ret;
}
--
2.29.2
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2021-04-02 21:35 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-02 21:35 [PATCH v7 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S Noah Goldstein
2021-04-02 21:35 ` [PATCH v7 2/2] x86: Expanding test-memmove.c, test-memcpy.c, bench-memcpy-large.c Noah Goldstein
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).