[PATCH v6 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH v6 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
@ 2021-04-02 20:26 Noah Goldstein
  2021-04-02 20:26 ` [PATCH v6 2/2] x86: Expanding test-memmove.c, test-memcpy.c, bench-memcpy-large.c Noah Goldstein
  2021-04-02 22:08 ` [PATCH v6 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S H.J. Lu
  0 siblings, 2 replies; 7+ messages in thread
From: Noah Goldstein @ 2021-04-02 20:26 UTC (permalink / raw)
  To: libc-alpha

From: noah <goldstein.w.n@gmail.com>

No Bug. This commit updates the large memcpy case (no overlap). The
update is to perform memcpy on either 2 or 4 contiguous pages at
once. This 1) helps to alleviate the affects of false memory aliasing
when destination and source have a close 4k alignment and 2) In most
cases and for most DRAM units is a modestly more efficient access
pattern. These changes are a clear performance improvement for
VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
pass.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 .../multiarch/memmove-vec-unaligned-erms.S    | 307 ++++++++++++++----
 1 file changed, 245 insertions(+), 62 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 897a3d9762..9c7f95c653 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -35,7 +35,16 @@
       __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
    7. If size >= __x86_shared_non_temporal_threshold and there is no
       overlap between destination and source, use non-temporal store
-      instead of aligned store.  */
+      instead of aligned store copying from either 2 or 4 pages at
+      once.
+   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
+      and source and destination do not page alias, copy from 2 pages
+      at once using non-temporal stores. Page aliasing in this case is
+      considered true if destination's page alignment - sources' page
+      alignment is less than 8 * VEC_SIZE.
+   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
+      and destination do page alias copy from 4 pages at once using
+      non-temporal stores.  */
 
 #include <sysdep.h>
 
@@ -67,6 +76,34 @@
 # endif
 #endif
 
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+
+#if PAGE_SIZE != 4096
+# error Unsupported PAGE_SIZE
+#endif
+
+#ifndef LOG_PAGE_SIZE
+# define LOG_PAGE_SIZE 12
+#endif
+
+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
+# error Invalid LOG_PAGE_SIZE
+#endif
+
+/* Byte per page for large_memcpy inner loop.  */
+#if VEC_SIZE == 64
+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
+#else
+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
+#endif
+
+/* Amount to shift rdx by to compare for memcpy_large_4x.  */
+#ifndef LOG_4X_MEMCPY_THRESH
+# define LOG_4X_MEMCPY_THRESH 4
+#endif
+
 /* Avoid short distance rep movsb only with non-SSE vector.  */
 #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
 # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
@@ -106,6 +143,28 @@
 # error Unsupported PREFETCH_SIZE!
 #endif
 
+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
+	VMOVU	(offset)base, vec0; \
+	VMOVU	((offset) + VEC_SIZE)base, vec1;
+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
+	VMOVNT  vec0, (offset)base; \
+	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+	VMOVU	(offset)base, vec0; \
+	VMOVU	((offset) + VEC_SIZE)base, vec1; \
+	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
+	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+	VMOVNT	vec0, (offset)base; \
+	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
+	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
+	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
+#else
+# error Invalid LARGE_LOAD_SIZE
+#endif
+
 #ifndef SECTION
 # error SECTION is not defined!
 #endif
@@ -393,6 +452,15 @@ L(last_4x_vec):
 	VZEROUPPER_RETURN
 
 L(more_8x_vec):
+	/* Check if non-temporal move candidate.  */
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+	/* Check non-temporal store threshold.  */
+	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	ja	L(large_memcpy_2x)
+#endif
+	/* Entry if rdx is greater than non-temporal threshold but there
+       is overlap.  */
+L(more_8x_vec_check):
 	cmpq	%rsi, %rdi
 	ja	L(more_8x_vec_backward)
 	/* Source == destination is less common.  */
@@ -419,11 +487,6 @@ L(more_8x_vec):
 	subq	%r8, %rdi
 	/* Adjust length.  */
 	addq	%r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	/* Check non-temporal store threshold.  */
-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
-	ja	L(large_forward)
-#endif
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
 	VMOVU	(%rsi), %VEC(0)
@@ -470,11 +533,6 @@ L(more_8x_vec_backward):
 	subq	%r8, %r9
 	/* Adjust length.  */
 	subq	%r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	/* Check non-temporal store threshold.  */
-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
-	ja	L(large_backward)
-#endif
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
 	VMOVU	(%rcx), %VEC(0)
@@ -500,72 +558,197 @@ L(loop_4x_vec_backward):
 	VZEROUPPER_RETURN
 
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-L(large_forward):
+L(large_memcpy_2x):
+	/* Compute absolute value of difference between source and
+	   destination.  */
+	movq	%rdi, %r9
+	subq	%rsi, %r9
+	movq	%r9, %r8
+	leaq	-1(%r9), %rcx
+	sarq	$63, %r8
+	xorq	%r8, %r9
+	subq	%r8, %r9
 	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rdi, %rdx), %r10
-	cmpq    %r10, %rsi
-	jb	L(loop_4x_vec_forward)
-L(loop_large_forward):
+	   destination and source since destination may be in cache when
+	   source is loaded.  */
+	cmpq	%r9, %rdx
+	ja	L(more_8x_vec_check)
+
+	/* Cache align destination. First store the first 64 bytes then
+	   adjust alignments.  */
+	VMOVU	(%rsi), %VEC(8)
+#if VEC_SIZE < 64
+	VMOVU	VEC_SIZE(%rsi), %VEC(9)
+#if VEC_SIZE < 32
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
+#endif
+#endif
+	VMOVU	%VEC(8), (%rdi)
+#if VEC_SIZE < 64
+	VMOVU	%VEC(9), VEC_SIZE(%rdi)
+#if VEC_SIZE < 32
+	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
+#endif
+#endif
+	/* Adjust source, destination, and size.  */
+	movq	%rdi, %r8
+	andq	$63, %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$64, %r8
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
+
+	/* Test if source and destination addresses will alias. If they do
+	   the larger pipeline in large_memcpy_4x alleviated the
+	   performance drop.  */
+	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
+	jz	L(large_memcpy_4x)
+
+	movq	%rdx, %r10
+	shrq	$LOG_4X_MEMCPY_THRESH, %r10
+	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
+	jae	L(large_memcpy_4x)
+
+	/* edx will store remainder size for copying tail.  */
+	andl	$(PAGE_SIZE * 2 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
+	/* Copy 4x VEC at a time from 2 pages.  */
+L(loop_large_memcpy_2x_outer):
+	/* ecx stores inner loop counter.  */
+	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_2x_inner):
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	addq	$LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	addq	$LARGE_LOAD_SIZE, %rdi
+	decl	%ecx
+	jnz	L(loop_large_memcpy_2x_inner)
+	addq	$PAGE_SIZE, %rdi
+	addq	$PAGE_SIZE, %rsi
+	decq	%r10
+	jne	L(loop_large_memcpy_2x_outer)
+
+	/* Check if only last 4 loads are needed.  */
+	cmpl	$(VEC_SIZE * 4), %edx
+	jbe	L(large_memcpy_2x_end)
+
+	/* Handle the last 2 * PAGE_SIZE bytes.  */
+L(loop_large_memcpy_2x_tail):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	addq	$PREFETCHED_LOAD_SIZE, %rsi
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
+	addq	$(VEC_SIZE * 4), %rsi
+	subl	$(VEC_SIZE * 4), %edx
 	VMOVNT	%VEC(0), (%rdi)
 	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
 	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
 	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	addq	$PREFETCHED_LOAD_SIZE, %rdi
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_forward)
+	addq	$(VEC_SIZE * 4), %rdi
+	cmpl	$(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_2x_tail)
+
+L(large_memcpy_2x_end):
 	sfence
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
-	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
-L(large_backward):
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rcx, %rdx), %r10
-	cmpq    %r10, %r9
-	jb	L(loop_4x_vec_backward)
-L(loop_large_backward):
-	/* Copy 4 * VEC a time backward with non-temporal stores.  */
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	subq	$PREFETCHED_LOAD_SIZE, %rcx
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
-	VMOVNT	%VEC(0), (%r9)
-	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
-	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	subq	$PREFETCHED_LOAD_SIZE, %r9
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_backward)
+L(large_memcpy_4x):
+	movq	%rdx, %r10
+	/* edx will store remainder size for copying tail.  */
+	andl	$(PAGE_SIZE * 4 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq	$(LOG_PAGE_SIZE + 2), %r10
+	/* Copy 4x VEC at a time from 4 pages.  */
+L(loop_large_memcpy_4x_outer):
+	/* ecx stores inner loop counter.  */
+	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_4x_inner):
+	/* Only one prefetch set per page as doing 4 pages give more time
+	   for prefetcher to keep up.  */
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	addq	$LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	addq	$LARGE_LOAD_SIZE, %rdi
+	decl	%ecx
+	jnz	L(loop_large_memcpy_4x_inner)
+	addq	$(PAGE_SIZE * 3), %rdi
+	addq	$(PAGE_SIZE * 3), %rsi
+	decq	%r10
+	jne	L(loop_large_memcpy_4x_outer)
+
+	/* Check if only last 4 loads are needed.  */
+	cmpl	$(VEC_SIZE * 4), %edx
+	jbe	L(large_memcpy_4x_end)
+
+	/* Handle the last 4  * PAGE_SIZE bytes.  */
+L(loop_large_memcpy_4x_tail):
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	addq	$(VEC_SIZE * 4), %rsi
+	subl	$(VEC_SIZE * 4), %edx
+	VMOVNT	%VEC(0), (%rdi)
+	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
+	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$(VEC_SIZE * 4), %rdi
+	cmpl	$(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_4x_tail)
+
+L(large_memcpy_4x_end):
 	sfence
-	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
-	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
+	/* Store the last 4 * VEC.  */
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
-- 
2.29.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v6 2/2] x86: Expanding test-memmove.c, test-memcpy.c, bench-memcpy-large.c
  2021-04-02 20:26 [PATCH v6 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S Noah Goldstein
@ 2021-04-02 20:26 ` Noah Goldstein
  2021-04-02 20:35   ` H.J. Lu
  2021-04-02 22:08 ` [PATCH v6 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S H.J. Lu
  1 sibling, 1 reply; 7+ messages in thread
From: Noah Goldstein @ 2021-04-02 20:26 UTC (permalink / raw)
  To: libc-alpha

From: noah <goldstein.w.n@gmail.com>

No Bug. This commit expanding the range of tests / benchmarks for
memmove and memcpy. The test expansion is mostly in the vein of
increasing the maximum size, increasing the number of unique
alignments tested, and testing both source < destination and vice
versa. The benchmark expansaion is just to increase the number of
unique alignments. test-memcpy, test-memccpy, test-mempcpy,
test-memmove, and tst-memmove-overflow all pass.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 benchtests/bench-memcpy-large.c |  8 +++-
 string/test-memcpy.c            | 61 ++++++++++++++++------------
 string/test-memmove.c           | 70 ++++++++++++++++++++-------------
 3 files changed, 83 insertions(+), 56 deletions(-)

diff --git a/benchtests/bench-memcpy-large.c b/benchtests/bench-memcpy-large.c
index 3df1575514..efb9627b1e 100644
--- a/benchtests/bench-memcpy-large.c
+++ b/benchtests/bench-memcpy-large.c
@@ -57,11 +57,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= 4095;
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= 4095;
   if (align2 + len >= page_size)
     return;
 
@@ -113,6 +113,10 @@ test_main (void)
       do_test (&json_ctx, 0, 3, i + 15);
       do_test (&json_ctx, 3, 0, i + 31);
       do_test (&json_ctx, 3, 5, i + 63);
+      do_test (&json_ctx, 0, 127, i);
+      do_test (&json_ctx, 0, 255, i);
+      do_test (&json_ctx, 0, 256, i);
+      do_test (&json_ctx, 0, 4064, i);
     }
 
   json_array_end (&json_ctx);
diff --git a/string/test-memcpy.c b/string/test-memcpy.c
index 2e9c6bd099..c9dfc88fed 100644
--- a/string/test-memcpy.c
+++ b/string/test-memcpy.c
@@ -82,11 +82,11 @@ do_test (size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= 4095;
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= 4095;
   if (align2 + len >= page_size)
     return;
 
@@ -213,11 +213,9 @@ do_random_tests (void)
 }
 
 static void
-do_test1 (void)
+do_test1 (size_t size)
 {
-  size_t size = 0x100000;
   void *large_buf;
-
   large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
 		    MAP_PRIVATE | MAP_ANON, -1, 0);
   if (large_buf == MAP_FAILED)
@@ -233,27 +231,32 @@ do_test1 (void)
   uint32_t *dest = large_buf;
   uint32_t *src = large_buf + size + page_size;
   size_t i;
-
-  for (i = 0; i < arrary_size; i++)
-    src[i] = (uint32_t) i;
-
-  FOR_EACH_IMPL (impl, 0)
+  size_t repeats;
+  for(repeats = 0; repeats < 2; repeats++)
     {
-      memset (dest, -1, size);
-      CALL (impl, (char *) dest, (char *) src, size);
       for (i = 0; i < arrary_size; i++)
-	if (dest[i] != src[i])
-	  {
-	    error (0, 0,
-		   "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
-		   impl->name, dest, src, i);
-	    ret = 1;
-	    break;
-	  }
+        src[i] = (uint32_t) i;
+
+      FOR_EACH_IMPL (impl, 0)
+        {
+            printf ("\t\tRunning: %s\n", impl->name);
+          memset (dest, -1, size);
+          CALL (impl, (char *) dest, (char *) src, size);
+          for (i = 0; i < arrary_size; i++)
+        if (dest[i] != src[i])
+          {
+            error (0, 0,
+               "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
+               impl->name, dest, src, i);
+            ret = 1;
+            munmap ((void *) large_buf, size * 2 + page_size);
+            return;
+          }
+        }
+      dest = src;
+      src = large_buf;
     }
-
-  munmap ((void *) dest, size);
-  munmap ((void *) src, size);
+  munmap ((void *) large_buf, size * 2 + page_size);
 }
 
 int
@@ -275,7 +278,6 @@ test_main (void)
       do_test (0, i, 1 << i);
       do_test (i, i, 1 << i);
     }
-
   for (i = 0; i < 32; ++i)
     {
       do_test (0, 0, i);
@@ -294,12 +296,19 @@ test_main (void)
       do_test (i, i, 16 * i);
     }
 
+  for (i = 19; i <= 25; ++i)
+    {
+      do_test (255, 0, 1 << i);
+      do_test (0, 255, i);
+      do_test (0, 4000, i);
+    }
+
   do_test (0, 0, getpagesize ());
 
   do_random_tests ();
 
-  do_test1 ();
-
+  do_test1 (0x100000);
+  do_test1 (0x2000000);
   return ret;
 }
 
diff --git a/string/test-memmove.c b/string/test-memmove.c
index 2e3ce75b9b..ff8099d12f 100644
--- a/string/test-memmove.c
+++ b/string/test-memmove.c
@@ -247,7 +247,7 @@ do_random_tests (void)
 }
 
 static void
-do_test2 (void)
+do_test2 (size_t offset)
 {
   size_t size = 0x20000000;
   uint32_t * large_buf;
@@ -268,33 +268,45 @@ do_test2 (void)
     }
 
   size_t bytes_move = 0x80000000 - (uintptr_t) large_buf;
+  if (bytes_move + offset * sizeof (uint32_t) > size)
+    {
+      munmap ((void *) large_buf, size);
+      return;
+    }
   size_t arr_size = bytes_move / sizeof (uint32_t);
   size_t i;
-
-  FOR_EACH_IMPL (impl, 0)
-    {
-      for (i = 0; i < arr_size; i++)
-        large_buf[i] = (uint32_t) i;
-
-      uint32_t * dst = &large_buf[33];
-
-#ifdef TEST_BCOPY
-      CALL (impl, (char *) large_buf, (char *) dst, bytes_move);
-#else
-      CALL (impl, (char *) dst, (char *) large_buf, bytes_move);
-#endif
-
-      for (i = 0; i < arr_size; i++)
-	{
-	  if (dst[i] != (uint32_t) i)
-	    {
-	      error (0, 0,
-		     "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
-		     impl->name, dst, large_buf, i);
-	      ret = 1;
-	      break;
-	    }
-	}
+  size_t repeats;
+  uint32_t * src = large_buf;
+  uint32_t * dst = &large_buf[offset];
+  for (repeats = 0; repeats < 2; ++repeats)
+    {      
+      FOR_EACH_IMPL (impl, 0)
+        {
+          for (i = 0; i < arr_size; i++)
+            src[i] = (uint32_t) i;
+
+
+    #ifdef TEST_BCOPY
+          CALL (impl, (char *) src, (char *) dst, bytes_move);
+    #else
+          CALL (impl, (char *) dst, (char *) src, bytes_move);
+    #endif
+
+          for (i = 0; i < arr_size; i++)
+        {
+          if (dst[i] != (uint32_t) i)
+            {
+              error (0, 0,
+                 "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
+                 impl->name, dst, large_buf, i);
+              ret = 1;
+              munmap ((void *) large_buf, size);
+              return;
+            }
+        }
+        }
+      src = dst;
+      dst = large_buf;
     }
 
   munmap ((void *) large_buf, size);
@@ -340,8 +352,10 @@ test_main (void)
 
   do_random_tests ();
 
-  do_test2 ();
-
+  do_test2 (33);
+  do_test2 (0x200000);
+  do_test2 (0x4000000 - 1);
+  do_test2 (0x4000000);
   return ret;
 }
 
-- 
2.29.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v6 2/2] x86: Expanding test-memmove.c, test-memcpy.c, bench-memcpy-large.c
  2021-04-02 20:26 ` [PATCH v6 2/2] x86: Expanding test-memmove.c, test-memcpy.c, bench-memcpy-large.c Noah Goldstein
@ 2021-04-02 20:35   ` H.J. Lu
  2021-04-02 21:35     ` Noah Goldstein
  0 siblings, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2021-04-02 20:35 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Fri, Apr 2, 2021 at 1:26 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> From: noah <goldstein.w.n@gmail.com>
>
> No Bug. This commit expanding the range of tests / benchmarks for
> memmove and memcpy. The test expansion is mostly in the vein of
> increasing the maximum size, increasing the number of unique
> alignments tested, and testing both source < destination and vice
> versa. The benchmark expansaion is just to increase the number of
> unique alignments. test-memcpy, test-memccpy, test-mempcpy,
> test-memmove, and tst-memmove-overflow all pass.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  benchtests/bench-memcpy-large.c |  8 +++-
>  string/test-memcpy.c            | 61 ++++++++++++++++------------
>  string/test-memmove.c           | 70 ++++++++++++++++++++-------------
>  3 files changed, 83 insertions(+), 56 deletions(-)
>
> diff --git a/benchtests/bench-memcpy-large.c b/benchtests/bench-memcpy-large.c
> index 3df1575514..efb9627b1e 100644
> --- a/benchtests/bench-memcpy-large.c
> +++ b/benchtests/bench-memcpy-large.c
> @@ -57,11 +57,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len)
>    size_t i, j;
>    char *s1, *s2;
>
> -  align1 &= 63;
> +  align1 &= 4095;
>    if (align1 + len >= page_size)
>      return;
>
> -  align2 &= 63;
> +  align2 &= 4095;
>    if (align2 + len >= page_size)
>      return;
>
> @@ -113,6 +113,10 @@ test_main (void)
>        do_test (&json_ctx, 0, 3, i + 15);
>        do_test (&json_ctx, 3, 0, i + 31);
>        do_test (&json_ctx, 3, 5, i + 63);
> +      do_test (&json_ctx, 0, 127, i);
> +      do_test (&json_ctx, 0, 255, i);
> +      do_test (&json_ctx, 0, 256, i);
> +      do_test (&json_ctx, 0, 4064, i);
>      }
>
>    json_array_end (&json_ctx);
> diff --git a/string/test-memcpy.c b/string/test-memcpy.c
> index 2e9c6bd099..c9dfc88fed 100644
> --- a/string/test-memcpy.c
> +++ b/string/test-memcpy.c
> @@ -82,11 +82,11 @@ do_test (size_t align1, size_t align2, size_t len)
>    size_t i, j;
>    char *s1, *s2;
>
> -  align1 &= 63;
> +  align1 &= 4095;
>    if (align1 + len >= page_size)
>      return;
>
> -  align2 &= 63;
> +  align2 &= 4095;
>    if (align2 + len >= page_size)
>      return;
>
> @@ -213,11 +213,9 @@ do_random_tests (void)
>  }
>
>  static void
> -do_test1 (void)
> +do_test1 (size_t size)
>  {
> -  size_t size = 0x100000;
>    void *large_buf;
> -
>    large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
>                     MAP_PRIVATE | MAP_ANON, -1, 0);
>    if (large_buf == MAP_FAILED)
> @@ -233,27 +231,32 @@ do_test1 (void)
>    uint32_t *dest = large_buf;
>    uint32_t *src = large_buf + size + page_size;
>    size_t i;
> -
> -  for (i = 0; i < arrary_size; i++)
> -    src[i] = (uint32_t) i;
> -
> -  FOR_EACH_IMPL (impl, 0)
> +  size_t repeats;
> +  for(repeats = 0; repeats < 2; repeats++)
>      {
> -      memset (dest, -1, size);
> -      CALL (impl, (char *) dest, (char *) src, size);
>        for (i = 0; i < arrary_size; i++)
> -       if (dest[i] != src[i])
> -         {
> -           error (0, 0,
> -                  "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> -                  impl->name, dest, src, i);
> -           ret = 1;
> -           break;
> -         }
> +        src[i] = (uint32_t) i;
> +
> +      FOR_EACH_IMPL (impl, 0)
> +        {
> +            printf ("\t\tRunning: %s\n", impl->name);
> +          memset (dest, -1, size);
> +          CALL (impl, (char *) dest, (char *) src, size);
> +          for (i = 0; i < arrary_size; i++)
> +        if (dest[i] != src[i])
> +          {
> +            error (0, 0,
> +               "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> +               impl->name, dest, src, i);
> +            ret = 1;
> +            munmap ((void *) large_buf, size * 2 + page_size);
> +            return;
> +          }
> +        }
> +      dest = src;
> +      src = large_buf;
>      }
> -
> -  munmap ((void *) dest, size);
> -  munmap ((void *) src, size);
> +  munmap ((void *) large_buf, size * 2 + page_size);
>  }
>
>  int
> @@ -275,7 +278,6 @@ test_main (void)
>        do_test (0, i, 1 << i);
>        do_test (i, i, 1 << i);
>      }
> -
>    for (i = 0; i < 32; ++i)
>      {
>        do_test (0, 0, i);
> @@ -294,12 +296,19 @@ test_main (void)
>        do_test (i, i, 16 * i);
>      }
>
> +  for (i = 19; i <= 25; ++i)
> +    {
> +      do_test (255, 0, 1 << i);
> +      do_test (0, 255, i);
> +      do_test (0, 4000, i);
> +    }
> +
>    do_test (0, 0, getpagesize ());
>
>    do_random_tests ();
>
> -  do_test1 ();
> -
> +  do_test1 (0x100000);
> +  do_test1 (0x2000000);
>    return ret;
>  }
>
> diff --git a/string/test-memmove.c b/string/test-memmove.c
> index 2e3ce75b9b..ff8099d12f 100644
> --- a/string/test-memmove.c
> +++ b/string/test-memmove.c
> @@ -247,7 +247,7 @@ do_random_tests (void)
>  }
>
>  static void
> -do_test2 (void)
> +do_test2 (size_t offset)
>  {
>    size_t size = 0x20000000;
>    uint32_t * large_buf;
> @@ -268,33 +268,45 @@ do_test2 (void)
>      }
>
>    size_t bytes_move = 0x80000000 - (uintptr_t) large_buf;
> +  if (bytes_move + offset * sizeof (uint32_t) > size)
> +    {
> +      munmap ((void *) large_buf, size);
> +      return;
> +    }
>    size_t arr_size = bytes_move / sizeof (uint32_t);
>    size_t i;
> -
> -  FOR_EACH_IMPL (impl, 0)
> -    {
> -      for (i = 0; i < arr_size; i++)
> -        large_buf[i] = (uint32_t) i;
> -
> -      uint32_t * dst = &large_buf[33];
> -
> -#ifdef TEST_BCOPY
> -      CALL (impl, (char *) large_buf, (char *) dst, bytes_move);
> -#else
> -      CALL (impl, (char *) dst, (char *) large_buf, bytes_move);
> -#endif
> -
> -      for (i = 0; i < arr_size; i++)
> -       {
> -         if (dst[i] != (uint32_t) i)
> -           {
> -             error (0, 0,
> -                    "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> -                    impl->name, dst, large_buf, i);
> -             ret = 1;
> -             break;
> -           }
> -       }
> +  size_t repeats;
> +  uint32_t * src = large_buf;
> +  uint32_t * dst = &large_buf[offset];
> +  for (repeats = 0; repeats < 2; ++repeats)
> +    {
            ^  Trailing whitespaces.

> +      FOR_EACH_IMPL (impl, 0)
> +        {
> +          for (i = 0; i < arr_size; i++)
> +            src[i] = (uint32_t) i;
> +
> +
> +    #ifdef TEST_BCOPY
> +          CALL (impl, (char *) src, (char *) dst, bytes_move);
> +    #else
> +          CALL (impl, (char *) dst, (char *) src, bytes_move);
> +    #endif
> +
> +          for (i = 0; i < arr_size; i++)
> +        {
> +          if (dst[i] != (uint32_t) i)
> +            {
> +              error (0, 0,
> +                 "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> +                 impl->name, dst, large_buf, i);
> +              ret = 1;
> +              munmap ((void *) large_buf, size);
> +              return;
> +            }
> +        }
> +        }
> +      src = dst;
> +      dst = large_buf;
>      }
>
>    munmap ((void *) large_buf, size);
> @@ -340,8 +352,10 @@ test_main (void)
>
>    do_random_tests ();
>
> -  do_test2 ();
> -
> +  do_test2 (33);
> +  do_test2 (0x200000);
> +  do_test2 (0x4000000 - 1);
> +  do_test2 (0x4000000);
>    return ret;
>  }
>
> --
> 2.29.2
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v6 2/2] x86: Expanding test-memmove.c, test-memcpy.c, bench-memcpy-large.c
  2021-04-02 20:35   ` H.J. Lu
@ 2021-04-02 21:35     ` Noah Goldstein
  0 siblings, 0 replies; 7+ messages in thread
From: Noah Goldstein @ 2021-04-02 21:35 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Fri, Apr 2, 2021 at 4:36 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Apr 2, 2021 at 1:26 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > From: noah <goldstein.w.n@gmail.com>
> >
> > No Bug. This commit expanding the range of tests / benchmarks for
> > memmove and memcpy. The test expansion is mostly in the vein of
> > increasing the maximum size, increasing the number of unique
> > alignments tested, and testing both source < destination and vice
> > versa. The benchmark expansaion is just to increase the number of
> > unique alignments. test-memcpy, test-memccpy, test-mempcpy,
> > test-memmove, and tst-memmove-overflow all pass.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> >  benchtests/bench-memcpy-large.c |  8 +++-
> >  string/test-memcpy.c            | 61 ++++++++++++++++------------
> >  string/test-memmove.c           | 70 ++++++++++++++++++++-------------
> >  3 files changed, 83 insertions(+), 56 deletions(-)
> >
> > diff --git a/benchtests/bench-memcpy-large.c b/benchtests/bench-memcpy-large.c
> > index 3df1575514..efb9627b1e 100644
> > --- a/benchtests/bench-memcpy-large.c
> > +++ b/benchtests/bench-memcpy-large.c
> > @@ -57,11 +57,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len)
> >    size_t i, j;
> >    char *s1, *s2;
> >
> > -  align1 &= 63;
> > +  align1 &= 4095;
> >    if (align1 + len >= page_size)
> >      return;
> >
> > -  align2 &= 63;
> > +  align2 &= 4095;
> >    if (align2 + len >= page_size)
> >      return;
> >
> > @@ -113,6 +113,10 @@ test_main (void)
> >        do_test (&json_ctx, 0, 3, i + 15);
> >        do_test (&json_ctx, 3, 0, i + 31);
> >        do_test (&json_ctx, 3, 5, i + 63);
> > +      do_test (&json_ctx, 0, 127, i);
> > +      do_test (&json_ctx, 0, 255, i);
> > +      do_test (&json_ctx, 0, 256, i);
> > +      do_test (&json_ctx, 0, 4064, i);
> >      }
> >
> >    json_array_end (&json_ctx);
> > diff --git a/string/test-memcpy.c b/string/test-memcpy.c
> > index 2e9c6bd099..c9dfc88fed 100644
> > --- a/string/test-memcpy.c
> > +++ b/string/test-memcpy.c
> > @@ -82,11 +82,11 @@ do_test (size_t align1, size_t align2, size_t len)
> >    size_t i, j;
> >    char *s1, *s2;
> >
> > -  align1 &= 63;
> > +  align1 &= 4095;
> >    if (align1 + len >= page_size)
> >      return;
> >
> > -  align2 &= 63;
> > +  align2 &= 4095;
> >    if (align2 + len >= page_size)
> >      return;
> >
> > @@ -213,11 +213,9 @@ do_random_tests (void)
> >  }
> >
> >  static void
> > -do_test1 (void)
> > +do_test1 (size_t size)
> >  {
> > -  size_t size = 0x100000;
> >    void *large_buf;
> > -
> >    large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
> >                     MAP_PRIVATE | MAP_ANON, -1, 0);
> >    if (large_buf == MAP_FAILED)
> > @@ -233,27 +231,32 @@ do_test1 (void)
> >    uint32_t *dest = large_buf;
> >    uint32_t *src = large_buf + size + page_size;
> >    size_t i;
> > -
> > -  for (i = 0; i < arrary_size; i++)
> > -    src[i] = (uint32_t) i;
> > -
> > -  FOR_EACH_IMPL (impl, 0)
> > +  size_t repeats;
> > +  for(repeats = 0; repeats < 2; repeats++)
> >      {
> > -      memset (dest, -1, size);
> > -      CALL (impl, (char *) dest, (char *) src, size);
> >        for (i = 0; i < arrary_size; i++)
> > -       if (dest[i] != src[i])
> > -         {
> > -           error (0, 0,
> > -                  "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> > -                  impl->name, dest, src, i);
> > -           ret = 1;
> > -           break;
> > -         }
> > +        src[i] = (uint32_t) i;
> > +
> > +      FOR_EACH_IMPL (impl, 0)
> > +        {
> > +            printf ("\t\tRunning: %s\n", impl->name);
> > +          memset (dest, -1, size);
> > +          CALL (impl, (char *) dest, (char *) src, size);
> > +          for (i = 0; i < arrary_size; i++)
> > +        if (dest[i] != src[i])
> > +          {
> > +            error (0, 0,
> > +               "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> > +               impl->name, dest, src, i);
> > +            ret = 1;
> > +            munmap ((void *) large_buf, size * 2 + page_size);
> > +            return;
> > +          }
> > +        }
> > +      dest = src;
> > +      src = large_buf;
> >      }
> > -
> > -  munmap ((void *) dest, size);
> > -  munmap ((void *) src, size);
> > +  munmap ((void *) large_buf, size * 2 + page_size);
> >  }
> >
> >  int
> > @@ -275,7 +278,6 @@ test_main (void)
> >        do_test (0, i, 1 << i);
> >        do_test (i, i, 1 << i);
> >      }
> > -
> >    for (i = 0; i < 32; ++i)
> >      {
> >        do_test (0, 0, i);
> > @@ -294,12 +296,19 @@ test_main (void)
> >        do_test (i, i, 16 * i);
> >      }
> >
> > +  for (i = 19; i <= 25; ++i)
> > +    {
> > +      do_test (255, 0, 1 << i);
> > +      do_test (0, 255, i);
> > +      do_test (0, 4000, i);
> > +    }
> > +
> >    do_test (0, 0, getpagesize ());
> >
> >    do_random_tests ();
> >
> > -  do_test1 ();
> > -
> > +  do_test1 (0x100000);
> > +  do_test1 (0x2000000);
> >    return ret;
> >  }
> >
> > diff --git a/string/test-memmove.c b/string/test-memmove.c
> > index 2e3ce75b9b..ff8099d12f 100644
> > --- a/string/test-memmove.c
> > +++ b/string/test-memmove.c
> > @@ -247,7 +247,7 @@ do_random_tests (void)
> >  }
> >
> >  static void
> > -do_test2 (void)
> > +do_test2 (size_t offset)
> >  {
> >    size_t size = 0x20000000;
> >    uint32_t * large_buf;
> > @@ -268,33 +268,45 @@ do_test2 (void)
> >      }
> >
> >    size_t bytes_move = 0x80000000 - (uintptr_t) large_buf;
> > +  if (bytes_move + offset * sizeof (uint32_t) > size)
> > +    {
> > +      munmap ((void *) large_buf, size);
> > +      return;
> > +    }
> >    size_t arr_size = bytes_move / sizeof (uint32_t);
> >    size_t i;
> > -
> > -  FOR_EACH_IMPL (impl, 0)
> > -    {
> > -      for (i = 0; i < arr_size; i++)
> > -        large_buf[i] = (uint32_t) i;
> > -
> > -      uint32_t * dst = &large_buf[33];
> > -
> > -#ifdef TEST_BCOPY
> > -      CALL (impl, (char *) large_buf, (char *) dst, bytes_move);
> > -#else
> > -      CALL (impl, (char *) dst, (char *) large_buf, bytes_move);
> > -#endif
> > -
> > -      for (i = 0; i < arr_size; i++)
> > -       {
> > -         if (dst[i] != (uint32_t) i)
> > -           {
> > -             error (0, 0,
> > -                    "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> > -                    impl->name, dst, large_buf, i);
> > -             ret = 1;
> > -             break;
> > -           }
> > -       }
> > +  size_t repeats;
> > +  uint32_t * src = large_buf;
> > +  uint32_t * dst = &large_buf[offset];
> > +  for (repeats = 0; repeats < 2; ++repeats)
> > +    {
>             ^  Trailing whitespaces.

fixed.
>
> > +      FOR_EACH_IMPL (impl, 0)
> > +        {
> > +          for (i = 0; i < arr_size; i++)
> > +            src[i] = (uint32_t) i;
> > +
> > +
> > +    #ifdef TEST_BCOPY
> > +          CALL (impl, (char *) src, (char *) dst, bytes_move);
> > +    #else
> > +          CALL (impl, (char *) dst, (char *) src, bytes_move);
> > +    #endif
> > +
> > +          for (i = 0; i < arr_size; i++)
> > +        {
> > +          if (dst[i] != (uint32_t) i)
> > +            {
> > +              error (0, 0,
> > +                 "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
> > +                 impl->name, dst, large_buf, i);
> > +              ret = 1;
> > +              munmap ((void *) large_buf, size);
> > +              return;
> > +            }
> > +        }
> > +        }
> > +      src = dst;
> > +      dst = large_buf;
> >      }
> >
> >    munmap ((void *) large_buf, size);
> > @@ -340,8 +352,10 @@ test_main (void)
> >
> >    do_random_tests ();
> >
> > -  do_test2 ();
> > -
> > +  do_test2 (33);
> > +  do_test2 (0x200000);
> > +  do_test2 (0x4000000 - 1);
> > +  do_test2 (0x4000000);
> >    return ret;
> >  }
> >
> > --
> > 2.29.2
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v6 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
  2021-04-02 20:26 [PATCH v6 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S Noah Goldstein
  2021-04-02 20:26 ` [PATCH v6 2/2] x86: Expanding test-memmove.c, test-memcpy.c, bench-memcpy-large.c Noah Goldstein
@ 2021-04-02 22:08 ` H.J. Lu
  2021-04-02 23:09   ` Noah Goldstein
  1 sibling, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2021-04-02 22:08 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos, hjl.tools

On Fri, Apr 02, 2021 at 04:26:42PM -0400, Noah Goldstein wrote:
> From: noah <goldstein.w.n@gmail.com>
> 
> No Bug. This commit updates the large memcpy case (no overlap). The
> update is to perform memcpy on either 2 or 4 contiguous pages at
> once. This 1) helps to alleviate the affects of false memory aliasing
> when destination and source have a close 4k alignment and 2) In most
> cases and for most DRAM units is a modestly more efficient access
> pattern. These changes are a clear performance improvement for
> VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> pass.
> 
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  .../multiarch/memmove-vec-unaligned-erms.S    | 307 ++++++++++++++----
>  1 file changed, 245 insertions(+), 62 deletions(-)
> 
> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> index 897a3d9762..9c7f95c653 100644
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -35,7 +35,16 @@
>        __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
>     7. If size >= __x86_shared_non_temporal_threshold and there is no
>        overlap between destination and source, use non-temporal store
> -      instead of aligned store.  */
> +      instead of aligned store copying from either 2 or 4 pages at
> +      once.
> +   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
> +      and source and destination do not page alias, copy from 2 pages
> +      at once using non-temporal stores. Page aliasing in this case is
> +      considered true if destination's page alignment - sources' page
> +      alignment is less than 8 * VEC_SIZE.
> +   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
> +      and destination do page alias copy from 4 pages at once using
> +      non-temporal stores.  */
>  
>  #include <sysdep.h>
>  
> @@ -67,6 +76,34 @@
>  # endif
>  #endif
>  
> +#ifndef PAGE_SIZE
> +# define PAGE_SIZE 4096
> +#endif
> +
> +#if PAGE_SIZE != 4096
> +# error Unsupported PAGE_SIZE
> +#endif
> +
> +#ifndef LOG_PAGE_SIZE
> +# define LOG_PAGE_SIZE 12
> +#endif
> +
> +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
> +# error Invalid LOG_PAGE_SIZE
> +#endif
> +
> +/* Byte per page for large_memcpy inner loop.  */
> +#if VEC_SIZE == 64
> +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
> +#else
> +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
> +#endif
> +
> +/* Amount to shift rdx by to compare for memcpy_large_4x.  */
> +#ifndef LOG_4X_MEMCPY_THRESH
> +# define LOG_4X_MEMCPY_THRESH 4
> +#endif
> +
>  /* Avoid short distance rep movsb only with non-SSE vector.  */
>  #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
>  # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
> @@ -106,6 +143,28 @@
>  # error Unsupported PREFETCH_SIZE!
>  #endif
>  
> +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
> +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
> +	VMOVU	(offset)base, vec0; \
> +	VMOVU	((offset) + VEC_SIZE)base, vec1;
> +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
> +	VMOVNT  vec0, (offset)base; \
> +	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
> +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
> +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> +	VMOVU	(offset)base, vec0; \
> +	VMOVU	((offset) + VEC_SIZE)base, vec1; \
> +	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
> +	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
> +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> +	VMOVNT	vec0, (offset)base; \
> +	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
> +	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
> +	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
> +#else
> +# error Invalid LARGE_LOAD_SIZE
> +#endif
> +
>  #ifndef SECTION
>  # error SECTION is not defined!
>  #endif
> @@ -393,6 +452,15 @@ L(last_4x_vec):
>  	VZEROUPPER_RETURN
>  
>  L(more_8x_vec):
> +	/* Check if non-temporal move candidate.  */
> +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> +	/* Check non-temporal store threshold.  */
> +	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> +	ja	L(large_memcpy_2x)
> +#endif
> +	/* Entry if rdx is greater than non-temporal threshold but there
> +       is overlap.  */
> +L(more_8x_vec_check):
>  	cmpq	%rsi, %rdi
>  	ja	L(more_8x_vec_backward)
>  	/* Source == destination is less common.  */
> @@ -419,11 +487,6 @@ L(more_8x_vec):
>  	subq	%r8, %rdi
>  	/* Adjust length.  */
>  	addq	%r8, %rdx
> -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> -	/* Check non-temporal store threshold.  */
> -	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
> -	ja	L(large_forward)
> -#endif
>  L(loop_4x_vec_forward):
>  	/* Copy 4 * VEC a time forward.  */
>  	VMOVU	(%rsi), %VEC(0)
> @@ -470,11 +533,6 @@ L(more_8x_vec_backward):
>  	subq	%r8, %r9
>  	/* Adjust length.  */
>  	subq	%r8, %rdx
> -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> -	/* Check non-temporal store threshold.  */
> -	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
> -	ja	L(large_backward)
> -#endif
>  L(loop_4x_vec_backward):
>  	/* Copy 4 * VEC a time backward.  */
>  	VMOVU	(%rcx), %VEC(0)
> @@ -500,72 +558,197 @@ L(loop_4x_vec_backward):
>  	VZEROUPPER_RETURN
>  
>  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> -L(large_forward):
> +L(large_memcpy_2x):
> +	/* Compute absolute value of difference between source and
> +	   destination.  */
> +	movq	%rdi, %r9
> +	subq	%rsi, %r9
> +	movq	%r9, %r8
> +	leaq	-1(%r9), %rcx
> +	sarq	$63, %r8
> +	xorq	%r8, %r9
> +	subq	%r8, %r9
>  	/* Don't use non-temporal store if there is overlap between
> -	   destination and source since destination may be in cache
> -	   when source is loaded.  */
> -	leaq    (%rdi, %rdx), %r10
> -	cmpq    %r10, %rsi
> -	jb	L(loop_4x_vec_forward)
> -L(loop_large_forward):
> +	   destination and source since destination may be in cache when
> +	   source is loaded.  */
> +	cmpq	%r9, %rdx
> +	ja	L(more_8x_vec_check)
> +
> +	/* Cache align destination. First store the first 64 bytes then
> +	   adjust alignments.  */
> +	VMOVU	(%rsi), %VEC(8)
> +#if VEC_SIZE < 64
> +	VMOVU	VEC_SIZE(%rsi), %VEC(9)
> +#if VEC_SIZE < 32
> +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
> +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
> +#endif
> +#endif
> +	VMOVU	%VEC(8), (%rdi)
> +#if VEC_SIZE < 64
> +	VMOVU	%VEC(9), VEC_SIZE(%rdi)
> +#if VEC_SIZE < 32
> +	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
> +	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
> +#endif
> +#endif
> +	/* Adjust source, destination, and size.  */
> +	movq	%rdi, %r8
> +	andq	$63, %r8
> +	/* Get the negative of offset for alignment.  */
> +	subq	$64, %r8
> +	/* Adjust source.  */
> +	subq	%r8, %rsi
> +	/* Adjust destination which should be aligned now.  */
> +	subq	%r8, %rdi
> +	/* Adjust length.  */
> +	addq	%r8, %rdx
> +
> +	/* Test if source and destination addresses will alias. If they do
> +	   the larger pipeline in large_memcpy_4x alleviated the
> +	   performance drop.  */
> +	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
> +	jz	L(large_memcpy_4x)
> +
> +	movq	%rdx, %r10
> +	shrq	$LOG_4X_MEMCPY_THRESH, %r10
> +	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
> +	jae	L(large_memcpy_4x)
> +
> +	/* edx will store remainder size for copying tail.  */
> +	andl	$(PAGE_SIZE * 2 - 1), %edx
> +	/* r10 stores outer loop counter.  */
> +	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
> +	/* Copy 4x VEC at a time from 2 pages.  */
> +L(loop_large_memcpy_2x_outer):
> +	/* ecx stores inner loop counter.  */
> +	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> +L(loop_large_memcpy_2x_inner):
> +	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> +	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
> +	/* Load vectors from rsi.  */
> +	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> +	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +	addq	$LARGE_LOAD_SIZE, %rsi
> +	/* Non-temporal store vectors to rdi.  */
> +	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> +	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +	addq	$LARGE_LOAD_SIZE, %rdi
> +	decl	%ecx
> +	jnz	L(loop_large_memcpy_2x_inner)
> +	addq	$PAGE_SIZE, %rdi
> +	addq	$PAGE_SIZE, %rsi
> +	decq	%r10
> +	jne	L(loop_large_memcpy_2x_outer)
> +
> +	/* Check if only last 4 loads are needed.  */
> +	cmpl	$(VEC_SIZE * 4), %edx
> +	jbe	L(large_memcpy_2x_end)
> +
> +	/* Handle the last 2 * PAGE_SIZE bytes.  */
> +L(loop_large_memcpy_2x_tail):
>  	/* Copy 4 * VEC a time forward with non-temporal stores.  */
> -	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> -	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
> +	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
>  	VMOVU	(%rsi), %VEC(0)
>  	VMOVU	VEC_SIZE(%rsi), %VEC(1)
>  	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
>  	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
> -	addq	$PREFETCHED_LOAD_SIZE, %rsi
> -	subq	$PREFETCHED_LOAD_SIZE, %rdx
> +	addq	$(VEC_SIZE * 4), %rsi
> +	subl	$(VEC_SIZE * 4), %edx
>  	VMOVNT	%VEC(0), (%rdi)
>  	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
>  	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
>  	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
> -	addq	$PREFETCHED_LOAD_SIZE, %rdi
> -	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
> -	ja	L(loop_large_forward)
> +	addq	$(VEC_SIZE * 4), %rdi
> +	cmpl	$(VEC_SIZE * 4), %edx
> +	ja	L(loop_large_memcpy_2x_tail)
> +
> +L(large_memcpy_2x_end):
>  	sfence
>  	/* Store the last 4 * VEC.  */
> -	VMOVU	%VEC(5), (%rcx)
> -	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
> -	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
> -	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
> -	/* Store the first VEC.  */
> -	VMOVU	%VEC(4), (%r11)
> +	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> +	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
> +
> +	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> +	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> +	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> +	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
>  	VZEROUPPER_RETURN
>  
> -L(large_backward):
> -	/* Don't use non-temporal store if there is overlap between
> -	   destination and source since destination may be in cache
> -	   when source is loaded.  */
> -	leaq    (%rcx, %rdx), %r10
> -	cmpq    %r10, %r9
> -	jb	L(loop_4x_vec_backward)
> -L(loop_large_backward):
> -	/* Copy 4 * VEC a time backward with non-temporal stores.  */
> -	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
> -	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
> -	VMOVU	(%rcx), %VEC(0)
> -	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
> -	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
> -	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
> -	subq	$PREFETCHED_LOAD_SIZE, %rcx
> -	subq	$PREFETCHED_LOAD_SIZE, %rdx
> -	VMOVNT	%VEC(0), (%r9)
> -	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
> -	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
> -	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
> -	subq	$PREFETCHED_LOAD_SIZE, %r9
> -	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
> -	ja	L(loop_large_backward)
> +L(large_memcpy_4x):
> +	movq	%rdx, %r10
> +	/* edx will store remainder size for copying tail.  */
> +	andl	$(PAGE_SIZE * 4 - 1), %edx
> +	/* r10 stores outer loop counter.  */
> +	shrq	$(LOG_PAGE_SIZE + 2), %r10
> +	/* Copy 4x VEC at a time from 4 pages.  */
> +L(loop_large_memcpy_4x_outer):
> +	/* ecx stores inner loop counter.  */
> +	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> +L(loop_large_memcpy_4x_inner):
> +	/* Only one prefetch set per page as doing 4 pages give more time
> +	   for prefetcher to keep up.  */
> +	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
> +	/* Load vectors from rsi.  */
> +	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> +	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> +	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> +	addq	$LARGE_LOAD_SIZE, %rsi
> +	/* Non-temporal store vectors to rdi.  */
> +	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> +	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> +	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> +	addq	$LARGE_LOAD_SIZE, %rdi
> +	decl	%ecx
> +	jnz	L(loop_large_memcpy_4x_inner)
> +	addq	$(PAGE_SIZE * 3), %rdi
> +	addq	$(PAGE_SIZE * 3), %rsi
> +	decq	%r10
> +	jne	L(loop_large_memcpy_4x_outer)
> +
> +	/* Check if only last 4 loads are needed.  */
> +	cmpl	$(VEC_SIZE * 4), %edx
> +	jbe	L(large_memcpy_4x_end)
> +
> +	/* Handle the last 4  * PAGE_SIZE bytes.  */
> +L(loop_large_memcpy_4x_tail):
> +	/* Copy 4 * VEC a time forward with non-temporal stores.  */
> +	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> +	VMOVU	(%rsi), %VEC(0)
> +	VMOVU	VEC_SIZE(%rsi), %VEC(1)
> +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
> +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
> +	addq	$(VEC_SIZE * 4), %rsi
> +	subl	$(VEC_SIZE * 4), %edx
> +	VMOVNT	%VEC(0), (%rdi)
> +	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
> +	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
> +	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
> +	addq	$(VEC_SIZE * 4), %rdi
> +	cmpl	$(VEC_SIZE * 4), %edx
> +	ja	L(loop_large_memcpy_4x_tail)
> +
> +L(large_memcpy_4x_end):
>  	sfence
> -	/* Store the first 4 * VEC.  */
> -	VMOVU	%VEC(4), (%rdi)
> -	VMOVU	%VEC(5), VEC_SIZE(%rdi)
> -	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
> -	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
> -	/* Store the last VEC.  */
> -	VMOVU	%VEC(8), (%r11)
> +	/* Store the last 4 * VEC.  */
> +	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> +	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
> +
> +	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> +	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> +	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> +	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
>  	VZEROUPPER_RETURN
>  #endif
>  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> -- 
> 2.29.2
> 

I am comparing some __memmove_avx_unaligned_erms numbers in
bench-memmove-large on Tiger Lake.  Most numbers are similar,
execept for one case.

Without this patch:

   length=32775, align1=0, align2=64:       455.62

With this patch,

   length=32775, align1=0, align2=64:      1613.81

Did you see the similar results?

H.J.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v6 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
  2021-04-02 22:08 ` [PATCH v6 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S H.J. Lu
@ 2021-04-02 23:09   ` Noah Goldstein
  2021-04-02 23:15     ` H.J. Lu
  0 siblings, 1 reply; 7+ messages in thread
From: Noah Goldstein @ 2021-04-02 23:09 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Fri, Apr 2, 2021 at 6:09 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Apr 02, 2021 at 04:26:42PM -0400, Noah Goldstein wrote:
> > From: noah <goldstein.w.n@gmail.com>
> >
> > No Bug. This commit updates the large memcpy case (no overlap). The
> > update is to perform memcpy on either 2 or 4 contiguous pages at
> > once. This 1) helps to alleviate the affects of false memory aliasing
> > when destination and source have a close 4k alignment and 2) In most
> > cases and for most DRAM units is a modestly more efficient access
> > pattern. These changes are a clear performance improvement for
> > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> > pass.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> >  .../multiarch/memmove-vec-unaligned-erms.S    | 307 ++++++++++++++----
> >  1 file changed, 245 insertions(+), 62 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > index 897a3d9762..9c7f95c653 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > @@ -35,7 +35,16 @@
> >        __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
> >     7. If size >= __x86_shared_non_temporal_threshold and there is no
> >        overlap between destination and source, use non-temporal store
> > -      instead of aligned store.  */
> > +      instead of aligned store copying from either 2 or 4 pages at
> > +      once.
> > +   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
> > +      and source and destination do not page alias, copy from 2 pages
> > +      at once using non-temporal stores. Page aliasing in this case is
> > +      considered true if destination's page alignment - sources' page
> > +      alignment is less than 8 * VEC_SIZE.
> > +   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
> > +      and destination do page alias copy from 4 pages at once using
> > +      non-temporal stores.  */
> >
> >  #include <sysdep.h>
> >
> > @@ -67,6 +76,34 @@
> >  # endif
> >  #endif
> >
> > +#ifndef PAGE_SIZE
> > +# define PAGE_SIZE 4096
> > +#endif
> > +
> > +#if PAGE_SIZE != 4096
> > +# error Unsupported PAGE_SIZE
> > +#endif
> > +
> > +#ifndef LOG_PAGE_SIZE
> > +# define LOG_PAGE_SIZE 12
> > +#endif
> > +
> > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
> > +# error Invalid LOG_PAGE_SIZE
> > +#endif
> > +
> > +/* Byte per page for large_memcpy inner loop.  */
> > +#if VEC_SIZE == 64
> > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
> > +#else
> > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
> > +#endif
> > +
> > +/* Amount to shift rdx by to compare for memcpy_large_4x.  */
> > +#ifndef LOG_4X_MEMCPY_THRESH
> > +# define LOG_4X_MEMCPY_THRESH 4
> > +#endif
> > +
> >  /* Avoid short distance rep movsb only with non-SSE vector.  */
> >  #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
> >  # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
> > @@ -106,6 +143,28 @@
> >  # error Unsupported PREFETCH_SIZE!
> >  #endif
> >
> > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
> > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
> > +     VMOVU   (offset)base, vec0; \
> > +     VMOVU   ((offset) + VEC_SIZE)base, vec1;
> > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
> > +     VMOVNT  vec0, (offset)base; \
> > +     VMOVNT  vec1, ((offset) + VEC_SIZE)base;
> > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
> > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > +     VMOVU   (offset)base, vec0; \
> > +     VMOVU   ((offset) + VEC_SIZE)base, vec1; \
> > +     VMOVU   ((offset) + VEC_SIZE * 2)base, vec2; \
> > +     VMOVU   ((offset) + VEC_SIZE * 3)base, vec3;
> > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > +     VMOVNT  vec0, (offset)base; \
> > +     VMOVNT  vec1, ((offset) + VEC_SIZE)base; \
> > +     VMOVNT  vec2, ((offset) + VEC_SIZE * 2)base; \
> > +     VMOVNT  vec3, ((offset) + VEC_SIZE * 3)base;
> > +#else
> > +# error Invalid LARGE_LOAD_SIZE
> > +#endif
> > +
> >  #ifndef SECTION
> >  # error SECTION is not defined!
> >  #endif
> > @@ -393,6 +452,15 @@ L(last_4x_vec):
> >       VZEROUPPER_RETURN
> >
> >  L(more_8x_vec):
> > +     /* Check if non-temporal move candidate.  */
> > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > +     /* Check non-temporal store threshold.  */
> > +     cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > +     ja      L(large_memcpy_2x)
> > +#endif
> > +     /* Entry if rdx is greater than non-temporal threshold but there
> > +       is overlap.  */
> > +L(more_8x_vec_check):
> >       cmpq    %rsi, %rdi
> >       ja      L(more_8x_vec_backward)
> >       /* Source == destination is less common.  */
> > @@ -419,11 +487,6 @@ L(more_8x_vec):
> >       subq    %r8, %rdi
> >       /* Adjust length.  */
> >       addq    %r8, %rdx
> > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > -     /* Check non-temporal store threshold.  */
> > -     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > -     ja      L(large_forward)
> > -#endif
> >  L(loop_4x_vec_forward):
> >       /* Copy 4 * VEC a time forward.  */
> >       VMOVU   (%rsi), %VEC(0)
> > @@ -470,11 +533,6 @@ L(more_8x_vec_backward):
> >       subq    %r8, %r9
> >       /* Adjust length.  */
> >       subq    %r8, %rdx
> > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > -     /* Check non-temporal store threshold.  */
> > -     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > -     ja      L(large_backward)
> > -#endif
> >  L(loop_4x_vec_backward):
> >       /* Copy 4 * VEC a time backward.  */
> >       VMOVU   (%rcx), %VEC(0)
> > @@ -500,72 +558,197 @@ L(loop_4x_vec_backward):
> >       VZEROUPPER_RETURN
> >
> >  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > -L(large_forward):
> > +L(large_memcpy_2x):
> > +     /* Compute absolute value of difference between source and
> > +        destination.  */
> > +     movq    %rdi, %r9
> > +     subq    %rsi, %r9
> > +     movq    %r9, %r8
> > +     leaq    -1(%r9), %rcx
> > +     sarq    $63, %r8
> > +     xorq    %r8, %r9
> > +     subq    %r8, %r9
> >       /* Don't use non-temporal store if there is overlap between
> > -        destination and source since destination may be in cache
> > -        when source is loaded.  */
> > -     leaq    (%rdi, %rdx), %r10
> > -     cmpq    %r10, %rsi
> > -     jb      L(loop_4x_vec_forward)
> > -L(loop_large_forward):
> > +        destination and source since destination may be in cache when
> > +        source is loaded.  */
> > +     cmpq    %r9, %rdx
> > +     ja      L(more_8x_vec_check)
> > +
> > +     /* Cache align destination. First store the first 64 bytes then
> > +        adjust alignments.  */
> > +     VMOVU   (%rsi), %VEC(8)
> > +#if VEC_SIZE < 64
> > +     VMOVU   VEC_SIZE(%rsi), %VEC(9)
> > +#if VEC_SIZE < 32
> > +     VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(10)
> > +     VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(11)
> > +#endif
> > +#endif
> > +     VMOVU   %VEC(8), (%rdi)
> > +#if VEC_SIZE < 64
> > +     VMOVU   %VEC(9), VEC_SIZE(%rdi)
> > +#if VEC_SIZE < 32
> > +     VMOVU   %VEC(10), (VEC_SIZE * 2)(%rdi)
> > +     VMOVU   %VEC(11), (VEC_SIZE * 3)(%rdi)
> > +#endif
> > +#endif
> > +     /* Adjust source, destination, and size.  */
> > +     movq    %rdi, %r8
> > +     andq    $63, %r8
> > +     /* Get the negative of offset for alignment.  */
> > +     subq    $64, %r8
> > +     /* Adjust source.  */
> > +     subq    %r8, %rsi
> > +     /* Adjust destination which should be aligned now.  */
> > +     subq    %r8, %rdi
> > +     /* Adjust length.  */
> > +     addq    %r8, %rdx
> > +
> > +     /* Test if source and destination addresses will alias. If they do
> > +        the larger pipeline in large_memcpy_4x alleviated the
> > +        performance drop.  */
> > +     testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx
> > +     jz      L(large_memcpy_4x)
> > +
> > +     movq    %rdx, %r10
> > +     shrq    $LOG_4X_MEMCPY_THRESH, %r10
> > +     cmp     __x86_shared_non_temporal_threshold(%rip), %r10
> > +     jae     L(large_memcpy_4x)
> > +
> > +     /* edx will store remainder size for copying tail.  */
> > +     andl    $(PAGE_SIZE * 2 - 1), %edx
> > +     /* r10 stores outer loop counter.  */
> > +     shrq    $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
> > +     /* Copy 4x VEC at a time from 2 pages.  */
> > +L(loop_large_memcpy_2x_outer):
> > +     /* ecx stores inner loop counter.  */
> > +     movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > +L(loop_large_memcpy_2x_inner):
> > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
> > +     /* Load vectors from rsi.  */
> > +     LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > +     LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > +     addq    $LARGE_LOAD_SIZE, %rsi
> > +     /* Non-temporal store vectors to rdi.  */
> > +     STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > +     STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > +     addq    $LARGE_LOAD_SIZE, %rdi
> > +     decl    %ecx
> > +     jnz     L(loop_large_memcpy_2x_inner)
> > +     addq    $PAGE_SIZE, %rdi
> > +     addq    $PAGE_SIZE, %rsi
> > +     decq    %r10
> > +     jne     L(loop_large_memcpy_2x_outer)
> > +
> > +     /* Check if only last 4 loads are needed.  */
> > +     cmpl    $(VEC_SIZE * 4), %edx
> > +     jbe     L(large_memcpy_2x_end)
> > +
> > +     /* Handle the last 2 * PAGE_SIZE bytes.  */
> > +L(loop_large_memcpy_2x_tail):
> >       /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > -     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > -     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
> > +     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> >       VMOVU   (%rsi), %VEC(0)
> >       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> >       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> >       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > -     addq    $PREFETCHED_LOAD_SIZE, %rsi
> > -     subq    $PREFETCHED_LOAD_SIZE, %rdx
> > +     addq    $(VEC_SIZE * 4), %rsi
> > +     subl    $(VEC_SIZE * 4), %edx
> >       VMOVNT  %VEC(0), (%rdi)
> >       VMOVNT  %VEC(1), VEC_SIZE(%rdi)
> >       VMOVNT  %VEC(2), (VEC_SIZE * 2)(%rdi)
> >       VMOVNT  %VEC(3), (VEC_SIZE * 3)(%rdi)
> > -     addq    $PREFETCHED_LOAD_SIZE, %rdi
> > -     cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> > -     ja      L(loop_large_forward)
> > +     addq    $(VEC_SIZE * 4), %rdi
> > +     cmpl    $(VEC_SIZE * 4), %edx
> > +     ja      L(loop_large_memcpy_2x_tail)
> > +
> > +L(large_memcpy_2x_end):
> >       sfence
> >       /* Store the last 4 * VEC.  */
> > -     VMOVU   %VEC(5), (%rcx)
> > -     VMOVU   %VEC(6), -VEC_SIZE(%rcx)
> > -     VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
> > -     VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
> > -     /* Store the first VEC.  */
> > -     VMOVU   %VEC(4), (%r11)
> > +     VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > +     VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > +     VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > +     VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > +
> > +     VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > +     VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > +     VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > +     VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> >       VZEROUPPER_RETURN
> >
> > -L(large_backward):
> > -     /* Don't use non-temporal store if there is overlap between
> > -        destination and source since destination may be in cache
> > -        when source is loaded.  */
> > -     leaq    (%rcx, %rdx), %r10
> > -     cmpq    %r10, %r9
> > -     jb      L(loop_4x_vec_backward)
> > -L(loop_large_backward):
> > -     /* Copy 4 * VEC a time backward with non-temporal stores.  */
> > -     PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
> > -     PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
> > -     VMOVU   (%rcx), %VEC(0)
> > -     VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> > -     VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> > -     VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> > -     subq    $PREFETCHED_LOAD_SIZE, %rcx
> > -     subq    $PREFETCHED_LOAD_SIZE, %rdx
> > -     VMOVNT  %VEC(0), (%r9)
> > -     VMOVNT  %VEC(1), -VEC_SIZE(%r9)
> > -     VMOVNT  %VEC(2), -(VEC_SIZE * 2)(%r9)
> > -     VMOVNT  %VEC(3), -(VEC_SIZE * 3)(%r9)
> > -     subq    $PREFETCHED_LOAD_SIZE, %r9
> > -     cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> > -     ja      L(loop_large_backward)
> > +L(large_memcpy_4x):
> > +     movq    %rdx, %r10
> > +     /* edx will store remainder size for copying tail.  */
> > +     andl    $(PAGE_SIZE * 4 - 1), %edx
> > +     /* r10 stores outer loop counter.  */
> > +     shrq    $(LOG_PAGE_SIZE + 2), %r10
> > +     /* Copy 4x VEC at a time from 4 pages.  */
> > +L(loop_large_memcpy_4x_outer):
> > +     /* ecx stores inner loop counter.  */
> > +     movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > +L(loop_large_memcpy_4x_inner):
> > +     /* Only one prefetch set per page as doing 4 pages give more time
> > +        for prefetcher to keep up.  */
> > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
> > +     /* Load vectors from rsi.  */
> > +     LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > +     LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > +     LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > +     LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > +     addq    $LARGE_LOAD_SIZE, %rsi
> > +     /* Non-temporal store vectors to rdi.  */
> > +     STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > +     STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > +     STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > +     STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > +     addq    $LARGE_LOAD_SIZE, %rdi
> > +     decl    %ecx
> > +     jnz     L(loop_large_memcpy_4x_inner)
> > +     addq    $(PAGE_SIZE * 3), %rdi
> > +     addq    $(PAGE_SIZE * 3), %rsi
> > +     decq    %r10
> > +     jne     L(loop_large_memcpy_4x_outer)
> > +
> > +     /* Check if only last 4 loads are needed.  */
> > +     cmpl    $(VEC_SIZE * 4), %edx
> > +     jbe     L(large_memcpy_4x_end)
> > +
> > +     /* Handle the last 4  * PAGE_SIZE bytes.  */
> > +L(loop_large_memcpy_4x_tail):
> > +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > +     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> > +     VMOVU   (%rsi), %VEC(0)
> > +     VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > +     VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > +     VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > +     addq    $(VEC_SIZE * 4), %rsi
> > +     subl    $(VEC_SIZE * 4), %edx
> > +     VMOVNT  %VEC(0), (%rdi)
> > +     VMOVNT  %VEC(1), VEC_SIZE(%rdi)
> > +     VMOVNT  %VEC(2), (VEC_SIZE * 2)(%rdi)
> > +     VMOVNT  %VEC(3), (VEC_SIZE * 3)(%rdi)
> > +     addq    $(VEC_SIZE * 4), %rdi
> > +     cmpl    $(VEC_SIZE * 4), %edx
> > +     ja      L(loop_large_memcpy_4x_tail)
> > +
> > +L(large_memcpy_4x_end):
> >       sfence
> > -     /* Store the first 4 * VEC.  */
> > -     VMOVU   %VEC(4), (%rdi)
> > -     VMOVU   %VEC(5), VEC_SIZE(%rdi)
> > -     VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
> > -     VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
> > -     /* Store the last VEC.  */
> > -     VMOVU   %VEC(8), (%r11)
> > +     /* Store the last 4 * VEC.  */
> > +     VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > +     VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > +     VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > +     VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > +
> > +     VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > +     VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > +     VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > +     VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> >       VZEROUPPER_RETURN
> >  #endif
> >  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> > --
> > 2.29.2
> >
>
> I am comparing some __memmove_avx_unaligned_erms numbers in
> bench-memmove-large on Tiger Lake.  Most numbers are similar,
> execept for one case.
>
> Without this patch:
>
>    length=32775, align1=0, align2=64:       455.62
>
> With this patch,
>
>    length=32775, align1=0, align2=64:      1613.81
>
> Did you see the similar results?
>

Somewhat. (These results are from icelake)
These results are from 100 runs.

        Mean    Median  Min     Max
New,    1131.8, 822.0,  799.5,  2317.6
Cur,    995.0,  852.7,  818.0,  1691.9

So this patch is definetly more volatile (see the Max). I think the issue
is actually loop alignment related. I was able to see the same
volatility in the current version if I nop padded before the loop. Do
you still see these results if you throw a .p2align 4 infront of the
loop?



> H.J.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v6 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
  2021-04-02 23:09   ` Noah Goldstein
@ 2021-04-02 23:15     ` H.J. Lu
  0 siblings, 0 replies; 7+ messages in thread
From: H.J. Lu @ 2021-04-02 23:15 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Fri, Apr 2, 2021 at 4:10 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Apr 2, 2021 at 6:09 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Apr 02, 2021 at 04:26:42PM -0400, Noah Goldstein wrote:
> > > From: noah <goldstein.w.n@gmail.com>
> > >
> > > No Bug. This commit updates the large memcpy case (no overlap). The
> > > update is to perform memcpy on either 2 or 4 contiguous pages at
> > > once. This 1) helps to alleviate the affects of false memory aliasing
> > > when destination and source have a close 4k alignment and 2) In most
> > > cases and for most DRAM units is a modestly more efficient access
> > > pattern. These changes are a clear performance improvement for
> > > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> > > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> > > pass.
> > >
> > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > ---
> > >  .../multiarch/memmove-vec-unaligned-erms.S    | 307 ++++++++++++++----
> > >  1 file changed, 245 insertions(+), 62 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > index 897a3d9762..9c7f95c653 100644
> > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > @@ -35,7 +35,16 @@
> > >        __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
> > >     7. If size >= __x86_shared_non_temporal_threshold and there is no
> > >        overlap between destination and source, use non-temporal store
> > > -      instead of aligned store.  */
> > > +      instead of aligned store copying from either 2 or 4 pages at
> > > +      once.
> > > +   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
> > > +      and source and destination do not page alias, copy from 2 pages
> > > +      at once using non-temporal stores. Page aliasing in this case is
> > > +      considered true if destination's page alignment - sources' page
> > > +      alignment is less than 8 * VEC_SIZE.
> > > +   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
> > > +      and destination do page alias copy from 4 pages at once using
> > > +      non-temporal stores.  */
> > >
> > >  #include <sysdep.h>
> > >
> > > @@ -67,6 +76,34 @@
> > >  # endif
> > >  #endif
> > >
> > > +#ifndef PAGE_SIZE
> > > +# define PAGE_SIZE 4096
> > > +#endif
> > > +
> > > +#if PAGE_SIZE != 4096
> > > +# error Unsupported PAGE_SIZE
> > > +#endif
> > > +
> > > +#ifndef LOG_PAGE_SIZE
> > > +# define LOG_PAGE_SIZE 12
> > > +#endif
> > > +
> > > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
> > > +# error Invalid LOG_PAGE_SIZE
> > > +#endif
> > > +
> > > +/* Byte per page for large_memcpy inner loop.  */
> > > +#if VEC_SIZE == 64
> > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
> > > +#else
> > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
> > > +#endif
> > > +
> > > +/* Amount to shift rdx by to compare for memcpy_large_4x.  */
> > > +#ifndef LOG_4X_MEMCPY_THRESH
> > > +# define LOG_4X_MEMCPY_THRESH 4
> > > +#endif
> > > +
> > >  /* Avoid short distance rep movsb only with non-SSE vector.  */
> > >  #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
> > >  # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
> > > @@ -106,6 +143,28 @@
> > >  # error Unsupported PREFETCH_SIZE!
> > >  #endif
> > >
> > > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
> > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
> > > +     VMOVU   (offset)base, vec0; \
> > > +     VMOVU   ((offset) + VEC_SIZE)base, vec1;
> > > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
> > > +     VMOVNT  vec0, (offset)base; \
> > > +     VMOVNT  vec1, ((offset) + VEC_SIZE)base;
> > > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
> > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > > +     VMOVU   (offset)base, vec0; \
> > > +     VMOVU   ((offset) + VEC_SIZE)base, vec1; \
> > > +     VMOVU   ((offset) + VEC_SIZE * 2)base, vec2; \
> > > +     VMOVU   ((offset) + VEC_SIZE * 3)base, vec3;
> > > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > > +     VMOVNT  vec0, (offset)base; \
> > > +     VMOVNT  vec1, ((offset) + VEC_SIZE)base; \
> > > +     VMOVNT  vec2, ((offset) + VEC_SIZE * 2)base; \
> > > +     VMOVNT  vec3, ((offset) + VEC_SIZE * 3)base;
> > > +#else
> > > +# error Invalid LARGE_LOAD_SIZE
> > > +#endif
> > > +
> > >  #ifndef SECTION
> > >  # error SECTION is not defined!
> > >  #endif
> > > @@ -393,6 +452,15 @@ L(last_4x_vec):
> > >       VZEROUPPER_RETURN
> > >
> > >  L(more_8x_vec):
> > > +     /* Check if non-temporal move candidate.  */
> > > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > +     /* Check non-temporal store threshold.  */
> > > +     cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > > +     ja      L(large_memcpy_2x)
> > > +#endif
> > > +     /* Entry if rdx is greater than non-temporal threshold but there
> > > +       is overlap.  */
> > > +L(more_8x_vec_check):
> > >       cmpq    %rsi, %rdi
> > >       ja      L(more_8x_vec_backward)
> > >       /* Source == destination is less common.  */
> > > @@ -419,11 +487,6 @@ L(more_8x_vec):
> > >       subq    %r8, %rdi
> > >       /* Adjust length.  */
> > >       addq    %r8, %rdx
> > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > -     /* Check non-temporal store threshold.  */
> > > -     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > > -     ja      L(large_forward)
> > > -#endif
> > >  L(loop_4x_vec_forward):
> > >       /* Copy 4 * VEC a time forward.  */
> > >       VMOVU   (%rsi), %VEC(0)
> > > @@ -470,11 +533,6 @@ L(more_8x_vec_backward):
> > >       subq    %r8, %r9
> > >       /* Adjust length.  */
> > >       subq    %r8, %rdx
> > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > -     /* Check non-temporal store threshold.  */
> > > -     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > > -     ja      L(large_backward)
> > > -#endif
> > >  L(loop_4x_vec_backward):
> > >       /* Copy 4 * VEC a time backward.  */
> > >       VMOVU   (%rcx), %VEC(0)
> > > @@ -500,72 +558,197 @@ L(loop_4x_vec_backward):
> > >       VZEROUPPER_RETURN
> > >
> > >  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > -L(large_forward):
> > > +L(large_memcpy_2x):
> > > +     /* Compute absolute value of difference between source and
> > > +        destination.  */
> > > +     movq    %rdi, %r9
> > > +     subq    %rsi, %r9
> > > +     movq    %r9, %r8
> > > +     leaq    -1(%r9), %rcx
> > > +     sarq    $63, %r8
> > > +     xorq    %r8, %r9
> > > +     subq    %r8, %r9
> > >       /* Don't use non-temporal store if there is overlap between
> > > -        destination and source since destination may be in cache
> > > -        when source is loaded.  */
> > > -     leaq    (%rdi, %rdx), %r10
> > > -     cmpq    %r10, %rsi
> > > -     jb      L(loop_4x_vec_forward)
> > > -L(loop_large_forward):
> > > +        destination and source since destination may be in cache when
> > > +        source is loaded.  */
> > > +     cmpq    %r9, %rdx
> > > +     ja      L(more_8x_vec_check)
> > > +
> > > +     /* Cache align destination. First store the first 64 bytes then
> > > +        adjust alignments.  */
> > > +     VMOVU   (%rsi), %VEC(8)
> > > +#if VEC_SIZE < 64
> > > +     VMOVU   VEC_SIZE(%rsi), %VEC(9)
> > > +#if VEC_SIZE < 32
> > > +     VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(10)
> > > +     VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(11)
> > > +#endif
> > > +#endif
> > > +     VMOVU   %VEC(8), (%rdi)
> > > +#if VEC_SIZE < 64
> > > +     VMOVU   %VEC(9), VEC_SIZE(%rdi)
> > > +#if VEC_SIZE < 32
> > > +     VMOVU   %VEC(10), (VEC_SIZE * 2)(%rdi)
> > > +     VMOVU   %VEC(11), (VEC_SIZE * 3)(%rdi)
> > > +#endif
> > > +#endif
> > > +     /* Adjust source, destination, and size.  */
> > > +     movq    %rdi, %r8
> > > +     andq    $63, %r8
> > > +     /* Get the negative of offset for alignment.  */
> > > +     subq    $64, %r8
> > > +     /* Adjust source.  */
> > > +     subq    %r8, %rsi
> > > +     /* Adjust destination which should be aligned now.  */
> > > +     subq    %r8, %rdi
> > > +     /* Adjust length.  */
> > > +     addq    %r8, %rdx
> > > +
> > > +     /* Test if source and destination addresses will alias. If they do
> > > +        the larger pipeline in large_memcpy_4x alleviated the
> > > +        performance drop.  */
> > > +     testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx
> > > +     jz      L(large_memcpy_4x)
> > > +
> > > +     movq    %rdx, %r10
> > > +     shrq    $LOG_4X_MEMCPY_THRESH, %r10
> > > +     cmp     __x86_shared_non_temporal_threshold(%rip), %r10
> > > +     jae     L(large_memcpy_4x)
> > > +
> > > +     /* edx will store remainder size for copying tail.  */
> > > +     andl    $(PAGE_SIZE * 2 - 1), %edx
> > > +     /* r10 stores outer loop counter.  */
> > > +     shrq    $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
> > > +     /* Copy 4x VEC at a time from 2 pages.  */
> > > +L(loop_large_memcpy_2x_outer):
> > > +     /* ecx stores inner loop counter.  */
> > > +     movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > > +L(loop_large_memcpy_2x_inner):
> > > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
> > > +     /* Load vectors from rsi.  */
> > > +     LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > +     addq    $LARGE_LOAD_SIZE, %rsi
> > > +     /* Non-temporal store vectors to rdi.  */
> > > +     STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > +     STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > +     addq    $LARGE_LOAD_SIZE, %rdi
> > > +     decl    %ecx
> > > +     jnz     L(loop_large_memcpy_2x_inner)
> > > +     addq    $PAGE_SIZE, %rdi
> > > +     addq    $PAGE_SIZE, %rsi
> > > +     decq    %r10
> > > +     jne     L(loop_large_memcpy_2x_outer)
> > > +
> > > +     /* Check if only last 4 loads are needed.  */
> > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > +     jbe     L(large_memcpy_2x_end)
> > > +
> > > +     /* Handle the last 2 * PAGE_SIZE bytes.  */
> > > +L(loop_large_memcpy_2x_tail):
> > >       /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > > -     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > > -     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
> > > +     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> > >       VMOVU   (%rsi), %VEC(0)
> > >       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > >       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > >       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > > -     addq    $PREFETCHED_LOAD_SIZE, %rsi
> > > -     subq    $PREFETCHED_LOAD_SIZE, %rdx
> > > +     addq    $(VEC_SIZE * 4), %rsi
> > > +     subl    $(VEC_SIZE * 4), %edx
> > >       VMOVNT  %VEC(0), (%rdi)
> > >       VMOVNT  %VEC(1), VEC_SIZE(%rdi)
> > >       VMOVNT  %VEC(2), (VEC_SIZE * 2)(%rdi)
> > >       VMOVNT  %VEC(3), (VEC_SIZE * 3)(%rdi)
> > > -     addq    $PREFETCHED_LOAD_SIZE, %rdi
> > > -     cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> > > -     ja      L(loop_large_forward)
> > > +     addq    $(VEC_SIZE * 4), %rdi
> > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > +     ja      L(loop_large_memcpy_2x_tail)
> > > +
> > > +L(large_memcpy_2x_end):
> > >       sfence
> > >       /* Store the last 4 * VEC.  */
> > > -     VMOVU   %VEC(5), (%rcx)
> > > -     VMOVU   %VEC(6), -VEC_SIZE(%rcx)
> > > -     VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
> > > -     VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
> > > -     /* Store the first VEC.  */
> > > -     VMOVU   %VEC(4), (%r11)
> > > +     VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > > +     VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > > +     VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > > +     VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > > +
> > > +     VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > > +     VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > > +     VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > > +     VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> > >       VZEROUPPER_RETURN
> > >
> > > -L(large_backward):
> > > -     /* Don't use non-temporal store if there is overlap between
> > > -        destination and source since destination may be in cache
> > > -        when source is loaded.  */
> > > -     leaq    (%rcx, %rdx), %r10
> > > -     cmpq    %r10, %r9
> > > -     jb      L(loop_4x_vec_backward)
> > > -L(loop_large_backward):
> > > -     /* Copy 4 * VEC a time backward with non-temporal stores.  */
> > > -     PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
> > > -     PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
> > > -     VMOVU   (%rcx), %VEC(0)
> > > -     VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> > > -     VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> > > -     VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> > > -     subq    $PREFETCHED_LOAD_SIZE, %rcx
> > > -     subq    $PREFETCHED_LOAD_SIZE, %rdx
> > > -     VMOVNT  %VEC(0), (%r9)
> > > -     VMOVNT  %VEC(1), -VEC_SIZE(%r9)
> > > -     VMOVNT  %VEC(2), -(VEC_SIZE * 2)(%r9)
> > > -     VMOVNT  %VEC(3), -(VEC_SIZE * 3)(%r9)
> > > -     subq    $PREFETCHED_LOAD_SIZE, %r9
> > > -     cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> > > -     ja      L(loop_large_backward)
> > > +L(large_memcpy_4x):
> > > +     movq    %rdx, %r10
> > > +     /* edx will store remainder size for copying tail.  */
> > > +     andl    $(PAGE_SIZE * 4 - 1), %edx
> > > +     /* r10 stores outer loop counter.  */
> > > +     shrq    $(LOG_PAGE_SIZE + 2), %r10
> > > +     /* Copy 4x VEC at a time from 4 pages.  */
> > > +L(loop_large_memcpy_4x_outer):
> > > +     /* ecx stores inner loop counter.  */
> > > +     movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > > +L(loop_large_memcpy_4x_inner):
> > > +     /* Only one prefetch set per page as doing 4 pages give more time
> > > +        for prefetcher to keep up.  */
> > > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
> > > +     /* Load vectors from rsi.  */
> > > +     LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > > +     addq    $LARGE_LOAD_SIZE, %rsi
> > > +     /* Non-temporal store vectors to rdi.  */
> > > +     STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > +     STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > +     STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > > +     STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > > +     addq    $LARGE_LOAD_SIZE, %rdi
> > > +     decl    %ecx
> > > +     jnz     L(loop_large_memcpy_4x_inner)
> > > +     addq    $(PAGE_SIZE * 3), %rdi
> > > +     addq    $(PAGE_SIZE * 3), %rsi
> > > +     decq    %r10
> > > +     jne     L(loop_large_memcpy_4x_outer)
> > > +
> > > +     /* Check if only last 4 loads are needed.  */
> > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > +     jbe     L(large_memcpy_4x_end)
> > > +
> > > +     /* Handle the last 4  * PAGE_SIZE bytes.  */
> > > +L(loop_large_memcpy_4x_tail):
> > > +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > > +     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> > > +     VMOVU   (%rsi), %VEC(0)
> > > +     VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > > +     VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > > +     VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > > +     addq    $(VEC_SIZE * 4), %rsi
> > > +     subl    $(VEC_SIZE * 4), %edx
> > > +     VMOVNT  %VEC(0), (%rdi)
> > > +     VMOVNT  %VEC(1), VEC_SIZE(%rdi)
> > > +     VMOVNT  %VEC(2), (VEC_SIZE * 2)(%rdi)
> > > +     VMOVNT  %VEC(3), (VEC_SIZE * 3)(%rdi)
> > > +     addq    $(VEC_SIZE * 4), %rdi
> > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > +     ja      L(loop_large_memcpy_4x_tail)
> > > +
> > > +L(large_memcpy_4x_end):
> > >       sfence
> > > -     /* Store the first 4 * VEC.  */
> > > -     VMOVU   %VEC(4), (%rdi)
> > > -     VMOVU   %VEC(5), VEC_SIZE(%rdi)
> > > -     VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
> > > -     VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
> > > -     /* Store the last VEC.  */
> > > -     VMOVU   %VEC(8), (%r11)
> > > +     /* Store the last 4 * VEC.  */
> > > +     VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > > +     VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > > +     VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > > +     VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > > +
> > > +     VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > > +     VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > > +     VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > > +     VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> > >       VZEROUPPER_RETURN
> > >  #endif
> > >  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> > > --
> > > 2.29.2
> > >
> >
> > I am comparing some __memmove_avx_unaligned_erms numbers in
> > bench-memmove-large on Tiger Lake.  Most numbers are similar,
> > execept for one case.
> >
> > Without this patch:
> >
> >    length=32775, align1=0, align2=64:       455.62
> >
> > With this patch,
> >
> >    length=32775, align1=0, align2=64:      1613.81
> >
> > Did you see the similar results?
> >
>
> Somewhat. (These results are from icelake)
> These results are from 100 runs.
>
>         Mean    Median  Min     Max
> New,    1131.8, 822.0,  799.5,  2317.6
> Cur,    995.0,  852.7,  818.0,  1691.9
>
> So this patch is definetly more volatile (see the Max). I think the issue
> is actually loop alignment related. I was able to see the same
> volatility in the current version if I nop padded before the loop. Do
> you still see these results if you throw a .p2align 4 infront of the
> loop?

Please update your patch which doesn't introduce performance
regressions in glibc benchmarks.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-04-02 23:15 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-02 20:26 [PATCH v6 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S Noah Goldstein
2021-04-02 20:26 ` [PATCH v6 2/2] x86: Expanding test-memmove.c, test-memcpy.c, bench-memcpy-large.c Noah Goldstein
2021-04-02 20:35   ` H.J. Lu
2021-04-02 21:35     ` Noah Goldstein
2021-04-02 22:08 ` [PATCH v6 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S H.J. Lu
2021-04-02 23:09   ` Noah Goldstein
2021-04-02 23:15     ` H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).