[PATCH v4 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH v4 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
@ 2021-03-30 21:38 noah
  2021-03-30 21:38 ` [PATCH v4 2/2] x86: Expanding test-memmove.c, test-memcpy.c, bench-memcpy-large.c noah
  2021-04-02  1:53 ` [PATCH v4 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S H.J. Lu
  0 siblings, 2 replies; 4+ messages in thread
From: noah @ 2021-03-30 21:38 UTC (permalink / raw)
  To: libc-alpha

No Bug. This commit updates the large memcpy case (no overlap). The
update is to perform memcpy on either 2 or 4 contiguous pages at
once. This 1) helps to alleviate the affects of false memory aliasing
when destination and source have a close 4k alignment and 2) In most
cases and for most DRAM units is a modestly more efficient access
pattern. These changes are a clear performance improvement for
VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
pass.

Signed-off-by: noah <goldstein.w.n@gmail.com>
---
 .../multiarch/memmove-vec-unaligned-erms.S    | 326 ++++++++++++++----
 1 file changed, 258 insertions(+), 68 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 897a3d9762..dae3e2bac5 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -35,7 +35,16 @@
       __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
    7. If size >= __x86_shared_non_temporal_threshold and there is no
       overlap between destination and source, use non-temporal store
-      instead of aligned store.  */
+      instead of aligned store copying from either 2 or 4 pages at
+      once.
+   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
+      and source and destination do not page alias, copy from 2 pages
+      at once using non-temporal stores. Page aliasing in this case is
+      considered true if destination's page alignment - sources' page
+      alignment is less than 8 * VEC_SIZE.
+   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
+      and destination do page alias copy from 4 pages at once using
+      non-temporal stores.  */
 
 #include <sysdep.h>
 
@@ -67,6 +76,35 @@
 # endif
 #endif
 
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+    
+#if PAGE_SIZE != 4096
+# error Unsupported PAGE_SIZE
+#endif
+    
+#ifndef LOG_PAGE_SIZE
+# define LOG_PAGE_SIZE 12    
+#endif
+    
+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
+# error Invalid LOG_PAGE_SIZE    
+#endif
+
+/* Byte per page for large_memcpy inner loop.  */    
+#if VEC_SIZE == 64
+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
+#else
+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
+#endif
+
+    
+/* Amount to shift rdx by to compare for memcpy_large_4x.  */    
+#ifndef LOG_4X_MEMCPY_THRESH
+# define LOG_4X_MEMCPY_THRESH 4
+#endif
+    
 /* Avoid short distance rep movsb only with non-SSE vector.  */
 #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
 # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
@@ -106,6 +144,28 @@
 # error Unsupported PREFETCH_SIZE!
 #endif
 
+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
+    VMOVU   (offset)base, vec0; \
+    VMOVU   ((offset) + VEC_SIZE)base, vec1;
+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
+    VMOVNT  vec0, (offset)base; \
+    VMOVNT  vec1, ((offset) + VEC_SIZE)base;
+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+    VMOVU   (offset)base, vec0; \
+    VMOVU   ((offset) + VEC_SIZE)base, vec1; \
+    VMOVU   ((offset) + VEC_SIZE * 2)base, vec2; \
+    VMOVU   ((offset) + VEC_SIZE * 3)base, vec3; 
+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+    VMOVNT  vec0, (offset)base; \
+    VMOVNT  vec1, ((offset) + VEC_SIZE)base; \
+    VMOVNT  vec2, ((offset) + VEC_SIZE * 2)base; \
+    VMOVNT  vec3, ((offset) + VEC_SIZE * 3)base;
+#else
+# error Invalid LARGE_LOAD_SIZE
+#endif
+    
 #ifndef SECTION
 # error SECTION is not defined!
 #endif
@@ -393,6 +453,15 @@ L(last_4x_vec):
 	VZEROUPPER_RETURN
 
 L(more_8x_vec):
+    /* Check if non-temporal move candidate.  */    
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+	/* Check non-temporal store threshold.  */
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	ja	L(large_memcpy_2x)
+#endif
+    /* Entry if rdx is greater than non-temporal threshold but there
+       is overlap.  */
+L(more_8x_vec_check):    
 	cmpq	%rsi, %rdi
 	ja	L(more_8x_vec_backward)
 	/* Source == destination is less common.  */
@@ -419,11 +488,6 @@ L(more_8x_vec):
 	subq	%r8, %rdi
 	/* Adjust length.  */
 	addq	%r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	/* Check non-temporal store threshold.  */
-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
-	ja	L(large_forward)
-#endif
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
 	VMOVU	(%rsi), %VEC(0)
@@ -470,11 +534,6 @@ L(more_8x_vec_backward):
 	subq	%r8, %r9
 	/* Adjust length.  */
 	subq	%r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	/* Check non-temporal store threshold.  */
-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
-	ja	L(large_backward)
-#endif
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
 	VMOVU	(%rcx), %VEC(0)
@@ -500,72 +559,203 @@ L(loop_4x_vec_backward):
 	VZEROUPPER_RETURN
 
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-L(large_forward):
+L(large_memcpy_2x):
+	/* Compute absolute value of difference between source and
+	   destination.  */
+	movq    %rdi, %r9
+	subq    %rsi, %r9
+	movq    %r9, %r8
+	leaq    -1(%r9), %rcx
+	sarq    $63, %r8
+	xorq    %r8, %r9
+	subq    %r8, %r9
 	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rdi, %rdx), %r10
-	cmpq    %r10, %rsi
-	jb	L(loop_4x_vec_forward)
-L(loop_large_forward):
-	/* Copy 4 * VEC a time forward with non-temporal stores.  */
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+	   destination and source since destination may be in cache when
+	   source is loaded.  */
+	cmpq    %r9, %rdx
+	ja L(more_8x_vec_check)
+
+	/* Cache align destination. First store the first 64 bytes then
+	   adjust alignments.  */
+	VMOVU	(%rsi), %VEC(8)
+#if VEC_SIZE < 64
+	VMOVU	VEC_SIZE(%rsi), %VEC(9)
+#if VEC_SIZE < 32    
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
+#endif
+#endif
+	VMOVU	%VEC(8), (%rdi)
+#if VEC_SIZE < 64
+	VMOVU	%VEC(9), VEC_SIZE(%rdi)
+#if VEC_SIZE < 32    
+	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
+#endif
+#endif
+	/* Adjust source, destination, and size.  */
+	MOVQ	%rdi, %r8
+	andq	$63, %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$64, %r8
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
+
+	/* Test if source and destination addresses will alias. If they do
+	   the larger pipeline in large_memcpy_4x alleviated the
+	   performance drop.  */
+	testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx
+	jz  L(large_memcpy_4x)    
+
+	movq    %rdx, %r10
+	shrq    $LOG_4X_MEMCPY_THRESH, %r10
+	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
+	jae L(large_memcpy_4x)
+
+	/* edx will store remainder size for copying tail.  */
+	andl    $(PAGE_SIZE * 2 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq    $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
+	/* Copy 4x VEC at a time from 2 pages.  */
+	.p2align 4
+L(loop_large_memcpy_2x_outer):
+	/* ecx stores inner loop counter.  */
+	movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_2x_inner):
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	addq	$LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	addq    $LARGE_LOAD_SIZE, %rdi
+	decl    %ecx
+	jnz L(loop_large_memcpy_2x_inner)
+	addq    $PAGE_SIZE, %rdi
+	addq    $PAGE_SIZE, %rsi
+	decq    %r10
+	jne L(loop_large_memcpy_2x_outer)
+	sfence
+
+	/* Check if only last 4 loads are needed.  */
+	cmpl    $(VEC_SIZE * 4), %edx
+	jbe L(large_memcpy_2x_end)
+
+	/* Handle the last 2 * PAGE_SIZE bytes. Use temporal stores
+	   here. The region will fit in cache and it should fit user
+	   expectations for the tail of the memcpy region to be hot.  */
+L(loop_large_memcpy_2x_tail):
+	/* Copy 4 * VEC a time forward with temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	addq	$PREFETCHED_LOAD_SIZE, %rsi
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
-	VMOVNT	%VEC(0), (%rdi)
-	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
-	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	addq	$PREFETCHED_LOAD_SIZE, %rdi
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_forward)
-	sfence
+	addq	$(VEC_SIZE * 4), %rsi
+	subl    $(VEC_SIZE * 4), %edx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$(VEC_SIZE * 4), %rdi
+	cmpl    $(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_2x_tail)
+
+L(large_memcpy_2x_end):
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
-	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
-L(large_backward):
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rcx, %rdx), %r10
-	cmpq    %r10, %r9
-	jb	L(loop_4x_vec_backward)
-L(loop_large_backward):
-	/* Copy 4 * VEC a time backward with non-temporal stores.  */
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	subq	$PREFETCHED_LOAD_SIZE, %rcx
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
-	VMOVNT	%VEC(0), (%r9)
-	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
-	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	subq	$PREFETCHED_LOAD_SIZE, %r9
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_backward)
+L(large_memcpy_4x):
+	movq    %rdx, %r10
+	/* edx will store remainder size for copying tail.  */
+	andl    $(PAGE_SIZE * 4 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq    $(LOG_PAGE_SIZE + 2), %r10
+	/* Copy 4x VEC at a time from 4 pages.  */
+	.p2align 4
+L(loop_large_memcpy_4x_outer):
+	/* ecx stores inner loop counter.  */
+	movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_4x_inner):
+	/* Only one prefetch set per page as doing 4 pages give more time
+	   for prefetcher to keep up.  */
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	addq	$LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	addq    $LARGE_LOAD_SIZE, %rdi
+	decl    %ecx
+	jnz L(loop_large_memcpy_4x_inner)
+	addq    $(PAGE_SIZE * 3), %rdi
+	addq    $(PAGE_SIZE * 3), %rsi
+	decq    %r10
+	jne L(loop_large_memcpy_4x_outer)
 	sfence
-	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
-	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
+
+	/* Check if only last 4 loads are needed.  */
+	cmpl    $(VEC_SIZE * 4), %edx
+	jbe L(large_memcpy_4x_end)
+
+	/* Handle the last 4  * PAGE_SIZE bytes.  */
+L(loop_large_memcpy_4x_tail):
+	/* Copy 4 * VEC a time forward with temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	addq	$(VEC_SIZE * 4), %rsi
+	subl    $(VEC_SIZE * 4), %edx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$(VEC_SIZE * 4), %rdi
+	cmpl    $(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_4x_tail)
+
+L(large_memcpy_4x_end):
+	/* Store the last 4 * VEC.  */
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
-- 
2.29.2


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH v4 2/2] x86: Expanding test-memmove.c, test-memcpy.c, bench-memcpy-large.c
  2021-03-30 21:38 [PATCH v4 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S noah
@ 2021-03-30 21:38 ` noah
  2021-04-02  1:53 ` [PATCH v4 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S H.J. Lu
  1 sibling, 0 replies; 4+ messages in thread
From: noah @ 2021-03-30 21:38 UTC (permalink / raw)
  To: libc-alpha

No Bug. This commit expanding the range of tests / benchmarks for
memmove and memcpy. The test expansion is mostly in the vein of
increasing the maximum size, increasing the number of unique
alignments tested, and testing both source < destination and vice
versa. The benchmark expansaion is just to increase the number of
unique alignments. test-memcpy, test-memccpy, test-mempcpy,
test-memmove, and tst-memmove-overflow all pass.

Signed-off-by: noah <goldstein.w.n@gmail.com>
---
 benchtests/bench-memcpy-large.c |  8 +++-
 string/test-memcpy.c            | 61 ++++++++++++++++------------
 string/test-memmove.c           | 70 ++++++++++++++++++++-------------
 3 files changed, 83 insertions(+), 56 deletions(-)

diff --git a/benchtests/bench-memcpy-large.c b/benchtests/bench-memcpy-large.c
index 3df1575514..efb9627b1e 100644
--- a/benchtests/bench-memcpy-large.c
+++ b/benchtests/bench-memcpy-large.c
@@ -57,11 +57,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= 4095;
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= 4095;
   if (align2 + len >= page_size)
     return;
 
@@ -113,6 +113,10 @@ test_main (void)
       do_test (&json_ctx, 0, 3, i + 15);
       do_test (&json_ctx, 3, 0, i + 31);
       do_test (&json_ctx, 3, 5, i + 63);
+      do_test (&json_ctx, 0, 127, i);
+      do_test (&json_ctx, 0, 255, i);
+      do_test (&json_ctx, 0, 256, i);
+      do_test (&json_ctx, 0, 4064, i);
     }
 
   json_array_end (&json_ctx);
diff --git a/string/test-memcpy.c b/string/test-memcpy.c
index 2e9c6bd099..c9dfc88fed 100644
--- a/string/test-memcpy.c
+++ b/string/test-memcpy.c
@@ -82,11 +82,11 @@ do_test (size_t align1, size_t align2, size_t len)
   size_t i, j;
   char *s1, *s2;
 
-  align1 &= 63;
+  align1 &= 4095;
   if (align1 + len >= page_size)
     return;
 
-  align2 &= 63;
+  align2 &= 4095;
   if (align2 + len >= page_size)
     return;
 
@@ -213,11 +213,9 @@ do_random_tests (void)
 }
 
 static void
-do_test1 (void)
+do_test1 (size_t size)
 {
-  size_t size = 0x100000;
   void *large_buf;
-
   large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
 		    MAP_PRIVATE | MAP_ANON, -1, 0);
   if (large_buf == MAP_FAILED)
@@ -233,27 +231,32 @@ do_test1 (void)
   uint32_t *dest = large_buf;
   uint32_t *src = large_buf + size + page_size;
   size_t i;
-
-  for (i = 0; i < arrary_size; i++)
-    src[i] = (uint32_t) i;
-
-  FOR_EACH_IMPL (impl, 0)
+  size_t repeats;
+  for(repeats = 0; repeats < 2; repeats++)
     {
-      memset (dest, -1, size);
-      CALL (impl, (char *) dest, (char *) src, size);
       for (i = 0; i < arrary_size; i++)
-	if (dest[i] != src[i])
-	  {
-	    error (0, 0,
-		   "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
-		   impl->name, dest, src, i);
-	    ret = 1;
-	    break;
-	  }
+        src[i] = (uint32_t) i;
+
+      FOR_EACH_IMPL (impl, 0)
+        {
+            printf ("\t\tRunning: %s\n", impl->name);
+          memset (dest, -1, size);
+          CALL (impl, (char *) dest, (char *) src, size);
+          for (i = 0; i < arrary_size; i++)
+        if (dest[i] != src[i])
+          {
+            error (0, 0,
+               "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
+               impl->name, dest, src, i);
+            ret = 1;
+            munmap ((void *) large_buf, size * 2 + page_size);
+            return;
+          }
+        }
+      dest = src;
+      src = large_buf;
     }
-
-  munmap ((void *) dest, size);
-  munmap ((void *) src, size);
+  munmap ((void *) large_buf, size * 2 + page_size);
 }
 
 int
@@ -275,7 +278,6 @@ test_main (void)
       do_test (0, i, 1 << i);
       do_test (i, i, 1 << i);
     }
-
   for (i = 0; i < 32; ++i)
     {
       do_test (0, 0, i);
@@ -294,12 +296,19 @@ test_main (void)
       do_test (i, i, 16 * i);
     }
 
+  for (i = 19; i <= 25; ++i)
+    {
+      do_test (255, 0, 1 << i);
+      do_test (0, 255, i);
+      do_test (0, 4000, i);
+    }
+
   do_test (0, 0, getpagesize ());
 
   do_random_tests ();
 
-  do_test1 ();
-
+  do_test1 (0x100000);
+  do_test1 (0x2000000);
   return ret;
 }
 
diff --git a/string/test-memmove.c b/string/test-memmove.c
index 2e3ce75b9b..ff8099d12f 100644
--- a/string/test-memmove.c
+++ b/string/test-memmove.c
@@ -247,7 +247,7 @@ do_random_tests (void)
 }
 
 static void
-do_test2 (void)
+do_test2 (size_t offset)
 {
   size_t size = 0x20000000;
   uint32_t * large_buf;
@@ -268,33 +268,45 @@ do_test2 (void)
     }
 
   size_t bytes_move = 0x80000000 - (uintptr_t) large_buf;
+  if (bytes_move + offset * sizeof (uint32_t) > size)
+    {
+      munmap ((void *) large_buf, size);
+      return;
+    }
   size_t arr_size = bytes_move / sizeof (uint32_t);
   size_t i;
-
-  FOR_EACH_IMPL (impl, 0)
-    {
-      for (i = 0; i < arr_size; i++)
-        large_buf[i] = (uint32_t) i;
-
-      uint32_t * dst = &large_buf[33];
-
-#ifdef TEST_BCOPY
-      CALL (impl, (char *) large_buf, (char *) dst, bytes_move);
-#else
-      CALL (impl, (char *) dst, (char *) large_buf, bytes_move);
-#endif
-
-      for (i = 0; i < arr_size; i++)
-	{
-	  if (dst[i] != (uint32_t) i)
-	    {
-	      error (0, 0,
-		     "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
-		     impl->name, dst, large_buf, i);
-	      ret = 1;
-	      break;
-	    }
-	}
+  size_t repeats;
+  uint32_t * src = large_buf;
+  uint32_t * dst = &large_buf[offset];
+  for (repeats = 0; repeats < 2; ++repeats)
+    {      
+      FOR_EACH_IMPL (impl, 0)
+        {
+          for (i = 0; i < arr_size; i++)
+            src[i] = (uint32_t) i;
+
+
+    #ifdef TEST_BCOPY
+          CALL (impl, (char *) src, (char *) dst, bytes_move);
+    #else
+          CALL (impl, (char *) dst, (char *) src, bytes_move);
+    #endif
+
+          for (i = 0; i < arr_size; i++)
+        {
+          if (dst[i] != (uint32_t) i)
+            {
+              error (0, 0,
+                 "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
+                 impl->name, dst, large_buf, i);
+              ret = 1;
+              munmap ((void *) large_buf, size);
+              return;
+            }
+        }
+        }
+      src = dst;
+      dst = large_buf;
     }
 
   munmap ((void *) large_buf, size);
@@ -340,8 +352,10 @@ test_main (void)
 
   do_random_tests ();
 
-  do_test2 ();
-
+  do_test2 (33);
+  do_test2 (0x200000);
+  do_test2 (0x4000000 - 1);
+  do_test2 (0x4000000);
   return ret;
 }
 
-- 
2.29.2


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v4 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
  2021-03-30 21:38 [PATCH v4 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S noah
  2021-03-30 21:38 ` [PATCH v4 2/2] x86: Expanding test-memmove.c, test-memcpy.c, bench-memcpy-large.c noah
@ 2021-04-02  1:53 ` H.J. Lu
  2021-04-02  4:15   ` Noah Goldstein
  1 sibling, 1 reply; 4+ messages in thread
From: H.J. Lu @ 2021-04-02  1:53 UTC (permalink / raw)
  To: noah; +Cc: GNU C Library, Carlos O'Donell

On Tue, Mar 30, 2021 at 2:38 PM noah <goldstein.w.n@gmail.com> wrote:
>
> No Bug. This commit updates the large memcpy case (no overlap). The
> update is to perform memcpy on either 2 or 4 contiguous pages at
> once. This 1) helps to alleviate the affects of false memory aliasing
> when destination and source have a close 4k alignment and 2) In most
> cases and for most DRAM units is a modestly more efficient access
> pattern. These changes are a clear performance improvement for
> VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> pass.
>
> Signed-off-by: noah <goldstein.w.n@gmail.com>
> ---
>  .../multiarch/memmove-vec-unaligned-erms.S    | 326 ++++++++++++++----
>  1 file changed, 258 insertions(+), 68 deletions(-)

1. There are many trailing whitespaces.
2. Replace "jccSPACElabel" with "jccTABlabel".
3. Replace "insnSPACEoperand" with "insnTABoperand" if needed.

> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> index 897a3d9762..dae3e2bac5 100644
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -35,7 +35,16 @@
>        __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
>     7. If size >= __x86_shared_non_temporal_threshold and there is no
>        overlap between destination and source, use non-temporal store
> -      instead of aligned store.  */
> +      instead of aligned store copying from either 2 or 4 pages at
> +      once.
> +   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
> +      and source and destination do not page alias, copy from 2 pages
> +      at once using non-temporal stores. Page aliasing in this case is
> +      considered true if destination's page alignment - sources' page
> +      alignment is less than 8 * VEC_SIZE.
> +   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
> +      and destination do page alias copy from 4 pages at once using
> +      non-temporal stores.  */
>
>  #include <sysdep.h>
>
> @@ -67,6 +76,35 @@
>  # endif
>  #endif
>
> +#ifndef PAGE_SIZE
> +# define PAGE_SIZE 4096
> +#endif
> +
> +#if PAGE_SIZE != 4096
> +# error Unsupported PAGE_SIZE
> +#endif
> +
> +#ifndef LOG_PAGE_SIZE
> +# define LOG_PAGE_SIZE 12
> +#endif
> +
> +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
> +# error Invalid LOG_PAGE_SIZE
> +#endif
> +
> +/* Byte per page for large_memcpy inner loop.  */
> +#if VEC_SIZE == 64
> +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
> +#else
> +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
> +#endif
> +
> +
> +/* Amount to shift rdx by to compare for memcpy_large_4x.  */
> +#ifndef LOG_4X_MEMCPY_THRESH
> +# define LOG_4X_MEMCPY_THRESH 4
> +#endif
> +
>  /* Avoid short distance rep movsb only with non-SSE vector.  */
>  #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
>  # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
> @@ -106,6 +144,28 @@
>  # error Unsupported PREFETCH_SIZE!
>  #endif
>
> +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
> +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
> +    VMOVU   (offset)base, vec0; \
> +    VMOVU   ((offset) + VEC_SIZE)base, vec1;
> +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
> +    VMOVNT  vec0, (offset)base; \
> +    VMOVNT  vec1, ((offset) + VEC_SIZE)base;
> +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
> +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> +    VMOVU   (offset)base, vec0; \
> +    VMOVU   ((offset) + VEC_SIZE)base, vec1; \
> +    VMOVU   ((offset) + VEC_SIZE * 2)base, vec2; \
> +    VMOVU   ((offset) + VEC_SIZE * 3)base, vec3;
> +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> +    VMOVNT  vec0, (offset)base; \
> +    VMOVNT  vec1, ((offset) + VEC_SIZE)base; \
> +    VMOVNT  vec2, ((offset) + VEC_SIZE * 2)base; \
> +    VMOVNT  vec3, ((offset) + VEC_SIZE * 3)base;
> +#else
> +# error Invalid LARGE_LOAD_SIZE
> +#endif
> +
>  #ifndef SECTION
>  # error SECTION is not defined!
>  #endif
> @@ -393,6 +453,15 @@ L(last_4x_vec):
>         VZEROUPPER_RETURN
>
>  L(more_8x_vec):
> +    /* Check if non-temporal move candidate.  */
> +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> +       /* Check non-temporal store threshold.  */
> +       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> +       ja      L(large_memcpy_2x)
> +#endif
> +    /* Entry if rdx is greater than non-temporal threshold but there
> +       is overlap.  */
> +L(more_8x_vec_check):
>         cmpq    %rsi, %rdi
>         ja      L(more_8x_vec_backward)
>         /* Source == destination is less common.  */
> @@ -419,11 +488,6 @@ L(more_8x_vec):
>         subq    %r8, %rdi
>         /* Adjust length.  */
>         addq    %r8, %rdx
> -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> -       /* Check non-temporal store threshold.  */
> -       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> -       ja      L(large_forward)
> -#endif
>  L(loop_4x_vec_forward):
>         /* Copy 4 * VEC a time forward.  */
>         VMOVU   (%rsi), %VEC(0)
> @@ -470,11 +534,6 @@ L(more_8x_vec_backward):
>         subq    %r8, %r9
>         /* Adjust length.  */
>         subq    %r8, %rdx
> -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> -       /* Check non-temporal store threshold.  */
> -       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> -       ja      L(large_backward)
> -#endif
>  L(loop_4x_vec_backward):
>         /* Copy 4 * VEC a time backward.  */
>         VMOVU   (%rcx), %VEC(0)
> @@ -500,72 +559,203 @@ L(loop_4x_vec_backward):
>         VZEROUPPER_RETURN
>
>  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> -L(large_forward):
> +L(large_memcpy_2x):
> +       /* Compute absolute value of difference between source and
> +          destination.  */
> +       movq    %rdi, %r9
> +       subq    %rsi, %r9
> +       movq    %r9, %r8
> +       leaq    -1(%r9), %rcx
> +       sarq    $63, %r8
> +       xorq    %r8, %r9
> +       subq    %r8, %r9
>         /* Don't use non-temporal store if there is overlap between
> -          destination and source since destination may be in cache
> -          when source is loaded.  */
> -       leaq    (%rdi, %rdx), %r10
> -       cmpq    %r10, %rsi
> -       jb      L(loop_4x_vec_forward)
> -L(loop_large_forward):
> -       /* Copy 4 * VEC a time forward with non-temporal stores.  */
> -       PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> -       PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
> +          destination and source since destination may be in cache when
> +          source is loaded.  */
> +       cmpq    %r9, %rdx
> +       ja L(more_8x_vec_check)
> +
> +       /* Cache align destination. First store the first 64 bytes then
> +          adjust alignments.  */
> +       VMOVU   (%rsi), %VEC(8)
> +#if VEC_SIZE < 64
> +       VMOVU   VEC_SIZE(%rsi), %VEC(9)
> +#if VEC_SIZE < 32
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(10)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(11)
> +#endif
> +#endif
> +       VMOVU   %VEC(8), (%rdi)
> +#if VEC_SIZE < 64
> +       VMOVU   %VEC(9), VEC_SIZE(%rdi)
> +#if VEC_SIZE < 32
> +       VMOVU   %VEC(10), (VEC_SIZE * 2)(%rdi)
> +       VMOVU   %VEC(11), (VEC_SIZE * 3)(%rdi)
> +#endif
> +#endif
> +       /* Adjust source, destination, and size.  */
> +       MOVQ    %rdi, %r8

movq

> +       andq    $63, %r8
> +       /* Get the negative of offset for alignment.  */
> +       subq    $64, %r8
> +       /* Adjust source.  */
> +       subq    %r8, %rsi
> +       /* Adjust destination which should be aligned now.  */
> +       subq    %r8, %rdi
> +       /* Adjust length.  */
> +       addq    %r8, %rdx
> +
> +       /* Test if source and destination addresses will alias. If they do
> +          the larger pipeline in large_memcpy_4x alleviated the
> +          performance drop.  */
> +       testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx
> +       jz  L(large_memcpy_4x)
> +
> +       movq    %rdx, %r10
> +       shrq    $LOG_4X_MEMCPY_THRESH, %r10
> +       cmp     __x86_shared_non_temporal_threshold(%rip), %r10
> +       jae L(large_memcpy_4x)
> +
> +       /* edx will store remainder size for copying tail.  */
> +       andl    $(PAGE_SIZE * 2 - 1), %edx
> +       /* r10 stores outer loop counter.  */
> +       shrq    $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
> +       /* Copy 4x VEC at a time from 2 pages.  */
> +       .p2align 4

If you drop .p2align, will it show up on the glibc benchtest?

> +L(loop_large_memcpy_2x_outer):
> +       /* ecx stores inner loop counter.  */
> +       movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> +L(loop_large_memcpy_2x_inner):
> +       PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> +       PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> +       PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> +       PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
> +       /* Load vectors from rsi.  */
> +       LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> +       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +       addq    $LARGE_LOAD_SIZE, %rsi
> +       /* Non-temporal store vectors to rdi.  */
> +       STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> +       STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +       addq    $LARGE_LOAD_SIZE, %rdi
> +       decl    %ecx
> +       jnz L(loop_large_memcpy_2x_inner)
> +       addq    $PAGE_SIZE, %rdi
> +       addq    $PAGE_SIZE, %rsi
> +       decq    %r10
> +       jne L(loop_large_memcpy_2x_outer)
> +       sfence
> +
> +       /* Check if only last 4 loads are needed.  */
> +       cmpl    $(VEC_SIZE * 4), %edx
> +       jbe L(large_memcpy_2x_end)
> +
> +       /* Handle the last 2 * PAGE_SIZE bytes. Use temporal stores
> +          here. The region will fit in cache and it should fit user
> +          expectations for the tail of the memcpy region to be hot.  */
> +L(loop_large_memcpy_2x_tail):
> +       /* Copy 4 * VEC a time forward with temporal stores.  */
> +       PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> +       PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
>         VMOVU   (%rsi), %VEC(0)
>         VMOVU   VEC_SIZE(%rsi), %VEC(1)
>         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
>         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> -       addq    $PREFETCHED_LOAD_SIZE, %rsi
> -       subq    $PREFETCHED_LOAD_SIZE, %rdx
> -       VMOVNT  %VEC(0), (%rdi)
> -       VMOVNT  %VEC(1), VEC_SIZE(%rdi)
> -       VMOVNT  %VEC(2), (VEC_SIZE * 2)(%rdi)
> -       VMOVNT  %VEC(3), (VEC_SIZE * 3)(%rdi)
> -       addq    $PREFETCHED_LOAD_SIZE, %rdi
> -       cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> -       ja      L(loop_large_forward)
> -       sfence
> +       addq    $(VEC_SIZE * 4), %rsi
> +       subl    $(VEC_SIZE * 4), %edx
> +       VMOVA   %VEC(0), (%rdi)
> +       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> +       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> +       addq    $(VEC_SIZE * 4), %rdi
> +       cmpl    $(VEC_SIZE * 4), %edx
> +       ja      L(loop_large_memcpy_2x_tail)
> +
> +L(large_memcpy_2x_end):
>         /* Store the last 4 * VEC.  */
> -       VMOVU   %VEC(5), (%rcx)
> -       VMOVU   %VEC(6), -VEC_SIZE(%rcx)
> -       VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
> -       VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
> -       /* Store the first VEC.  */
> -       VMOVU   %VEC(4), (%r11)
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> +
> +       VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> +       VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> +       VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
>         VZEROUPPER_RETURN
>
> -L(large_backward):
> -       /* Don't use non-temporal store if there is overlap between
> -          destination and source since destination may be in cache
> -          when source is loaded.  */
> -       leaq    (%rcx, %rdx), %r10
> -       cmpq    %r10, %r9
> -       jb      L(loop_4x_vec_backward)
> -L(loop_large_backward):
> -       /* Copy 4 * VEC a time backward with non-temporal stores.  */
> -       PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
> -       PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
> -       VMOVU   (%rcx), %VEC(0)
> -       VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> -       VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> -       VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> -       subq    $PREFETCHED_LOAD_SIZE, %rcx
> -       subq    $PREFETCHED_LOAD_SIZE, %rdx
> -       VMOVNT  %VEC(0), (%r9)
> -       VMOVNT  %VEC(1), -VEC_SIZE(%r9)
> -       VMOVNT  %VEC(2), -(VEC_SIZE * 2)(%r9)
> -       VMOVNT  %VEC(3), -(VEC_SIZE * 3)(%r9)
> -       subq    $PREFETCHED_LOAD_SIZE, %r9
> -       cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> -       ja      L(loop_large_backward)
> +L(large_memcpy_4x):
> +       movq    %rdx, %r10
> +       /* edx will store remainder size for copying tail.  */
> +       andl    $(PAGE_SIZE * 4 - 1), %edx
> +       /* r10 stores outer loop counter.  */
> +       shrq    $(LOG_PAGE_SIZE + 2), %r10
> +       /* Copy 4x VEC at a time from 4 pages.  */
> +       .p2align 4

If you drop .p2align, will it show up on the glibc benchtest?

> +L(loop_large_memcpy_4x_outer):
> +       /* ecx stores inner loop counter.  */
> +       movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> +L(loop_large_memcpy_4x_inner):
> +       /* Only one prefetch set per page as doing 4 pages give more time
> +          for prefetcher to keep up.  */
> +       PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> +       PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> +       PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> +       PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
> +       /* Load vectors from rsi.  */
> +       LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> +       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +       LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> +       LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> +       addq    $LARGE_LOAD_SIZE, %rsi
> +       /* Non-temporal store vectors to rdi.  */
> +       STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> +       STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +       STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> +       STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> +       addq    $LARGE_LOAD_SIZE, %rdi
> +       decl    %ecx
> +       jnz L(loop_large_memcpy_4x_inner)
> +       addq    $(PAGE_SIZE * 3), %rdi
> +       addq    $(PAGE_SIZE * 3), %rsi
> +       decq    %r10
> +       jne L(loop_large_memcpy_4x_outer)
>         sfence
> -       /* Store the first 4 * VEC.  */
> -       VMOVU   %VEC(4), (%rdi)
> -       VMOVU   %VEC(5), VEC_SIZE(%rdi)
> -       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
> -       VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
> -       /* Store the last VEC.  */
> -       VMOVU   %VEC(8), (%r11)
> +
> +       /* Check if only last 4 loads are needed.  */
> +       cmpl    $(VEC_SIZE * 4), %edx
> +       jbe L(large_memcpy_4x_end)
> +
> +       /* Handle the last 4  * PAGE_SIZE bytes.  */
> +L(loop_large_memcpy_4x_tail):
> +       /* Copy 4 * VEC a time forward with temporal stores.  */
> +       PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> +       PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> +       VMOVU   (%rsi), %VEC(0)
> +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> +       addq    $(VEC_SIZE * 4), %rsi
> +       subl    $(VEC_SIZE * 4), %edx
> +       VMOVA   %VEC(0), (%rdi)
> +       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> +       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> +       addq    $(VEC_SIZE * 4), %rdi
> +       cmpl    $(VEC_SIZE * 4), %edx
> +       ja      L(loop_large_memcpy_4x_tail)
> +
> +L(large_memcpy_4x_end):
> +       /* Store the last 4 * VEC.  */
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> +
> +       VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> +       VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> +       VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
>         VZEROUPPER_RETURN
>  #endif
>  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> --
> 2.29.2
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v4 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
  2021-04-02  1:53 ` [PATCH v4 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S H.J. Lu
@ 2021-04-02  4:15   ` Noah Goldstein
  0 siblings, 0 replies; 4+ messages in thread
From: Noah Goldstein @ 2021-04-02  4:15 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 1, 2021 at 9:54 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Mar 30, 2021 at 2:38 PM noah <goldstein.w.n@gmail.com> wrote:
> >
> > No Bug. This commit updates the large memcpy case (no overlap). The
> > update is to perform memcpy on either 2 or 4 contiguous pages at
> > once. This 1) helps to alleviate the affects of false memory aliasing
> > when destination and source have a close 4k alignment and 2) In most
> > cases and for most DRAM units is a modestly more efficient access
> > pattern. These changes are a clear performance improvement for
> > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> > pass.
> >
> > Signed-off-by: noah <goldstein.w.n@gmail.com>
> > ---
> >  .../multiarch/memmove-vec-unaligned-erms.S    | 326 ++++++++++++++----
> >  1 file changed, 258 insertions(+), 68 deletions(-)
>
> 1. There are many trailing whitespaces.
done.
> 2. Replace "jccSPACElabel" with "jccTABlabel".
done.
> 3. Replace "insnSPACEoperand" with "insnTABoperand" if needed.
done.
>
> > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > index 897a3d9762..dae3e2bac5 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > @@ -35,7 +35,16 @@
> >        __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
> >     7. If size >= __x86_shared_non_temporal_threshold and there is no
> >        overlap between destination and source, use non-temporal store
> > -      instead of aligned store.  */
> > +      instead of aligned store copying from either 2 or 4 pages at
> > +      once.
> > +   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
> > +      and source and destination do not page alias, copy from 2 pages
> > +      at once using non-temporal stores. Page aliasing in this case is
> > +      considered true if destination's page alignment - sources' page
> > +      alignment is less than 8 * VEC_SIZE.
> > +   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
> > +      and destination do page alias copy from 4 pages at once using
> > +      non-temporal stores.  */
> >
> >  #include <sysdep.h>
> >
> > @@ -67,6 +76,35 @@
> >  # endif
> >  #endif
> >
> > +#ifndef PAGE_SIZE
> > +# define PAGE_SIZE 4096
> > +#endif
> > +
> > +#if PAGE_SIZE != 4096
> > +# error Unsupported PAGE_SIZE
> > +#endif
> > +
> > +#ifndef LOG_PAGE_SIZE
> > +# define LOG_PAGE_SIZE 12
> > +#endif
> > +
> > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
> > +# error Invalid LOG_PAGE_SIZE
> > +#endif
> > +
> > +/* Byte per page for large_memcpy inner loop.  */
> > +#if VEC_SIZE == 64
> > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
> > +#else
> > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
> > +#endif
> > +
> > +
> > +/* Amount to shift rdx by to compare for memcpy_large_4x.  */
> > +#ifndef LOG_4X_MEMCPY_THRESH
> > +# define LOG_4X_MEMCPY_THRESH 4
> > +#endif
> > +
> >  /* Avoid short distance rep movsb only with non-SSE vector.  */
> >  #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
> >  # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
> > @@ -106,6 +144,28 @@
> >  # error Unsupported PREFETCH_SIZE!
> >  #endif
> >
> > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
> > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
> > +    VMOVU   (offset)base, vec0; \
> > +    VMOVU   ((offset) + VEC_SIZE)base, vec1;
> > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
> > +    VMOVNT  vec0, (offset)base; \
> > +    VMOVNT  vec1, ((offset) + VEC_SIZE)base;
> > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
> > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > +    VMOVU   (offset)base, vec0; \
> > +    VMOVU   ((offset) + VEC_SIZE)base, vec1; \
> > +    VMOVU   ((offset) + VEC_SIZE * 2)base, vec2; \
> > +    VMOVU   ((offset) + VEC_SIZE * 3)base, vec3;
> > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > +    VMOVNT  vec0, (offset)base; \
> > +    VMOVNT  vec1, ((offset) + VEC_SIZE)base; \
> > +    VMOVNT  vec2, ((offset) + VEC_SIZE * 2)base; \
> > +    VMOVNT  vec3, ((offset) + VEC_SIZE * 3)base;
> > +#else
> > +# error Invalid LARGE_LOAD_SIZE
> > +#endif
> > +
> >  #ifndef SECTION
> >  # error SECTION is not defined!
> >  #endif
> > @@ -393,6 +453,15 @@ L(last_4x_vec):
> >         VZEROUPPER_RETURN
> >
> >  L(more_8x_vec):
> > +    /* Check if non-temporal move candidate.  */
> > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > +       /* Check non-temporal store threshold.  */
> > +       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > +       ja      L(large_memcpy_2x)
> > +#endif
> > +    /* Entry if rdx is greater than non-temporal threshold but there
> > +       is overlap.  */
> > +L(more_8x_vec_check):
> >         cmpq    %rsi, %rdi
> >         ja      L(more_8x_vec_backward)
> >         /* Source == destination is less common.  */
> > @@ -419,11 +488,6 @@ L(more_8x_vec):
> >         subq    %r8, %rdi
> >         /* Adjust length.  */
> >         addq    %r8, %rdx
> > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > -       /* Check non-temporal store threshold.  */
> > -       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > -       ja      L(large_forward)
> > -#endif
> >  L(loop_4x_vec_forward):
> >         /* Copy 4 * VEC a time forward.  */
> >         VMOVU   (%rsi), %VEC(0)
> > @@ -470,11 +534,6 @@ L(more_8x_vec_backward):
> >         subq    %r8, %r9
> >         /* Adjust length.  */
> >         subq    %r8, %rdx
> > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > -       /* Check non-temporal store threshold.  */
> > -       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > -       ja      L(large_backward)
> > -#endif
> >  L(loop_4x_vec_backward):
> >         /* Copy 4 * VEC a time backward.  */
> >         VMOVU   (%rcx), %VEC(0)
> > @@ -500,72 +559,203 @@ L(loop_4x_vec_backward):
> >         VZEROUPPER_RETURN
> >
> >  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > -L(large_forward):
> > +L(large_memcpy_2x):
> > +       /* Compute absolute value of difference between source and
> > +          destination.  */
> > +       movq    %rdi, %r9
> > +       subq    %rsi, %r9
> > +       movq    %r9, %r8
> > +       leaq    -1(%r9), %rcx
> > +       sarq    $63, %r8
> > +       xorq    %r8, %r9
> > +       subq    %r8, %r9
> >         /* Don't use non-temporal store if there is overlap between
> > -          destination and source since destination may be in cache
> > -          when source is loaded.  */
> > -       leaq    (%rdi, %rdx), %r10
> > -       cmpq    %r10, %rsi
> > -       jb      L(loop_4x_vec_forward)
> > -L(loop_large_forward):
> > -       /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > -       PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > -       PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
> > +          destination and source since destination may be in cache when
> > +          source is loaded.  */
> > +       cmpq    %r9, %rdx
> > +       ja L(more_8x_vec_check)
> > +
> > +       /* Cache align destination. First store the first 64 bytes then
> > +          adjust alignments.  */
> > +       VMOVU   (%rsi), %VEC(8)
> > +#if VEC_SIZE < 64
> > +       VMOVU   VEC_SIZE(%rsi), %VEC(9)
> > +#if VEC_SIZE < 32
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(10)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(11)
> > +#endif
> > +#endif
> > +       VMOVU   %VEC(8), (%rdi)
> > +#if VEC_SIZE < 64
> > +       VMOVU   %VEC(9), VEC_SIZE(%rdi)
> > +#if VEC_SIZE < 32
> > +       VMOVU   %VEC(10), (VEC_SIZE * 2)(%rdi)
> > +       VMOVU   %VEC(11), (VEC_SIZE * 3)(%rdi)
> > +#endif
> > +#endif
> > +       /* Adjust source, destination, and size.  */
> > +       MOVQ    %rdi, %r8
>
> movq
done.
>
> > +       andq    $63, %r8
> > +       /* Get the negative of offset for alignment.  */
> > +       subq    $64, %r8
> > +       /* Adjust source.  */
> > +       subq    %r8, %rsi
> > +       /* Adjust destination which should be aligned now.  */
> > +       subq    %r8, %rdi
> > +       /* Adjust length.  */
> > +       addq    %r8, %rdx
> > +
> > +       /* Test if source and destination addresses will alias. If they do
> > +          the larger pipeline in large_memcpy_4x alleviated the
> > +          performance drop.  */
> > +       testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx
> > +       jz  L(large_memcpy_4x)
> > +
> > +       movq    %rdx, %r10
> > +       shrq    $LOG_4X_MEMCPY_THRESH, %r10
> > +       cmp     __x86_shared_non_temporal_threshold(%rip), %r10
> > +       jae L(large_memcpy_4x)
> > +
> > +       /* edx will store remainder size for copying tail.  */
> > +       andl    $(PAGE_SIZE * 2 - 1), %edx
> > +       /* r10 stores outer loop counter.  */
> > +       shrq    $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
> > +       /* Copy 4x VEC at a time from 2 pages.  */
> > +       .p2align 4
>
> If you drop .p2align, will it show up on the glibc benchtest?
No. Dropped it.
>
> > +L(loop_large_memcpy_2x_outer):
> > +       /* ecx stores inner loop counter.  */
> > +       movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > +L(loop_large_memcpy_2x_inner):
> > +       PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > +       PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > +       PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > +       PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
> > +       /* Load vectors from rsi.  */
> > +       LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > +       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > +       addq    $LARGE_LOAD_SIZE, %rsi
> > +       /* Non-temporal store vectors to rdi.  */
> > +       STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > +       STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > +       addq    $LARGE_LOAD_SIZE, %rdi
> > +       decl    %ecx
> > +       jnz L(loop_large_memcpy_2x_inner)
> > +       addq    $PAGE_SIZE, %rdi
> > +       addq    $PAGE_SIZE, %rsi
> > +       decq    %r10
> > +       jne L(loop_large_memcpy_2x_outer)
> > +       sfence
> > +
> > +       /* Check if only last 4 loads are needed.  */
> > +       cmpl    $(VEC_SIZE * 4), %edx
> > +       jbe L(large_memcpy_2x_end)
> > +
> > +       /* Handle the last 2 * PAGE_SIZE bytes. Use temporal stores
> > +          here. The region will fit in cache and it should fit user
> > +          expectations for the tail of the memcpy region to be hot.  */
> > +L(loop_large_memcpy_2x_tail):
> > +       /* Copy 4 * VEC a time forward with temporal stores.  */
> > +       PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> > +       PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> >         VMOVU   (%rsi), %VEC(0)
> >         VMOVU   VEC_SIZE(%rsi), %VEC(1)
> >         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> >         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > -       addq    $PREFETCHED_LOAD_SIZE, %rsi
> > -       subq    $PREFETCHED_LOAD_SIZE, %rdx
> > -       VMOVNT  %VEC(0), (%rdi)
> > -       VMOVNT  %VEC(1), VEC_SIZE(%rdi)
> > -       VMOVNT  %VEC(2), (VEC_SIZE * 2)(%rdi)
> > -       VMOVNT  %VEC(3), (VEC_SIZE * 3)(%rdi)
> > -       addq    $PREFETCHED_LOAD_SIZE, %rdi
> > -       cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> > -       ja      L(loop_large_forward)
> > -       sfence
> > +       addq    $(VEC_SIZE * 4), %rsi
> > +       subl    $(VEC_SIZE * 4), %edx
> > +       VMOVA   %VEC(0), (%rdi)
> > +       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > +       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > +       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > +       addq    $(VEC_SIZE * 4), %rdi
> > +       cmpl    $(VEC_SIZE * 4), %edx
> > +       ja      L(loop_large_memcpy_2x_tail)
> > +
> > +L(large_memcpy_2x_end):
> >         /* Store the last 4 * VEC.  */
> > -       VMOVU   %VEC(5), (%rcx)
> > -       VMOVU   %VEC(6), -VEC_SIZE(%rcx)
> > -       VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
> > -       VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
> > -       /* Store the first VEC.  */
> > -       VMOVU   %VEC(4), (%r11)
> > +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > +
> > +       VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > +       VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > +       VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > +       VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> >         VZEROUPPER_RETURN
> >
> > -L(large_backward):
> > -       /* Don't use non-temporal store if there is overlap between
> > -          destination and source since destination may be in cache
> > -          when source is loaded.  */
> > -       leaq    (%rcx, %rdx), %r10
> > -       cmpq    %r10, %r9
> > -       jb      L(loop_4x_vec_backward)
> > -L(loop_large_backward):
> > -       /* Copy 4 * VEC a time backward with non-temporal stores.  */
> > -       PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
> > -       PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
> > -       VMOVU   (%rcx), %VEC(0)
> > -       VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> > -       VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> > -       VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> > -       subq    $PREFETCHED_LOAD_SIZE, %rcx
> > -       subq    $PREFETCHED_LOAD_SIZE, %rdx
> > -       VMOVNT  %VEC(0), (%r9)
> > -       VMOVNT  %VEC(1), -VEC_SIZE(%r9)
> > -       VMOVNT  %VEC(2), -(VEC_SIZE * 2)(%r9)
> > -       VMOVNT  %VEC(3), -(VEC_SIZE * 3)(%r9)
> > -       subq    $PREFETCHED_LOAD_SIZE, %r9
> > -       cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> > -       ja      L(loop_large_backward)
> > +L(large_memcpy_4x):
> > +       movq    %rdx, %r10
> > +       /* edx will store remainder size for copying tail.  */
> > +       andl    $(PAGE_SIZE * 4 - 1), %edx
> > +       /* r10 stores outer loop counter.  */
> > +       shrq    $(LOG_PAGE_SIZE + 2), %r10
> > +       /* Copy 4x VEC at a time from 4 pages.  */
> > +       .p2align 4
>
> If you drop .p2align, will it show up on the glibc benchtest?
No. Dropped it.
>
> > +L(loop_large_memcpy_4x_outer):
> > +       /* ecx stores inner loop counter.  */
> > +       movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > +L(loop_large_memcpy_4x_inner):
> > +       /* Only one prefetch set per page as doing 4 pages give more time
> > +          for prefetcher to keep up.  */
> > +       PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > +       PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > +       PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> > +       PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
> > +       /* Load vectors from rsi.  */
> > +       LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > +       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > +       LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > +       LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > +       addq    $LARGE_LOAD_SIZE, %rsi
> > +       /* Non-temporal store vectors to rdi.  */
> > +       STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > +       STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > +       STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > +       STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > +       addq    $LARGE_LOAD_SIZE, %rdi
> > +       decl    %ecx
> > +       jnz L(loop_large_memcpy_4x_inner)
> > +       addq    $(PAGE_SIZE * 3), %rdi
> > +       addq    $(PAGE_SIZE * 3), %rsi
> > +       decq    %r10
> > +       jne L(loop_large_memcpy_4x_outer)
> >         sfence
> > -       /* Store the first 4 * VEC.  */
> > -       VMOVU   %VEC(4), (%rdi)
> > -       VMOVU   %VEC(5), VEC_SIZE(%rdi)
> > -       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
> > -       VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
> > -       /* Store the last VEC.  */
> > -       VMOVU   %VEC(8), (%r11)
> > +
> > +       /* Check if only last 4 loads are needed.  */
> > +       cmpl    $(VEC_SIZE * 4), %edx
> > +       jbe L(large_memcpy_4x_end)
> > +
> > +       /* Handle the last 4  * PAGE_SIZE bytes.  */
> > +L(loop_large_memcpy_4x_tail):
> > +       /* Copy 4 * VEC a time forward with temporal stores.  */
> > +       PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> > +       PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> > +       VMOVU   (%rsi), %VEC(0)
> > +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > +       addq    $(VEC_SIZE * 4), %rsi
> > +       subl    $(VEC_SIZE * 4), %edx
> > +       VMOVA   %VEC(0), (%rdi)
> > +       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > +       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > +       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > +       addq    $(VEC_SIZE * 4), %rdi
> > +       cmpl    $(VEC_SIZE * 4), %edx
> > +       ja      L(loop_large_memcpy_4x_tail)
> > +
> > +L(large_memcpy_4x_end):
> > +       /* Store the last 4 * VEC.  */
> > +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > +
> > +       VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > +       VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > +       VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > +       VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> >         VZEROUPPER_RETURN
> >  #endif
> >  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> > --
> > 2.29.2
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-04-02  4:15 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-30 21:38 [PATCH v4 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S noah
2021-03-30 21:38 ` [PATCH v4 2/2] x86: Expanding test-memmove.c, test-memcpy.c, bench-memcpy-large.c noah
2021-04-02  1:53 ` [PATCH v4 1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S H.J. Lu
2021-04-02  4:15   ` Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).