From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <skpgkp2@sourceware.org>
Received: by sourceware.org (Postfix, from userid 7852)
 id E7AF93857C49; Mon,  2 May 2022 21:29:58 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org E7AF93857C49
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Sunil Pandey <skpgkp2@sourceware.org>
To: glibc-cvs@sourceware.org
Subject: [glibc/release/2.33/master] x86: Improve memset-vec-unaligned-erms.S
X-Act-Checkin: glibc
X-Git-Author: Noah Goldstein <goldstein.w.n@gmail.com>
X-Git-Refname: refs/heads/release/2.33/master
X-Git-Oldrev: 903190e981e995f9f10063fd717ce72ab1c6cb04
X-Git-Newrev: 6903448d936ee49e09db3d13830373f47f58c791
Message-Id: <20220502212958.E7AF93857C49@sourceware.org>
Date: Mon,  2 May 2022 21:29:58 +0000 (GMT)
X-BeenThere: glibc-cvs@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Glibc-cvs mailing list <glibc-cvs.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/glibc-cvs>,
 <mailto:glibc-cvs-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/glibc-cvs/>
List-Help: <mailto:glibc-cvs-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/glibc-cvs>,
 <mailto:glibc-cvs-request@sourceware.org?subject=subscribe>
X-List-Received-Date: Mon, 02 May 2022 21:29:59 -0000

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=6903448d936ee49e09db3d13830373f47f58c791

commit 6903448d936ee49e09db3d13830373f47f58c791
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Thu May 20 13:13:51 2021 -0400

    x86: Improve memset-vec-unaligned-erms.S
    
    No bug. This commit makes a few small improvements to
    memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64
    instead of 128. Either alignment will perform equally well in a loop
    and 128 just increases the odds of having to do an extra iteration
    which can be significant overhead for small values. 2) Align some
    targets and the loop. 3) Remove an ALU from the alignment process. 4)
    Reorder the last 4x VEC so that they are stored after the loop. 5)
    Move the condition for leq 8x VEC to before the alignment
    process. test-memset and test-wmemset are both passing.
    
    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
    (cherry picked from commit 6abf27980a947f9b6e514d6b33b83059d39566ae)

Diff:
---
 .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 50 ++++++++++++----------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 08cfa49bd1..ff196844a0 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 	VMOVU	%VEC(0), (%rdi)
 	VZEROUPPER_RETURN
 
+	.p2align 4
 L(stosb_more_2x_vec):
 	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
 	ja	L(stosb)
+#else
+	.p2align 4
 #endif
 L(more_2x_vec):
-	cmpq  $(VEC_SIZE * 4), %rdx
-	ja	L(loop_start)
+	/* Stores to first 2x VEC before cmp as any path forward will
+	   require it.  */
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(0), VEC_SIZE(%rdi)
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_start)
 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 L(return):
 #if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
@@ -192,28 +197,29 @@ L(return):
 #endif
 
 L(loop_start):
-	leaq	(VEC_SIZE * 4)(%rdi), %rcx
-	VMOVU	%VEC(0), (%rdi)
-	andq	$-(VEC_SIZE * 4), %rcx
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
-	addq	%rdi, %rdx
-	andq	$-(VEC_SIZE * 4), %rdx
-	cmpq	%rdx, %rcx
-	je	L(return)
+	cmpq	$(VEC_SIZE * 8), %rdx
+	jbe	L(loop_end)
+	andq	$-(VEC_SIZE * 2), %rdi
+	subq	$-(VEC_SIZE * 4), %rdi
+	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
+	.p2align 4
 L(loop):
-	VMOVA	%VEC(0), (%rcx)
-	VMOVA	%VEC(0), VEC_SIZE(%rcx)
-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
-	addq	$(VEC_SIZE * 4), %rcx
-	cmpq	%rcx, %rdx
-	jne	L(loop)
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(0), VEC_SIZE(%rdi)
+	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rcx, %rdi
+	jb	L(loop)
+L(loop_end):
+	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
+	       rdx as length is also unchanged.  */
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
+	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
+	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
+	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
 	VZEROUPPER_SHORT_RETURN
 
 	.p2align 4