public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests
@ 2022-10-29 20:19 Noah Goldstein
  2022-10-29 20:19 ` [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes Noah Goldstein
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Noah Goldstein @ 2022-10-29 20:19 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

len=0 is valid and fairly common so should be tested.
---
 benchtests/bench-memcmp.c | 18 +++++++++---------
 string/test-memcmp.c      | 16 ++++++++++------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/benchtests/bench-memcmp.c b/benchtests/bench-memcmp.c
index d64eaa992e..b2816baebe 100644
--- a/benchtests/bench-memcmp.c
+++ b/benchtests/bench-memcmp.c
@@ -63,7 +63,7 @@ IMPL (MEMCMP, 1)
 
 static void
 do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s1,
-	     const CHAR *s2, size_t len, int exp_result)
+	     const CHAR *s2, size_t len)
 {
   size_t i, iters = INNER_LOOP_ITERS_LARGE;
   timing_t start, stop, cur;
@@ -87,9 +87,6 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
   size_t i;
   CHAR *s1, *s2;
 
-  if (len == 0)
-    return;
-
   align1 &= (4096 - CHARBYTES);
   if (align1 + (len + 1) * CHARBYTES >= page_size)
     return;
@@ -111,13 +108,16 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
   for (i = 0; i < len; i++)
     s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % MAX_CHAR;
 
-  s1[len] = align1;
-  s2[len] = align2;
-  s2[len - 1] -= exp_result;
+  if (len)
+    {
+      s1[len] = align1;
+      s2[len] = align2;
+      s2[len - 1] -= exp_result;
+    }
 
   FOR_EACH_IMPL (impl, 0)
     {
-      do_one_test (json_ctx, impl, s1, s2, len, exp_result);
+      do_one_test (json_ctx, impl, s1, s2, len);
     }
 
   json_array_end (json_ctx);
@@ -147,7 +147,7 @@ test_main (void)
   json_array_end (&json_ctx);
 
   json_array_begin (&json_ctx, "results");
-  for (i = 1; i < 32; ++i)
+  for (i = 0; i < 32; ++i)
     {
       do_test (&json_ctx, i * CHARBYTES, i * CHARBYTES, i, 0);
       do_test (&json_ctx, i * CHARBYTES, i * CHARBYTES, i, 1);
diff --git a/string/test-memcmp.c b/string/test-memcmp.c
index 181b689f68..18d8b0d9f1 100644
--- a/string/test-memcmp.c
+++ b/string/test-memcmp.c
@@ -117,9 +117,6 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result)
   size_t i;
   CHAR *s1, *s2;
 
-  if (len == 0)
-    return;
-
   align1 &= (4096 - CHARBYTES);
   if (align1 + (len + 1) * CHARBYTES >= page_size)
     return;
@@ -134,9 +131,16 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result)
   for (i = 0; i < len; i++)
     s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % CHAR__MAX;
 
-  s1[len] = align1;
-  s2[len] = align2;
-  s2[len - 1] -= exp_result;
+  if (len)
+    {
+      s1[len] = align1;
+      s2[len] = align2;
+      s2[len - 1] -= exp_result;
+    }
+  else
+    {
+      exp_result = 0;
+    }
 
   FOR_EACH_IMPL (impl, 0)
     do_one_test (impl, s1, s2, len, exp_result);
-- 
2.34.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes
  2022-10-29 20:19 [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Noah Goldstein
@ 2022-10-29 20:19 ` Noah Goldstein
  2022-10-31 15:47   ` H.J. Lu
  2022-10-29 20:19 ` [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S " Noah Goldstein
  2022-10-31 13:19 ` [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Siddhesh Poyarekar
  2 siblings, 1 reply; 7+ messages in thread
From: Noah Goldstein @ 2022-10-29 20:19 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

The only change to the existing generated code is `tzcnt` -> `bsf` to
save a byte of code size here and there.

Rewriting with VMM API allows for memcmp-evex-movbe to be used with
evex512 by including "x86-evex512-vecs.h" at the top.

Complete check passes on x86-64.
---
 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 308 +++++++++++--------
 1 file changed, 175 insertions(+), 133 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
index bc017768be..f6c379831e 100644
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -62,44 +62,38 @@ Latency:
 #  define MEMCMP	__memcmp_evex_movbe
 # endif
 
-# define VMOVU		vmovdqu64
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
 
 # ifdef USE_AS_WMEMCMP
 #  define VMOVU_MASK	vmovdqu32
 #  define CHAR_SIZE	4
 #  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
 #  define VPTEST	vptestmd
+
+#  define USE_WIDE_CHAR
 # else
 #  define VMOVU_MASK	vmovdqu8
 #  define CHAR_SIZE	1
 #  define VPCMP	vpcmpub
+#  define VPCMPEQ	vpcmpeqb
 #  define VPTEST	vptestmb
 # endif
 
+# include "reg-macros.h"
 
-# define VEC_SIZE	32
 # define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# define XMM0		xmm16
-# define XMM1		xmm17
-# define XMM2		xmm18
-# define YMM0		ymm16
-# define XMM1		xmm17
-# define XMM2		xmm18
-# define YMM1		ymm17
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
 
 /* Warning!
            wmemcmp has to use SIGNED comparison for elements.
            memcmp has to use UNSIGNED comparison for elemnts.
 */
 
-	.section .text.evex,"ax",@progbits
+	.section SECTION(.text), "ax", @progbits
 /* Cache align memcmp entry. This allows for much more thorough
    frontend optimization.  */
 ENTRY_P2ALIGN (MEMCMP, 6)
@@ -111,23 +105,40 @@ ENTRY_P2ALIGN (MEMCMP, 6)
 	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
 	ja	L(more_1x_vec)
 
-	/* Create mask for CHAR's we want to compare. This allows us to
-	   avoid having to include page cross logic.  */
-	movl	$-1, %ecx
-	bzhil	%edx, %ecx, %ecx
-	kmovd	%ecx, %k2
+	/* Create mask of bytes that are guranteed to be valid because
+	   of length (edx). Using masked movs allows us to skip checks
+	   for page crosses/zero size.  */
+	mov	$-1, %VRAX
+	bzhi	%VRDX, %VRAX, %VRAX
+	/* NB: A `jz` might be useful here. Page-faults that are
+	   invalidated by predicate execution (the evex mask) can be
+	   very slow.  The expectation is this is not the norm so and
+	   "most" code will not regularly call 'memcmp' with length = 0
+	   and memory that is not wired up.  */
+	KMOV	%VRAX, %k2
+
+
 
 	/* Safe to load full ymm with mask.  */
-	VMOVU_MASK (%rsi), %YMM2{%k2}
-	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
+	/* Slightly different method for VEC_SIZE == 64 to save a bit of
+	   code size. This allows us to fit L(return_vec_0) entirely in
+	   the first cache line.  */
+# if VEC_SIZE == 64
+	VPCMPEQ	(%rdi), %VMM(2), %k1{%k2}
+	KMOV	%k1, %VRCX
+	sub	%VRCX, %VRAX
+# else
+	VPCMP	$4, (%rdi), %VMM(2), %k1{%k2}
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
+# endif
 	jnz	L(return_vec_0)
 	ret
 
-	.p2align 4
+	.p2align 4,, 11
 L(return_vec_0):
-	tzcntl	%eax, %eax
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_WMEMCMP
 	movl	(%rdi, %rax, CHAR_SIZE), %ecx
 	xorl	%edx, %edx
@@ -138,33 +149,36 @@ L(return_vec_0):
 	leal	-1(%rdx, %rdx), %eax
 # else
 	movzbl	(%rsi, %rax), %ecx
+#  if VEC_SIZE == 64
+	movb	(%rdi, %rax), %al
+#  else
 	movzbl	(%rdi, %rax), %eax
+#  endif
 	subl	%ecx, %eax
 # endif
 	ret
 
-
-	.p2align 4
+	.p2align 4,, 11
 L(more_1x_vec):
 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	(%rsi), %YMM1
+	VMOVU	(%rsi), %VMM(1)
 	/* Use compare not equals to directly check for mismatch.  */
-	VPCMP	$4,(%rdi), %YMM1, %k1
-	kmovd	%k1, %eax
+	VPCMP	$4, (%rdi), %VMM(1), %k1
+	KMOV	%k1, %VRAX
 	/* NB: eax must be destination register if going to
-	   L(return_vec_[0,2]). For L(return_vec_3) destination register
-	   must be ecx.  */
-	testl	%eax, %eax
+	   L(return_vec_[0,2]). For L(return_vec_3) destination
+	   register must be ecx.  */
+	test	%VRAX, %VRAX
 	jnz	L(return_vec_0)
 
 	cmpq	$(CHAR_PER_VEC * 2), %rdx
 	jbe	L(last_1x_vec)
 
 	/* Check second VEC no matter what.  */
-	VMOVU	VEC_SIZE(%rsi), %YMM2
-	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	VMOVU	VEC_SIZE(%rsi), %VMM(2)
+	VPCMP	$4, VEC_SIZE(%rdi), %VMM(2), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(return_vec_1)
 
 	/* Less than 4 * VEC.  */
@@ -172,16 +186,16 @@ L(more_1x_vec):
 	jbe	L(last_2x_vec)
 
 	/* Check third and fourth VEC no matter what.  */
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
-	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(return_vec_2)
 
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
-	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
-	kmovd	%k1, %ecx
-	testl	%ecx, %ecx
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(4)
+	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
+	KMOV	%k1, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(return_vec_3)
 
 	/* Go to 4x VEC loop.  */
@@ -192,8 +206,8 @@ L(more_1x_vec):
 	   branches.  */
 
 	/* Load first two VEC from s2 before adjusting addresses.  */
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(2)
 	leaq	-(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
 	leaq	-(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
 
@@ -202,56 +216,61 @@ L(more_1x_vec):
 
 	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
 	   will have some 1s.  */
-	vpxorq	(%rdi), %YMM1, %YMM1
-	vpxorq	(VEC_SIZE)(%rdi), %YMM2, %YMM2
+	vpxorq	(%rdi), %VMM(1), %VMM(1)
+	vpxorq	(VEC_SIZE)(%rdi), %VMM(2), %VMM(2)
 
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
-	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	vpxorq	(VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
 
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
-	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
-	   oring with YMM1. Result is stored in YMM4.  */
-	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(4)
+	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with VEC(4) while
+	   oring with VEC(1). Result is stored in VEC(4).  */
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4)
 
-	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
-	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+	/* Or together VEC(2), VEC(3), and VEC(4) into VEC(4).  */
+	vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
 
-	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
+	/* Test VEC(4) against itself. Store any CHAR mismatches in k1.
 	 */
-	VPTEST	%YMM4, %YMM4, %k1
+	VPTEST	%VMM(4), %VMM(4), %k1
 	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
-	kmovd	%k1, %ecx
-	testl	%ecx, %ecx
+	KMOV	%k1, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(return_vec_0_1_2_3)
 	/* NB: eax must be zero to reach here.  */
 	ret
 
 
-	.p2align 4,, 8
+	.p2align 4,, 9
 L(8x_end_return_vec_0_1_2_3):
 	movq	%rdx, %rdi
 L(8x_return_vec_0_1_2_3):
+	/* L(loop_4x_vec) leaves result in `k1` for VEC_SIZE == 64.  */
+# if VEC_SIZE == 64
+	KMOV	%k1, %VRCX
+# endif
 	addq	%rdi, %rsi
 L(return_vec_0_1_2_3):
-	VPTEST	%YMM1, %YMM1, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPTEST	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(return_vec_0)
 
-	VPTEST	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPTEST	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(return_vec_1)
 
-	VPTEST	%YMM3, %YMM3, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPTEST	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(return_vec_2)
+	.p2align 4,, 2
 L(return_vec_3):
 	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
 	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
 	   line.  */
-	bsfl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # ifdef USE_AS_WMEMCMP
 	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
 	xorl	%edx, %edx
@@ -266,11 +285,11 @@ L(return_vec_3):
 	ret
 
 
-	.p2align 4
+	.p2align 4,, 8
 L(return_vec_1):
 	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
 	   fetch block.  */
-	bsfl	%eax, %eax
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_WMEMCMP
 	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
 	xorl	%edx, %edx
@@ -284,11 +303,11 @@ L(return_vec_1):
 # endif
 	ret
 
-	.p2align 4,, 10
+	.p2align 4,, 7
 L(return_vec_2):
 	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
 	   fetch block.  */
-	bsfl	%eax, %eax
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_WMEMCMP
 	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
 	xorl	%edx, %edx
@@ -302,7 +321,7 @@ L(return_vec_2):
 # endif
 	ret
 
-	.p2align 4
+	.p2align 4,, 8
 L(more_8x_vec):
 	/* Set end of s1 in rdx.  */
 	leaq	-(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
@@ -316,62 +335,82 @@ L(more_8x_vec):
 
 	.p2align 4
 L(loop_4x_vec):
-	VMOVU	(%rsi, %rdi), %YMM1
-	vpxorq	(%rdi), %YMM1, %YMM1
-	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
-	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
-	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
-	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
-	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
-	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
-	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
-	VPTEST	%YMM4, %YMM4, %k1
-	kmovd	%k1, %ecx
-	testl	%ecx, %ecx
+	VMOVU	(%rsi, %rdi), %VMM(1)
+	vpxorq	(%rdi), %VMM(1), %VMM(1)
+	VMOVU	VEC_SIZE(%rsi, %rdi), %VMM(2)
+	vpxorq	VEC_SIZE(%rdi), %VMM(2), %VMM(2)
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
+	vpxorq	(VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4)
+	vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
+	VPTEST	%VMM(4), %VMM(4), %k1
+	/* If VEC_SIZE == 64 just branch with KTEST. We have free port0
+	   space and it allows the loop to fit in 2x cache lines
+	   instead of 3.  */
+# if VEC_SIZE == 64
+	KTEST	%k1, %k1
+# else
+	KMOV	%k1, %VRCX
+	test	%VRCX, %VRCX
+# endif
 	jnz	L(8x_return_vec_0_1_2_3)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpq	%rdx, %rdi
 	jb	L(loop_4x_vec)
-
 	subq	%rdx, %rdi
 	/* rdi has 4 * VEC_SIZE - remaining length.  */
 	cmpl	$(VEC_SIZE * 3), %edi
-	jae	L(8x_last_1x_vec)
+	jge	L(8x_last_1x_vec)
 	/* Load regardless of branch.  */
-	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
-	cmpl	$(VEC_SIZE * 2), %edi
-	jae	L(8x_last_2x_vec)
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
 
-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
-
-	VMOVU	(%rsi, %rdx), %YMM1
-	vpxorq	(%rdx), %YMM1, %YMM1
+	/* Seperate logic as we can only use testb for VEC_SIZE == 64.
+	 */
+# if VEC_SIZE == 64
+	testb	%dil, %dil
+	js	L(8x_last_2x_vec)
+# else
+	cmpl	$(VEC_SIZE * 2), %edi
+	jge	L(8x_last_2x_vec)
+# endif
 
-	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
-	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
-	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
-	vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
-	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
-	VPTEST	%YMM4, %YMM4, %k1
-	kmovd	%k1, %ecx
-	testl	%ecx, %ecx
+	vpxorq	(VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(3)
+
+	VMOVU	(%rsi, %rdx), %VMM(1)
+	vpxorq	(%rdx), %VMM(1), %VMM(1)
+
+	VMOVU	VEC_SIZE(%rsi, %rdx), %VMM(2)
+	vpxorq	VEC_SIZE(%rdx), %VMM(2), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %VMM(1), %VMM(4)
+	vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
+	VPTEST	%VMM(4), %VMM(4), %k1
+	/* L(8x_end_return_vec_0_1_2_3) expects bitmask to still be in
+	   `k1`  if VEC_SIZE == 64.  */
+# if VEC_SIZE == 64
+	KTEST	%k1, %k1
+# else
+	KMOV	%k1, %VRCX
+	test	%VRCX, %VRCX
+# endif
 	jnz	L(8x_end_return_vec_0_1_2_3)
 	/* NB: eax must be zero to reach here.  */
 	ret
 
 	/* Only entry is from L(more_8x_vec).  */
-	.p2align 4,, 10
+	.p2align 4,, 6
 L(8x_last_2x_vec):
-	VPCMP	$4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %VMM(3), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(8x_return_vec_2)
-	/* Naturally aligned to 16 bytes.  */
+	.p2align 4,, 5
 L(8x_last_1x_vec):
-	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
-	VPCMP	$4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
+	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(8x_return_vec_3)
 	ret
 
@@ -383,7 +422,7 @@ L(8x_last_1x_vec):
 L(8x_return_vec_2):
 	subq	$VEC_SIZE, %rdx
 L(8x_return_vec_3):
-	bsfl	%eax, %eax
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_WMEMCMP
 	leaq	(%rdx, %rax, CHAR_SIZE), %rax
 	movl	(VEC_SIZE * 3)(%rax), %ecx
@@ -399,32 +438,34 @@ L(8x_return_vec_3):
 # endif
 	ret
 
-	.p2align 4,, 10
+	.p2align 4,, 8
 L(last_2x_vec):
 	/* Check second to last VEC.  */
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
-	VPCMP	$4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
+	VPCMP	$4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(return_vec_1_end)
 
 	/* Check last VEC.  */
-	.p2align 4
+	.p2align 4,, 8
 L(last_1x_vec):
-	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
-	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
+	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(return_vec_0_end)
 	ret
 
 
-	/* Don't align. Takes 2-fetch blocks either way and aligning
-	   will cause code to spill into another cacheline.  */
+	/* Don't fully align. Takes 2-fetch blocks either way and
+	   aligning will cause code to spill into another cacheline.
+	 */
+	.p2align 4,, 3
 L(return_vec_1_end):
 	/* Use bsf to save code size. This is necessary to have
 	   L(one_or_less) fit in aligning bytes between.  */
-	bsfl	%eax, %eax
+	bsf	%VRAX, %VRAX
 	addl	%edx, %eax
 # ifdef USE_AS_WMEMCMP
 	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
@@ -439,10 +480,11 @@ L(return_vec_1_end):
 # endif
 	ret
 
+	.p2align 4,, 2
 	/* Don't align. Takes 2-fetch blocks either way and aligning
 	   will cause code to spill into another cacheline.  */
 L(return_vec_0_end):
-	tzcntl	%eax, %eax
+	bsf	%VRAX, %VRAX
 	addl	%edx, %eax
 # ifdef USE_AS_WMEMCMP
 	movl	-VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
@@ -456,7 +498,7 @@ L(return_vec_0_end):
 	subl	%ecx, %eax
 # endif
 	ret
-	/* 1-byte until next cache line.  */
-
+	/* evex256: 2-byte until next cache line. evex512: 46-bytes
+	   until next cache line.  */
 END (MEMCMP)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S and minor changes
  2022-10-29 20:19 [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Noah Goldstein
  2022-10-29 20:19 ` [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes Noah Goldstein
@ 2022-10-29 20:19 ` Noah Goldstein
  2022-10-31 15:48   ` H.J. Lu
  2022-10-31 13:19 ` [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Siddhesh Poyarekar
  2 siblings, 1 reply; 7+ messages in thread
From: Noah Goldstein @ 2022-10-29 20:19 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Changes to generated code are:
    1. In a few places use `vpcmpeqb` instead of `vpcmpneq` to save a
       byte of code size.
    2. Add a branch for length <= (VEC_SIZE * 6) as opposed to doing
       the entire block of [VEC_SIZE * 4 + 1, VEC_SIZE * 8] in a
       single basic-block (the space to add the extra branch without
       changing code size is bought with the above change).

Change (2) has roughly a 20-25% speedup for sizes in [VEC_SIZE * 4 +
1, VEC_SIZE * 6] and negligible to no-cost for [VEC_SIZE * 6 + 1,
VEC_SIZE * 8]

From N=10 runs on Tigerlake:

align1,align2 ,length ,result               ,New TIme    ,Cur Time,New Time / Old Time
0     ,0      ,129    ,0                    ,5.404       ,6.887   ,0.785
0     ,0      ,129    ,1                    ,5.308       ,6.826   ,0.778
0     ,0      ,129    ,18446744073709551615 ,5.359       ,6.823   ,0.785
0     ,0      ,161    ,0                    ,5.284       ,6.827   ,0.774
0     ,0      ,161    ,1                    ,5.317       ,6.745   ,0.788
0     ,0      ,161    ,18446744073709551615 ,5.406       ,6.778   ,0.798

0     ,0      ,193    ,0                    ,6.969       ,6.832   ,1.000
0     ,0      ,193    ,1                    ,6.943       ,6.748   ,1.029
0     ,0      ,193    ,18446744073709551615 ,6.997       ,6.728   ,1.011
0     ,0      ,225    ,0                    ,7.144       ,6.746   ,0.989
0     ,0      ,225    ,1                    ,7.218       ,6.683   ,1.003
0     ,0      ,225    ,18446744073709551615 ,6.864       ,6.767   ,0.992
0     ,0      ,256    ,0                    ,5.423       ,5.482   ,0.989
0     ,0      ,256    ,1                    ,5.348       ,5.465   ,0.978
0     ,0      ,256    ,18446744073709551615 ,5.321       ,5.518   ,0.964

Rewriting with VMM API allows for memcmpeq-evex to be used with
evex512 by including "x86-evex512-vecs.h" at the top.

Complete check passes on x86-64.
---
 sysdeps/x86_64/multiarch/memcmpeq-evex.S | 255 ++++++++++++++---------
 1 file changed, 155 insertions(+), 100 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
index 41124ef1d3..671d19393e 100644
--- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S
+++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
@@ -41,24 +41,53 @@
 #  define MEMCMPEQ	__memcmpeq_evex
 # endif
 
+# ifndef VEC_SIZE
+#  include "x86-evex512-vecs.h"
+# endif
+# include "reg-macros.h"
+
+
+# if VEC_SIZE == 32
+
+#  define TEST_ZERO_VCMP(reg)	inc %VGPR(reg)
+#  define TEST_ZERO(reg)	test %VGPR(reg), %VGPR(reg)
+
+#  define TO_32BIT_P1(reg)	/* Do nothing. */
+#  define TO_32BIT_P2(reg)	/* Do nothing. */
+#  define TO_32BIT(reg)	/* Do nothing. */
+
+#  define VEC_CMP	VPCMPEQ
+
+# elif VEC_SIZE == 64
+
+#  define TEST_ZERO_VCMP(reg)	TEST_ZERO(reg)
+#  define TEST_ZERO(reg)	neg %VGPR(reg)
+
+
+	/* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit
+	   int. We have two methods for this. If the mask with branched
+	   on, we use `neg` for the branch then `sbb` to get the 32-bit
+	   return. If the mask was no branched on, we just use
+	   `popcntq`.  */
+#  define TO_32BIT_P1(reg)	TEST_ZERO(reg)
+#  define TO_32BIT_P2(reg)	sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32)
+#  define TO_32BIT(reg)	popcntq %reg, %reg
+
+#  define VEC_CMP	VPCMPNEQ
+
+# else
+#  error "Unsupported VEC_SIZE"
+# endif
+
+
 # define VMOVU_MASK	vmovdqu8
-# define VMOVU	vmovdqu64
-# define VPCMP	vpcmpub
+# define VPCMPNEQ	vpcmpneqb
+# define VPCMPEQ	vpcmpeqb
 # define VPTEST	vptestmb
 
-# define VEC_SIZE	32
 # define PAGE_SIZE	4096
 
-# define YMM0		ymm16
-# define YMM1		ymm17
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
-
-
-	.section .text.evex, "ax", @progbits
+	.section SECTION(.text), "ax", @progbits
 ENTRY_P2ALIGN (MEMCMPEQ, 6)
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
@@ -69,47 +98,54 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
 	ja	L(more_1x_vec)
 
 	/* Create mask of bytes that are guranteed to be valid because
-	   of length (edx). Using masked movs allows us to skip checks for
-	   page crosses/zero size.  */
-	movl	$-1, %ecx
-	bzhil	%edx, %ecx, %ecx
-	kmovd	%ecx, %k2
+	   of length (edx). Using masked movs allows us to skip checks
+	   for page crosses/zero size.  */
+	mov	$-1, %VRAX
+	bzhi	%VRDX, %VRAX, %VRAX
+	/* NB: A `jz` might be useful here. Page-faults that are
+	   invalidated by predicate execution (the evex mask) can be
+	   very slow.  The expectation is this is not the norm so and
+	   "most" code will not regularly call 'memcmp' with length = 0
+	   and memory that is not wired up.  */
+	KMOV	%VRAX, %k2
 
 	/* Use masked loads as VEC_SIZE could page cross where length
 	   (edx) would not.  */
-	VMOVU_MASK (%rsi), %YMM2{%k2}
-	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
-	kmovd	%k1, %eax
+	VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
+	VPCMPNEQ (%rdi), %VMM(2), %k1{%k2}
+	KMOV	%k1, %VRAX
+	TO_32BIT (VRAX)
 	ret
 
-
+	.p2align 4,, 3
 L(last_1x_vec):
-	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
-	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
-	kmovd	%k1, %eax
+	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
+	VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	TO_32BIT_P1 (rax)
 L(return_neq0):
+	TO_32BIT_P2 (rax)
 	ret
 
 
-
-	.p2align 4
+	.p2align 4,, 12
 L(more_1x_vec):
 	/* From VEC + 1 to 2 * VEC.  */
-	VMOVU	(%rsi), %YMM1
+	VMOVU	(%rsi), %VMM(1)
 	/* Use compare not equals to directly check for mismatch.  */
-	VPCMP	$4,(%rdi), %YMM1, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	VPCMPNEQ (%rdi), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	TEST_ZERO (rax)
 	jnz	L(return_neq0)
 
 	cmpq	$(VEC_SIZE * 2), %rdx
 	jbe	L(last_1x_vec)
 
 	/* Check second VEC no matter what.  */
-	VMOVU	VEC_SIZE(%rsi), %YMM2
-	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	VMOVU	VEC_SIZE(%rsi), %VMM(2)
+	VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1
+	KMOV	%k1, %VRAX
+	TEST_ZERO (rax)
 	jnz	L(return_neq0)
 
 	/* Less than 4 * VEC.  */
@@ -117,16 +153,16 @@ L(more_1x_vec):
 	jbe	L(last_2x_vec)
 
 	/* Check third and fourth VEC no matter what.  */
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
-	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VEC_CMP	(VEC_SIZE * 2)(%rdi), %VMM(3), %k1
+	KMOV	%k1, %VRAX
+	TEST_ZERO_VCMP (rax)
 	jnz	L(return_neq0)
 
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
-	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(4)
+	VEC_CMP	(VEC_SIZE * 3)(%rdi), %VMM(4), %k1
+	KMOV	%k1, %VRAX
+	TEST_ZERO_VCMP (rax)
 	jnz	L(return_neq0)
 
 	/* Go to 4x VEC loop.  */
@@ -136,8 +172,8 @@ L(more_1x_vec):
 	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
 	   branches.  */
 
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %YMM1
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %YMM2
+	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
 	addq	%rdx, %rdi
 
 	/* Wait to load from s1 until addressed adjust due to
@@ -145,26 +181,32 @@ L(more_1x_vec):
 
 	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
 	   will have some 1s.  */
-	vpxorq	-(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1
-	/* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while
-	   oring with YMM1. Result is stored in YMM1.  */
-	vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2
-
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
-	vpxorq	-(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
-	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
-	VMOVU	-(VEC_SIZE)(%rsi, %rdx), %YMM4
-	vpxorq	-(VEC_SIZE)(%rdi), %YMM4, %YMM4
-
-	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
-	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
-
-	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
-	VPTEST	%YMM4, %YMM4, %k1
-	kmovd	%k1, %eax
+	vpxorq	-(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1)
+	/* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while
+	   oring with VEC(1). Result is stored in VEC(1).  */
+	vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2)
+
+	cmpl	$(VEC_SIZE * 6), %edx
+	jbe	L(4x_last_2x_vec)
+
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3)
+	vpxorq	-(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3)
+	/* Or together VEC(1), VEC(2), and VEC(3) into VEC(3).  */
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4)
+	vpxorq	-(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4)
+
+	/* Or together VEC(4), VEC(3), and VEC(2) into VEC(2).  */
+	vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2)
+
+	/* Compare VEC(4) with 0. If any 1s s1 and s2 don't match.  */
+L(4x_last_2x_vec):
+	VPTEST	%VMM(2), %VMM(2), %k1
+	KMOV	%k1, %VRAX
+	TO_32BIT (VRAX)
 	ret
 
-	.p2align 4
+
+	.p2align 4,, 10
 L(more_8x_vec):
 	/* Set end of s1 in rdx.  */
 	leaq	-(VEC_SIZE * 4)(%rdi, %rdx), %rdx
@@ -175,67 +217,80 @@ L(more_8x_vec):
 	andq	$-VEC_SIZE, %rdi
 	/* Adjust because first 4x vec where check already.  */
 	subq	$-(VEC_SIZE * 4), %rdi
-	.p2align 4
+	.p2align 5,, 12
+	.p2align 4,, 8
 L(loop_4x_vec):
-	VMOVU	(%rsi, %rdi), %YMM1
-	vpxorq	(%rdi), %YMM1, %YMM1
+	VMOVU	(%rsi, %rdi), %VMM(1)
+	vpxorq	(%rdi), %VMM(1), %VMM(1)
 
-	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
-	vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
+	VMOVU	VEC_SIZE(%rsi, %rdi), %VMM(2)
+	vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2)
 
-	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
-	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
+	vpxorq	(VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
 
-	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
-	vpxorq	(VEC_SIZE * 3)(%rdi), %YMM4, %YMM4
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
+	vpxorq	(VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4)
 
-	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
-	VPTEST	%YMM4, %YMM4, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
+	VPTEST	%VMM(4), %VMM(4), %k1
+	KMOV	%k1, %VRAX
+	TEST_ZERO (rax)
 	jnz	L(return_neq2)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpq	%rdx, %rdi
 	jb	L(loop_4x_vec)
 
 	subq	%rdx, %rdi
-	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
-	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM4, %YMM4
+
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
+	vpxorq	(VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4)
 	/* rdi has 4 * VEC_SIZE - remaining length.  */
-	cmpl	$(VEC_SIZE * 3), %edi
-	jae	L(8x_last_1x_vec)
+
 	/* Load regardless of branch.  */
-	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
-	/* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
-	   oring with YMM4. Result is stored in YMM4.  */
-	vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
+	/* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while
+	   oring with VEC(4). Result is stored in VEC(4).  */
+	vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4)
+
+	/* Seperate logic as we can only use testb for VEC_SIZE == 64.
+	 */
+# if VEC_SIZE == 64
+	testb	%dil, %dil
+	js	L(8x_last_2x_vec)
+# else
 	cmpl	$(VEC_SIZE * 2), %edi
-	jae	L(8x_last_2x_vec)
+	jge	L(8x_last_2x_vec)
+# endif
 
-	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
-	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+	VMOVU	VEC_SIZE(%rsi, %rdx), %VMM(2)
+	vpxorq	VEC_SIZE(%rdx), %VMM(2), %VMM(2)
 
-	VMOVU	(%rsi, %rdx), %YMM1
-	vpxorq	(%rdx), %YMM1, %YMM1
+	VMOVU	(%rsi, %rdx), %VMM(1)
+	vpxorq	(%rdx), %VMM(1), %VMM(1)
 
-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM4
+	vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4)
 L(8x_last_1x_vec):
 L(8x_last_2x_vec):
-	VPTEST	%YMM4, %YMM4, %k1
-	kmovd	%k1, %eax
+	VPTEST	%VMM(4), %VMM(4), %k1
+	KMOV	%k1, %VRAX
+	TO_32BIT_P1 (rax)
 L(return_neq2):
+	TO_32BIT_P2 (rax)
 	ret
 
-	.p2align 4,, 8
+	.p2align 4,, 4
 L(last_2x_vec):
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
-	vpxorq	-(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
-	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
-	vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
-	VPTEST	%YMM2, %YMM2, %k1
-	kmovd	%k1, %eax
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1)
+	vpxorq	-(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1)
+	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2)
+	vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2)
+	VPTEST	%VMM(2), %VMM(2), %k1
+	KMOV	%k1, %VRAX
+	TO_32BIT (VRAX)
 	ret
 
-    /* 1 Bytes from next cache line. */
+	/* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from
+	   next cache line.  */
 END (MEMCMPEQ)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests
  2022-10-29 20:19 [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Noah Goldstein
  2022-10-29 20:19 ` [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes Noah Goldstein
  2022-10-29 20:19 ` [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S " Noah Goldstein
@ 2022-10-31 13:19 ` Siddhesh Poyarekar
  2 siblings, 0 replies; 7+ messages in thread
From: Siddhesh Poyarekar @ 2022-10-31 13:19 UTC (permalink / raw)
  To: Noah Goldstein, libc-alpha

LGTM.

Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>

On 2022-10-29 16:19, Noah Goldstein via Libc-alpha wrote:
> len=0 is valid and fairly common so should be tested.
> ---
>   benchtests/bench-memcmp.c | 18 +++++++++---------
>   string/test-memcmp.c      | 16 ++++++++++------
>   2 files changed, 19 insertions(+), 15 deletions(-)
> 
> diff --git a/benchtests/bench-memcmp.c b/benchtests/bench-memcmp.c
> index d64eaa992e..b2816baebe 100644
> --- a/benchtests/bench-memcmp.c
> +++ b/benchtests/bench-memcmp.c
> @@ -63,7 +63,7 @@ IMPL (MEMCMP, 1)
>   
>   static void
>   do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s1,
> -	     const CHAR *s2, size_t len, int exp_result)
> +	     const CHAR *s2, size_t len)
>   {
>     size_t i, iters = INNER_LOOP_ITERS_LARGE;
>     timing_t start, stop, cur;
> @@ -87,9 +87,6 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
>     size_t i;
>     CHAR *s1, *s2;
>   
> -  if (len == 0)
> -    return;
> -
>     align1 &= (4096 - CHARBYTES);
>     if (align1 + (len + 1) * CHARBYTES >= page_size)
>       return;
> @@ -111,13 +108,16 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
>     for (i = 0; i < len; i++)
>       s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % MAX_CHAR;
>   
> -  s1[len] = align1;
> -  s2[len] = align2;
> -  s2[len - 1] -= exp_result;
> +  if (len)
> +    {
> +      s1[len] = align1;
> +      s2[len] = align2;
> +      s2[len - 1] -= exp_result;
> +    }
>   
>     FOR_EACH_IMPL (impl, 0)
>       {
> -      do_one_test (json_ctx, impl, s1, s2, len, exp_result);
> +      do_one_test (json_ctx, impl, s1, s2, len);
>       }
>   
>     json_array_end (json_ctx);
> @@ -147,7 +147,7 @@ test_main (void)
>     json_array_end (&json_ctx);
>   
>     json_array_begin (&json_ctx, "results");
> -  for (i = 1; i < 32; ++i)
> +  for (i = 0; i < 32; ++i)
>       {
>         do_test (&json_ctx, i * CHARBYTES, i * CHARBYTES, i, 0);
>         do_test (&json_ctx, i * CHARBYTES, i * CHARBYTES, i, 1);
> diff --git a/string/test-memcmp.c b/string/test-memcmp.c
> index 181b689f68..18d8b0d9f1 100644
> --- a/string/test-memcmp.c
> +++ b/string/test-memcmp.c
> @@ -117,9 +117,6 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result)
>     size_t i;
>     CHAR *s1, *s2;
>   
> -  if (len == 0)
> -    return;
> -
>     align1 &= (4096 - CHARBYTES);
>     if (align1 + (len + 1) * CHARBYTES >= page_size)
>       return;
> @@ -134,9 +131,16 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result)
>     for (i = 0; i < len; i++)
>       s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % CHAR__MAX;
>   
> -  s1[len] = align1;
> -  s2[len] = align2;
> -  s2[len - 1] -= exp_result;
> +  if (len)
> +    {
> +      s1[len] = align1;
> +      s2[len] = align2;
> +      s2[len - 1] -= exp_result;
> +    }
> +  else
> +    {
> +      exp_result = 0;
> +    }
>   
>     FOR_EACH_IMPL (impl, 0)
>       do_one_test (impl, s1, s2, len, exp_result);

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes
  2022-10-29 20:19 ` [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes Noah Goldstein
@ 2022-10-31 15:47   ` H.J. Lu
  0 siblings, 0 replies; 7+ messages in thread
From: H.J. Lu @ 2022-10-31 15:47 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Sat, Oct 29, 2022 at 1:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The only change to the existing generated code is `tzcnt` -> `bsf` to
> save a byte of code size here and there.
>
> Rewriting with VMM API allows for memcmp-evex-movbe to be used with
> evex512 by including "x86-evex512-vecs.h" at the top.
>
> Complete check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 308 +++++++++++--------
>  1 file changed, 175 insertions(+), 133 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> index bc017768be..f6c379831e 100644
> --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> @@ -62,44 +62,38 @@ Latency:
>  #  define MEMCMP       __memcmp_evex_movbe
>  # endif
>
> -# define VMOVU         vmovdqu64
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
>
>  # ifdef USE_AS_WMEMCMP
>  #  define VMOVU_MASK   vmovdqu32
>  #  define CHAR_SIZE    4
>  #  define VPCMP        vpcmpd
> +#  define VPCMPEQ      vpcmpeqd
>  #  define VPTEST       vptestmd
> +
> +#  define USE_WIDE_CHAR
>  # else
>  #  define VMOVU_MASK   vmovdqu8
>  #  define CHAR_SIZE    1
>  #  define VPCMP        vpcmpub
> +#  define VPCMPEQ      vpcmpeqb
>  #  define VPTEST       vptestmb
>  # endif
>
> +# include "reg-macros.h"
>
> -# define VEC_SIZE      32
>  # define PAGE_SIZE     4096
>  # define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
> -# define XMM0          xmm16
> -# define XMM1          xmm17
> -# define XMM2          xmm18
> -# define YMM0          ymm16
> -# define XMM1          xmm17
> -# define XMM2          xmm18
> -# define YMM1          ymm17
> -# define YMM2          ymm18
> -# define YMM3          ymm19
> -# define YMM4          ymm20
> -# define YMM5          ymm21
> -# define YMM6          ymm22
>
>  /* Warning!
>             wmemcmp has to use SIGNED comparison for elements.
>             memcmp has to use UNSIGNED comparison for elemnts.
>  */
>
> -       .section .text.evex,"ax",@progbits
> +       .section SECTION(.text), "ax", @progbits
>  /* Cache align memcmp entry. This allows for much more thorough
>     frontend optimization.  */
>  ENTRY_P2ALIGN (MEMCMP, 6)
> @@ -111,23 +105,40 @@ ENTRY_P2ALIGN (MEMCMP, 6)
>         /* Fall through for [0, VEC_SIZE] as its the hottest.  */
>         ja      L(more_1x_vec)
>
> -       /* Create mask for CHAR's we want to compare. This allows us to
> -          avoid having to include page cross logic.  */
> -       movl    $-1, %ecx
> -       bzhil   %edx, %ecx, %ecx
> -       kmovd   %ecx, %k2
> +       /* Create mask of bytes that are guranteed to be valid because
> +          of length (edx). Using masked movs allows us to skip checks
> +          for page crosses/zero size.  */
> +       mov     $-1, %VRAX
> +       bzhi    %VRDX, %VRAX, %VRAX
> +       /* NB: A `jz` might be useful here. Page-faults that are
> +          invalidated by predicate execution (the evex mask) can be
> +          very slow.  The expectation is this is not the norm so and
> +          "most" code will not regularly call 'memcmp' with length = 0
> +          and memory that is not wired up.  */
> +       KMOV    %VRAX, %k2
> +
> +
>
>         /* Safe to load full ymm with mask.  */
> -       VMOVU_MASK (%rsi), %YMM2{%k2}
> -       VPCMP   $4,(%rdi), %YMM2, %k1{%k2}
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> +       VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
> +       /* Slightly different method for VEC_SIZE == 64 to save a bit of
> +          code size. This allows us to fit L(return_vec_0) entirely in
> +          the first cache line.  */
> +# if VEC_SIZE == 64
> +       VPCMPEQ (%rdi), %VMM(2), %k1{%k2}
> +       KMOV    %k1, %VRCX
> +       sub     %VRCX, %VRAX
> +# else
> +       VPCMP   $4, (%rdi), %VMM(2), %k1{%k2}
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
> +# endif
>         jnz     L(return_vec_0)
>         ret
>
> -       .p2align 4
> +       .p2align 4,, 11
>  L(return_vec_0):
> -       tzcntl  %eax, %eax
> +       bsf     %VRAX, %VRAX
>  # ifdef USE_AS_WMEMCMP
>         movl    (%rdi, %rax, CHAR_SIZE), %ecx
>         xorl    %edx, %edx
> @@ -138,33 +149,36 @@ L(return_vec_0):
>         leal    -1(%rdx, %rdx), %eax
>  # else
>         movzbl  (%rsi, %rax), %ecx
> +#  if VEC_SIZE == 64
> +       movb    (%rdi, %rax), %al
> +#  else
>         movzbl  (%rdi, %rax), %eax
> +#  endif
>         subl    %ecx, %eax
>  # endif
>         ret
>
> -
> -       .p2align 4
> +       .p2align 4,, 11
>  L(more_1x_vec):
>         /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
> -       VMOVU   (%rsi), %YMM1
> +       VMOVU   (%rsi), %VMM(1)
>         /* Use compare not equals to directly check for mismatch.  */
> -       VPCMP   $4,(%rdi), %YMM1, %k1
> -       kmovd   %k1, %eax
> +       VPCMP   $4, (%rdi), %VMM(1), %k1
> +       KMOV    %k1, %VRAX
>         /* NB: eax must be destination register if going to
> -          L(return_vec_[0,2]). For L(return_vec_3) destination register
> -          must be ecx.  */
> -       testl   %eax, %eax
> +          L(return_vec_[0,2]). For L(return_vec_3) destination
> +          register must be ecx.  */
> +       test    %VRAX, %VRAX
>         jnz     L(return_vec_0)
>
>         cmpq    $(CHAR_PER_VEC * 2), %rdx
>         jbe     L(last_1x_vec)
>
>         /* Check second VEC no matter what.  */
> -       VMOVU   VEC_SIZE(%rsi), %YMM2
> -       VPCMP   $4, VEC_SIZE(%rdi), %YMM2, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> +       VMOVU   VEC_SIZE(%rsi), %VMM(2)
> +       VPCMP   $4, VEC_SIZE(%rdi), %VMM(2), %k1
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(return_vec_1)
>
>         /* Less than 4 * VEC.  */
> @@ -172,16 +186,16 @@ L(more_1x_vec):
>         jbe     L(last_2x_vec)
>
>         /* Check third and fourth VEC no matter what.  */
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM3
> -       VPCMP   $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(3)
> +       VPCMP   $4, (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(return_vec_2)
>
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM4
> -       VPCMP   $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
> -       kmovd   %k1, %ecx
> -       testl   %ecx, %ecx
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(4)
> +       VPCMP   $4, (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
> +       KMOV    %k1, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(return_vec_3)
>
>         /* Go to 4x VEC loop.  */
> @@ -192,8 +206,8 @@ L(more_1x_vec):
>            branches.  */
>
>         /* Load first two VEC from s2 before adjusting addresses.  */
> -       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
> -       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(2)
>         leaq    -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
>         leaq    -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
>
> @@ -202,56 +216,61 @@ L(more_1x_vec):
>
>         /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
>            will have some 1s.  */
> -       vpxorq  (%rdi), %YMM1, %YMM1
> -       vpxorq  (VEC_SIZE)(%rdi), %YMM2, %YMM2
> +       vpxorq  (%rdi), %VMM(1), %VMM(1)
> +       vpxorq  (VEC_SIZE)(%rdi), %VMM(2), %VMM(2)
>
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM3
> -       vpxorq  (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(3)
> +       vpxorq  (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
>
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM4
> -       /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
> -          oring with YMM1. Result is stored in YMM4.  */
> -       vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(4)
> +       /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with VEC(4) while
> +          oring with VEC(1). Result is stored in VEC(4).  */
> +       vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4)
>
> -       /* Or together YMM2, YMM3, and YMM4 into YMM4.  */
> -       vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> +       /* Or together VEC(2), VEC(3), and VEC(4) into VEC(4).  */
> +       vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
>
> -       /* Test YMM4 against itself. Store any CHAR mismatches in k1.
> +       /* Test VEC(4) against itself. Store any CHAR mismatches in k1.
>          */
> -       VPTEST  %YMM4, %YMM4, %k1
> +       VPTEST  %VMM(4), %VMM(4), %k1
>         /* k1 must go to ecx for L(return_vec_0_1_2_3).  */
> -       kmovd   %k1, %ecx
> -       testl   %ecx, %ecx
> +       KMOV    %k1, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(return_vec_0_1_2_3)
>         /* NB: eax must be zero to reach here.  */
>         ret
>
>
> -       .p2align 4,, 8
> +       .p2align 4,, 9
>  L(8x_end_return_vec_0_1_2_3):
>         movq    %rdx, %rdi
>  L(8x_return_vec_0_1_2_3):
> +       /* L(loop_4x_vec) leaves result in `k1` for VEC_SIZE == 64.  */
> +# if VEC_SIZE == 64
> +       KMOV    %k1, %VRCX
> +# endif
>         addq    %rdi, %rsi
>  L(return_vec_0_1_2_3):
> -       VPTEST  %YMM1, %YMM1, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       VPTEST  %VMM(1), %VMM(1), %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(return_vec_0)
>
> -       VPTEST  %YMM2, %YMM2, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       VPTEST  %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(return_vec_1)
>
> -       VPTEST  %YMM3, %YMM3, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       VPTEST  %VMM(3), %VMM(3), %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(return_vec_2)
> +       .p2align 4,, 2
>  L(return_vec_3):
>         /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
>            fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
>            line.  */
> -       bsfl    %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>  # ifdef USE_AS_WMEMCMP
>         movl    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
>         xorl    %edx, %edx
> @@ -266,11 +285,11 @@ L(return_vec_3):
>         ret
>
>
> -       .p2align 4
> +       .p2align 4,, 8
>  L(return_vec_1):
>         /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
>            fetch block.  */
> -       bsfl    %eax, %eax
> +       bsf     %VRAX, %VRAX
>  # ifdef USE_AS_WMEMCMP
>         movl    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
>         xorl    %edx, %edx
> @@ -284,11 +303,11 @@ L(return_vec_1):
>  # endif
>         ret
>
> -       .p2align 4,, 10
> +       .p2align 4,, 7
>  L(return_vec_2):
>         /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
>            fetch block.  */
> -       bsfl    %eax, %eax
> +       bsf     %VRAX, %VRAX
>  # ifdef USE_AS_WMEMCMP
>         movl    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
>         xorl    %edx, %edx
> @@ -302,7 +321,7 @@ L(return_vec_2):
>  # endif
>         ret
>
> -       .p2align 4
> +       .p2align 4,, 8
>  L(more_8x_vec):
>         /* Set end of s1 in rdx.  */
>         leaq    -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
> @@ -316,62 +335,82 @@ L(more_8x_vec):
>
>         .p2align 4
>  L(loop_4x_vec):
> -       VMOVU   (%rsi, %rdi), %YMM1
> -       vpxorq  (%rdi), %YMM1, %YMM1
> -       VMOVU   VEC_SIZE(%rsi, %rdi), %YMM2
> -       vpxorq  VEC_SIZE(%rdi), %YMM2, %YMM2
> -       VMOVU   (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
> -       vpxorq  (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> -       VMOVU   (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
> -       vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> -       vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> -       VPTEST  %YMM4, %YMM4, %k1
> -       kmovd   %k1, %ecx
> -       testl   %ecx, %ecx
> +       VMOVU   (%rsi, %rdi), %VMM(1)
> +       vpxorq  (%rdi), %VMM(1), %VMM(1)
> +       VMOVU   VEC_SIZE(%rsi, %rdi), %VMM(2)
> +       vpxorq  VEC_SIZE(%rdi), %VMM(2), %VMM(2)
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
> +       vpxorq  (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
> +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
> +       vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4)
> +       vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
> +       VPTEST  %VMM(4), %VMM(4), %k1
> +       /* If VEC_SIZE == 64 just branch with KTEST. We have free port0
> +          space and it allows the loop to fit in 2x cache lines
> +          instead of 3.  */
> +# if VEC_SIZE == 64
> +       KTEST   %k1, %k1
> +# else
> +       KMOV    %k1, %VRCX
> +       test    %VRCX, %VRCX
> +# endif
>         jnz     L(8x_return_vec_0_1_2_3)
>         subq    $-(VEC_SIZE * 4), %rdi
>         cmpq    %rdx, %rdi
>         jb      L(loop_4x_vec)
> -
>         subq    %rdx, %rdi
>         /* rdi has 4 * VEC_SIZE - remaining length.  */
>         cmpl    $(VEC_SIZE * 3), %edi
> -       jae     L(8x_last_1x_vec)
> +       jge     L(8x_last_1x_vec)
>         /* Load regardless of branch.  */
> -       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
> -       cmpl    $(VEC_SIZE * 2), %edi
> -       jae     L(8x_last_2x_vec)
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
>
> -       vpxorq  (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
> -
> -       VMOVU   (%rsi, %rdx), %YMM1
> -       vpxorq  (%rdx), %YMM1, %YMM1
> +       /* Seperate logic as we can only use testb for VEC_SIZE == 64.
> +        */
> +# if VEC_SIZE == 64
> +       testb   %dil, %dil
> +       js      L(8x_last_2x_vec)
> +# else
> +       cmpl    $(VEC_SIZE * 2), %edi
> +       jge     L(8x_last_2x_vec)
> +# endif
>
> -       VMOVU   VEC_SIZE(%rsi, %rdx), %YMM2
> -       vpxorq  VEC_SIZE(%rdx), %YMM2, %YMM2
> -       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
> -       vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
> -       vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> -       VPTEST  %YMM4, %YMM4, %k1
> -       kmovd   %k1, %ecx
> -       testl   %ecx, %ecx
> +       vpxorq  (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(3)
> +
> +       VMOVU   (%rsi, %rdx), %VMM(1)
> +       vpxorq  (%rdx), %VMM(1), %VMM(1)
> +
> +       VMOVU   VEC_SIZE(%rsi, %rdx), %VMM(2)
> +       vpxorq  VEC_SIZE(%rdx), %VMM(2), %VMM(2)
> +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
> +       vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %VMM(1), %VMM(4)
> +       vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
> +       VPTEST  %VMM(4), %VMM(4), %k1
> +       /* L(8x_end_return_vec_0_1_2_3) expects bitmask to still be in
> +          `k1`  if VEC_SIZE == 64.  */
> +# if VEC_SIZE == 64
> +       KTEST   %k1, %k1
> +# else
> +       KMOV    %k1, %VRCX
> +       test    %VRCX, %VRCX
> +# endif
>         jnz     L(8x_end_return_vec_0_1_2_3)
>         /* NB: eax must be zero to reach here.  */
>         ret
>
>         /* Only entry is from L(more_8x_vec).  */
> -       .p2align 4,, 10
> +       .p2align 4,, 6
>  L(8x_last_2x_vec):
> -       VPCMP   $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> +       VPCMP   $4, (VEC_SIZE * 2)(%rdx), %VMM(3), %k1
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(8x_return_vec_2)
> -       /* Naturally aligned to 16 bytes.  */
> +       .p2align 4,, 5
>  L(8x_last_1x_vec):
> -       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
> -       VPCMP   $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
> +       VPCMP   $4, (VEC_SIZE * 3)(%rdx), %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(8x_return_vec_3)
>         ret
>
> @@ -383,7 +422,7 @@ L(8x_last_1x_vec):
>  L(8x_return_vec_2):
>         subq    $VEC_SIZE, %rdx
>  L(8x_return_vec_3):
> -       bsfl    %eax, %eax
> +       bsf     %VRAX, %VRAX
>  # ifdef USE_AS_WMEMCMP
>         leaq    (%rdx, %rax, CHAR_SIZE), %rax
>         movl    (VEC_SIZE * 3)(%rax), %ecx
> @@ -399,32 +438,34 @@ L(8x_return_vec_3):
>  # endif
>         ret
>
> -       .p2align 4,, 10
> +       .p2align 4,, 8
>  L(last_2x_vec):
>         /* Check second to last VEC.  */
> -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
> -       VPCMP   $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> +       VPCMP   $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(return_vec_1_end)
>
>         /* Check last VEC.  */
> -       .p2align 4
> +       .p2align 4,, 8
>  L(last_1x_vec):
> -       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
> -       VPCMP   $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> +       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> +       VPCMP   $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(return_vec_0_end)
>         ret
>
>
> -       /* Don't align. Takes 2-fetch blocks either way and aligning
> -          will cause code to spill into another cacheline.  */
> +       /* Don't fully align. Takes 2-fetch blocks either way and
> +          aligning will cause code to spill into another cacheline.
> +        */
> +       .p2align 4,, 3
>  L(return_vec_1_end):
>         /* Use bsf to save code size. This is necessary to have
>            L(one_or_less) fit in aligning bytes between.  */
> -       bsfl    %eax, %eax
> +       bsf     %VRAX, %VRAX
>         addl    %edx, %eax
>  # ifdef USE_AS_WMEMCMP
>         movl    -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
> @@ -439,10 +480,11 @@ L(return_vec_1_end):
>  # endif
>         ret
>
> +       .p2align 4,, 2
>         /* Don't align. Takes 2-fetch blocks either way and aligning
>            will cause code to spill into another cacheline.  */
>  L(return_vec_0_end):
> -       tzcntl  %eax, %eax
> +       bsf     %VRAX, %VRAX
>         addl    %edx, %eax
>  # ifdef USE_AS_WMEMCMP
>         movl    -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
> @@ -456,7 +498,7 @@ L(return_vec_0_end):
>         subl    %ecx, %eax
>  # endif
>         ret
> -       /* 1-byte until next cache line.  */
> -
> +       /* evex256: 2-byte until next cache line. evex512: 46-bytes
> +          until next cache line.  */
>  END (MEMCMP)
>  #endif
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S and minor changes
  2022-10-29 20:19 ` [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S " Noah Goldstein
@ 2022-10-31 15:48   ` H.J. Lu
  2022-10-31 16:42     ` Noah Goldstein
  0 siblings, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2022-10-31 15:48 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Sat, Oct 29, 2022 at 1:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Changes to generated code are:
>     1. In a few places use `vpcmpeqb` instead of `vpcmpneq` to save a
>        byte of code size.
>     2. Add a branch for length <= (VEC_SIZE * 6) as opposed to doing
>        the entire block of [VEC_SIZE * 4 + 1, VEC_SIZE * 8] in a
>        single basic-block (the space to add the extra branch without
>        changing code size is bought with the above change).
>
> Change (2) has roughly a 20-25% speedup for sizes in [VEC_SIZE * 4 +
> 1, VEC_SIZE * 6] and negligible to no-cost for [VEC_SIZE * 6 + 1,
> VEC_SIZE * 8]
>
> From N=10 runs on Tigerlake:
>
> align1,align2 ,length ,result               ,New TIme    ,Cur Time,New Time / Old Time
> 0     ,0      ,129    ,0                    ,5.404       ,6.887   ,0.785
> 0     ,0      ,129    ,1                    ,5.308       ,6.826   ,0.778
> 0     ,0      ,129    ,18446744073709551615 ,5.359       ,6.823   ,0.785
> 0     ,0      ,161    ,0                    ,5.284       ,6.827   ,0.774
> 0     ,0      ,161    ,1                    ,5.317       ,6.745   ,0.788
> 0     ,0      ,161    ,18446744073709551615 ,5.406       ,6.778   ,0.798
>
> 0     ,0      ,193    ,0                    ,6.969       ,6.832   ,1.000
> 0     ,0      ,193    ,1                    ,6.943       ,6.748   ,1.029
> 0     ,0      ,193    ,18446744073709551615 ,6.997       ,6.728   ,1.011
> 0     ,0      ,225    ,0                    ,7.144       ,6.746   ,0.989
> 0     ,0      ,225    ,1                    ,7.218       ,6.683   ,1.003
> 0     ,0      ,225    ,18446744073709551615 ,6.864       ,6.767   ,0.992
> 0     ,0      ,256    ,0                    ,5.423       ,5.482   ,0.989
> 0     ,0      ,256    ,1                    ,5.348       ,5.465   ,0.978
> 0     ,0      ,256    ,18446744073709551615 ,5.321       ,5.518   ,0.964
>
> Rewriting with VMM API allows for memcmpeq-evex to be used with
> evex512 by including "x86-evex512-vecs.h" at the top.
>
> Complete check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/memcmpeq-evex.S | 255 ++++++++++++++---------
>  1 file changed, 155 insertions(+), 100 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> index 41124ef1d3..671d19393e 100644
> --- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> +++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> @@ -41,24 +41,53 @@
>  #  define MEMCMPEQ     __memcmpeq_evex
>  # endif
>
> +# ifndef VEC_SIZE
> +#  include "x86-evex512-vecs.h"
> +# endif
> +# include "reg-macros.h"
> +
> +
> +# if VEC_SIZE == 32
> +
> +#  define TEST_ZERO_VCMP(reg)  inc %VGPR(reg)
> +#  define TEST_ZERO(reg)       test %VGPR(reg), %VGPR(reg)
> +
> +#  define TO_32BIT_P1(reg)     /* Do nothing. */
> +#  define TO_32BIT_P2(reg)     /* Do nothing. */
> +#  define TO_32BIT(reg)        /* Do nothing. */
> +
> +#  define VEC_CMP      VPCMPEQ
> +
> +# elif VEC_SIZE == 64
> +
> +#  define TEST_ZERO_VCMP(reg)  TEST_ZERO(reg)
> +#  define TEST_ZERO(reg)       neg %VGPR(reg)
> +
> +
> +       /* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit
> +          int. We have two methods for this. If the mask with branched
> +          on, we use `neg` for the branch then `sbb` to get the 32-bit
> +          return. If the mask was no branched on, we just use
> +          `popcntq`.  */
> +#  define TO_32BIT_P1(reg)     TEST_ZERO(reg)
> +#  define TO_32BIT_P2(reg)     sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32)
> +#  define TO_32BIT(reg)        popcntq %reg, %reg
> +
> +#  define VEC_CMP      VPCMPNEQ
> +
> +# else
> +#  error "Unsupported VEC_SIZE"
> +# endif
> +
> +
>  # define VMOVU_MASK    vmovdqu8
> -# define VMOVU vmovdqu64
> -# define VPCMP vpcmpub
> +# define VPCMPNEQ      vpcmpneqb
> +# define VPCMPEQ       vpcmpeqb
>  # define VPTEST        vptestmb
>
> -# define VEC_SIZE      32
>  # define PAGE_SIZE     4096
>
> -# define YMM0          ymm16
> -# define YMM1          ymm17
> -# define YMM2          ymm18
> -# define YMM3          ymm19
> -# define YMM4          ymm20
> -# define YMM5          ymm21
> -# define YMM6          ymm22
> -
> -
> -       .section .text.evex, "ax", @progbits
> +       .section SECTION(.text), "ax", @progbits
>  ENTRY_P2ALIGN (MEMCMPEQ, 6)
>  # ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
> @@ -69,47 +98,54 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
>         ja      L(more_1x_vec)
>
>         /* Create mask of bytes that are guranteed to be valid because
> -          of length (edx). Using masked movs allows us to skip checks for
> -          page crosses/zero size.  */
> -       movl    $-1, %ecx
> -       bzhil   %edx, %ecx, %ecx
> -       kmovd   %ecx, %k2
> +          of length (edx). Using masked movs allows us to skip checks
> +          for page crosses/zero size.  */
> +       mov     $-1, %VRAX
> +       bzhi    %VRDX, %VRAX, %VRAX
> +       /* NB: A `jz` might be useful here. Page-faults that are
> +          invalidated by predicate execution (the evex mask) can be
> +          very slow.  The expectation is this is not the norm so and
> +          "most" code will not regularly call 'memcmp' with length = 0
> +          and memory that is not wired up.  */
> +       KMOV    %VRAX, %k2
>
>         /* Use masked loads as VEC_SIZE could page cross where length
>            (edx) would not.  */
> -       VMOVU_MASK (%rsi), %YMM2{%k2}
> -       VPCMP   $4,(%rdi), %YMM2, %k1{%k2}
> -       kmovd   %k1, %eax
> +       VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
> +       VPCMPNEQ (%rdi), %VMM(2), %k1{%k2}
> +       KMOV    %k1, %VRAX
> +       TO_32BIT (VRAX)
>         ret
>
> -
> +       .p2align 4,, 3
>  L(last_1x_vec):
> -       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
> -       VPCMP   $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
> -       kmovd   %k1, %eax
> +       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
> +       VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       TO_32BIT_P1 (rax)
>  L(return_neq0):
> +       TO_32BIT_P2 (rax)
>         ret
>
>
> -
> -       .p2align 4
> +       .p2align 4,, 12
>  L(more_1x_vec):
>         /* From VEC + 1 to 2 * VEC.  */
> -       VMOVU   (%rsi), %YMM1
> +       VMOVU   (%rsi), %VMM(1)
>         /* Use compare not equals to directly check for mismatch.  */
> -       VPCMP   $4,(%rdi), %YMM1, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> +       VPCMPNEQ (%rdi), %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       TEST_ZERO (rax)
>         jnz     L(return_neq0)
>
>         cmpq    $(VEC_SIZE * 2), %rdx
>         jbe     L(last_1x_vec)
>
>         /* Check second VEC no matter what.  */
> -       VMOVU   VEC_SIZE(%rsi), %YMM2
> -       VPCMP   $4, VEC_SIZE(%rdi), %YMM2, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> +       VMOVU   VEC_SIZE(%rsi), %VMM(2)
> +       VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1
> +       KMOV    %k1, %VRAX
> +       TEST_ZERO (rax)
>         jnz     L(return_neq0)
>
>         /* Less than 4 * VEC.  */
> @@ -117,16 +153,16 @@ L(more_1x_vec):
>         jbe     L(last_2x_vec)
>
>         /* Check third and fourth VEC no matter what.  */
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM3
> -       VPCMP   $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(3)
> +       VEC_CMP (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
> +       KMOV    %k1, %VRAX
> +       TEST_ZERO_VCMP (rax)
>         jnz     L(return_neq0)
>
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM4
> -       VPCMP   $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(4)
> +       VEC_CMP (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
> +       KMOV    %k1, %VRAX
> +       TEST_ZERO_VCMP (rax)
>         jnz     L(return_neq0)
>
>         /* Go to 4x VEC loop.  */
> @@ -136,8 +172,8 @@ L(more_1x_vec):
>         /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
>            branches.  */
>
> -       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %YMM1
> -       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %YMM2
> +       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
>         addq    %rdx, %rdi
>
>         /* Wait to load from s1 until addressed adjust due to
> @@ -145,26 +181,32 @@ L(more_1x_vec):
>
>         /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
>            will have some 1s.  */
> -       vpxorq  -(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1
> -       /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while
> -          oring with YMM1. Result is stored in YMM1.  */
> -       vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2
> -
> -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
> -       vpxorq  -(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> -       /* Or together YMM1, YMM2, and YMM3 into YMM3.  */
> -       VMOVU   -(VEC_SIZE)(%rsi, %rdx), %YMM4
> -       vpxorq  -(VEC_SIZE)(%rdi), %YMM4, %YMM4
> -
> -       /* Or together YMM2, YMM3, and YMM4 into YMM4.  */
> -       vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> -
> -       /* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
> -       VPTEST  %YMM4, %YMM4, %k1
> -       kmovd   %k1, %eax
> +       vpxorq  -(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1)
> +       /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while
> +          oring with VEC(1). Result is stored in VEC(1).  */
> +       vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2)
> +
> +       cmpl    $(VEC_SIZE * 6), %edx
> +       jbe     L(4x_last_2x_vec)
> +
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3)
> +       vpxorq  -(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3)
> +       /* Or together VEC(1), VEC(2), and VEC(3) into VEC(3).  */
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4)
> +       vpxorq  -(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4)
> +
> +       /* Or together VEC(4), VEC(3), and VEC(2) into VEC(2).  */
> +       vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2)
> +
> +       /* Compare VEC(4) with 0. If any 1s s1 and s2 don't match.  */
> +L(4x_last_2x_vec):
> +       VPTEST  %VMM(2), %VMM(2), %k1
> +       KMOV    %k1, %VRAX
> +       TO_32BIT (VRAX)
>         ret
>
> -       .p2align 4
> +
> +       .p2align 4,, 10
>  L(more_8x_vec):
>         /* Set end of s1 in rdx.  */
>         leaq    -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
> @@ -175,67 +217,80 @@ L(more_8x_vec):
>         andq    $-VEC_SIZE, %rdi
>         /* Adjust because first 4x vec where check already.  */
>         subq    $-(VEC_SIZE * 4), %rdi
> -       .p2align 4
> +       .p2align 5,, 12
> +       .p2align 4,, 8
>  L(loop_4x_vec):
> -       VMOVU   (%rsi, %rdi), %YMM1
> -       vpxorq  (%rdi), %YMM1, %YMM1
> +       VMOVU   (%rsi, %rdi), %VMM(1)
> +       vpxorq  (%rdi), %VMM(1), %VMM(1)
>
> -       VMOVU   VEC_SIZE(%rsi, %rdi), %YMM2
> -       vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
> +       VMOVU   VEC_SIZE(%rsi, %rdi), %VMM(2)
> +       vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2)
>
> -       VMOVU   (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
> -       vpxorq  (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
> +       vpxorq  (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
>
> -       VMOVU   (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
> -       vpxorq  (VEC_SIZE * 3)(%rdi), %YMM4, %YMM4
> +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
> +       vpxorq  (VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4)
>
> -       vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> -       VPTEST  %YMM4, %YMM4, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> +       vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
> +       VPTEST  %VMM(4), %VMM(4), %k1
> +       KMOV    %k1, %VRAX
> +       TEST_ZERO (rax)
>         jnz     L(return_neq2)
>         subq    $-(VEC_SIZE * 4), %rdi
>         cmpq    %rdx, %rdi
>         jb      L(loop_4x_vec)
>
>         subq    %rdx, %rdi
> -       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
> -       vpxorq  (VEC_SIZE * 3)(%rdx), %YMM4, %YMM4
> +
> +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
> +       vpxorq  (VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4)
>         /* rdi has 4 * VEC_SIZE - remaining length.  */
> -       cmpl    $(VEC_SIZE * 3), %edi
> -       jae     L(8x_last_1x_vec)
> +
>         /* Load regardless of branch.  */
> -       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
> -       /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
> -          oring with YMM4. Result is stored in YMM4.  */
> -       vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
> +       /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while
> +          oring with VEC(4). Result is stored in VEC(4).  */
> +       vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4)
> +
> +       /* Seperate logic as we can only use testb for VEC_SIZE == 64.
> +        */
> +# if VEC_SIZE == 64
> +       testb   %dil, %dil
> +       js      L(8x_last_2x_vec)
> +# else
>         cmpl    $(VEC_SIZE * 2), %edi
> -       jae     L(8x_last_2x_vec)
> +       jge     L(8x_last_2x_vec)
> +# endif
>
> -       VMOVU   VEC_SIZE(%rsi, %rdx), %YMM2
> -       vpxorq  VEC_SIZE(%rdx), %YMM2, %YMM2
> +       VMOVU   VEC_SIZE(%rsi, %rdx), %VMM(2)
> +       vpxorq  VEC_SIZE(%rdx), %VMM(2), %VMM(2)
>
> -       VMOVU   (%rsi, %rdx), %YMM1
> -       vpxorq  (%rdx), %YMM1, %YMM1
> +       VMOVU   (%rsi, %rdx), %VMM(1)
> +       vpxorq  (%rdx), %VMM(1), %VMM(1)
>
> -       vpternlogd $0xfe, %YMM1, %YMM2, %YMM4
> +       vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4)
>  L(8x_last_1x_vec):
>  L(8x_last_2x_vec):
> -       VPTEST  %YMM4, %YMM4, %k1
> -       kmovd   %k1, %eax
> +       VPTEST  %VMM(4), %VMM(4), %k1
> +       KMOV    %k1, %VRAX
> +       TO_32BIT_P1 (rax)
>  L(return_neq2):
> +       TO_32BIT_P2 (rax)
>         ret
>
> -       .p2align 4,, 8
> +       .p2align 4,, 4
>  L(last_2x_vec):
> -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
> -       vpxorq  -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
> -       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
> -       vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
> -       VPTEST  %YMM2, %YMM2, %k1
> -       kmovd   %k1, %eax
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1)
> +       vpxorq  -(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1)
> +       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2)
> +       vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2)
> +       VPTEST  %VMM(2), %VMM(2), %k1
> +       KMOV    %k1, %VRAX
> +       TO_32BIT (VRAX)
>         ret
>
> -    /* 1 Bytes from next cache line. */
> +       /* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from
> +          next cache line.  */
>  END (MEMCMPEQ)
>  #endif
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S and minor changes
  2022-10-31 15:48   ` H.J. Lu
@ 2022-10-31 16:42     ` Noah Goldstein
  0 siblings, 0 replies; 7+ messages in thread
From: Noah Goldstein @ 2022-10-31 16:42 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Mon, Oct 31, 2022 at 10:48 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sat, Oct 29, 2022 at 1:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Changes to generated code are:
> >     1. In a few places use `vpcmpeqb` instead of `vpcmpneq` to save a
> >        byte of code size.
> >     2. Add a branch for length <= (VEC_SIZE * 6) as opposed to doing
> >        the entire block of [VEC_SIZE * 4 + 1, VEC_SIZE * 8] in a
> >        single basic-block (the space to add the extra branch without
> >        changing code size is bought with the above change).
> >
> > Change (2) has roughly a 20-25% speedup for sizes in [VEC_SIZE * 4 +
> > 1, VEC_SIZE * 6] and negligible to no-cost for [VEC_SIZE * 6 + 1,
> > VEC_SIZE * 8]
> >
> > From N=10 runs on Tigerlake:
> >
> > align1,align2 ,length ,result               ,New TIme    ,Cur Time,New Time / Old Time
> > 0     ,0      ,129    ,0                    ,5.404       ,6.887   ,0.785
> > 0     ,0      ,129    ,1                    ,5.308       ,6.826   ,0.778
> > 0     ,0      ,129    ,18446744073709551615 ,5.359       ,6.823   ,0.785
> > 0     ,0      ,161    ,0                    ,5.284       ,6.827   ,0.774
> > 0     ,0      ,161    ,1                    ,5.317       ,6.745   ,0.788
> > 0     ,0      ,161    ,18446744073709551615 ,5.406       ,6.778   ,0.798
> >
> > 0     ,0      ,193    ,0                    ,6.969       ,6.832   ,1.000
> > 0     ,0      ,193    ,1                    ,6.943       ,6.748   ,1.029
> > 0     ,0      ,193    ,18446744073709551615 ,6.997       ,6.728   ,1.011
> > 0     ,0      ,225    ,0                    ,7.144       ,6.746   ,0.989
> > 0     ,0      ,225    ,1                    ,7.218       ,6.683   ,1.003
> > 0     ,0      ,225    ,18446744073709551615 ,6.864       ,6.767   ,0.992
> > 0     ,0      ,256    ,0                    ,5.423       ,5.482   ,0.989
> > 0     ,0      ,256    ,1                    ,5.348       ,5.465   ,0.978
> > 0     ,0      ,256    ,18446744073709551615 ,5.321       ,5.518   ,0.964
> >
> > Rewriting with VMM API allows for memcmpeq-evex to be used with
> > evex512 by including "x86-evex512-vecs.h" at the top.
> >
> > Complete check passes on x86-64.
> > ---
> >  sysdeps/x86_64/multiarch/memcmpeq-evex.S | 255 ++++++++++++++---------
> >  1 file changed, 155 insertions(+), 100 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> > index 41124ef1d3..671d19393e 100644
> > --- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> > +++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> > @@ -41,24 +41,53 @@
> >  #  define MEMCMPEQ     __memcmpeq_evex
> >  # endif
> >
> > +# ifndef VEC_SIZE
> > +#  include "x86-evex512-vecs.h"
> > +# endif
> > +# include "reg-macros.h"
> > +
> > +
> > +# if VEC_SIZE == 32
> > +
> > +#  define TEST_ZERO_VCMP(reg)  inc %VGPR(reg)
> > +#  define TEST_ZERO(reg)       test %VGPR(reg), %VGPR(reg)
> > +
> > +#  define TO_32BIT_P1(reg)     /* Do nothing. */
> > +#  define TO_32BIT_P2(reg)     /* Do nothing. */
> > +#  define TO_32BIT(reg)        /* Do nothing. */
> > +
> > +#  define VEC_CMP      VPCMPEQ
> > +
> > +# elif VEC_SIZE == 64
> > +
> > +#  define TEST_ZERO_VCMP(reg)  TEST_ZERO(reg)
> > +#  define TEST_ZERO(reg)       neg %VGPR(reg)
> > +
> > +
> > +       /* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit
> > +          int. We have two methods for this. If the mask with branched
> > +          on, we use `neg` for the branch then `sbb` to get the 32-bit
> > +          return. If the mask was no branched on, we just use
> > +          `popcntq`.  */
> > +#  define TO_32BIT_P1(reg)     TEST_ZERO(reg)
> > +#  define TO_32BIT_P2(reg)     sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32)
> > +#  define TO_32BIT(reg)        popcntq %reg, %reg
> > +
> > +#  define VEC_CMP      VPCMPNEQ
> > +
> > +# else
> > +#  error "Unsupported VEC_SIZE"
> > +# endif
> > +
> > +
> >  # define VMOVU_MASK    vmovdqu8
> > -# define VMOVU vmovdqu64
> > -# define VPCMP vpcmpub
> > +# define VPCMPNEQ      vpcmpneqb
> > +# define VPCMPEQ       vpcmpeqb
> >  # define VPTEST        vptestmb
> >
> > -# define VEC_SIZE      32
> >  # define PAGE_SIZE     4096
> >
> > -# define YMM0          ymm16
> > -# define YMM1          ymm17
> > -# define YMM2          ymm18
> > -# define YMM3          ymm19
> > -# define YMM4          ymm20
> > -# define YMM5          ymm21
> > -# define YMM6          ymm22
> > -
> > -
> > -       .section .text.evex, "ax", @progbits
> > +       .section SECTION(.text), "ax", @progbits
> >  ENTRY_P2ALIGN (MEMCMPEQ, 6)
> >  # ifdef __ILP32__
> >         /* Clear the upper 32 bits.  */
> > @@ -69,47 +98,54 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
> >         ja      L(more_1x_vec)
> >
> >         /* Create mask of bytes that are guranteed to be valid because
> > -          of length (edx). Using masked movs allows us to skip checks for
> > -          page crosses/zero size.  */
> > -       movl    $-1, %ecx
> > -       bzhil   %edx, %ecx, %ecx
> > -       kmovd   %ecx, %k2
> > +          of length (edx). Using masked movs allows us to skip checks
> > +          for page crosses/zero size.  */
> > +       mov     $-1, %VRAX
> > +       bzhi    %VRDX, %VRAX, %VRAX
> > +       /* NB: A `jz` might be useful here. Page-faults that are
> > +          invalidated by predicate execution (the evex mask) can be
> > +          very slow.  The expectation is this is not the norm so and
> > +          "most" code will not regularly call 'memcmp' with length = 0
> > +          and memory that is not wired up.  */
> > +       KMOV    %VRAX, %k2
> >
> >         /* Use masked loads as VEC_SIZE could page cross where length
> >            (edx) would not.  */
> > -       VMOVU_MASK (%rsi), %YMM2{%k2}
> > -       VPCMP   $4,(%rdi), %YMM2, %k1{%k2}
> > -       kmovd   %k1, %eax
> > +       VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
> > +       VPCMPNEQ (%rdi), %VMM(2), %k1{%k2}
> > +       KMOV    %k1, %VRAX
> > +       TO_32BIT (VRAX)
> >         ret
> >
> > -
> > +       .p2align 4,, 3
> >  L(last_1x_vec):
> > -       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
> > -       VPCMP   $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
> > -       kmovd   %k1, %eax
> > +       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
> > +       VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1
> > +       KMOV    %k1, %VRAX
> > +       TO_32BIT_P1 (rax)
> >  L(return_neq0):
> > +       TO_32BIT_P2 (rax)
> >         ret
> >
> >
> > -
> > -       .p2align 4
> > +       .p2align 4,, 12
> >  L(more_1x_vec):
> >         /* From VEC + 1 to 2 * VEC.  */
> > -       VMOVU   (%rsi), %YMM1
> > +       VMOVU   (%rsi), %VMM(1)
> >         /* Use compare not equals to directly check for mismatch.  */
> > -       VPCMP   $4,(%rdi), %YMM1, %k1
> > -       kmovd   %k1, %eax
> > -       testl   %eax, %eax
> > +       VPCMPNEQ (%rdi), %VMM(1), %k1
> > +       KMOV    %k1, %VRAX
> > +       TEST_ZERO (rax)
> >         jnz     L(return_neq0)
> >
> >         cmpq    $(VEC_SIZE * 2), %rdx
> >         jbe     L(last_1x_vec)
> >
> >         /* Check second VEC no matter what.  */
> > -       VMOVU   VEC_SIZE(%rsi), %YMM2
> > -       VPCMP   $4, VEC_SIZE(%rdi), %YMM2, %k1
> > -       kmovd   %k1, %eax
> > -       testl   %eax, %eax
> > +       VMOVU   VEC_SIZE(%rsi), %VMM(2)
> > +       VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1
> > +       KMOV    %k1, %VRAX
> > +       TEST_ZERO (rax)
> >         jnz     L(return_neq0)
> >
> >         /* Less than 4 * VEC.  */
> > @@ -117,16 +153,16 @@ L(more_1x_vec):
> >         jbe     L(last_2x_vec)
> >
> >         /* Check third and fourth VEC no matter what.  */
> > -       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM3
> > -       VPCMP   $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
> > -       kmovd   %k1, %eax
> > -       testl   %eax, %eax
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(3)
> > +       VEC_CMP (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
> > +       KMOV    %k1, %VRAX
> > +       TEST_ZERO_VCMP (rax)
> >         jnz     L(return_neq0)
> >
> > -       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM4
> > -       VPCMP   $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
> > -       kmovd   %k1, %eax
> > -       testl   %eax, %eax
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(4)
> > +       VEC_CMP (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
> > +       KMOV    %k1, %VRAX
> > +       TEST_ZERO_VCMP (rax)
> >         jnz     L(return_neq0)
> >
> >         /* Go to 4x VEC loop.  */
> > @@ -136,8 +172,8 @@ L(more_1x_vec):
> >         /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
> >            branches.  */
> >
> > -       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %YMM1
> > -       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %YMM2
> > +       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
> > +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
> >         addq    %rdx, %rdi
> >
> >         /* Wait to load from s1 until addressed adjust due to
> > @@ -145,26 +181,32 @@ L(more_1x_vec):
> >
> >         /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
> >            will have some 1s.  */
> > -       vpxorq  -(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1
> > -       /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while
> > -          oring with YMM1. Result is stored in YMM1.  */
> > -       vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2
> > -
> > -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
> > -       vpxorq  -(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> > -       /* Or together YMM1, YMM2, and YMM3 into YMM3.  */
> > -       VMOVU   -(VEC_SIZE)(%rsi, %rdx), %YMM4
> > -       vpxorq  -(VEC_SIZE)(%rdi), %YMM4, %YMM4
> > -
> > -       /* Or together YMM2, YMM3, and YMM4 into YMM4.  */
> > -       vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> > -
> > -       /* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
> > -       VPTEST  %YMM4, %YMM4, %k1
> > -       kmovd   %k1, %eax
> > +       vpxorq  -(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1)
> > +       /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while
> > +          oring with VEC(1). Result is stored in VEC(1).  */
> > +       vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2)
> > +
> > +       cmpl    $(VEC_SIZE * 6), %edx
> > +       jbe     L(4x_last_2x_vec)
> > +
> > +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3)
> > +       vpxorq  -(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3)
> > +       /* Or together VEC(1), VEC(2), and VEC(3) into VEC(3).  */
> > +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4)
> > +       vpxorq  -(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4)
> > +
> > +       /* Or together VEC(4), VEC(3), and VEC(2) into VEC(2).  */
> > +       vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2)
> > +
> > +       /* Compare VEC(4) with 0. If any 1s s1 and s2 don't match.  */
> > +L(4x_last_2x_vec):
> > +       VPTEST  %VMM(2), %VMM(2), %k1
> > +       KMOV    %k1, %VRAX
> > +       TO_32BIT (VRAX)
> >         ret
> >
> > -       .p2align 4
> > +
> > +       .p2align 4,, 10
> >  L(more_8x_vec):
> >         /* Set end of s1 in rdx.  */
> >         leaq    -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
> > @@ -175,67 +217,80 @@ L(more_8x_vec):
> >         andq    $-VEC_SIZE, %rdi
> >         /* Adjust because first 4x vec where check already.  */
> >         subq    $-(VEC_SIZE * 4), %rdi
> > -       .p2align 4
> > +       .p2align 5,, 12
> > +       .p2align 4,, 8
> >  L(loop_4x_vec):
> > -       VMOVU   (%rsi, %rdi), %YMM1
> > -       vpxorq  (%rdi), %YMM1, %YMM1
> > +       VMOVU   (%rsi, %rdi), %VMM(1)
> > +       vpxorq  (%rdi), %VMM(1), %VMM(1)
> >
> > -       VMOVU   VEC_SIZE(%rsi, %rdi), %YMM2
> > -       vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
> > +       VMOVU   VEC_SIZE(%rsi, %rdi), %VMM(2)
> > +       vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2)
> >
> > -       VMOVU   (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
> > -       vpxorq  (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
> > +       vpxorq  (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
> >
> > -       VMOVU   (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
> > -       vpxorq  (VEC_SIZE * 3)(%rdi), %YMM4, %YMM4
> > +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
> > +       vpxorq  (VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4)
> >
> > -       vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> > -       VPTEST  %YMM4, %YMM4, %k1
> > -       kmovd   %k1, %eax
> > -       testl   %eax, %eax
> > +       vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
> > +       VPTEST  %VMM(4), %VMM(4), %k1
> > +       KMOV    %k1, %VRAX
> > +       TEST_ZERO (rax)
> >         jnz     L(return_neq2)
> >         subq    $-(VEC_SIZE * 4), %rdi
> >         cmpq    %rdx, %rdi
> >         jb      L(loop_4x_vec)
> >
> >         subq    %rdx, %rdi
> > -       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
> > -       vpxorq  (VEC_SIZE * 3)(%rdx), %YMM4, %YMM4
> > +
> > +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
> > +       vpxorq  (VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4)
> >         /* rdi has 4 * VEC_SIZE - remaining length.  */
> > -       cmpl    $(VEC_SIZE * 3), %edi
> > -       jae     L(8x_last_1x_vec)
> > +
> >         /* Load regardless of branch.  */
> > -       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
> > -       /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
> > -          oring with YMM4. Result is stored in YMM4.  */
> > -       vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
> > +       /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while
> > +          oring with VEC(4). Result is stored in VEC(4).  */
> > +       vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4)
> > +
> > +       /* Seperate logic as we can only use testb for VEC_SIZE == 64.
> > +        */
> > +# if VEC_SIZE == 64
> > +       testb   %dil, %dil
> > +       js      L(8x_last_2x_vec)
> > +# else
> >         cmpl    $(VEC_SIZE * 2), %edi
> > -       jae     L(8x_last_2x_vec)
> > +       jge     L(8x_last_2x_vec)
> > +# endif
> >
> > -       VMOVU   VEC_SIZE(%rsi, %rdx), %YMM2
> > -       vpxorq  VEC_SIZE(%rdx), %YMM2, %YMM2
> > +       VMOVU   VEC_SIZE(%rsi, %rdx), %VMM(2)
> > +       vpxorq  VEC_SIZE(%rdx), %VMM(2), %VMM(2)
> >
> > -       VMOVU   (%rsi, %rdx), %YMM1
> > -       vpxorq  (%rdx), %YMM1, %YMM1
> > +       VMOVU   (%rsi, %rdx), %VMM(1)
> > +       vpxorq  (%rdx), %VMM(1), %VMM(1)
> >
> > -       vpternlogd $0xfe, %YMM1, %YMM2, %YMM4
> > +       vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4)
> >  L(8x_last_1x_vec):
> >  L(8x_last_2x_vec):
> > -       VPTEST  %YMM4, %YMM4, %k1
> > -       kmovd   %k1, %eax
> > +       VPTEST  %VMM(4), %VMM(4), %k1
> > +       KMOV    %k1, %VRAX
> > +       TO_32BIT_P1 (rax)
> >  L(return_neq2):
> > +       TO_32BIT_P2 (rax)
> >         ret
> >
> > -       .p2align 4,, 8
> > +       .p2align 4,, 4
> >  L(last_2x_vec):
> > -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
> > -       vpxorq  -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
> > -       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
> > -       vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
> > -       VPTEST  %YMM2, %YMM2, %k1
> > -       kmovd   %k1, %eax
> > +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1)
> > +       vpxorq  -(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1)
> > +       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2)
> > +       vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2)
> > +       VPTEST  %VMM(2), %VMM(2), %k1
> > +       KMOV    %k1, %VRAX
> > +       TO_32BIT (VRAX)
> >         ret
> >
> > -    /* 1 Bytes from next cache line. */
> > +       /* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from
> > +          next cache line.  */
> >  END (MEMCMPEQ)
> >  #endif
> > --
> > 2.34.1
> >
>
> LGTM.
>
> Thanks.
>
> --
> H.J.

Err realized I had messed up my extraction script when consolidating these
numbers (ratios are correct, had pulled in the wrong fields when grabbing
the actual times).

Correct numbers are:
align1,align2 ,length ,result               ,memcmpeq-v2 ,current
,memcmpeq-v2/current
0     ,0      ,129    ,0                    ,5.404       ,6.887   ,0.785
0     ,0      ,129    ,1                    ,5.308       ,6.826   ,0.778
0     ,0      ,129    ,18446744073709551615 ,5.359       ,6.823   ,0.785
0     ,0      ,161    ,0                    ,5.284       ,6.827   ,0.774
0     ,0      ,161    ,1                    ,5.317       ,6.745   ,0.788
0     ,0      ,161    ,18446744073709551615 ,5.406       ,6.778   ,0.798

0     ,0      ,193    ,0                    ,6.804       ,6.802   ,1.000
0     ,0      ,193    ,1                    ,6.950       ,6.754   ,1.029
0     ,0      ,193    ,18446744073709551615 ,6.792       ,6.719   ,1.011
0     ,0      ,225    ,0                    ,6.625       ,6.699   ,0.989
0     ,0      ,225    ,1                    ,6.776       ,6.735   ,1.003
0     ,0      ,225    ,18446744073709551615 ,6.758       ,6.738   ,0.992
0     ,0      ,256    ,0                    ,5.402       ,5.462
,0.989
0     ,0      ,256    ,1                    ,5.364       ,5.483
,0.978
0     ,0      ,256    ,18446744073709551615 ,5.341       ,5.539   ,0.964

Will update the commit message before pushing.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-10-31 16:42 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-29 20:19 [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Noah Goldstein
2022-10-29 20:19 ` [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes Noah Goldstein
2022-10-31 15:47   ` H.J. Lu
2022-10-29 20:19 ` [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S " Noah Goldstein
2022-10-31 15:48   ` H.J. Lu
2022-10-31 16:42     ` Noah Goldstein
2022-10-31 13:19 ` [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Siddhesh Poyarekar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).