public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
@ 2023-11-01 22:16 Noah Goldstein
  2023-11-07 17:51 ` Noah Goldstein
                   ` (5 more replies)
  0 siblings, 6 replies; 21+ messages in thread
From: Noah Goldstein @ 2023-11-01 22:16 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
was missing the CPU_FEATURE checks for VBMI2 in the
ifunc/ifunc-impl-list.

The fix is either to add those checks or change the logic to not use
`vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
implementation is usable on SKX.

New implementation is a bit slower, but this is in a cold path so its
probably okay.
---
 sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
 1 file changed, 43 insertions(+), 21 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
index cd6a0a870a..b174d43208 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
@@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	jg	L(cross_page_boundary)
-
+L(page_cross_continue):
 	VMOVU	(%rdi), %VMM(1)
 	/* k0 has a 1 for each zero CHAR in YMM1.  */
 	VPTESTN	%VMM(1), %VMM(1), %k0
@@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
 	test	%VGPR(rsi), %VGPR(rsi)
 	jz	L(aligned_more)
 	/* fallthrough: zero CHAR in first VEC.  */
-L(page_cross_return):
+
 	/* K1 has a 1 for each search CHAR match in VEC(1).  */
 	VPCMPEQ	%VMATCH, %VMM(1), %k1
 	KMOV	%k1, %VGPR(rax)
+L(page_cross_return):
 	/* Build mask up until first zero CHAR (used to mask of
 	   potential search CHAR matches past the end of the string).  */
 	blsmsk	%VGPR(rsi), %VGPR(rsi)
@@ -167,7 +168,6 @@ L(first_vec_x1_return):
 
 	.p2align 4,, 12
 L(aligned_more):
-L(page_cross_continue):
 	/* Need to keep original pointer incase VEC(1) has last match.  */
 	movq	%rdi, %rsi
 	andq	$-VEC_SIZE, %rdi
@@ -340,34 +340,56 @@ L(return_new_match_ret):
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 4
 L(cross_page_boundary):
+	/* eax contains all the page offset bits of src (rdi). `xor rdi,
+	   rax` sets pointer will all page offset bits cleared so
+	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+	   before page cross (guaranteed to be safe to read). Doing this
+	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+	   a bit of code size.  */
 	xorq	%rdi, %rax
-	mov	$-1, %VRDX
-	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
-	VPTESTN	%VMM(6), %VMM(6), %k0
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
 	KMOV	%k0, %VRSI
 
+	/* Shift out zero CHAR matches that are before the beginning of
+	   src (rdi).  */
 # ifdef USE_AS_WCSRCHR
 	movl	%edi, %ecx
-	and	$(VEC_SIZE - 1), %ecx
+	andl	$(VEC_SIZE - 1), %ecx
 	shrl	$2, %ecx
 # endif
-	shlx	%SHIFT_REG, %VRDX, %VRDX
+	shrx	%SHIFT_REG, %VRSI, %VRSI
 
-# ifdef USE_AS_WCSRCHR
-	kmovw	%edx, %k1
+	test	%VRSI, %VRSI
+	jz	L(page_cross_continue)
+
+	/* Found zero CHAR so need to test for search CHAR.  */
+	VPCMP	$0, %VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	/* Shift out search CHAR matches that are before the beginning of
+	   src (rdi).  */
+	shrx	%SHIFT_REG, %VRAX, %VRAX
+	/* For VEC_SIZE == 64 we are just at the end of a cacheline here
+	   so to save code-size just re-use return logic for first VEC.
+	   This is relatively cold code (page cross).  */
+# if VEC_SIZE == 64
+	jmp	L(page_cross_return)
+    /* 6 bytes from cache-line.  */
 # else
-	KMOV	%VRDX, %k1
+	/* Check if any search CHAR match in range.  */
+	blsmsk	%VRSI, %VRSI
+	and	%VRSI, %VRAX
+	jz	L(ret2)
+	bsr	%VRAX, %VRAX
+#  ifdef USE_AS_WCSRCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	addq	%rdi, %rax
+#  endif
+L(ret2):
+	ret
+	/* 3 bytes from cache-line.  */
 # endif
-
-	VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
-	/* We could technically just jmp back after the vpcompress but
-	   it doesn't save any 16-byte blocks.  */
-	shrx	%SHIFT_REG, %VRSI, %VRSI
-	test	%VRSI, %VRSI
-	jnz	L(page_cross_return)
-	jmp	L(page_cross_continue)
-	/* 1-byte from cache line.  */
 END(STRRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-01 22:16 x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S Noah Goldstein
@ 2023-11-07 17:51 ` Noah Goldstein
  2023-11-07 18:25   ` Florian Weimer
  2023-11-08 19:04 ` Sunil Pandey
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2023-11-07 17:51 UTC (permalink / raw)
  To: libc-alpha; +Cc: hjl.tools, carlos

On Wed, Nov 1, 2023 at 5:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
> was missing the CPU_FEATURE checks for VBMI2 in the
> ifunc/ifunc-impl-list.
>
> The fix is either to add those checks or change the logic to not use
> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
> implementation is usable on SKX.
>
> New implementation is a bit slower, but this is in a cold path so its
> probably okay.
> ---
>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
>  1 file changed, 43 insertions(+), 21 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> index cd6a0a870a..b174d43208 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>         andl    $(PAGE_SIZE - 1), %eax
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         jg      L(cross_page_boundary)
> -
> +L(page_cross_continue):
>         VMOVU   (%rdi), %VMM(1)
>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>         VPTESTN %VMM(1), %VMM(1), %k0
> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>         test    %VGPR(rsi), %VGPR(rsi)
>         jz      L(aligned_more)
>         /* fallthrough: zero CHAR in first VEC.  */
> -L(page_cross_return):
> +
>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>         VPCMPEQ %VMATCH, %VMM(1), %k1
>         KMOV    %k1, %VGPR(rax)
> +L(page_cross_return):
>         /* Build mask up until first zero CHAR (used to mask of
>            potential search CHAR matches past the end of the string).  */
>         blsmsk  %VGPR(rsi), %VGPR(rsi)
> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
>
>         .p2align 4,, 12
>  L(aligned_more):
> -L(page_cross_continue):
>         /* Need to keep original pointer incase VEC(1) has last match.  */
>         movq    %rdi, %rsi
>         andq    $-VEC_SIZE, %rdi
> @@ -340,34 +340,56 @@ L(return_new_match_ret):
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4,, 4
>  L(cross_page_boundary):
> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> +          rax` sets pointer will all page offset bits cleared so
> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> +          before page cross (guaranteed to be safe to read). Doing this
> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> +          a bit of code size.  */
>         xorq    %rdi, %rax
> -       mov     $-1, %VRDX
> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> -       VPTESTN %VMM(6), %VMM(6), %k0
> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> +       VPTESTN %VMM(1), %VMM(1), %k0
>         KMOV    %k0, %VRSI
>
> +       /* Shift out zero CHAR matches that are before the beginning of
> +          src (rdi).  */
>  # ifdef USE_AS_WCSRCHR
>         movl    %edi, %ecx
> -       and     $(VEC_SIZE - 1), %ecx
> +       andl    $(VEC_SIZE - 1), %ecx
>         shrl    $2, %ecx
>  # endif
> -       shlx    %SHIFT_REG, %VRDX, %VRDX
> +       shrx    %SHIFT_REG, %VRSI, %VRSI
>
> -# ifdef USE_AS_WCSRCHR
> -       kmovw   %edx, %k1
> +       test    %VRSI, %VRSI
> +       jz      L(page_cross_continue)
> +
> +       /* Found zero CHAR so need to test for search CHAR.  */
> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       /* Shift out search CHAR matches that are before the beginning of
> +          src (rdi).  */
> +       shrx    %SHIFT_REG, %VRAX, %VRAX
> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
> +          so to save code-size just re-use return logic for first VEC.
> +          This is relatively cold code (page cross).  */
> +# if VEC_SIZE == 64
> +       jmp     L(page_cross_return)
> +    /* 6 bytes from cache-line.  */
>  # else
> -       KMOV    %VRDX, %k1
> +       /* Check if any search CHAR match in range.  */
> +       blsmsk  %VRSI, %VRSI
> +       and     %VRSI, %VRAX
> +       jz      L(ret2)
> +       bsr     %VRAX, %VRAX
> +#  ifdef USE_AS_WCSRCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +       addq    %rdi, %rax
> +#  endif
> +L(ret2):
> +       ret
> +       /* 3 bytes from cache-line.  */
>  # endif
> -
> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
> -       /* We could technically just jmp back after the vpcompress but
> -          it doesn't save any 16-byte blocks.  */
> -       shrx    %SHIFT_REG, %VRSI, %VRSI
> -       test    %VRSI, %VRSI
> -       jnz     L(page_cross_return)
> -       jmp     L(page_cross_continue)
> -       /* 1-byte from cache line.  */
>  END(STRRCHR)
>  #endif
> --
> 2.34.1
>
ping

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-07 17:51 ` Noah Goldstein
@ 2023-11-07 18:25   ` Florian Weimer
  2023-11-08  5:22     ` Sunil Pandey
  0 siblings, 1 reply; 21+ messages in thread
From: Florian Weimer @ 2023-11-07 18:25 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, Sunil K Pandey

* Noah Goldstein:

> On Wed, Nov 1, 2023 at 5:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
>> was missing the CPU_FEATURE checks for VBMI2 in the
>> ifunc/ifunc-impl-list.
>>
>> The fix is either to add those checks or change the logic to not use
>> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
>> implementation is usable on SKX.
>>
>> New implementation is a bit slower, but this is in a cold path so its
>> probably okay.
>> ---
>>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
>>  1 file changed, 43 insertions(+), 21 deletions(-)
>>
>> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> index cd6a0a870a..b174d43208 100644
>> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         andl    $(PAGE_SIZE - 1), %eax
>>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>>         jg      L(cross_page_boundary)
>> -
>> +L(page_cross_continue):
>>         VMOVU   (%rdi), %VMM(1)
>>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>>         VPTESTN %VMM(1), %VMM(1), %k0
>> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         test    %VGPR(rsi), %VGPR(rsi)
>>         jz      L(aligned_more)
>>         /* fallthrough: zero CHAR in first VEC.  */
>> -L(page_cross_return):
>> +
>>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>>         VPCMPEQ %VMATCH, %VMM(1), %k1
>>         KMOV    %k1, %VGPR(rax)
>> +L(page_cross_return):
>>         /* Build mask up until first zero CHAR (used to mask of
>>            potential search CHAR matches past the end of the string).  */
>>         blsmsk  %VGPR(rsi), %VGPR(rsi)
>> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
>>
>>         .p2align 4,, 12
>>  L(aligned_more):
>> -L(page_cross_continue):
>>         /* Need to keep original pointer incase VEC(1) has last match.  */
>>         movq    %rdi, %rsi
>>         andq    $-VEC_SIZE, %rdi
>> @@ -340,34 +340,56 @@ L(return_new_match_ret):
>>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>>         ret
>>
>> -       .p2align 4,, 4
>>  L(cross_page_boundary):
>> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
>> +          rax` sets pointer will all page offset bits cleared so
>> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
>> +          before page cross (guaranteed to be safe to read). Doing this
>> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
>> +          a bit of code size.  */
>>         xorq    %rdi, %rax
>> -       mov     $-1, %VRDX
>> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
>> -       VPTESTN %VMM(6), %VMM(6), %k0
>> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
>> +       VPTESTN %VMM(1), %VMM(1), %k0
>>         KMOV    %k0, %VRSI
>>
>> +       /* Shift out zero CHAR matches that are before the beginning of
>> +          src (rdi).  */
>>  # ifdef USE_AS_WCSRCHR
>>         movl    %edi, %ecx
>> -       and     $(VEC_SIZE - 1), %ecx
>> +       andl    $(VEC_SIZE - 1), %ecx
>>         shrl    $2, %ecx
>>  # endif
>> -       shlx    %SHIFT_REG, %VRDX, %VRDX
>> +       shrx    %SHIFT_REG, %VRSI, %VRSI
>>
>> -# ifdef USE_AS_WCSRCHR
>> -       kmovw   %edx, %k1
>> +       test    %VRSI, %VRSI
>> +       jz      L(page_cross_continue)
>> +
>> +       /* Found zero CHAR so need to test for search CHAR.  */
>> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
>> +       KMOV    %k1, %VRAX
>> +       /* Shift out search CHAR matches that are before the beginning of
>> +          src (rdi).  */
>> +       shrx    %SHIFT_REG, %VRAX, %VRAX
>> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
>> +          so to save code-size just re-use return logic for first VEC.
>> +          This is relatively cold code (page cross).  */
>> +# if VEC_SIZE == 64
>> +       jmp     L(page_cross_return)
>> +    /* 6 bytes from cache-line.  */
>>  # else
>> -       KMOV    %VRDX, %k1
>> +       /* Check if any search CHAR match in range.  */
>> +       blsmsk  %VRSI, %VRSI
>> +       and     %VRSI, %VRAX
>> +       jz      L(ret2)
>> +       bsr     %VRAX, %VRAX
>> +#  ifdef USE_AS_WCSRCHR
>> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>> +#  else
>> +       addq    %rdi, %rax
>> +#  endif
>> +L(ret2):
>> +       ret
>> +       /* 3 bytes from cache-line.  */
>>  # endif
>> -
>> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
>> -       /* We could technically just jmp back after the vpcompress but
>> -          it doesn't save any 16-byte blocks.  */
>> -       shrx    %SHIFT_REG, %VRSI, %VRSI
>> -       test    %VRSI, %VRSI
>> -       jnz     L(page_cross_return)
>> -       jmp     L(page_cross_continue)
>> -       /* 1-byte from cache line.  */
>>  END(STRRCHR)
>>  #endif
>> --
>> 2.34.1
>>
> ping

Sunil, could you review this?

Thanks,
Florian


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-07 18:25   ` Florian Weimer
@ 2023-11-08  5:22     ` Sunil Pandey
  0 siblings, 0 replies; 21+ messages in thread
From: Sunil Pandey @ 2023-11-08  5:22 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Noah Goldstein, libc-alpha

[-- Attachment #1: Type: text/plain, Size: 5571 bytes --]

On Tue, Nov 7, 2023 at 10:25 AM Florian Weimer <fweimer@redhat.com> wrote:

> * Noah Goldstein:
>
> > On Wed, Nov 1, 2023 at 5:17 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >>
> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
> >> was missing the CPU_FEATURE checks for VBMI2 in the
> >> ifunc/ifunc-impl-list.
> >>
> >> The fix is either to add those checks or change the logic to not use
> >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
> >> implementation is usable on SKX.
> >>
> >> New implementation is a bit slower, but this is in a cold path so its
> >> probably okay.
> >> ---
> >>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
> >>  1 file changed, 43 insertions(+), 21 deletions(-)
> >>
> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> index cd6a0a870a..b174d43208 100644
> >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> >>         andl    $(PAGE_SIZE - 1), %eax
> >>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> >>         jg      L(cross_page_boundary)
> >> -
> >> +L(page_cross_continue):
> >>         VMOVU   (%rdi), %VMM(1)
> >>         /* k0 has a 1 for each zero CHAR in YMM1.  */
> >>         VPTESTN %VMM(1), %VMM(1), %k0
> >> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> >>         test    %VGPR(rsi), %VGPR(rsi)
> >>         jz      L(aligned_more)
> >>         /* fallthrough: zero CHAR in first VEC.  */
> >> -L(page_cross_return):
> >> +
> >>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
> >>         VPCMPEQ %VMATCH, %VMM(1), %k1
> >>         KMOV    %k1, %VGPR(rax)
> >> +L(page_cross_return):
> >>         /* Build mask up until first zero CHAR (used to mask of
> >>            potential search CHAR matches past the end of the string).
> */
> >>         blsmsk  %VGPR(rsi), %VGPR(rsi)
> >> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
> >>
> >>         .p2align 4,, 12
> >>  L(aligned_more):
> >> -L(page_cross_continue):
> >>         /* Need to keep original pointer incase VEC(1) has last match.
> */
> >>         movq    %rdi, %rsi
> >>         andq    $-VEC_SIZE, %rdi
> >> @@ -340,34 +340,56 @@ L(return_new_match_ret):
> >>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >>         ret
> >>
> >> -       .p2align 4,, 4
> >>  L(cross_page_boundary):
> >> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> >> +          rax` sets pointer will all page offset bits cleared so
> >> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> >> +          before page cross (guaranteed to be safe to read). Doing this
> >> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> >> +          a bit of code size.  */
> >>         xorq    %rdi, %rax
> >> -       mov     $-1, %VRDX
> >> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> >> -       VPTESTN %VMM(6), %VMM(6), %k0
> >> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> >> +       VPTESTN %VMM(1), %VMM(1), %k0
> >>         KMOV    %k0, %VRSI
> >>
> >> +       /* Shift out zero CHAR matches that are before the beginning of
> >> +          src (rdi).  */
> >>  # ifdef USE_AS_WCSRCHR
> >>         movl    %edi, %ecx
> >> -       and     $(VEC_SIZE - 1), %ecx
> >> +       andl    $(VEC_SIZE - 1), %ecx
> >>         shrl    $2, %ecx
> >>  # endif
> >> -       shlx    %SHIFT_REG, %VRDX, %VRDX
> >> +       shrx    %SHIFT_REG, %VRSI, %VRSI
> >>
> >> -# ifdef USE_AS_WCSRCHR
> >> -       kmovw   %edx, %k1
> >> +       test    %VRSI, %VRSI
> >> +       jz      L(page_cross_continue)
> >> +
> >> +       /* Found zero CHAR so need to test for search CHAR.  */
> >> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
> >> +       KMOV    %k1, %VRAX
> >> +       /* Shift out search CHAR matches that are before the beginning
> of
> >> +          src (rdi).  */
> >> +       shrx    %SHIFT_REG, %VRAX, %VRAX
> >> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
> >> +          so to save code-size just re-use return logic for first VEC.
> >> +          This is relatively cold code (page cross).  */
> >> +# if VEC_SIZE == 64
> >> +       jmp     L(page_cross_return)
> >> +    /* 6 bytes from cache-line.  */
> >>  # else
> >> -       KMOV    %VRDX, %k1
> >> +       /* Check if any search CHAR match in range.  */
> >> +       blsmsk  %VRSI, %VRSI
> >> +       and     %VRSI, %VRAX
> >> +       jz      L(ret2)
> >> +       bsr     %VRAX, %VRAX
> >> +#  ifdef USE_AS_WCSRCHR
> >> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> >> +#  else
> >> +       addq    %rdi, %rax
> >> +#  endif
> >> +L(ret2):
> >> +       ret
> >> +       /* 3 bytes from cache-line.  */
> >>  # endif
> >> -
> >> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
> >> -       /* We could technically just jmp back after the vpcompress but
> >> -          it doesn't save any 16-byte blocks.  */
> >> -       shrx    %SHIFT_REG, %VRSI, %VRSI
> >> -       test    %VRSI, %VRSI
> >> -       jnz     L(page_cross_return)
> >> -       jmp     L(page_cross_continue)
> >> -       /* 1-byte from cache line.  */
> >>  END(STRRCHR)
> >>  #endif
> >> --
> >> 2.34.1
> >>
> > ping
>
> Sunil, could you review this?
>

Sure, I'm looking into it.


>
> Thanks,
> Florian
>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-01 22:16 x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S Noah Goldstein
  2023-11-07 17:51 ` Noah Goldstein
@ 2023-11-08 19:04 ` Sunil Pandey
  2023-11-08 19:22   ` Noah Goldstein
  2023-11-08 21:33   ` Noah Goldstein
  2023-11-08 21:33 ` [PATCH v2] " Noah Goldstein
                   ` (3 subsequent siblings)
  5 siblings, 2 replies; 21+ messages in thread
From: Sunil Pandey @ 2023-11-08 19:04 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, hjl.tools, carlos

[-- Attachment #1: Type: text/plain, Size: 6693 bytes --]

On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
> was missing the CPU_FEATURE checks for VBMI2 in the
> ifunc/ifunc-impl-list.
>
> The fix is either to add those checks or change the logic to not use
> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
> implementation is usable on SKX.
>
> New implementation is a bit slower, but this is in a cold path so its
> probably okay.
> ---
>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
>  1 file changed, 43 insertions(+), 21 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> index cd6a0a870a..b174d43208 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>         andl    $(PAGE_SIZE - 1), %eax
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         jg      L(cross_page_boundary)
> -
> +L(page_cross_continue):
>         VMOVU   (%rdi), %VMM(1)
>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>         VPTESTN %VMM(1), %VMM(1), %k0
> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>         test    %VGPR(rsi), %VGPR(rsi)
>         jz      L(aligned_more)
>         /* fallthrough: zero CHAR in first VEC.  */
> -L(page_cross_return):
> +
>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>         VPCMPEQ %VMATCH, %VMM(1), %k1
>         KMOV    %k1, %VGPR(rax)
> +L(page_cross_return):
>         /* Build mask up until first zero CHAR (used to mask of
>            potential search CHAR matches past the end of the string).  */
>         blsmsk  %VGPR(rsi), %VGPR(rsi)
> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
>
>         .p2align 4,, 12
>  L(aligned_more):
> -L(page_cross_continue):
>         /* Need to keep original pointer incase VEC(1) has last match.  */
>         movq    %rdi, %rsi
>         andq    $-VEC_SIZE, %rdi
> @@ -340,34 +340,56 @@ L(return_new_match_ret):
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4,, 4
>  L(cross_page_boundary):
> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> +          rax` sets pointer will all page offset bits cleared so
> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> +          before page cross (guaranteed to be safe to read). Doing this
> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> +          a bit of code size.  */
>         xorq    %rdi, %rax
> -       mov     $-1, %VRDX
> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> -       VPTESTN %VMM(6), %VMM(6), %k0
> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> +       VPTESTN %VMM(1), %VMM(1), %k0
>         KMOV    %k0, %VRSI
>
> +       /* Shift out zero CHAR matches that are before the beginning of
> +          src (rdi).  */
>  # ifdef USE_AS_WCSRCHR
>         movl    %edi, %ecx
> -       and     $(VEC_SIZE - 1), %ecx
> +       andl    $(VEC_SIZE - 1), %ecx
>         shrl    $2, %ecx
>  # endif
> -       shlx    %SHIFT_REG, %VRDX, %VRDX
> +       shrx    %SHIFT_REG, %VRSI, %VRSI
>
> -# ifdef USE_AS_WCSRCHR
> -       kmovw   %edx, %k1
> +       test    %VRSI, %VRSI
> +       jz      L(page_cross_continue)
> +
> +       /* Found zero CHAR so need to test for search CHAR.  */
> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       /* Shift out search CHAR matches that are before the beginning of
> +          src (rdi).  */
> +       shrx    %SHIFT_REG, %VRAX, %VRAX
> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
> +          so to save code-size just re-use return logic for first VEC.
> +          This is relatively cold code (page cross).  */
> +# if VEC_SIZE == 64
> +       jmp     L(page_cross_return)
>

Can you please remove unconditional jump. It's simply confusing and doesn't
help across the board.

$ tail -7 /tmp/wcsrchr-evex.out
 301: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
 306: c4 e2 48 f3 d6       blsmsk %esi,%esi
 30b: 21 f0                 and    %esi,%eax
 30d: 74 07                 je     316 <__wcsrchr_evex+0x316>
 30f: 0f bd c0             bsr    %eax,%eax
 312: 48 8d 04 87           lea    (%rdi,%rax,4),%rax
 316: c3                   ret

$ tail -7 /tmp/wcsrchr-evex512.out
 300: c4 e2 73 f7 f6       shrx   %ecx,%esi,%esi
 305: 85 f6                 test   %esi,%esi
 307: 0f 84 0b fd ff ff     je     18 <__wcsrchr_evex512+0x18>
 30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1
 314: c5 fb 93 c1           kmovd  %k1,%eax
 318: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
 31d: e9 18 fd ff ff       jmp    3a <__wcsrchr_evex512+0x3a>

$ tail -7 /tmp/strrchr-evex.out
 2e8: c4 e2 43 f7 c0       shrx   %edi,%eax,%eax
 2ed: c4 e2 48 f3 d6       blsmsk %esi,%esi
 2f2: 21 f0                 and    %esi,%eax
 2f4: 74 06                 je     2fc <__strrchr_evex+0x2fc>
 2f6: 0f bd c0             bsr    %eax,%eax
 2f9: 48 01 f8             add    %rdi,%rax
 2fc: c3                   ret

 $ tail -7 /tmp/strrchr-evex512.out
 315: c4 e2 c3 f7 f6       shrx   %rdi,%rsi,%rsi
 31a: 48 85 f6             test   %rsi,%rsi
 31d: 0f 84 f5 fc ff ff     je     18 <__strrchr_evex512+0x18>
 323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1
 32a: c4 e1 fb 93 c1       kmovq  %k1,%rax
 32f: c4 e2 c3 f7 c0       shrx   %rdi,%rax,%rax
 334: e9 04 fd ff ff       jmp    3d <__strrchr_evex512+0x3d>



> +    /* 6 bytes from cache-line.  */
>  # else
> -       KMOV    %VRDX, %k1
> +       /* Check if any search CHAR match in range.  */
> +       blsmsk  %VRSI, %VRSI
> +       and     %VRSI, %VRAX
> +       jz      L(ret2)
> +       bsr     %VRAX, %VRAX
> +#  ifdef USE_AS_WCSRCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +       addq    %rdi, %rax
> +#  endif
> +L(ret2):
> +       ret
> +       /* 3 bytes from cache-line.  */
>  # endif
> -
> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
>

Please remove the VCOMPRESS definition as it's not used.


> -       /* We could technically just jmp back after the vpcompress but
> -          it doesn't save any 16-byte blocks.  */
> -       shrx    %SHIFT_REG, %VRSI, %VRSI
> -       test    %VRSI, %VRSI
> -       jnz     L(page_cross_return)
> -       jmp     L(page_cross_continue)
> -       /* 1-byte from cache line.  */
>  END(STRRCHR)
>  #endif
> --
> 2.34.1
>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-08 19:04 ` Sunil Pandey
@ 2023-11-08 19:22   ` Noah Goldstein
  2023-11-08 20:08     ` Sunil Pandey
  2023-11-08 21:33   ` Noah Goldstein
  1 sibling, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2023-11-08 19:22 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: libc-alpha, hjl.tools, carlos

On Wed, Nov 8, 2023 at 1:05 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
>> was missing the CPU_FEATURE checks for VBMI2 in the
>> ifunc/ifunc-impl-list.
>>
>> The fix is either to add those checks or change the logic to not use
>> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
>> implementation is usable on SKX.
>>
>> New implementation is a bit slower, but this is in a cold path so its
>> probably okay.
>> ---
>>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
>>  1 file changed, 43 insertions(+), 21 deletions(-)
>>
>> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> index cd6a0a870a..b174d43208 100644
>> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         andl    $(PAGE_SIZE - 1), %eax
>>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>>         jg      L(cross_page_boundary)
>> -
>> +L(page_cross_continue):
>>         VMOVU   (%rdi), %VMM(1)
>>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>>         VPTESTN %VMM(1), %VMM(1), %k0
>> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         test    %VGPR(rsi), %VGPR(rsi)
>>         jz      L(aligned_more)
>>         /* fallthrough: zero CHAR in first VEC.  */
>> -L(page_cross_return):
>> +
>>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>>         VPCMPEQ %VMATCH, %VMM(1), %k1
>>         KMOV    %k1, %VGPR(rax)
>> +L(page_cross_return):
>>         /* Build mask up until first zero CHAR (used to mask of
>>            potential search CHAR matches past the end of the string).  */
>>         blsmsk  %VGPR(rsi), %VGPR(rsi)
>> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
>>
>>         .p2align 4,, 12
>>  L(aligned_more):
>> -L(page_cross_continue):
>>         /* Need to keep original pointer incase VEC(1) has last match.  */
>>         movq    %rdi, %rsi
>>         andq    $-VEC_SIZE, %rdi
>> @@ -340,34 +340,56 @@ L(return_new_match_ret):
>>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>>         ret
>>
>> -       .p2align 4,, 4
>>  L(cross_page_boundary):
>> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
>> +          rax` sets pointer will all page offset bits cleared so
>> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
>> +          before page cross (guaranteed to be safe to read). Doing this
>> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
>> +          a bit of code size.  */
>>         xorq    %rdi, %rax
>> -       mov     $-1, %VRDX
>> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
>> -       VPTESTN %VMM(6), %VMM(6), %k0
>> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
>> +       VPTESTN %VMM(1), %VMM(1), %k0
>>         KMOV    %k0, %VRSI
>>
>> +       /* Shift out zero CHAR matches that are before the beginning of
>> +          src (rdi).  */
>>  # ifdef USE_AS_WCSRCHR
>>         movl    %edi, %ecx
>> -       and     $(VEC_SIZE - 1), %ecx
>> +       andl    $(VEC_SIZE - 1), %ecx
>>         shrl    $2, %ecx
>>  # endif
>> -       shlx    %SHIFT_REG, %VRDX, %VRDX
>> +       shrx    %SHIFT_REG, %VRSI, %VRSI
>>
>> -# ifdef USE_AS_WCSRCHR
>> -       kmovw   %edx, %k1
>> +       test    %VRSI, %VRSI
>> +       jz      L(page_cross_continue)
>> +
>> +       /* Found zero CHAR so need to test for search CHAR.  */
>> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
>> +       KMOV    %k1, %VRAX
>> +       /* Shift out search CHAR matches that are before the beginning of
>> +          src (rdi).  */
>> +       shrx    %SHIFT_REG, %VRAX, %VRAX
>> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
>> +          so to save code-size just re-use return logic for first VEC.
>> +          This is relatively cold code (page cross).  */
>> +# if VEC_SIZE == 64
>> +       jmp     L(page_cross_return)
>
>
> Can you please remove unconditional jump. It's simply confusing and doesn't help across the board.
>
> $ tail -7 /tmp/wcsrchr-evex.out
>  301: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
>  306: c4 e2 48 f3 d6       blsmsk %esi,%esi
>  30b: 21 f0                 and    %esi,%eax
>  30d: 74 07                 je     316 <__wcsrchr_evex+0x316>
>  30f: 0f bd c0             bsr    %eax,%eax
>  312: 48 8d 04 87           lea    (%rdi,%rax,4),%rax
>  316: c3                   ret
>
> $ tail -7 /tmp/wcsrchr-evex512.out
>  300: c4 e2 73 f7 f6       shrx   %ecx,%esi,%esi
>  305: 85 f6                 test   %esi,%esi
>  307: 0f 84 0b fd ff ff     je     18 <__wcsrchr_evex512+0x18>
>  30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1
>  314: c5 fb 93 c1           kmovd  %k1,%eax
>  318: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
>  31d: e9 18 fd ff ff       jmp    3a <__wcsrchr_evex512+0x3a>
>
> $ tail -7 /tmp/strrchr-evex.out
>  2e8: c4 e2 43 f7 c0       shrx   %edi,%eax,%eax
>  2ed: c4 e2 48 f3 d6       blsmsk %esi,%esi
>  2f2: 21 f0                 and    %esi,%eax
>  2f4: 74 06                 je     2fc <__strrchr_evex+0x2fc>
>  2f6: 0f bd c0             bsr    %eax,%eax
>  2f9: 48 01 f8             add    %rdi,%rax
>  2fc: c3                   ret
>
>  $ tail -7 /tmp/strrchr-evex512.out
>  315: c4 e2 c3 f7 f6       shrx   %rdi,%rsi,%rsi
>  31a: 48 85 f6             test   %rsi,%rsi
>  31d: 0f 84 f5 fc ff ff     je     18 <__strrchr_evex512+0x18>
>  323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1
>  32a: c4 e1 fb 93 c1       kmovq  %k1,%rax
>  32f: c4 e2 c3 f7 c0       shrx   %rdi,%rax,%rax
>  334: e9 04 fd ff ff       jmp    3d <__strrchr_evex512+0x3d>
It helps here (or at least saves a cache line). But not super invested
either way.
>
>
>>
>> +    /* 6 bytes from cache-line.  */
>>  # else
>> -       KMOV    %VRDX, %k1
>> +       /* Check if any search CHAR match in range.  */
>> +       blsmsk  %VRSI, %VRSI
>> +       and     %VRSI, %VRAX
>> +       jz      L(ret2)
>> +       bsr     %VRAX, %VRAX
>> +#  ifdef USE_AS_WCSRCHR
>> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>> +#  else
>> +       addq    %rdi, %rax
>> +#  endif
>> +L(ret2):
>> +       ret
>> +       /* 3 bytes from cache-line.  */
>>  # endif
>> -
>> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
>
>
> Please remove the VCOMPRESS definition as it's not used.
>
>>
>> -       /* We could technically just jmp back after the vpcompress but
>> -          it doesn't save any 16-byte blocks.  */
>> -       shrx    %SHIFT_REG, %VRSI, %VRSI
>> -       test    %VRSI, %VRSI
>> -       jnz     L(page_cross_return)
>> -       jmp     L(page_cross_continue)
>> -       /* 1-byte from cache line.  */
>>  END(STRRCHR)
>>  #endif
>> --
>> 2.34.1
>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-08 19:22   ` Noah Goldstein
@ 2023-11-08 20:08     ` Sunil Pandey
  2023-11-08 21:21       ` Noah Goldstein
  0 siblings, 1 reply; 21+ messages in thread
From: Sunil Pandey @ 2023-11-08 20:08 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, hjl.tools, carlos

[-- Attachment #1: Type: text/plain, Size: 7763 bytes --]

On Wed, Nov 8, 2023 at 11:22 AM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> On Wed, Nov 8, 2023 at 1:05 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> >
> >
> > On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >>
> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
> >> was missing the CPU_FEATURE checks for VBMI2 in the
> >> ifunc/ifunc-impl-list.
> >>
> >> The fix is either to add those checks or change the logic to not use
> >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
> >> implementation is usable on SKX.
> >>
> >> New implementation is a bit slower, but this is in a cold path so its
> >> probably okay.
> >> ---
> >>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
> >>  1 file changed, 43 insertions(+), 21 deletions(-)
> >>
> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> index cd6a0a870a..b174d43208 100644
> >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> >>         andl    $(PAGE_SIZE - 1), %eax
> >>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> >>         jg      L(cross_page_boundary)
> >> -
> >> +L(page_cross_continue):
> >>         VMOVU   (%rdi), %VMM(1)
> >>         /* k0 has a 1 for each zero CHAR in YMM1.  */
> >>         VPTESTN %VMM(1), %VMM(1), %k0
> >> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> >>         test    %VGPR(rsi), %VGPR(rsi)
> >>         jz      L(aligned_more)
> >>         /* fallthrough: zero CHAR in first VEC.  */
> >> -L(page_cross_return):
> >> +
> >>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
> >>         VPCMPEQ %VMATCH, %VMM(1), %k1
> >>         KMOV    %k1, %VGPR(rax)
> >> +L(page_cross_return):
> >>         /* Build mask up until first zero CHAR (used to mask of
> >>            potential search CHAR matches past the end of the string).
> */
> >>         blsmsk  %VGPR(rsi), %VGPR(rsi)
> >> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
> >>
> >>         .p2align 4,, 12
> >>  L(aligned_more):
> >> -L(page_cross_continue):
> >>         /* Need to keep original pointer incase VEC(1) has last match.
> */
> >>         movq    %rdi, %rsi
> >>         andq    $-VEC_SIZE, %rdi
> >> @@ -340,34 +340,56 @@ L(return_new_match_ret):
> >>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >>         ret
> >>
> >> -       .p2align 4,, 4
> >>  L(cross_page_boundary):
> >> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> >> +          rax` sets pointer will all page offset bits cleared so
> >> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> >> +          before page cross (guaranteed to be safe to read). Doing this
> >> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> >> +          a bit of code size.  */
> >>         xorq    %rdi, %rax
> >> -       mov     $-1, %VRDX
> >> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> >> -       VPTESTN %VMM(6), %VMM(6), %k0
> >> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> >> +       VPTESTN %VMM(1), %VMM(1), %k0
> >>         KMOV    %k0, %VRSI
> >>
> >> +       /* Shift out zero CHAR matches that are before the beginning of
> >> +          src (rdi).  */
> >>  # ifdef USE_AS_WCSRCHR
> >>         movl    %edi, %ecx
> >> -       and     $(VEC_SIZE - 1), %ecx
> >> +       andl    $(VEC_SIZE - 1), %ecx
> >>         shrl    $2, %ecx
> >>  # endif
> >> -       shlx    %SHIFT_REG, %VRDX, %VRDX
> >> +       shrx    %SHIFT_REG, %VRSI, %VRSI
> >>
> >> -# ifdef USE_AS_WCSRCHR
> >> -       kmovw   %edx, %k1
> >> +       test    %VRSI, %VRSI
> >> +       jz      L(page_cross_continue)
> >> +
> >> +       /* Found zero CHAR so need to test for search CHAR.  */
> >> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
> >> +       KMOV    %k1, %VRAX
> >> +       /* Shift out search CHAR matches that are before the beginning
> of
> >> +          src (rdi).  */
> >> +       shrx    %SHIFT_REG, %VRAX, %VRAX
> >> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
> >> +          so to save code-size just re-use return logic for first VEC.
> >> +          This is relatively cold code (page cross).  */
> >> +# if VEC_SIZE == 64
> >> +       jmp     L(page_cross_return)
> >
> >
> > Can you please remove unconditional jump. It's simply confusing and
> doesn't help across the board.
> >
> > $ tail -7 /tmp/wcsrchr-evex.out
> >  301: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
> >  306: c4 e2 48 f3 d6       blsmsk %esi,%esi
> >  30b: 21 f0                 and    %esi,%eax
> >  30d: 74 07                 je     316 <__wcsrchr_evex+0x316>
> >  30f: 0f bd c0             bsr    %eax,%eax
> >  312: 48 8d 04 87           lea    (%rdi,%rax,4),%rax
> >  316: c3                   ret
> >
> > $ tail -7 /tmp/wcsrchr-evex512.out
> >  300: c4 e2 73 f7 f6       shrx   %ecx,%esi,%esi
> >  305: 85 f6                 test   %esi,%esi
> >  307: 0f 84 0b fd ff ff     je     18 <__wcsrchr_evex512+0x18>
> >  30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1
> >  314: c5 fb 93 c1           kmovd  %k1,%eax
> >  318: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
> >  31d: e9 18 fd ff ff       jmp    3a <__wcsrchr_evex512+0x3a>
> >
>

Unnecessary unconditional jump here, as the entire code will fit in the
same cache line.
Also it will slow down execution.

In general, I'm not in favor of unconditional jump unless it can provide
significant perf boost.

> $ tail -7 /tmp/strrchr-evex.out
> >  2e8: c4 e2 43 f7 c0       shrx   %edi,%eax,%eax
> >  2ed: c4 e2 48 f3 d6       blsmsk %esi,%esi
> >  2f2: 21 f0                 and    %esi,%eax
> >  2f4: 74 06                 je     2fc <__strrchr_evex+0x2fc>
> >  2f6: 0f bd c0             bsr    %eax,%eax
> >  2f9: 48 01 f8             add    %rdi,%rax
> >  2fc: c3                   ret
> >
> >  $ tail -7 /tmp/strrchr-evex512.out
> >  315: c4 e2 c3 f7 f6       shrx   %rdi,%rsi,%rsi
> >  31a: 48 85 f6             test   %rsi,%rsi
> >  31d: 0f 84 f5 fc ff ff     je     18 <__strrchr_evex512+0x18>
> >  323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1
> >  32a: c4 e1 fb 93 c1       kmovq  %k1,%rax
> >  32f: c4 e2 c3 f7 c0       shrx   %rdi,%rax,%rax
> >  334: e9 04 fd ff ff       jmp    3d <__strrchr_evex512+0x3d>
> It helps here (or at least saves a cache line). But not super invested
> either way.
> >
> >
> >>
> >> +    /* 6 bytes from cache-line.  */
> >>  # else
> >> -       KMOV    %VRDX, %k1
> >> +       /* Check if any search CHAR match in range.  */
> >> +       blsmsk  %VRSI, %VRSI
> >> +       and     %VRSI, %VRAX
> >> +       jz      L(ret2)
> >> +       bsr     %VRAX, %VRAX
> >> +#  ifdef USE_AS_WCSRCHR
> >> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> >> +#  else
> >> +       addq    %rdi, %rax
> >> +#  endif
> >> +L(ret2):
> >> +       ret
> >> +       /* 3 bytes from cache-line.  */
> >>  # endif
> >> -
> >> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
> >
> >
> > Please remove the VCOMPRESS definition as it's not used.
> >
> >>
> >> -       /* We could technically just jmp back after the vpcompress but
> >> -          it doesn't save any 16-byte blocks.  */
> >> -       shrx    %SHIFT_REG, %VRSI, %VRSI
> >> -       test    %VRSI, %VRSI
> >> -       jnz     L(page_cross_return)
> >> -       jmp     L(page_cross_continue)
> >> -       /* 1-byte from cache line.  */
> >>  END(STRRCHR)
> >>  #endif
> >> --
> >> 2.34.1
> >>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-08 20:08     ` Sunil Pandey
@ 2023-11-08 21:21       ` Noah Goldstein
  2023-11-08 21:22         ` Noah Goldstein
  0 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2023-11-08 21:21 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: libc-alpha, hjl.tools, carlos

On Wed, Nov 8, 2023 at 2:08 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Wed, Nov 8, 2023 at 11:22 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> On Wed, Nov 8, 2023 at 1:05 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>> >
>> >
>> >
>> > On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >>
>> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
>> >> was missing the CPU_FEATURE checks for VBMI2 in the
>> >> ifunc/ifunc-impl-list.
>> >>
>> >> The fix is either to add those checks or change the logic to not use
>> >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
>> >> implementation is usable on SKX.
>> >>
>> >> New implementation is a bit slower, but this is in a cold path so its
>> >> probably okay.
>> >> ---
>> >>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
>> >>  1 file changed, 43 insertions(+), 21 deletions(-)
>> >>
>> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> >> index cd6a0a870a..b174d43208 100644
>> >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> >> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>> >>         andl    $(PAGE_SIZE - 1), %eax
>> >>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>> >>         jg      L(cross_page_boundary)
>> >> -
>> >> +L(page_cross_continue):
>> >>         VMOVU   (%rdi), %VMM(1)
>> >>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>> >>         VPTESTN %VMM(1), %VMM(1), %k0
>> >> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>> >>         test    %VGPR(rsi), %VGPR(rsi)
>> >>         jz      L(aligned_more)
>> >>         /* fallthrough: zero CHAR in first VEC.  */
>> >> -L(page_cross_return):
>> >> +
>> >>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>> >>         VPCMPEQ %VMATCH, %VMM(1), %k1
>> >>         KMOV    %k1, %VGPR(rax)
>> >> +L(page_cross_return):
>> >>         /* Build mask up until first zero CHAR (used to mask of
>> >>            potential search CHAR matches past the end of the string).  */
>> >>         blsmsk  %VGPR(rsi), %VGPR(rsi)
>> >> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
>> >>
>> >>         .p2align 4,, 12
>> >>  L(aligned_more):
>> >> -L(page_cross_continue):
>> >>         /* Need to keep original pointer incase VEC(1) has last match.  */
>> >>         movq    %rdi, %rsi
>> >>         andq    $-VEC_SIZE, %rdi
>> >> @@ -340,34 +340,56 @@ L(return_new_match_ret):
>> >>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>> >>         ret
>> >>
>> >> -       .p2align 4,, 4
>> >>  L(cross_page_boundary):
>> >> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
>> >> +          rax` sets pointer will all page offset bits cleared so
>> >> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
>> >> +          before page cross (guaranteed to be safe to read). Doing this
>> >> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
>> >> +          a bit of code size.  */
>> >>         xorq    %rdi, %rax
>> >> -       mov     $-1, %VRDX
>> >> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
>> >> -       VPTESTN %VMM(6), %VMM(6), %k0
>> >> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
>> >> +       VPTESTN %VMM(1), %VMM(1), %k0
>> >>         KMOV    %k0, %VRSI
>> >>
>> >> +       /* Shift out zero CHAR matches that are before the beginning of
>> >> +          src (rdi).  */
>> >>  # ifdef USE_AS_WCSRCHR
>> >>         movl    %edi, %ecx
>> >> -       and     $(VEC_SIZE - 1), %ecx
>> >> +       andl    $(VEC_SIZE - 1), %ecx
>> >>         shrl    $2, %ecx
>> >>  # endif
>> >> -       shlx    %SHIFT_REG, %VRDX, %VRDX
>> >> +       shrx    %SHIFT_REG, %VRSI, %VRSI
>> >>
>> >> -# ifdef USE_AS_WCSRCHR
>> >> -       kmovw   %edx, %k1
>> >> +       test    %VRSI, %VRSI
>> >> +       jz      L(page_cross_continue)
>> >> +
>> >> +       /* Found zero CHAR so need to test for search CHAR.  */
>> >> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
>> >> +       KMOV    %k1, %VRAX
>> >> +       /* Shift out search CHAR matches that are before the beginning of
>> >> +          src (rdi).  */
>> >> +       shrx    %SHIFT_REG, %VRAX, %VRAX
>> >> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
>> >> +          so to save code-size just re-use return logic for first VEC.
>> >> +          This is relatively cold code (page cross).  */
>> >> +# if VEC_SIZE == 64
>> >> +       jmp     L(page_cross_return)
>> >
>> >
>> > Can you please remove unconditional jump. It's simply confusing and doesn't help across the board.
>> >
>> > $ tail -7 /tmp/wcsrchr-evex.out
>> >  301: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
>> >  306: c4 e2 48 f3 d6       blsmsk %esi,%esi
>> >  30b: 21 f0                 and    %esi,%eax
>> >  30d: 74 07                 je     316 <__wcsrchr_evex+0x316>
>> >  30f: 0f bd c0             bsr    %eax,%eax
>> >  312: 48 8d 04 87           lea    (%rdi,%rax,4),%rax
>> >  316: c3                   ret
>> >
>> > $ tail -7 /tmp/wcsrchr-evex512.out
>> >  300: c4 e2 73 f7 f6       shrx   %ecx,%esi,%esi
>> >  305: 85 f6                 test   %esi,%esi
>> >  307: 0f 84 0b fd ff ff     je     18 <__wcsrchr_evex512+0x18>
>> >  30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1
>> >  314: c5 fb 93 c1           kmovd  %k1,%eax
>> >  318: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
>> >  31d: e9 18 fd ff ff       jmp    3a <__wcsrchr_evex512+0x3a>
>> >
>
>
> Unnecessary unconditional jump here, as the entire code will fit in the same cache line.
> Also it will slow down execution.
>
> In general, I'm not in favor of unconditional jump unless it can provide significant perf boost.
>
The repalcement code:
```
blsmsk %rsi,%rsi            : +5
and    %rsi,%rax             : +3
je_imm32                        : +2
bsr    %rax,%rax             : +4
add    %rsi, %rax            : +3
ret                                   : +1
                                       :  + 18
Starting at 0x334, that will spill to the next line.

>> > $ tail -7 /tmp/strrchr-evex.out
>> >  2e8: c4 e2 43 f7 c0       shrx   %edi,%eax,%eax
>> >  2ed: c4 e2 48 f3 d6       blsmsk %esi,%esi
>> >  2f2: 21 f0                 and    %esi,%eax
>> >  2f4: 74 06                 je     2fc <__strrchr_evex+0x2fc>
>> >  2f6: 0f bd c0             bsr    %eax,%eax
>> >  2f9: 48 01 f8             add    %rdi,%rax
>> >  2fc: c3                   ret
>> >
>> >  $ tail -7 /tmp/strrchr-evex512.out
>> >  315: c4 e2 c3 f7 f6       shrx   %rdi,%rsi,%rsi
>> >  31a: 48 85 f6             test   %rsi,%rsi
>> >  31d: 0f 84 f5 fc ff ff     je     18 <__strrchr_evex512+0x18>
>> >  323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1
>> >  32a: c4 e1 fb 93 c1       kmovq  %k1,%rax
>> >  32f: c4 e2 c3 f7 c0       shrx   %rdi,%rax,%rax
>> >  334: e9 04 fd ff ff       jmp    3d <__strrchr_evex512+0x3d>
>> It helps here (or at least saves a cache line). But not super invested
>> either way.
>> >
>> >
>> >>
>> >> +    /* 6 bytes from cache-line.  */
>> >>  # else
>> >> -       KMOV    %VRDX, %k1
>> >> +       /* Check if any search CHAR match in range.  */
>> >> +       blsmsk  %VRSI, %VRSI
>> >> +       and     %VRSI, %VRAX
>> >> +       jz      L(ret2)
>> >> +       bsr     %VRAX, %VRAX
>> >> +#  ifdef USE_AS_WCSRCHR
>> >> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>> >> +#  else
>> >> +       addq    %rdi, %rax
>> >> +#  endif
>> >> +L(ret2):
>> >> +       ret
>> >> +       /* 3 bytes from cache-line.  */
>> >>  # endif
>> >> -
>> >> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
>> >
>> >
>> > Please remove the VCOMPRESS definition as it's not used.
>> >
>> >>
>> >> -       /* We could technically just jmp back after the vpcompress but
>> >> -          it doesn't save any 16-byte blocks.  */
>> >> -       shrx    %SHIFT_REG, %VRSI, %VRSI
>> >> -       test    %VRSI, %VRSI
>> >> -       jnz     L(page_cross_return)
>> >> -       jmp     L(page_cross_continue)
>> >> -       /* 1-byte from cache line.  */
>> >>  END(STRRCHR)
>> >>  #endif
>> >> --
>> >> 2.34.1
>> >>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-08 21:21       ` Noah Goldstein
@ 2023-11-08 21:22         ` Noah Goldstein
  0 siblings, 0 replies; 21+ messages in thread
From: Noah Goldstein @ 2023-11-08 21:22 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: libc-alpha, hjl.tools, carlos

On Wed, Nov 8, 2023 at 3:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Nov 8, 2023 at 2:08 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> >
> >
> > On Wed, Nov 8, 2023 at 11:22 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >>
> >> On Wed, Nov 8, 2023 at 1:05 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >> >
> >> >
> >> >
> >> > On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >> >>
> >> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
> >> >> was missing the CPU_FEATURE checks for VBMI2 in the
> >> >> ifunc/ifunc-impl-list.
> >> >>
> >> >> The fix is either to add those checks or change the logic to not use
> >> >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
> >> >> implementation is usable on SKX.
> >> >>
> >> >> New implementation is a bit slower, but this is in a cold path so its
> >> >> probably okay.
> >> >> ---
> >> >>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
> >> >>  1 file changed, 43 insertions(+), 21 deletions(-)
> >> >>
> >> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> >> index cd6a0a870a..b174d43208 100644
> >> >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> >> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> >> >>         andl    $(PAGE_SIZE - 1), %eax
> >> >>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> >> >>         jg      L(cross_page_boundary)
> >> >> -
> >> >> +L(page_cross_continue):
> >> >>         VMOVU   (%rdi), %VMM(1)
> >> >>         /* k0 has a 1 for each zero CHAR in YMM1.  */
> >> >>         VPTESTN %VMM(1), %VMM(1), %k0
> >> >> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> >> >>         test    %VGPR(rsi), %VGPR(rsi)
> >> >>         jz      L(aligned_more)
> >> >>         /* fallthrough: zero CHAR in first VEC.  */
> >> >> -L(page_cross_return):
> >> >> +
> >> >>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
> >> >>         VPCMPEQ %VMATCH, %VMM(1), %k1
> >> >>         KMOV    %k1, %VGPR(rax)
> >> >> +L(page_cross_return):
> >> >>         /* Build mask up until first zero CHAR (used to mask of
> >> >>            potential search CHAR matches past the end of the string).  */
> >> >>         blsmsk  %VGPR(rsi), %VGPR(rsi)
> >> >> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
> >> >>
> >> >>         .p2align 4,, 12
> >> >>  L(aligned_more):
> >> >> -L(page_cross_continue):
> >> >>         /* Need to keep original pointer incase VEC(1) has last match.  */
> >> >>         movq    %rdi, %rsi
> >> >>         andq    $-VEC_SIZE, %rdi
> >> >> @@ -340,34 +340,56 @@ L(return_new_match_ret):
> >> >>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >> >>         ret
> >> >>
> >> >> -       .p2align 4,, 4
> >> >>  L(cross_page_boundary):
> >> >> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> >> >> +          rax` sets pointer will all page offset bits cleared so
> >> >> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> >> >> +          before page cross (guaranteed to be safe to read). Doing this
> >> >> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> >> >> +          a bit of code size.  */
> >> >>         xorq    %rdi, %rax
> >> >> -       mov     $-1, %VRDX
> >> >> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> >> >> -       VPTESTN %VMM(6), %VMM(6), %k0
> >> >> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> >> >> +       VPTESTN %VMM(1), %VMM(1), %k0
> >> >>         KMOV    %k0, %VRSI
> >> >>
> >> >> +       /* Shift out zero CHAR matches that are before the beginning of
> >> >> +          src (rdi).  */
> >> >>  # ifdef USE_AS_WCSRCHR
> >> >>         movl    %edi, %ecx
> >> >> -       and     $(VEC_SIZE - 1), %ecx
> >> >> +       andl    $(VEC_SIZE - 1), %ecx
> >> >>         shrl    $2, %ecx
> >> >>  # endif
> >> >> -       shlx    %SHIFT_REG, %VRDX, %VRDX
> >> >> +       shrx    %SHIFT_REG, %VRSI, %VRSI
> >> >>
> >> >> -# ifdef USE_AS_WCSRCHR
> >> >> -       kmovw   %edx, %k1
> >> >> +       test    %VRSI, %VRSI
> >> >> +       jz      L(page_cross_continue)
> >> >> +
> >> >> +       /* Found zero CHAR so need to test for search CHAR.  */
> >> >> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
> >> >> +       KMOV    %k1, %VRAX
> >> >> +       /* Shift out search CHAR matches that are before the beginning of
> >> >> +          src (rdi).  */
> >> >> +       shrx    %SHIFT_REG, %VRAX, %VRAX
> >> >> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
> >> >> +          so to save code-size just re-use return logic for first VEC.
> >> >> +          This is relatively cold code (page cross).  */
> >> >> +# if VEC_SIZE == 64
> >> >> +       jmp     L(page_cross_return)
> >> >
> >> >
> >> > Can you please remove unconditional jump. It's simply confusing and doesn't help across the board.
> >> >
> >> > $ tail -7 /tmp/wcsrchr-evex.out
> >> >  301: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
> >> >  306: c4 e2 48 f3 d6       blsmsk %esi,%esi
> >> >  30b: 21 f0                 and    %esi,%eax
> >> >  30d: 74 07                 je     316 <__wcsrchr_evex+0x316>
> >> >  30f: 0f bd c0             bsr    %eax,%eax
> >> >  312: 48 8d 04 87           lea    (%rdi,%rax,4),%rax
> >> >  316: c3                   ret
> >> >
> >> > $ tail -7 /tmp/wcsrchr-evex512.out
> >> >  300: c4 e2 73 f7 f6       shrx   %ecx,%esi,%esi
> >> >  305: 85 f6                 test   %esi,%esi
> >> >  307: 0f 84 0b fd ff ff     je     18 <__wcsrchr_evex512+0x18>
> >> >  30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1
> >> >  314: c5 fb 93 c1           kmovd  %k1,%eax
> >> >  318: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
> >> >  31d: e9 18 fd ff ff       jmp    3a <__wcsrchr_evex512+0x3a>
> >> >
> >
> >
> > Unnecessary unconditional jump here, as the entire code will fit in the same cache line.
> > Also it will slow down execution.
> >
> > In general, I'm not in favor of unconditional jump unless it can provide significant perf boost.

Oh you mean make the conditional only for strrchr-evex512, but exclude
wcsrchr-evx512.
Sure.
> >
> The repalcement code:
> ```
> blsmsk %rsi,%rsi            : +5
> and    %rsi,%rax             : +3
> je_imm32                        : +2
> bsr    %rax,%rax             : +4
> add    %rsi, %rax            : +3
> ret                                   : +1
>                                        :  + 18
> Starting at 0x334, that will spill to the next line.
>
> >> > $ tail -7 /tmp/strrchr-evex.out
> >> >  2e8: c4 e2 43 f7 c0       shrx   %edi,%eax,%eax
> >> >  2ed: c4 e2 48 f3 d6       blsmsk %esi,%esi
> >> >  2f2: 21 f0                 and    %esi,%eax
> >> >  2f4: 74 06                 je     2fc <__strrchr_evex+0x2fc>
> >> >  2f6: 0f bd c0             bsr    %eax,%eax
> >> >  2f9: 48 01 f8             add    %rdi,%rax
> >> >  2fc: c3                   ret
> >> >
> >> >  $ tail -7 /tmp/strrchr-evex512.out
> >> >  315: c4 e2 c3 f7 f6       shrx   %rdi,%rsi,%rsi
> >> >  31a: 48 85 f6             test   %rsi,%rsi
> >> >  31d: 0f 84 f5 fc ff ff     je     18 <__strrchr_evex512+0x18>
> >> >  323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1
> >> >  32a: c4 e1 fb 93 c1       kmovq  %k1,%rax
> >> >  32f: c4 e2 c3 f7 c0       shrx   %rdi,%rax,%rax
> >> >  334: e9 04 fd ff ff       jmp    3d <__strrchr_evex512+0x3d>
> >> It helps here (or at least saves a cache line). But not super invested
> >> either way.
> >> >
> >> >
> >> >>
> >> >> +    /* 6 bytes from cache-line.  */
> >> >>  # else
> >> >> -       KMOV    %VRDX, %k1
> >> >> +       /* Check if any search CHAR match in range.  */
> >> >> +       blsmsk  %VRSI, %VRSI
> >> >> +       and     %VRSI, %VRAX
> >> >> +       jz      L(ret2)
> >> >> +       bsr     %VRAX, %VRAX
> >> >> +#  ifdef USE_AS_WCSRCHR
> >> >> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> >> >> +#  else
> >> >> +       addq    %rdi, %rax
> >> >> +#  endif
> >> >> +L(ret2):
> >> >> +       ret
> >> >> +       /* 3 bytes from cache-line.  */
> >> >>  # endif
> >> >> -
> >> >> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
> >> >
> >> >
> >> > Please remove the VCOMPRESS definition as it's not used.
> >> >
> >> >>
> >> >> -       /* We could technically just jmp back after the vpcompress but
> >> >> -          it doesn't save any 16-byte blocks.  */
> >> >> -       shrx    %SHIFT_REG, %VRSI, %VRSI
> >> >> -       test    %VRSI, %VRSI
> >> >> -       jnz     L(page_cross_return)
> >> >> -       jmp     L(page_cross_continue)
> >> >> -       /* 1-byte from cache line.  */
> >> >>  END(STRRCHR)
> >> >>  #endif
> >> >> --
> >> >> 2.34.1
> >> >>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-08 19:04 ` Sunil Pandey
  2023-11-08 19:22   ` Noah Goldstein
@ 2023-11-08 21:33   ` Noah Goldstein
  1 sibling, 0 replies; 21+ messages in thread
From: Noah Goldstein @ 2023-11-08 21:33 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: libc-alpha, hjl.tools, carlos

On Wed, Nov 8, 2023 at 1:05 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
>> was missing the CPU_FEATURE checks for VBMI2 in the
>> ifunc/ifunc-impl-list.
>>
>> The fix is either to add those checks or change the logic to not use
>> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
>> implementation is usable on SKX.
>>
>> New implementation is a bit slower, but this is in a cold path so its
>> probably okay.
>> ---
>>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
>>  1 file changed, 43 insertions(+), 21 deletions(-)
>>
>> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> index cd6a0a870a..b174d43208 100644
>> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         andl    $(PAGE_SIZE - 1), %eax
>>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>>         jg      L(cross_page_boundary)
>> -
>> +L(page_cross_continue):
>>         VMOVU   (%rdi), %VMM(1)
>>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>>         VPTESTN %VMM(1), %VMM(1), %k0
>> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         test    %VGPR(rsi), %VGPR(rsi)
>>         jz      L(aligned_more)
>>         /* fallthrough: zero CHAR in first VEC.  */
>> -L(page_cross_return):
>> +
>>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>>         VPCMPEQ %VMATCH, %VMM(1), %k1
>>         KMOV    %k1, %VGPR(rax)
>> +L(page_cross_return):
>>         /* Build mask up until first zero CHAR (used to mask of
>>            potential search CHAR matches past the end of the string).  */
>>         blsmsk  %VGPR(rsi), %VGPR(rsi)
>> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
>>
>>         .p2align 4,, 12
>>  L(aligned_more):
>> -L(page_cross_continue):
>>         /* Need to keep original pointer incase VEC(1) has last match.  */
>>         movq    %rdi, %rsi
>>         andq    $-VEC_SIZE, %rdi
>> @@ -340,34 +340,56 @@ L(return_new_match_ret):
>>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>>         ret
>>
>> -       .p2align 4,, 4
>>  L(cross_page_boundary):
>> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
>> +          rax` sets pointer will all page offset bits cleared so
>> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
>> +          before page cross (guaranteed to be safe to read). Doing this
>> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
>> +          a bit of code size.  */
>>         xorq    %rdi, %rax
>> -       mov     $-1, %VRDX
>> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
>> -       VPTESTN %VMM(6), %VMM(6), %k0
>> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
>> +       VPTESTN %VMM(1), %VMM(1), %k0
>>         KMOV    %k0, %VRSI
>>
>> +       /* Shift out zero CHAR matches that are before the beginning of
>> +          src (rdi).  */
>>  # ifdef USE_AS_WCSRCHR
>>         movl    %edi, %ecx
>> -       and     $(VEC_SIZE - 1), %ecx
>> +       andl    $(VEC_SIZE - 1), %ecx
>>         shrl    $2, %ecx
>>  # endif
>> -       shlx    %SHIFT_REG, %VRDX, %VRDX
>> +       shrx    %SHIFT_REG, %VRSI, %VRSI
>>
>> -# ifdef USE_AS_WCSRCHR
>> -       kmovw   %edx, %k1
>> +       test    %VRSI, %VRSI
>> +       jz      L(page_cross_continue)
>> +
>> +       /* Found zero CHAR so need to test for search CHAR.  */
>> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
>> +       KMOV    %k1, %VRAX
>> +       /* Shift out search CHAR matches that are before the beginning of
>> +          src (rdi).  */
>> +       shrx    %SHIFT_REG, %VRAX, %VRAX
>> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
>> +          so to save code-size just re-use return logic for first VEC.
>> +          This is relatively cold code (page cross).  */
>> +# if VEC_SIZE == 64
>> +       jmp     L(page_cross_return)
>
>
> Can you please remove unconditional jump. It's simply confusing and doesn't help across the board.
done for wcsrchr-evex512
>
> $ tail -7 /tmp/wcsrchr-evex.out
>  301: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
>  306: c4 e2 48 f3 d6       blsmsk %esi,%esi
>  30b: 21 f0                 and    %esi,%eax
>  30d: 74 07                 je     316 <__wcsrchr_evex+0x316>
>  30f: 0f bd c0             bsr    %eax,%eax
>  312: 48 8d 04 87           lea    (%rdi,%rax,4),%rax
>  316: c3                   ret
>
> $ tail -7 /tmp/wcsrchr-evex512.out
>  300: c4 e2 73 f7 f6       shrx   %ecx,%esi,%esi
>  305: 85 f6                 test   %esi,%esi
>  307: 0f 84 0b fd ff ff     je     18 <__wcsrchr_evex512+0x18>
>  30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1
>  314: c5 fb 93 c1           kmovd  %k1,%eax
>  318: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
>  31d: e9 18 fd ff ff       jmp    3a <__wcsrchr_evex512+0x3a>
>
> $ tail -7 /tmp/strrchr-evex.out
>  2e8: c4 e2 43 f7 c0       shrx   %edi,%eax,%eax
>  2ed: c4 e2 48 f3 d6       blsmsk %esi,%esi
>  2f2: 21 f0                 and    %esi,%eax
>  2f4: 74 06                 je     2fc <__strrchr_evex+0x2fc>
>  2f6: 0f bd c0             bsr    %eax,%eax
>  2f9: 48 01 f8             add    %rdi,%rax
>  2fc: c3                   ret
>
>  $ tail -7 /tmp/strrchr-evex512.out
>  315: c4 e2 c3 f7 f6       shrx   %rdi,%rsi,%rsi
>  31a: 48 85 f6             test   %rsi,%rsi
>  31d: 0f 84 f5 fc ff ff     je     18 <__strrchr_evex512+0x18>
>  323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1
>  32a: c4 e1 fb 93 c1       kmovq  %k1,%rax
>  32f: c4 e2 c3 f7 c0       shrx   %rdi,%rax,%rax
>  334: e9 04 fd ff ff       jmp    3d <__strrchr_evex512+0x3d>
>
>
>>
>> +    /* 6 bytes from cache-line.  */
>>  # else
>> -       KMOV    %VRDX, %k1
>> +       /* Check if any search CHAR match in range.  */
>> +       blsmsk  %VRSI, %VRSI
>> +       and     %VRSI, %VRAX
>> +       jz      L(ret2)
>> +       bsr     %VRAX, %VRAX
>> +#  ifdef USE_AS_WCSRCHR
>> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>> +#  else
>> +       addq    %rdi, %rax
>> +#  endif
>> +L(ret2):
>> +       ret
>> +       /* 3 bytes from cache-line.  */
>>  # endif
>> -
>> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
>
>
> Please remove the VCOMPRESS definition as it's not used.
done.
>
>>
>> -       /* We could technically just jmp back after the vpcompress but
>> -          it doesn't save any 16-byte blocks.  */
>> -       shrx    %SHIFT_REG, %VRSI, %VRSI
>> -       test    %VRSI, %VRSI
>> -       jnz     L(page_cross_return)
>> -       jmp     L(page_cross_continue)
>> -       /* 1-byte from cache line.  */
>>  END(STRRCHR)
>>  #endif
>> --
>> 2.34.1
>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2] x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-01 22:16 x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S Noah Goldstein
  2023-11-07 17:51 ` Noah Goldstein
  2023-11-08 19:04 ` Sunil Pandey
@ 2023-11-08 21:33 ` Noah Goldstein
  2023-11-09  0:10   ` Sunil Pandey
  2023-11-09 22:49 ` [PATCH v3] " Noah Goldstein
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2023-11-08 21:33 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
was missing the CPU_FEATURE checks for VBMI2 in the
ifunc/ifunc-impl-list.

The fix is either to add those checks or change the logic to not use
`vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
implementation is usable on SKX.

New implementation is a bit slower, but this is in a cold path so its
probably okay.
---
 sysdeps/x86_64/multiarch/strrchr-evex-base.S | 66 +++++++++++++-------
 1 file changed, 43 insertions(+), 23 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
index cd6a0a870a..7fb0a6a543 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
@@ -35,7 +35,6 @@
 #  define CHAR_SIZE	4
 #  define VPCMP		vpcmpd
 #  define VPMIN		vpminud
-#  define VPCOMPRESS	vpcompressd
 #  define VPTESTN	vptestnmd
 #  define VPTEST	vptestmd
 #  define VPBROADCAST	vpbroadcastd
@@ -46,7 +45,6 @@
 #  define CHAR_SIZE	1
 #  define VPCMP		vpcmpb
 #  define VPMIN		vpminub
-#  define VPCOMPRESS	vpcompressb
 #  define VPTESTN	vptestnmb
 #  define VPTEST	vptestmb
 #  define VPBROADCAST	vpbroadcastb
@@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	jg	L(cross_page_boundary)
-
+L(page_cross_continue):
 	VMOVU	(%rdi), %VMM(1)
 	/* k0 has a 1 for each zero CHAR in YMM1.  */
 	VPTESTN	%VMM(1), %VMM(1), %k0
@@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
 	test	%VGPR(rsi), %VGPR(rsi)
 	jz	L(aligned_more)
 	/* fallthrough: zero CHAR in first VEC.  */
-L(page_cross_return):
+
 	/* K1 has a 1 for each search CHAR match in VEC(1).  */
 	VPCMPEQ	%VMATCH, %VMM(1), %k1
 	KMOV	%k1, %VGPR(rax)
+L(page_cross_return):
 	/* Build mask up until first zero CHAR (used to mask of
 	   potential search CHAR matches past the end of the string).  */
 	blsmsk	%VGPR(rsi), %VGPR(rsi)
@@ -167,7 +166,6 @@ L(first_vec_x1_return):
 
 	.p2align 4,, 12
 L(aligned_more):
-L(page_cross_continue):
 	/* Need to keep original pointer incase VEC(1) has last match.  */
 	movq	%rdi, %rsi
 	andq	$-VEC_SIZE, %rdi
@@ -340,34 +338,56 @@ L(return_new_match_ret):
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 4
 L(cross_page_boundary):
+	/* eax contains all the page offset bits of src (rdi). `xor rdi,
+	   rax` sets pointer will all page offset bits cleared so
+	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+	   before page cross (guaranteed to be safe to read). Doing this
+	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+	   a bit of code size.  */
 	xorq	%rdi, %rax
-	mov	$-1, %VRDX
-	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
-	VPTESTN	%VMM(6), %VMM(6), %k0
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
 	KMOV	%k0, %VRSI
 
+	/* Shift out zero CHAR matches that are before the beginning of
+	   src (rdi).  */
 # ifdef USE_AS_WCSRCHR
 	movl	%edi, %ecx
-	and	$(VEC_SIZE - 1), %ecx
+	andl	$(VEC_SIZE - 1), %ecx
 	shrl	$2, %ecx
 # endif
-	shlx	%SHIFT_REG, %VRDX, %VRDX
+	shrx	%SHIFT_REG, %VRSI, %VRSI
 
-# ifdef USE_AS_WCSRCHR
-	kmovw	%edx, %k1
+	test	%VRSI, %VRSI
+	jz	L(page_cross_continue)
+
+	/* Found zero CHAR so need to test for search CHAR.  */
+	VPCMP	$0, %VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	/* Shift out search CHAR matches that are before the beginning of
+	   src (rdi).  */
+	shrx	%SHIFT_REG, %VRAX, %VRAX
+	/* For strrchr VEC_SIZE == 64 we are just at the end of a cacheline
+	   here so to save code-size just re-use return logic for first
+	   VEC. This is relatively cold code (page cross).  */
+# if VEC_SIZE == 64 && !(defined USE_AS_WCSRCHR)
+	jmp	L(page_cross_return)
+    /* 6 bytes from cache-line.  */
 # else
-	KMOV	%VRDX, %k1
+	/* Check if any search CHAR match in range.  */
+	blsmsk	%VRSI, %VRSI
+	and	%VRSI, %VRAX
+	jz	L(ret2)
+	bsr	%VRAX, %VRAX
+#  ifdef USE_AS_WCSRCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	addq	%rdi, %rax
+#  endif
+L(ret2):
+	ret
+	/* 3 bytes from cache-line.  */
 # endif
-
-	VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
-	/* We could technically just jmp back after the vpcompress but
-	   it doesn't save any 16-byte blocks.  */
-	shrx	%SHIFT_REG, %VRSI, %VRSI
-	test	%VRSI, %VRSI
-	jnz	L(page_cross_return)
-	jmp	L(page_cross_continue)
-	/* 1-byte from cache line.  */
 END(STRRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-08 21:33 ` [PATCH v2] " Noah Goldstein
@ 2023-11-09  0:10   ` Sunil Pandey
  2023-11-09  1:51     ` Noah Goldstein
  0 siblings, 1 reply; 21+ messages in thread
From: Sunil Pandey @ 2023-11-09  0:10 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, hjl.tools, carlos

[-- Attachment #1: Type: text/plain, Size: 5691 bytes --]

On Wed, Nov 8, 2023 at 2:36 PM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
> was missing the CPU_FEATURE checks for VBMI2 in the
> ifunc/ifunc-impl-list.
>
> The fix is either to add those checks or change the logic to not use
> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
> implementation is usable on SKX.
>
> New implementation is a bit slower, but this is in a cold path so its
> probably okay.
> ---
>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 66 +++++++++++++-------
>  1 file changed, 43 insertions(+), 23 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> index cd6a0a870a..7fb0a6a543 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> @@ -35,7 +35,6 @@
>  #  define CHAR_SIZE    4
>  #  define VPCMP                vpcmpd
>  #  define VPMIN                vpminud
> -#  define VPCOMPRESS   vpcompressd
>  #  define VPTESTN      vptestnmd
>  #  define VPTEST       vptestmd
>  #  define VPBROADCAST  vpbroadcastd
> @@ -46,7 +45,6 @@
>  #  define CHAR_SIZE    1
>  #  define VPCMP                vpcmpb
>  #  define VPMIN                vpminub
> -#  define VPCOMPRESS   vpcompressb
>  #  define VPTESTN      vptestnmb
>  #  define VPTEST       vptestmb
>  #  define VPBROADCAST  vpbroadcastb
> @@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>         andl    $(PAGE_SIZE - 1), %eax
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         jg      L(cross_page_boundary)
> -
> +L(page_cross_continue):
>         VMOVU   (%rdi), %VMM(1)
>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>         VPTESTN %VMM(1), %VMM(1), %k0
> @@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>         test    %VGPR(rsi), %VGPR(rsi)
>         jz      L(aligned_more)
>         /* fallthrough: zero CHAR in first VEC.  */
> -L(page_cross_return):
> +
>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>         VPCMPEQ %VMATCH, %VMM(1), %k1
>         KMOV    %k1, %VGPR(rax)
> +L(page_cross_return):
>         /* Build mask up until first zero CHAR (used to mask of
>            potential search CHAR matches past the end of the string).  */
>         blsmsk  %VGPR(rsi), %VGPR(rsi)
> @@ -167,7 +166,6 @@ L(first_vec_x1_return):
>
>         .p2align 4,, 12
>  L(aligned_more):
> -L(page_cross_continue):
>         /* Need to keep original pointer incase VEC(1) has last match.  */
>         movq    %rdi, %rsi
>         andq    $-VEC_SIZE, %rdi
> @@ -340,34 +338,56 @@ L(return_new_match_ret):
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4,, 4
>  L(cross_page_boundary):
> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> +          rax` sets pointer will all page offset bits cleared so
> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> +          before page cross (guaranteed to be safe to read). Doing this
> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> +          a bit of code size.  */
>         xorq    %rdi, %rax
> -       mov     $-1, %VRDX
> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> -       VPTESTN %VMM(6), %VMM(6), %k0
> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> +       VPTESTN %VMM(1), %VMM(1), %k0
>         KMOV    %k0, %VRSI
>
> +       /* Shift out zero CHAR matches that are before the beginning of
> +          src (rdi).  */
>  # ifdef USE_AS_WCSRCHR
>         movl    %edi, %ecx
> -       and     $(VEC_SIZE - 1), %ecx
> +       andl    $(VEC_SIZE - 1), %ecx
>         shrl    $2, %ecx
>  # endif
> -       shlx    %SHIFT_REG, %VRDX, %VRDX
> +       shrx    %SHIFT_REG, %VRSI, %VRSI
>
> -# ifdef USE_AS_WCSRCHR
> -       kmovw   %edx, %k1
> +       test    %VRSI, %VRSI
> +       jz      L(page_cross_continue)
> +
> +       /* Found zero CHAR so need to test for search CHAR.  */
> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       /* Shift out search CHAR matches that are before the beginning of
> +          src (rdi).  */
> +       shrx    %SHIFT_REG, %VRAX, %VRAX
> +       /* For strrchr VEC_SIZE == 64 we are just at the end of a cacheline
> +          here so to save code-size just re-use return logic for first
> +          VEC. This is relatively cold code (page cross).  */
> +# if VEC_SIZE == 64 && !(defined USE_AS_WCSRCHR)
> +       jmp     L(page_cross_return)
>

Still, unconditional jump is hard to read and understand, unless someone
builds and analyzes memory layout.


> +    /* 6 bytes from cache-line.  */
>  # else
> -       KMOV    %VRDX, %k1
> +       /* Check if any search CHAR match in range.  */
> +       blsmsk  %VRSI, %VRSI
> +       and     %VRSI, %VRAX
> +       jz      L(ret2)
> +       bsr     %VRAX, %VRAX
> +#  ifdef USE_AS_WCSRCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +       addq    %rdi, %rax
> +#  endif
> +L(ret2):
> +       ret
> +       /* 3 bytes from cache-line.  */
>  # endif
> -
> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
> -       /* We could technically just jmp back after the vpcompress but
> -          it doesn't save any 16-byte blocks.  */
> -       shrx    %SHIFT_REG, %VRSI, %VRSI
> -       test    %VRSI, %VRSI
> -       jnz     L(page_cross_return)
> -       jmp     L(page_cross_continue)
> -       /* 1-byte from cache line.  */
>  END(STRRCHR)
>  #endif
> --
> 2.34.1
>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-09  0:10   ` Sunil Pandey
@ 2023-11-09  1:51     ` Noah Goldstein
  0 siblings, 0 replies; 21+ messages in thread
From: Noah Goldstein @ 2023-11-09  1:51 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: libc-alpha, hjl.tools, carlos

On Wed, Nov 8, 2023 at 6:10 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Wed, Nov 8, 2023 at 2:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
>> was missing the CPU_FEATURE checks for VBMI2 in the
>> ifunc/ifunc-impl-list.
>>
>> The fix is either to add those checks or change the logic to not use
>> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
>> implementation is usable on SKX.
>>
>> New implementation is a bit slower, but this is in a cold path so its
>> probably okay.
>> ---
>>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 66 +++++++++++++-------
>>  1 file changed, 43 insertions(+), 23 deletions(-)
>>
>> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> index cd6a0a870a..7fb0a6a543 100644
>> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> @@ -35,7 +35,6 @@
>>  #  define CHAR_SIZE    4
>>  #  define VPCMP                vpcmpd
>>  #  define VPMIN                vpminud
>> -#  define VPCOMPRESS   vpcompressd
>>  #  define VPTESTN      vptestnmd
>>  #  define VPTEST       vptestmd
>>  #  define VPBROADCAST  vpbroadcastd
>> @@ -46,7 +45,6 @@
>>  #  define CHAR_SIZE    1
>>  #  define VPCMP                vpcmpb
>>  #  define VPMIN                vpminub
>> -#  define VPCOMPRESS   vpcompressb
>>  #  define VPTESTN      vptestnmb
>>  #  define VPTEST       vptestmb
>>  #  define VPBROADCAST  vpbroadcastb
>> @@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         andl    $(PAGE_SIZE - 1), %eax
>>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>>         jg      L(cross_page_boundary)
>> -
>> +L(page_cross_continue):
>>         VMOVU   (%rdi), %VMM(1)
>>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>>         VPTESTN %VMM(1), %VMM(1), %k0
>> @@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         test    %VGPR(rsi), %VGPR(rsi)
>>         jz      L(aligned_more)
>>         /* fallthrough: zero CHAR in first VEC.  */
>> -L(page_cross_return):
>> +
>>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>>         VPCMPEQ %VMATCH, %VMM(1), %k1
>>         KMOV    %k1, %VGPR(rax)
>> +L(page_cross_return):
>>         /* Build mask up until first zero CHAR (used to mask of
>>            potential search CHAR matches past the end of the string).  */
>>         blsmsk  %VGPR(rsi), %VGPR(rsi)
>> @@ -167,7 +166,6 @@ L(first_vec_x1_return):
>>
>>         .p2align 4,, 12
>>  L(aligned_more):
>> -L(page_cross_continue):
>>         /* Need to keep original pointer incase VEC(1) has last match.  */
>>         movq    %rdi, %rsi
>>         andq    $-VEC_SIZE, %rdi
>> @@ -340,34 +338,56 @@ L(return_new_match_ret):
>>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>>         ret
>>
>> -       .p2align 4,, 4
>>  L(cross_page_boundary):
>> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
>> +          rax` sets pointer will all page offset bits cleared so
>> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
>> +          before page cross (guaranteed to be safe to read). Doing this
>> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
>> +          a bit of code size.  */
>>         xorq    %rdi, %rax
>> -       mov     $-1, %VRDX
>> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
>> -       VPTESTN %VMM(6), %VMM(6), %k0
>> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
>> +       VPTESTN %VMM(1), %VMM(1), %k0
>>         KMOV    %k0, %VRSI
>>
>> +       /* Shift out zero CHAR matches that are before the beginning of
>> +          src (rdi).  */
>>  # ifdef USE_AS_WCSRCHR
>>         movl    %edi, %ecx
>> -       and     $(VEC_SIZE - 1), %ecx
>> +       andl    $(VEC_SIZE - 1), %ecx
>>         shrl    $2, %ecx
>>  # endif
>> -       shlx    %SHIFT_REG, %VRDX, %VRDX
>> +       shrx    %SHIFT_REG, %VRSI, %VRSI
>>
>> -# ifdef USE_AS_WCSRCHR
>> -       kmovw   %edx, %k1
>> +       test    %VRSI, %VRSI
>> +       jz      L(page_cross_continue)
>> +
>> +       /* Found zero CHAR so need to test for search CHAR.  */
>> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
>> +       KMOV    %k1, %VRAX
>> +       /* Shift out search CHAR matches that are before the beginning of
>> +          src (rdi).  */
>> +       shrx    %SHIFT_REG, %VRAX, %VRAX
>> +       /* For strrchr VEC_SIZE == 64 we are just at the end of a cacheline
>> +          here so to save code-size just re-use return logic for first
>> +          VEC. This is relatively cold code (page cross).  */
>> +# if VEC_SIZE == 64 && !(defined USE_AS_WCSRCHR)
>> +       jmp     L(page_cross_return)
>
>
> Still, unconditional jump is hard to read and understand, unless someone builds and analyzes memory layout.
>
Thats why we have the comment.
>>
>> +    /* 6 bytes from cache-line.  */
>>  # else
>> -       KMOV    %VRDX, %k1
>> +       /* Check if any search CHAR match in range.  */
>> +       blsmsk  %VRSI, %VRSI
>> +       and     %VRSI, %VRAX
>> +       jz      L(ret2)
>> +       bsr     %VRAX, %VRAX
>> +#  ifdef USE_AS_WCSRCHR
>> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>> +#  else
>> +       addq    %rdi, %rax
>> +#  endif
>> +L(ret2):
>> +       ret
>> +       /* 3 bytes from cache-line.  */
>>  # endif
>> -
>> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
>> -       /* We could technically just jmp back after the vpcompress but
>> -          it doesn't save any 16-byte blocks.  */
>> -       shrx    %SHIFT_REG, %VRSI, %VRSI
>> -       test    %VRSI, %VRSI
>> -       jnz     L(page_cross_return)
>> -       jmp     L(page_cross_continue)
>> -       /* 1-byte from cache line.  */
>>  END(STRRCHR)
>>  #endif
>> --
>> 2.34.1
>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v3] x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-01 22:16 x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S Noah Goldstein
                   ` (2 preceding siblings ...)
  2023-11-08 21:33 ` [PATCH v2] " Noah Goldstein
@ 2023-11-09 22:49 ` Noah Goldstein
  2023-11-10  3:10   ` Sunil Pandey
  2023-11-11  3:20 ` [PATCH v4] " Noah Goldstein
  2023-11-12 20:51 ` Noah Goldstein
  5 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2023-11-09 22:49 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
was missing the CPU_FEATURE checks for VBMI2 in the
ifunc/ifunc-impl-list.

The fix is either to add those checks or change the logic to not use
`vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
implementation is usable on SKX.

New implementation is a bit slower, but this is in a cold path so its
probably okay.
---
 sysdeps/x86_64/multiarch/strrchr-evex-base.S | 61 ++++++++++++--------
 1 file changed, 37 insertions(+), 24 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
index cd6a0a870a..eb5f1f855a 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
@@ -35,7 +35,6 @@
 #  define CHAR_SIZE	4
 #  define VPCMP		vpcmpd
 #  define VPMIN		vpminud
-#  define VPCOMPRESS	vpcompressd
 #  define VPTESTN	vptestnmd
 #  define VPTEST	vptestmd
 #  define VPBROADCAST	vpbroadcastd
@@ -46,7 +45,6 @@
 #  define CHAR_SIZE	1
 #  define VPCMP		vpcmpb
 #  define VPMIN		vpminub
-#  define VPCOMPRESS	vpcompressb
 #  define VPTESTN	vptestnmb
 #  define VPTEST	vptestmb
 #  define VPBROADCAST	vpbroadcastb
@@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	jg	L(cross_page_boundary)
-
+L(page_cross_continue):
 	VMOVU	(%rdi), %VMM(1)
 	/* k0 has a 1 for each zero CHAR in YMM1.  */
 	VPTESTN	%VMM(1), %VMM(1), %k0
@@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
 	test	%VGPR(rsi), %VGPR(rsi)
 	jz	L(aligned_more)
 	/* fallthrough: zero CHAR in first VEC.  */
-L(page_cross_return):
+
 	/* K1 has a 1 for each search CHAR match in VEC(1).  */
 	VPCMPEQ	%VMATCH, %VMM(1), %k1
 	KMOV	%k1, %VGPR(rax)
+L(page_cross_return):
 	/* Build mask up until first zero CHAR (used to mask of
 	   potential search CHAR matches past the end of the string).  */
 	blsmsk	%VGPR(rsi), %VGPR(rsi)
@@ -167,7 +166,6 @@ L(first_vec_x1_return):
 
 	.p2align 4,, 12
 L(aligned_more):
-L(page_cross_continue):
 	/* Need to keep original pointer incase VEC(1) has last match.  */
 	movq	%rdi, %rsi
 	andq	$-VEC_SIZE, %rdi
@@ -340,34 +338,49 @@ L(return_new_match_ret):
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 4
 L(cross_page_boundary):
+	/* eax contains all the page offset bits of src (rdi). `xor rdi,
+	   rax` sets pointer will all page offset bits cleared so
+	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+	   before page cross (guaranteed to be safe to read). Doing this
+	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+	   a bit of code size.  */
 	xorq	%rdi, %rax
-	mov	$-1, %VRDX
-	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
-	VPTESTN	%VMM(6), %VMM(6), %k0
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
 	KMOV	%k0, %VRSI
 
+	/* Shift out zero CHAR matches that are before the beginning of
+	   src (rdi).  */
 # ifdef USE_AS_WCSRCHR
 	movl	%edi, %ecx
-	and	$(VEC_SIZE - 1), %ecx
+	andl	$(VEC_SIZE - 1), %ecx
 	shrl	$2, %ecx
 # endif
-	shlx	%SHIFT_REG, %VRDX, %VRDX
-
-# ifdef USE_AS_WCSRCHR
-	kmovw	%edx, %k1
-# else
-	KMOV	%VRDX, %k1
-# endif
-
-	VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
-	/* We could technically just jmp back after the vpcompress but
-	   it doesn't save any 16-byte blocks.  */
 	shrx	%SHIFT_REG, %VRSI, %VRSI
+
 	test	%VRSI, %VRSI
-	jnz	L(page_cross_return)
-	jmp	L(page_cross_continue)
-	/* 1-byte from cache line.  */
+	jz	L(page_cross_continue)
+
+	/* Found zero CHAR so need to test for search CHAR.  */
+	VPCMP	$0, %VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	/* Shift out search CHAR matches that are before the beginning of
+	   src (rdi).  */
+	shrx	%SHIFT_REG, %VRAX, %VRAX
+	/* Check if any search CHAR match in range.  */
+	blsmsk	%VRSI, %VRSI
+	and	%VRSI, %VRAX
+	jz	L(ret2)
+	bsr	%VRAX, %VRAX
+#  ifdef USE_AS_WCSRCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	addq	%rdi, %rax
+#  endif
+L(ret2):
+	ret
+	/* 3 bytes from cache-line for evex.
+	   Crosses cache line for evex512.  */
 END(STRRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-09 22:49 ` [PATCH v3] " Noah Goldstein
@ 2023-11-10  3:10   ` Sunil Pandey
  2023-11-11  3:21     ` Noah Goldstein
  0 siblings, 1 reply; 21+ messages in thread
From: Sunil Pandey @ 2023-11-10  3:10 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, hjl.tools, carlos

[-- Attachment #1: Type: text/plain, Size: 6106 bytes --]

On Thu, Nov 9, 2023 at 2:49 PM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
> was missing the CPU_FEATURE checks for VBMI2 in the
> ifunc/ifunc-impl-list.
>
> The fix is either to add those checks or change the logic to not use
> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
> implementation is usable on SKX.
>
> New implementation is a bit slower, but this is in a cold path so its
> probably okay.
> ---
>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 61 ++++++++++++--------
>  1 file changed, 37 insertions(+), 24 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> index cd6a0a870a..eb5f1f855a 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> @@ -35,7 +35,6 @@
>  #  define CHAR_SIZE    4
>  #  define VPCMP                vpcmpd
>  #  define VPMIN                vpminud
> -#  define VPCOMPRESS   vpcompressd
>  #  define VPTESTN      vptestnmd
>  #  define VPTEST       vptestmd
>  #  define VPBROADCAST  vpbroadcastd
> @@ -46,7 +45,6 @@
>  #  define CHAR_SIZE    1
>  #  define VPCMP                vpcmpb
>  #  define VPMIN                vpminub
> -#  define VPCOMPRESS   vpcompressb
>  #  define VPTESTN      vptestnmb
>  #  define VPTEST       vptestmb
>  #  define VPBROADCAST  vpbroadcastb
> @@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>         andl    $(PAGE_SIZE - 1), %eax
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         jg      L(cross_page_boundary)
> -
> +L(page_cross_continue):
>         VMOVU   (%rdi), %VMM(1)
>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>         VPTESTN %VMM(1), %VMM(1), %k0
> @@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>         test    %VGPR(rsi), %VGPR(rsi)
>         jz      L(aligned_more)
>         /* fallthrough: zero CHAR in first VEC.  */
> -L(page_cross_return):
> +
>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>         VPCMPEQ %VMATCH, %VMM(1), %k1
>         KMOV    %k1, %VGPR(rax)
> +L(page_cross_return):
>

Unused label.


>         /* Build mask up until first zero CHAR (used to mask of
>            potential search CHAR matches past the end of the string).  */
>         blsmsk  %VGPR(rsi), %VGPR(rsi)
> @@ -167,7 +166,6 @@ L(first_vec_x1_return):
>
>         .p2align 4,, 12
>  L(aligned_more):
> -L(page_cross_continue):
>         /* Need to keep original pointer incase VEC(1) has last match.  */
>         movq    %rdi, %rsi
>         andq    $-VEC_SIZE, %rdi
> @@ -340,34 +338,49 @@ L(return_new_match_ret):
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4,, 4
>  L(cross_page_boundary):
> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> +          rax` sets pointer will all page offset bits cleared so
> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> +          before page cross (guaranteed to be safe to read). Doing this
> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> +          a bit of code size.  */
>         xorq    %rdi, %rax
> -       mov     $-1, %VRDX
> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> -       VPTESTN %VMM(6), %VMM(6), %k0
> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> +       VPTESTN %VMM(1), %VMM(1), %k0
>         KMOV    %k0, %VRSI
>
> +       /* Shift out zero CHAR matches that are before the beginning of
> +          src (rdi).  */
>  # ifdef USE_AS_WCSRCHR
>         movl    %edi, %ecx
> -       and     $(VEC_SIZE - 1), %ecx
> +       andl    $(VEC_SIZE - 1), %ecx
>         shrl    $2, %ecx
>  # endif
> -       shlx    %SHIFT_REG, %VRDX, %VRDX
> -
> -# ifdef USE_AS_WCSRCHR
> -       kmovw   %edx, %k1
> -# else
> -       KMOV    %VRDX, %k1
> -# endif
> -
> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
> -       /* We could technically just jmp back after the vpcompress but
> -          it doesn't save any 16-byte blocks.  */
>         shrx    %SHIFT_REG, %VRSI, %VRSI
> +
>         test    %VRSI, %VRSI
> -       jnz     L(page_cross_return)
> -       jmp     L(page_cross_continue)
> -       /* 1-byte from cache line.  */
> +       jz      L(page_cross_continue)
> +
> +       /* Found zero CHAR so need to test for search CHAR.  */
> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       /* Shift out search CHAR matches that are before the beginning of
> +          src (rdi).  */
> +       shrx    %SHIFT_REG, %VRAX, %VRAX
> +       /* Check if any search CHAR match in range.  */
> +       blsmsk  %VRSI, %VRSI
> +       and     %VRSI, %VRAX
> +       jz      L(ret2)
> +       bsr     %VRAX, %VRAX
> +#  ifdef USE_AS_WCSRCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +       addq    %rdi, %rax
> +#  endif
> +L(ret2):
> +       ret
>

Modified instructions to fit page crossing logic in cache line for
strrchr-evex512.

 300: 48 31 f8             xor    %rdi,%rax
 303: 62 e1 fe 48 6f 48 3f vmovdqu64 0xfc0(%rax),%zmm17
 30a: 62 b2 76 40 26 c1     vptestnmb %zmm17,%zmm17,%k0
 310: c4 e1 fb 93 f0       kmovq  %k0,%rsi
 315: 89 f9                 mov    %edi,%ecx
 317: 48 d3 ee             shr    %cl,%rsi
 31a: 0f 84 f8 fc ff ff     je     18 <__strrchr_evex512+0x18>
 320: 62 b1 75 40 74 c8     vpcmpeqb %zmm16,%zmm17,%k1
 326: c4 e1 fb 93 c1       kmovq  %k1,%rax
 32b: 48 d3 e8             shr    %cl,%rax
 32e: c4 e2 c8 f3 d6       blsmsk %rsi,%rsi
 333: 48 21 f0             and    %rsi,%rax
 336: 74 07                 je     33f <__strrchr_evex512+0x33f>
 338: 48 0f bd c0           bsr    %rax,%rax
 33c: 48 01 f8             add    %rdi,%rax
 33f: c3                   ret


> +       /* 3 bytes from cache-line for evex.
> +          Crosses cache line for evex512.  */
>  END(STRRCHR)
>  #endif
> --
> 2.34.1
>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v4] x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-01 22:16 x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S Noah Goldstein
                   ` (3 preceding siblings ...)
  2023-11-09 22:49 ` [PATCH v3] " Noah Goldstein
@ 2023-11-11  3:20 ` Noah Goldstein
  2023-11-12 20:51 ` Noah Goldstein
  5 siblings, 0 replies; 21+ messages in thread
From: Noah Goldstein @ 2023-11-11  3:20 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
was missing the CPU_FEATURE checks for VBMI2 in the
ifunc/ifunc-impl-list.

The fix is either to add those checks or change the logic to not use
`vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
implementation is usable on SKX.

New implementation is a bit slower, but this is in a cold path so its
probably okay.
---
 sysdeps/x86_64/multiarch/strrchr-evex-base.S | 60 ++++++++++++--------
 1 file changed, 36 insertions(+), 24 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
index cd6a0a870a..9cab21c09c 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
@@ -35,7 +35,6 @@
 #  define CHAR_SIZE	4
 #  define VPCMP		vpcmpd
 #  define VPMIN		vpminud
-#  define VPCOMPRESS	vpcompressd
 #  define VPTESTN	vptestnmd
 #  define VPTEST	vptestmd
 #  define VPBROADCAST	vpbroadcastd
@@ -46,7 +45,6 @@
 #  define CHAR_SIZE	1
 #  define VPCMP		vpcmpb
 #  define VPMIN		vpminub
-#  define VPCOMPRESS	vpcompressb
 #  define VPTESTN	vptestnmb
 #  define VPTEST	vptestmb
 #  define VPBROADCAST	vpbroadcastb
@@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	jg	L(cross_page_boundary)
-
+L(page_cross_continue):
 	VMOVU	(%rdi), %VMM(1)
 	/* k0 has a 1 for each zero CHAR in YMM1.  */
 	VPTESTN	%VMM(1), %VMM(1), %k0
@@ -79,7 +77,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
 	test	%VGPR(rsi), %VGPR(rsi)
 	jz	L(aligned_more)
 	/* fallthrough: zero CHAR in first VEC.  */
-L(page_cross_return):
+
 	/* K1 has a 1 for each search CHAR match in VEC(1).  */
 	VPCMPEQ	%VMATCH, %VMM(1), %k1
 	KMOV	%k1, %VGPR(rax)
@@ -167,7 +165,6 @@ L(first_vec_x1_return):
 
 	.p2align 4,, 12
 L(aligned_more):
-L(page_cross_continue):
 	/* Need to keep original pointer incase VEC(1) has last match.  */
 	movq	%rdi, %rsi
 	andq	$-VEC_SIZE, %rdi
@@ -340,34 +337,49 @@ L(return_new_match_ret):
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 4
 L(cross_page_boundary):
+	/* eax contains all the page offset bits of src (rdi). `xor rdi,
+	   rax` sets pointer will all page offset bits cleared so
+	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+	   before page cross (guaranteed to be safe to read). Doing this
+	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+	   a bit of code size.  */
 	xorq	%rdi, %rax
-	mov	$-1, %VRDX
-	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
-	VPTESTN	%VMM(6), %VMM(6), %k0
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
 	KMOV	%k0, %VRSI
 
+	/* Shift out zero CHAR matches that are before the beginning of
+	   src (rdi).  */
 # ifdef USE_AS_WCSRCHR
 	movl	%edi, %ecx
-	and	$(VEC_SIZE - 1), %ecx
+	andl	$(VEC_SIZE - 1), %ecx
 	shrl	$2, %ecx
 # endif
-	shlx	%SHIFT_REG, %VRDX, %VRDX
-
-# ifdef USE_AS_WCSRCHR
-	kmovw	%edx, %k1
-# else
-	KMOV	%VRDX, %k1
-# endif
-
-	VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
-	/* We could technically just jmp back after the vpcompress but
-	   it doesn't save any 16-byte blocks.  */
 	shrx	%SHIFT_REG, %VRSI, %VRSI
+
 	test	%VRSI, %VRSI
-	jnz	L(page_cross_return)
-	jmp	L(page_cross_continue)
-	/* 1-byte from cache line.  */
+	jz	L(page_cross_continue)
+
+	/* Found zero CHAR so need to test for search CHAR.  */
+	VPCMP	$0, %VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	/* Shift out search CHAR matches that are before the beginning of
+	   src (rdi).  */
+	shrx	%SHIFT_REG, %VRAX, %VRAX
+	/* Check if any search CHAR match in range.  */
+	blsmsk	%VRSI, %VRSI
+	and	%VRSI, %VRAX
+	jz	L(ret2)
+	bsr	%VRAX, %VRAX
+#  ifdef USE_AS_WCSRCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	addq	%rdi, %rax
+#  endif
+L(ret2):
+	ret
+	/* 3 bytes from cache-line for evex.
+	   Crosses cache line for evex512.  */
 END(STRRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-10  3:10   ` Sunil Pandey
@ 2023-11-11  3:21     ` Noah Goldstein
  2023-11-12 17:56       ` Sunil Pandey
  0 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2023-11-11  3:21 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: libc-alpha, hjl.tools, carlos

On Thu, Nov 9, 2023 at 9:11 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Thu, Nov 9, 2023 at 2:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
>> was missing the CPU_FEATURE checks for VBMI2 in the
>> ifunc/ifunc-impl-list.
>>
>> The fix is either to add those checks or change the logic to not use
>> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
>> implementation is usable on SKX.
>>
>> New implementation is a bit slower, but this is in a cold path so its
>> probably okay.
>> ---
>>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 61 ++++++++++++--------
>>  1 file changed, 37 insertions(+), 24 deletions(-)
>>
>> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> index cd6a0a870a..eb5f1f855a 100644
>> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> @@ -35,7 +35,6 @@
>>  #  define CHAR_SIZE    4
>>  #  define VPCMP                vpcmpd
>>  #  define VPMIN                vpminud
>> -#  define VPCOMPRESS   vpcompressd
>>  #  define VPTESTN      vptestnmd
>>  #  define VPTEST       vptestmd
>>  #  define VPBROADCAST  vpbroadcastd
>> @@ -46,7 +45,6 @@
>>  #  define CHAR_SIZE    1
>>  #  define VPCMP                vpcmpb
>>  #  define VPMIN                vpminub
>> -#  define VPCOMPRESS   vpcompressb
>>  #  define VPTESTN      vptestnmb
>>  #  define VPTEST       vptestmb
>>  #  define VPBROADCAST  vpbroadcastb
>> @@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         andl    $(PAGE_SIZE - 1), %eax
>>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>>         jg      L(cross_page_boundary)
>> -
>> +L(page_cross_continue):
>>         VMOVU   (%rdi), %VMM(1)
>>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>>         VPTESTN %VMM(1), %VMM(1), %k0
>> @@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         test    %VGPR(rsi), %VGPR(rsi)
>>         jz      L(aligned_more)
>>         /* fallthrough: zero CHAR in first VEC.  */
>> -L(page_cross_return):
>> +
>>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>>         VPCMPEQ %VMATCH, %VMM(1), %k1
>>         KMOV    %k1, %VGPR(rax)
>> +L(page_cross_return):
>
>
> Unused label.
>
>>
>>         /* Build mask up until first zero CHAR (used to mask of
>>            potential search CHAR matches past the end of the string).  */
>>         blsmsk  %VGPR(rsi), %VGPR(rsi)
>> @@ -167,7 +166,6 @@ L(first_vec_x1_return):
>>
>>         .p2align 4,, 12
>>  L(aligned_more):
>> -L(page_cross_continue):
>>         /* Need to keep original pointer incase VEC(1) has last match.  */
>>         movq    %rdi, %rsi
>>         andq    $-VEC_SIZE, %rdi
>> @@ -340,34 +338,49 @@ L(return_new_match_ret):
>>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>>         ret
>>
>> -       .p2align 4,, 4
>>  L(cross_page_boundary):
>> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
>> +          rax` sets pointer will all page offset bits cleared so
>> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
>> +          before page cross (guaranteed to be safe to read). Doing this
>> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
>> +          a bit of code size.  */
>>         xorq    %rdi, %rax
>> -       mov     $-1, %VRDX
>> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
>> -       VPTESTN %VMM(6), %VMM(6), %k0
>> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
>> +       VPTESTN %VMM(1), %VMM(1), %k0
>>         KMOV    %k0, %VRSI
>>
>> +       /* Shift out zero CHAR matches that are before the beginning of
>> +          src (rdi).  */
>>  # ifdef USE_AS_WCSRCHR
>>         movl    %edi, %ecx
>> -       and     $(VEC_SIZE - 1), %ecx
>> +       andl    $(VEC_SIZE - 1), %ecx
>>         shrl    $2, %ecx
>>  # endif
>> -       shlx    %SHIFT_REG, %VRDX, %VRDX
>> -
>> -# ifdef USE_AS_WCSRCHR
>> -       kmovw   %edx, %k1
>> -# else
>> -       KMOV    %VRDX, %k1
>> -# endif
>> -
>> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
>> -       /* We could technically just jmp back after the vpcompress but
>> -          it doesn't save any 16-byte blocks.  */
>>         shrx    %SHIFT_REG, %VRSI, %VRSI
>> +
>>         test    %VRSI, %VRSI
>> -       jnz     L(page_cross_return)
>> -       jmp     L(page_cross_continue)
>> -       /* 1-byte from cache line.  */
>> +       jz      L(page_cross_continue)
>> +
>> +       /* Found zero CHAR so need to test for search CHAR.  */
>> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
>> +       KMOV    %k1, %VRAX
>> +       /* Shift out search CHAR matches that are before the beginning of
>> +          src (rdi).  */
>> +       shrx    %SHIFT_REG, %VRAX, %VRAX
>> +       /* Check if any search CHAR match in range.  */
>> +       blsmsk  %VRSI, %VRSI
>> +       and     %VRSI, %VRAX
>> +       jz      L(ret2)
>> +       bsr     %VRAX, %VRAX
>> +#  ifdef USE_AS_WCSRCHR
>> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>> +#  else
>> +       addq    %rdi, %rax
>> +#  endif
>> +L(ret2):
>> +       ret
>
>
> Modified instructions to fit page crossing logic in cache line for strrchr-evex512.
>
>  300: 48 31 f8             xor    %rdi,%rax
>  303: 62 e1 fe 48 6f 48 3f vmovdqu64 0xfc0(%rax),%zmm17
>  30a: 62 b2 76 40 26 c1     vptestnmb %zmm17,%zmm17,%k0
>  310: c4 e1 fb 93 f0       kmovq  %k0,%rsi
>  315: 89 f9                 mov    %edi,%ecx
>  317: 48 d3 ee             shr    %cl,%rsi
>  31a: 0f 84 f8 fc ff ff     je     18 <__strrchr_evex512+0x18>
>  320: 62 b1 75 40 74 c8     vpcmpeqb %zmm16,%zmm17,%k1
>  326: c4 e1 fb 93 c1       kmovq  %k1,%rax
>  32b: 48 d3 e8             shr    %cl,%rax
>  32e: c4 e2 c8 f3 d6       blsmsk %rsi,%rsi
>  333: 48 21 f0             and    %rsi,%rax
>  336: 74 07                 je     33f <__strrchr_evex512+0x33f>
>  338: 48 0f bd c0           bsr    %rax,%rax
>  33c: 48 01 f8             add    %rdi,%rax
>  33f: c3                   ret
>
Can update if you want, but its a alot of ifdefs, LMK.
>>
>> +       /* 3 bytes from cache-line for evex.
>> +          Crosses cache line for evex512.  */
>>  END(STRRCHR)
>>  #endif
>> --
>> 2.34.1
>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-11  3:21     ` Noah Goldstein
@ 2023-11-12 17:56       ` Sunil Pandey
  2023-11-12 20:01         ` Noah Goldstein
  0 siblings, 1 reply; 21+ messages in thread
From: Sunil Pandey @ 2023-11-12 17:56 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, hjl.tools, carlos

[-- Attachment #1: Type: text/plain, Size: 8186 bytes --]

On Fri, Nov 10, 2023 at 7:21 PM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> On Thu, Nov 9, 2023 at 9:11 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> >
> >
> > On Thu, Nov 9, 2023 at 2:49 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >>
> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
> >> was missing the CPU_FEATURE checks for VBMI2 in the
> >> ifunc/ifunc-impl-list.
> >>
> >> The fix is either to add those checks or change the logic to not use
> >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
> >> implementation is usable on SKX.
> >>
> >> New implementation is a bit slower, but this is in a cold path so its
> >> probably okay.
> >> ---
> >>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 61 ++++++++++++--------
> >>  1 file changed, 37 insertions(+), 24 deletions(-)
> >>
> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> index cd6a0a870a..eb5f1f855a 100644
> >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> @@ -35,7 +35,6 @@
> >>  #  define CHAR_SIZE    4
> >>  #  define VPCMP                vpcmpd
> >>  #  define VPMIN                vpminud
> >> -#  define VPCOMPRESS   vpcompressd
> >>  #  define VPTESTN      vptestnmd
> >>  #  define VPTEST       vptestmd
> >>  #  define VPBROADCAST  vpbroadcastd
> >> @@ -46,7 +45,6 @@
> >>  #  define CHAR_SIZE    1
> >>  #  define VPCMP                vpcmpb
> >>  #  define VPMIN                vpminub
> >> -#  define VPCOMPRESS   vpcompressb
> >>  #  define VPTESTN      vptestnmb
> >>  #  define VPTEST       vptestmb
> >>  #  define VPBROADCAST  vpbroadcastb
> >> @@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> >>         andl    $(PAGE_SIZE - 1), %eax
> >>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> >>         jg      L(cross_page_boundary)
> >> -
> >> +L(page_cross_continue):
> >>         VMOVU   (%rdi), %VMM(1)
> >>         /* k0 has a 1 for each zero CHAR in YMM1.  */
> >>         VPTESTN %VMM(1), %VMM(1), %k0
> >> @@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> >>         test    %VGPR(rsi), %VGPR(rsi)
> >>         jz      L(aligned_more)
> >>         /* fallthrough: zero CHAR in first VEC.  */
> >> -L(page_cross_return):
> >> +
> >>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
> >>         VPCMPEQ %VMATCH, %VMM(1), %k1
> >>         KMOV    %k1, %VGPR(rax)
> >> +L(page_cross_return):
> >
> >
> > Unused label.
> >
> >>
> >>         /* Build mask up until first zero CHAR (used to mask of
> >>            potential search CHAR matches past the end of the string).
> */
> >>         blsmsk  %VGPR(rsi), %VGPR(rsi)
> >> @@ -167,7 +166,6 @@ L(first_vec_x1_return):
> >>
> >>         .p2align 4,, 12
> >>  L(aligned_more):
> >> -L(page_cross_continue):
> >>         /* Need to keep original pointer incase VEC(1) has last match.
> */
> >>         movq    %rdi, %rsi
> >>         andq    $-VEC_SIZE, %rdi
> >> @@ -340,34 +338,49 @@ L(return_new_match_ret):
> >>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >>         ret
> >>
> >> -       .p2align 4,, 4
> >>  L(cross_page_boundary):
> >> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> >> +          rax` sets pointer will all page offset bits cleared so
> >> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> >> +          before page cross (guaranteed to be safe to read). Doing this
> >> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> >> +          a bit of code size.  */
> >>         xorq    %rdi, %rax
> >> -       mov     $-1, %VRDX
> >> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> >> -       VPTESTN %VMM(6), %VMM(6), %k0
> >> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> >> +       VPTESTN %VMM(1), %VMM(1), %k0
> >>         KMOV    %k0, %VRSI
> >>
> >> +       /* Shift out zero CHAR matches that are before the beginning of
> >> +          src (rdi).  */
> >>  # ifdef USE_AS_WCSRCHR
> >>         movl    %edi, %ecx
> >> -       and     $(VEC_SIZE - 1), %ecx
> >> +       andl    $(VEC_SIZE - 1), %ecx
> >>         shrl    $2, %ecx
> >>  # endif
> >> -       shlx    %SHIFT_REG, %VRDX, %VRDX
> >> -
> >> -# ifdef USE_AS_WCSRCHR
> >> -       kmovw   %edx, %k1
> >> -# else
> >> -       KMOV    %VRDX, %k1
> >> -# endif
> >> -
> >> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
> >> -       /* We could technically just jmp back after the vpcompress but
> >> -          it doesn't save any 16-byte blocks.  */
> >>         shrx    %SHIFT_REG, %VRSI, %VRSI
> >> +
> >>         test    %VRSI, %VRSI
> >> -       jnz     L(page_cross_return)
> >> -       jmp     L(page_cross_continue)
> >> -       /* 1-byte from cache line.  */
> >> +       jz      L(page_cross_continue)
> >> +
> >> +       /* Found zero CHAR so need to test for search CHAR.  */
> >> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
> >> +       KMOV    %k1, %VRAX
> >> +       /* Shift out search CHAR matches that are before the beginning
> of
> >> +          src (rdi).  */
> >> +       shrx    %SHIFT_REG, %VRAX, %VRAX
> >> +       /* Check if any search CHAR match in range.  */
> >> +       blsmsk  %VRSI, %VRSI
> >> +       and     %VRSI, %VRAX
> >> +       jz      L(ret2)
> >> +       bsr     %VRAX, %VRAX
> >> +#  ifdef USE_AS_WCSRCHR
> >> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> >> +#  else
> >> +       addq    %rdi, %rax
> >> +#  endif
> >> +L(ret2):
> >> +       ret
> >
> >
> > Modified instructions to fit page crossing logic in cache line for
> strrchr-evex512.
> >
> >  300: 48 31 f8             xor    %rdi,%rax
> >  303: 62 e1 fe 48 6f 48 3f vmovdqu64 0xfc0(%rax),%zmm17
> >  30a: 62 b2 76 40 26 c1     vptestnmb %zmm17,%zmm17,%k0
> >  310: c4 e1 fb 93 f0       kmovq  %k0,%rsi
> >  315: 89 f9                 mov    %edi,%ecx
> >  317: 48 d3 ee             shr    %cl,%rsi
> >  31a: 0f 84 f8 fc ff ff     je     18 <__strrchr_evex512+0x18>
> >  320: 62 b1 75 40 74 c8     vpcmpeqb %zmm16,%zmm17,%k1
> >  326: c4 e1 fb 93 c1       kmovq  %k1,%rax
> >  32b: 48 d3 e8             shr    %cl,%rax
> >  32e: c4 e2 c8 f3 d6       blsmsk %rsi,%rsi
> >  333: 48 21 f0             and    %rsi,%rax
> >  336: 74 07                 je     33f <__strrchr_evex512+0x33f>
> >  338: 48 0f bd c0           bsr    %rax,%rax
> >  33c: 48 01 f8             add    %rdi,%rax
> >  33f: c3                   ret
> >
> Can update if you want, but its a alot of ifdefs, LMK.
>

I'm not sure why you need to use lots of ifdef. I just modified 4 lines on
top of your latest patch.

diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
index 9cab21c09c..3f4540a041 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
@@ -351,22 +351,20 @@ L(cross_page_boundary):

        /* Shift out zero CHAR matches that are before the beginning of
           src (rdi).  */
-# ifdef USE_AS_WCSRCHR
        movl    %edi, %ecx
+# ifdef USE_AS_WCSRCHR
        andl    $(VEC_SIZE - 1), %ecx
        shrl    $2, %ecx
 # endif
-       shrx    %SHIFT_REG, %VRSI, %VRSI
-
-       test    %VRSI, %VRSI
+       shr     %ecx, %VRSI
        jz      L(page_cross_continue)

        /* Found zero CHAR so need to test for search CHAR.  */
-       VPCMP   $0, %VMATCH, %VMM(1), %k1
+       VPCMPEQ %VMATCH, %VMM(1), %k1
        KMOV    %k1, %VRAX
        /* Shift out search CHAR matches that are before the beginning of
           src (rdi).  */
-       shrx    %SHIFT_REG, %VRAX, %VRAX
+       shr     %ecx, %VRAX
        /* Check if any search CHAR match in range.  */
        blsmsk  %VRSI, %VRSI
        and     %VRSI, %VRAX

At least all string/wcsmbs tests passed.


> >>
> >> +       /* 3 bytes from cache-line for evex.
> >> +          Crosses cache line for evex512.  */
> >>  END(STRRCHR)
> >>  #endif
> >> --
> >> 2.34.1
> >>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-12 17:56       ` Sunil Pandey
@ 2023-11-12 20:01         ` Noah Goldstein
  0 siblings, 0 replies; 21+ messages in thread
From: Noah Goldstein @ 2023-11-12 20:01 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: libc-alpha, hjl.tools, carlos

On Sun, Nov 12, 2023 at 11:57 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Fri, Nov 10, 2023 at 7:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> On Thu, Nov 9, 2023 at 9:11 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>> >
>> >
>> >
>> > On Thu, Nov 9, 2023 at 2:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >>
>> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
>> >> was missing the CPU_FEATURE checks for VBMI2 in the
>> >> ifunc/ifunc-impl-list.
>> >>
>> >> The fix is either to add those checks or change the logic to not use
>> >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
>> >> implementation is usable on SKX.
>> >>
>> >> New implementation is a bit slower, but this is in a cold path so its
>> >> probably okay.
>> >> ---
>> >>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 61 ++++++++++++--------
>> >>  1 file changed, 37 insertions(+), 24 deletions(-)
>> >>
>> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> >> index cd6a0a870a..eb5f1f855a 100644
>> >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> >> @@ -35,7 +35,6 @@
>> >>  #  define CHAR_SIZE    4
>> >>  #  define VPCMP                vpcmpd
>> >>  #  define VPMIN                vpminud
>> >> -#  define VPCOMPRESS   vpcompressd
>> >>  #  define VPTESTN      vptestnmd
>> >>  #  define VPTEST       vptestmd
>> >>  #  define VPBROADCAST  vpbroadcastd
>> >> @@ -46,7 +45,6 @@
>> >>  #  define CHAR_SIZE    1
>> >>  #  define VPCMP                vpcmpb
>> >>  #  define VPMIN                vpminub
>> >> -#  define VPCOMPRESS   vpcompressb
>> >>  #  define VPTESTN      vptestnmb
>> >>  #  define VPTEST       vptestmb
>> >>  #  define VPBROADCAST  vpbroadcastb
>> >> @@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>> >>         andl    $(PAGE_SIZE - 1), %eax
>> >>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>> >>         jg      L(cross_page_boundary)
>> >> -
>> >> +L(page_cross_continue):
>> >>         VMOVU   (%rdi), %VMM(1)
>> >>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>> >>         VPTESTN %VMM(1), %VMM(1), %k0
>> >> @@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>> >>         test    %VGPR(rsi), %VGPR(rsi)
>> >>         jz      L(aligned_more)
>> >>         /* fallthrough: zero CHAR in first VEC.  */
>> >> -L(page_cross_return):
>> >> +
>> >>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>> >>         VPCMPEQ %VMATCH, %VMM(1), %k1
>> >>         KMOV    %k1, %VGPR(rax)
>> >> +L(page_cross_return):
>> >
>> >
>> > Unused label.
>> >
>> >>
>> >>         /* Build mask up until first zero CHAR (used to mask of
>> >>            potential search CHAR matches past the end of the string).  */
>> >>         blsmsk  %VGPR(rsi), %VGPR(rsi)
>> >> @@ -167,7 +166,6 @@ L(first_vec_x1_return):
>> >>
>> >>         .p2align 4,, 12
>> >>  L(aligned_more):
>> >> -L(page_cross_continue):
>> >>         /* Need to keep original pointer incase VEC(1) has last match.  */
>> >>         movq    %rdi, %rsi
>> >>         andq    $-VEC_SIZE, %rdi
>> >> @@ -340,34 +338,49 @@ L(return_new_match_ret):
>> >>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>> >>         ret
>> >>
>> >> -       .p2align 4,, 4
>> >>  L(cross_page_boundary):
>> >> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
>> >> +          rax` sets pointer will all page offset bits cleared so
>> >> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
>> >> +          before page cross (guaranteed to be safe to read). Doing this
>> >> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
>> >> +          a bit of code size.  */
>> >>         xorq    %rdi, %rax
>> >> -       mov     $-1, %VRDX
>> >> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
>> >> -       VPTESTN %VMM(6), %VMM(6), %k0
>> >> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
>> >> +       VPTESTN %VMM(1), %VMM(1), %k0
>> >>         KMOV    %k0, %VRSI
>> >>
>> >> +       /* Shift out zero CHAR matches that are before the beginning of
>> >> +          src (rdi).  */
>> >>  # ifdef USE_AS_WCSRCHR
>> >>         movl    %edi, %ecx
>> >> -       and     $(VEC_SIZE - 1), %ecx
>> >> +       andl    $(VEC_SIZE - 1), %ecx
>> >>         shrl    $2, %ecx
>> >>  # endif
>> >> -       shlx    %SHIFT_REG, %VRDX, %VRDX
>> >> -
>> >> -# ifdef USE_AS_WCSRCHR
>> >> -       kmovw   %edx, %k1
>> >> -# else
>> >> -       KMOV    %VRDX, %k1
>> >> -# endif
>> >> -
>> >> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
>> >> -       /* We could technically just jmp back after the vpcompress but
>> >> -          it doesn't save any 16-byte blocks.  */
>> >>         shrx    %SHIFT_REG, %VRSI, %VRSI
>> >> +
>> >>         test    %VRSI, %VRSI
>> >> -       jnz     L(page_cross_return)
>> >> -       jmp     L(page_cross_continue)
>> >> -       /* 1-byte from cache line.  */
>> >> +       jz      L(page_cross_continue)
>> >> +
>> >> +       /* Found zero CHAR so need to test for search CHAR.  */
>> >> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
>> >> +       KMOV    %k1, %VRAX
>> >> +       /* Shift out search CHAR matches that are before the beginning of
>> >> +          src (rdi).  */
>> >> +       shrx    %SHIFT_REG, %VRAX, %VRAX
>> >> +       /* Check if any search CHAR match in range.  */
>> >> +       blsmsk  %VRSI, %VRSI
>> >> +       and     %VRSI, %VRAX
>> >> +       jz      L(ret2)
>> >> +       bsr     %VRAX, %VRAX
>> >> +#  ifdef USE_AS_WCSRCHR
>> >> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>> >> +#  else
>> >> +       addq    %rdi, %rax
>> >> +#  endif
>> >> +L(ret2):
>> >> +       ret
>> >
>> >
>> > Modified instructions to fit page crossing logic in cache line for strrchr-evex512.
>> >
>> >  300: 48 31 f8             xor    %rdi,%rax
>> >  303: 62 e1 fe 48 6f 48 3f vmovdqu64 0xfc0(%rax),%zmm17
>> >  30a: 62 b2 76 40 26 c1     vptestnmb %zmm17,%zmm17,%k0
>> >  310: c4 e1 fb 93 f0       kmovq  %k0,%rsi
>> >  315: 89 f9                 mov    %edi,%ecx
>> >  317: 48 d3 ee             shr    %cl,%rsi
>> >  31a: 0f 84 f8 fc ff ff     je     18 <__strrchr_evex512+0x18>
>> >  320: 62 b1 75 40 74 c8     vpcmpeqb %zmm16,%zmm17,%k1
>> >  326: c4 e1 fb 93 c1       kmovq  %k1,%rax
>> >  32b: 48 d3 e8             shr    %cl,%rax
>> >  32e: c4 e2 c8 f3 d6       blsmsk %rsi,%rsi
>> >  333: 48 21 f0             and    %rsi,%rax
>> >  336: 74 07                 je     33f <__strrchr_evex512+0x33f>
>> >  338: 48 0f bd c0           bsr    %rax,%rax
>> >  33c: 48 01 f8             add    %rdi,%rax
>> >  33f: c3                   ret
>> >
>> Can update if you want, but its a alot of ifdefs, LMK.
>
>
> I'm not sure why you need to use lots of ifdef. I just modified 4 lines on top of your latest patch.
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> index 9cab21c09c..3f4540a041 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> @@ -351,22 +351,20 @@ L(cross_page_boundary):
>
>         /* Shift out zero CHAR matches that are before the beginning of
>            src (rdi).  */
> -# ifdef USE_AS_WCSRCHR
>         movl    %edi, %ecx
> +# ifdef USE_AS_WCSRCHR
>         andl    $(VEC_SIZE - 1), %ecx
>         shrl    $2, %ecx
>  # endif
> -       shrx    %SHIFT_REG, %VRSI, %VRSI
> -
> -       test    %VRSI, %VRSI
> +       shr     %ecx, %VRSI
>         jz      L(page_cross_continue)
>
>         /* Found zero CHAR so need to test for search CHAR.  */
> -       VPCMP   $0, %VMATCH, %VMM(1), %k1
> +       VPCMPEQ %VMATCH, %VMM(1), %k1
>         KMOV    %k1, %VRAX
>         /* Shift out search CHAR matches that are before the beginning of
>            src (rdi).  */
> -       shrx    %SHIFT_REG, %VRAX, %VRAX
> +       shr     %ecx, %VRAX
>         /* Check if any search CHAR match in range.  */
>         blsmsk  %VRSI, %VRSI
>         and     %VRSI, %VRAX
>
> At least all string/wcsmbs tests passed.
>

There is a cost for everything except the avx512 version.
Ill update though.
>>
>> >>
>> >> +       /* 3 bytes from cache-line for evex.
>> >> +          Crosses cache line for evex512.  */
>> >>  END(STRRCHR)
>> >>  #endif
>> >> --
>> >> 2.34.1
>> >>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-01 22:16 x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S Noah Goldstein
                   ` (4 preceding siblings ...)
  2023-11-11  3:20 ` [PATCH v4] " Noah Goldstein
@ 2023-11-12 20:51 ` Noah Goldstein
  2023-11-13 14:07   ` Sunil Pandey
  5 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2023-11-12 20:51 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
was missing the CPU_FEATURE checks for VBMI2 in the
ifunc/ifunc-impl-list.

The fix is either to add those checks or change the logic to not use
`vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
implementation is usable on SKX.

New implementation is a bit slower, but this is in a cold path so its
probably okay.
---
 sysdeps/x86_64/multiarch/strrchr-evex-base.S | 75 +++++++++++++-------
 1 file changed, 51 insertions(+), 24 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
index cd6a0a870a..da5bf1237a 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
@@ -35,18 +35,20 @@
 #  define CHAR_SIZE	4
 #  define VPCMP		vpcmpd
 #  define VPMIN		vpminud
-#  define VPCOMPRESS	vpcompressd
 #  define VPTESTN	vptestnmd
 #  define VPTEST	vptestmd
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
 
 # else
-#  define SHIFT_REG	VRDI
+#  if VEC_SIZE == 64
+#   define SHIFT_REG	VRCX
+#  else
+#   define SHIFT_REG	VRDI
+#  endif
 #  define CHAR_SIZE	1
 #  define VPCMP		vpcmpb
 #  define VPMIN		vpminub
-#  define VPCOMPRESS	vpcompressb
 #  define VPTESTN	vptestnmb
 #  define VPTEST	vptestmb
 #  define VPBROADCAST	vpbroadcastb
@@ -56,6 +58,12 @@
 #  define KORTEST_M	KORTEST
 # endif
 
+# if VEC_SIZE == 32 || (defined USE_AS_WCSRCHR)
+#  define SHIFT_R(cnt, val)	shrx cnt, val, val
+# else
+#  define SHIFT_R(cnt, val)	shr %cl, val
+# endif
+
 # define VMATCH		VMM(0)
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 # define PAGE_SIZE	4096
@@ -71,7 +79,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	jg	L(cross_page_boundary)
-
+L(page_cross_continue):
 	VMOVU	(%rdi), %VMM(1)
 	/* k0 has a 1 for each zero CHAR in YMM1.  */
 	VPTESTN	%VMM(1), %VMM(1), %k0
@@ -79,7 +87,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
 	test	%VGPR(rsi), %VGPR(rsi)
 	jz	L(aligned_more)
 	/* fallthrough: zero CHAR in first VEC.  */
-L(page_cross_return):
+
 	/* K1 has a 1 for each search CHAR match in VEC(1).  */
 	VPCMPEQ	%VMATCH, %VMM(1), %k1
 	KMOV	%k1, %VGPR(rax)
@@ -167,7 +175,6 @@ L(first_vec_x1_return):
 
 	.p2align 4,, 12
 L(aligned_more):
-L(page_cross_continue):
 	/* Need to keep original pointer incase VEC(1) has last match.  */
 	movq	%rdi, %rsi
 	andq	$-VEC_SIZE, %rdi
@@ -340,34 +347,54 @@ L(return_new_match_ret):
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 4
 L(cross_page_boundary):
+	/* eax contains all the page offset bits of src (rdi). `xor rdi,
+	   rax` sets pointer will all page offset bits cleared so
+	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+	   before page cross (guaranteed to be safe to read). Doing this
+	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+	   a bit of code size.  */
 	xorq	%rdi, %rax
-	mov	$-1, %VRDX
-	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
-	VPTESTN	%VMM(6), %VMM(6), %k0
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
 	KMOV	%k0, %VRSI
 
-# ifdef USE_AS_WCSRCHR
+	/* Shift out zero CHAR matches that are before the beginning of
+	   src (rdi).  */
+# if VEC_SIZE == 64 || (defined USE_AS_WCSRCHR)
 	movl	%edi, %ecx
-	and	$(VEC_SIZE - 1), %ecx
+# endif
+# ifdef USE_AS_WCSRCHR
+	andl	$(VEC_SIZE - 1), %ecx
 	shrl	$2, %ecx
 # endif
-	shlx	%SHIFT_REG, %VRDX, %VRDX
+	SHIFT_R	(%SHIFT_REG, %VRSI)
+# if VEC_SIZE == 32 || (defined USE_AS_WCSRCHR)
+	/* For strrchr-evex512 we use SHIFT_R as shr which will set zero
+	   flag.  */
+	test	%VRSI, %VRSI
+# endif
+	jz	L(page_cross_continue)
 
+	/* Found zero CHAR so need to test for search CHAR.  */
+	VPCMPEQ	%VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	/* Shift out search CHAR matches that are before the beginning of
+	   src (rdi).  */
+	SHIFT_R	(%SHIFT_REG, %VRAX)
+	/* Check if any search CHAR match in range.  */
+	blsmsk	%VRSI, %VRSI
+	and	%VRSI, %VRAX
+	jz	L(ret2)
+	bsr	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
-	kmovw	%edx, %k1
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	KMOV	%VRDX, %k1
+	addq	%rdi, %rax
 # endif
-
-	VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
-	/* We could technically just jmp back after the vpcompress but
-	   it doesn't save any 16-byte blocks.  */
-	shrx	%SHIFT_REG, %VRSI, %VRSI
-	test	%VRSI, %VRSI
-	jnz	L(page_cross_return)
-	jmp	L(page_cross_continue)
-	/* 1-byte from cache line.  */
+L(ret2):
+	ret
+	/* 3 bytes from cache-line for evex.  */
+	/* 0 bytes from cache-line for evex512.  */
 END(STRRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
  2023-11-12 20:51 ` Noah Goldstein
@ 2023-11-13 14:07   ` Sunil Pandey
  0 siblings, 0 replies; 21+ messages in thread
From: Sunil Pandey @ 2023-11-13 14:07 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, hjl.tools, carlos

[-- Attachment #1: Type: text/plain, Size: 5862 bytes --]

On Sun, Nov 12, 2023 at 12:51 PM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
> was missing the CPU_FEATURE checks for VBMI2 in the
> ifunc/ifunc-impl-list.
>
> The fix is either to add those checks or change the logic to not use
> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
> implementation is usable on SKX.
>
> New implementation is a bit slower, but this is in a cold path so its
> probably okay.
> ---
>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 75 +++++++++++++-------
>  1 file changed, 51 insertions(+), 24 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> index cd6a0a870a..da5bf1237a 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> @@ -35,18 +35,20 @@
>  #  define CHAR_SIZE    4
>  #  define VPCMP                vpcmpd
>  #  define VPMIN                vpminud
> -#  define VPCOMPRESS   vpcompressd
>  #  define VPTESTN      vptestnmd
>  #  define VPTEST       vptestmd
>  #  define VPBROADCAST  vpbroadcastd
>  #  define VPCMPEQ      vpcmpeqd
>
>  # else
> -#  define SHIFT_REG    VRDI
> +#  if VEC_SIZE == 64
> +#   define SHIFT_REG   VRCX
> +#  else
> +#   define SHIFT_REG   VRDI
> +#  endif
>  #  define CHAR_SIZE    1
>  #  define VPCMP                vpcmpb
>  #  define VPMIN                vpminub
> -#  define VPCOMPRESS   vpcompressb
>  #  define VPTESTN      vptestnmb
>  #  define VPTEST       vptestmb
>  #  define VPBROADCAST  vpbroadcastb
> @@ -56,6 +58,12 @@
>  #  define KORTEST_M    KORTEST
>  # endif
>
> +# if VEC_SIZE == 32 || (defined USE_AS_WCSRCHR)
> +#  define SHIFT_R(cnt, val)    shrx cnt, val, val
> +# else
> +#  define SHIFT_R(cnt, val)    shr %cl, val
> +# endif
> +
>  # define VMATCH                VMM(0)
>  # define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>  # define PAGE_SIZE     4096
> @@ -71,7 +79,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>         andl    $(PAGE_SIZE - 1), %eax
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         jg      L(cross_page_boundary)
> -
> +L(page_cross_continue):
>         VMOVU   (%rdi), %VMM(1)
>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>         VPTESTN %VMM(1), %VMM(1), %k0
> @@ -79,7 +87,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>         test    %VGPR(rsi), %VGPR(rsi)
>         jz      L(aligned_more)
>         /* fallthrough: zero CHAR in first VEC.  */
> -L(page_cross_return):
> +
>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>         VPCMPEQ %VMATCH, %VMM(1), %k1
>         KMOV    %k1, %VGPR(rax)
> @@ -167,7 +175,6 @@ L(first_vec_x1_return):
>
>         .p2align 4,, 12
>  L(aligned_more):
> -L(page_cross_continue):
>         /* Need to keep original pointer incase VEC(1) has last match.  */
>         movq    %rdi, %rsi
>         andq    $-VEC_SIZE, %rdi
> @@ -340,34 +347,54 @@ L(return_new_match_ret):
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4,, 4
>  L(cross_page_boundary):
> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> +          rax` sets pointer will all page offset bits cleared so
> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> +          before page cross (guaranteed to be safe to read). Doing this
> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> +          a bit of code size.  */
>         xorq    %rdi, %rax
> -       mov     $-1, %VRDX
> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> -       VPTESTN %VMM(6), %VMM(6), %k0
> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> +       VPTESTN %VMM(1), %VMM(1), %k0
>         KMOV    %k0, %VRSI
>
> -# ifdef USE_AS_WCSRCHR
> +       /* Shift out zero CHAR matches that are before the beginning of
> +          src (rdi).  */
> +# if VEC_SIZE == 64 || (defined USE_AS_WCSRCHR)
>         movl    %edi, %ecx
> -       and     $(VEC_SIZE - 1), %ecx
> +# endif
> +# ifdef USE_AS_WCSRCHR
> +       andl    $(VEC_SIZE - 1), %ecx
>         shrl    $2, %ecx
>  # endif
> -       shlx    %SHIFT_REG, %VRDX, %VRDX
> +       SHIFT_R (%SHIFT_REG, %VRSI)
> +# if VEC_SIZE == 32 || (defined USE_AS_WCSRCHR)
> +       /* For strrchr-evex512 we use SHIFT_R as shr which will set zero
> +          flag.  */
> +       test    %VRSI, %VRSI
> +# endif
> +       jz      L(page_cross_continue)
>
> +       /* Found zero CHAR so need to test for search CHAR.  */
> +       VPCMPEQ %VMATCH, %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       /* Shift out search CHAR matches that are before the beginning of
> +          src (rdi).  */
> +       SHIFT_R (%SHIFT_REG, %VRAX)
> +       /* Check if any search CHAR match in range.  */
> +       blsmsk  %VRSI, %VRSI
> +       and     %VRSI, %VRAX
> +       jz      L(ret2)
> +       bsr     %VRAX, %VRAX
>  # ifdef USE_AS_WCSRCHR
> -       kmovw   %edx, %k1
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>  # else
> -       KMOV    %VRDX, %k1
> +       addq    %rdi, %rax
>  # endif
> -
> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
> -       /* We could technically just jmp back after the vpcompress but
> -          it doesn't save any 16-byte blocks.  */
> -       shrx    %SHIFT_REG, %VRSI, %VRSI
> -       test    %VRSI, %VRSI
> -       jnz     L(page_cross_return)
> -       jmp     L(page_cross_continue)
> -       /* 1-byte from cache line.  */
> +L(ret2):
> +       ret
> +       /* 3 bytes from cache-line for evex.  */
> +       /* 0 bytes from cache-line for evex512.  */
>  END(STRRCHR)
>  #endif
> --
> 2.34.1


LGTM
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2023-11-13 14:07 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-01 22:16 x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S Noah Goldstein
2023-11-07 17:51 ` Noah Goldstein
2023-11-07 18:25   ` Florian Weimer
2023-11-08  5:22     ` Sunil Pandey
2023-11-08 19:04 ` Sunil Pandey
2023-11-08 19:22   ` Noah Goldstein
2023-11-08 20:08     ` Sunil Pandey
2023-11-08 21:21       ` Noah Goldstein
2023-11-08 21:22         ` Noah Goldstein
2023-11-08 21:33   ` Noah Goldstein
2023-11-08 21:33 ` [PATCH v2] " Noah Goldstein
2023-11-09  0:10   ` Sunil Pandey
2023-11-09  1:51     ` Noah Goldstein
2023-11-09 22:49 ` [PATCH v3] " Noah Goldstein
2023-11-10  3:10   ` Sunil Pandey
2023-11-11  3:21     ` Noah Goldstein
2023-11-12 17:56       ` Sunil Pandey
2023-11-12 20:01         ` Noah Goldstein
2023-11-11  3:20 ` [PATCH v4] " Noah Goldstein
2023-11-12 20:51 ` Noah Goldstein
2023-11-13 14:07   ` Sunil Pandey

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).