* [PATCH v5 1/2] x86: Optimize strlen-evex.S @ 2021-04-19 23:36 Noah Goldstein 2021-04-19 23:36 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Noah Goldstein 2021-04-20 1:01 ` [PATCH v5 1/2] x86: Optimize strlen-evex.S H.J. Lu 0 siblings, 2 replies; 24+ messages in thread From: Noah Goldstein @ 2021-04-19 23:36 UTC (permalink / raw) To: libc-alpha No bug. This commit optimizes strlen-evex.S. The optimizations are mostly small things but they add up to roughly 10-30% performance improvement for strlen. The results for strnlen are bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen are all passing. Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> --- sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++----------- 1 file changed, 317 insertions(+), 264 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S index 0583819078..4bf6874b82 100644 --- a/sysdeps/x86_64/multiarch/strlen-evex.S +++ b/sysdeps/x86_64/multiarch/strlen-evex.S @@ -29,11 +29,13 @@ # ifdef USE_AS_WCSLEN # define VPCMP vpcmpd # define VPMINU vpminud -# define SHIFT_REG r9d +# define SHIFT_REG ecx +# define CHAR_SIZE 4 # else # define VPCMP vpcmpb # define VPMINU vpminub -# define SHIFT_REG ecx +# define SHIFT_REG edx +# define CHAR_SIZE 1 # endif # define XMMZERO xmm16 @@ -46,132 +48,165 @@ # define YMM6 ymm22 # define VEC_SIZE 32 +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) .section .text.evex,"ax",@progbits ENTRY (STRLEN) # ifdef USE_AS_STRNLEN - /* Check for zero length. */ + /* Check zero length. */ test %RSI_LP, %RSI_LP jz L(zero) -# ifdef USE_AS_WCSLEN - shl $2, %RSI_LP -# elif defined __ILP32__ +# ifdef __ILP32__ /* Clear the upper 32 bits. */ movl %esi, %esi # endif mov %RSI_LP, %R8_LP # endif - movl %edi, %ecx - movq %rdi, %rdx + movl %edi, %eax vpxorq %XMMZERO, %XMMZERO, %XMMZERO - + /* Clear high bits from edi. Only keeping bits relevant to page + cross check. */ + andl $(PAGE_SIZE - 1), %eax /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. Each bit in K0 represents a null byte. */ VPCMP $0, (%rdi), %YMMZERO, %k0 kmovd %k0, %eax - testl %eax, %eax - # ifdef USE_AS_STRNLEN - jnz L(first_vec_x0_check) - /* Adjust length and check the end of data. */ - subq $VEC_SIZE, %rsi - jbe L(max) -# else - jnz L(first_vec_x0) + /* If length < CHAR_PER_VEC handle special. */ + cmpq $CHAR_PER_VEC, %rsi + jbe L(first_vec_x0) # endif - - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax + ret # ifdef USE_AS_STRNLEN - /* Adjust length. */ - addq %rcx, %rsi +L(zero): + xorl %eax, %eax + ret - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) + .p2align 4 +L(first_vec_x0): + /* Set bit for max len so that tzcnt will return min of max len + and position of first match. */ + btsq %rsi, %rax + tzcntl %eax, %eax + ret # endif - jmp L(more_4x_vec) .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - -# ifdef USE_AS_WCSLEN - /* NB: Divide shift count by 4 since each bit in K0 represent 4 - bytes. */ - movl %ecx, %SHIFT_REG - sarl $2, %SHIFT_REG +L(first_vec_x1): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ +# ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal CHAR_PER_VEC(%rdi, %rax), %eax # endif - VPCMP $0, (%rdi), %YMMZERO, %k0 - kmovd %k0, %eax + ret - /* Remove the leading bytes. */ - sarxl %SHIFT_REG, %eax, %eax - testl %eax, %eax - jz L(aligned_more) + .p2align 4 +L(first_vec_x2): tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) -# endif - addq %rdi, %rax - addq %rcx, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax # endif ret .p2align 4 -L(aligned_more): +L(first_vec_x3): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" - to void possible addition overflow. */ - negq %rcx - addq $VEC_SIZE, %rcx - - /* Check the end of data. */ - subq %rcx, %rsi - jbe L(max) + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax # endif + ret - addq $VEC_SIZE, %rdi - + .p2align 4 +L(first_vec_x4): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax # endif + ret -L(more_4x_vec): + .p2align 5 +L(aligned_more): + movq %rdi, %rdx + /* Align data to VEC_SIZE. */ + andq $-(VEC_SIZE), %rdi +L(cross_page_continue): /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time since data is only aligned to VEC_SIZE. */ - VPCMP $0, (%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x0) - +# ifdef USE_AS_STRNLEN + /* + CHAR_SIZE because it simplies the logic in + last_4x_vec_or_less. */ + leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx + subq %rdx, %rcx +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx +# endif +# endif + /* Load first VEC regardless. */ VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 +# ifdef USE_AS_STRNLEN + /* Adjust length. If near end handle specially. */ + subq %rcx, %rsi + jb L(last_4x_vec_or_less) +# endif kmovd %k0, %eax testl %eax, %eax jnz L(first_vec_x1) VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 kmovd %k0, %eax - testl %eax, %eax + test %eax, %eax jnz L(first_vec_x2) VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 @@ -179,258 +214,276 @@ L(more_4x_vec): testl %eax, %eax jnz L(first_vec_x3) - addq $(VEC_SIZE * 4), %rdi - -# ifdef USE_AS_STRNLEN - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) -# endif - - /* Align data to 4 * VEC_SIZE. */ - movq %rdi, %rcx - andl $(4 * VEC_SIZE - 1), %ecx - andq $-(4 * VEC_SIZE), %rdi + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x4) + addq $VEC_SIZE, %rdi # ifdef USE_AS_STRNLEN - /* Adjust length. */ + /* Check if at last VEC_SIZE * 4 length. */ + cmpq $(CHAR_PER_VEC * 4 - 1), %rsi + jbe L(last_4x_vec_or_less_load) + movl %edi, %ecx + andl $(VEC_SIZE * 4 - 1), %ecx +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx +# endif + /* Readjust length. */ addq %rcx, %rsi # endif + /* Align data to VEC_SIZE * 4. */ + andq $-(VEC_SIZE * 4), %rdi + /* Compare 4 * VEC at a time forward. */ .p2align 4 L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - VMOVA (%rdi), %YMM1 - VMOVA VEC_SIZE(%rdi), %YMM2 - VMOVA (VEC_SIZE * 2)(%rdi), %YMM3 - VMOVA (VEC_SIZE * 3)(%rdi), %YMM4 - - VPMINU %YMM1, %YMM2, %YMM5 - VPMINU %YMM3, %YMM4, %YMM6 + /* Load first VEC regardless. */ + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 +# ifdef USE_AS_STRNLEN + /* Break if at end of length. */ + subq $(CHAR_PER_VEC * 4), %rsi + jb L(last_4x_vec_or_less_cmpeq) +# endif + /* Save some code size by microfusing VPMINU with the load. Since + the matches in ymm2/ymm4 can only be returned if there where no + matches in ymm1/ymm3 respectively there is no issue with overlap. + */ + VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 + VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 + VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 + + VPCMP $0, %YMM2, %YMMZERO, %k0 + VPCMP $0, %YMM4, %YMMZERO, %k1 + subq $-(VEC_SIZE * 4), %rdi + kortestd %k0, %k1 + jz L(loop_4x_vec) + + /* Check if end was in first half. */ + kmovd %k0, %eax + subq %rdx, %rdi +# ifdef USE_AS_WCSLEN + shrq $2, %rdi +# endif + testl %eax, %eax + jz L(second_vec_return) - VPMINU %YMM5, %YMM6, %YMM5 - VPCMP $0, %YMM5, %YMMZERO, %k0 - ktestd %k0, %k0 - jnz L(4x_vec_end) + VPCMP $0, %YMM1, %YMMZERO, %k2 + kmovd %k2, %edx + /* Combine VEC1 matches (edx) with VEC2 matches (eax). */ +# ifdef USE_AS_WCSLEN + sall $CHAR_PER_VEC, %eax + orl %edx, %eax + tzcntl %eax, %eax +# else + salq $CHAR_PER_VEC, %rax + orq %rdx, %rax + tzcntq %rax, %rax +# endif + addq %rdi, %rax + ret - addq $(VEC_SIZE * 4), %rdi -# ifndef USE_AS_STRNLEN - jmp L(loop_4x_vec) -# else - subq $(VEC_SIZE * 4), %rsi - ja L(loop_4x_vec) +# ifdef USE_AS_STRNLEN +L(last_4x_vec_or_less_load): + /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 +L(last_4x_vec_or_less_cmpeq): + VPCMP $0, %YMM1, %YMMZERO, %k0 + addq $(VEC_SIZE * 3), %rdi L(last_4x_vec_or_less): - /* Less than 4 * VEC and aligned to VEC_SIZE. */ - addl $(VEC_SIZE * 2), %esi - jle L(last_2x_vec) - - VPCMP $0, (%rdi), %YMMZERO, %k0 kmovd %k0, %eax + /* If remaining length > VEC_SIZE * 2. This works if esi is off by + VEC_SIZE * 4. */ + testl $(CHAR_PER_VEC * 2), %esi + jnz L(last_4x_vec) + + /* length may have been negative or positive by an offset of + CHAR_PER_VEC * 4 depending on where this was called from. This + fixes that. */ + andl $(CHAR_PER_VEC * 4 - 1), %esi testl %eax, %eax - jnz L(first_vec_x0) + jnz L(last_vec_x1_check) - VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x1) + /* Check the end of data. */ + subl $CHAR_PER_VEC, %esi + jb L(max) VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x2_check) - subl $VEC_SIZE, %esi - jle L(max) + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max) - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x3_check) + subq %rdx, %rdi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi +# endif + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax + ret +L(max): movq %r8, %rax + ret +# endif + + /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less) + in the 4x VEC loop can use 2 byte encoding. */ + .p2align 4 +L(second_vec_return): + VPCMP $0, %YMM3, %YMMZERO, %k0 + /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ +# ifdef USE_AS_WCSLEN + kunpckbw %k0, %k1, %k0 + kmovd %k0, %eax + tzcntl %eax, %eax +# else + kunpckdq %k0, %k1, %k0 + kmovq %k0, %rax + tzcntq %rax, %rax +# endif + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax + ret + + +# ifdef USE_AS_STRNLEN +L(last_vec_x1_check): + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi # ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi # endif + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax ret .p2align 4 -L(last_2x_vec): - addl $(VEC_SIZE * 2), %esi +L(last_4x_vec): + /* Test first 2x VEC normally. */ + testl %eax, %eax + jnz L(last_vec_x1) - VPCMP $0, (%rdi), %YMMZERO, %k0 + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 kmovd %k0, %eax testl %eax, %eax - jnz L(first_vec_x0_check) - subl $VEC_SIZE, %esi - jle L(max) + jnz L(last_vec_x2) - VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 + /* Normalize length. */ + andl $(CHAR_PER_VEC * 4 - 1), %esi + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 kmovd %k0, %eax testl %eax, %eax - jnz L(first_vec_x1_check) - movq %r8, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif - ret + jnz L(last_vec_x3) - .p2align 4 -L(first_vec_x0_check): + /* Check the end of data. */ + subl $(CHAR_PER_VEC * 3), %esi + jb L(max) + + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq %rdi, %rax - subq %rdx, %rax + cmpl %eax, %esi + jb L(max_end) + + subq %rdx, %rdi # ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi # endif + leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax ret .p2align 4 -L(first_vec_x1_check): +L(last_vec_x1): tzcntl %eax, %eax + subq %rdx, %rdi # ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $VEC_SIZE, %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi # endif + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax ret .p2align 4 -L(first_vec_x2_check): +L(last_vec_x2): tzcntl %eax, %eax + subq %rdx, %rdi # ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $(VEC_SIZE * 2), %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi # endif + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax ret .p2align 4 -L(first_vec_x3_check): +L(last_vec_x3): tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif + subl $(CHAR_PER_VEC * 2), %esi /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $(VEC_SIZE * 3), %rax - addq %rdi, %rax - subq %rdx, %rax + cmpl %eax, %esi + jb L(max_end) + subq %rdx, %rdi # ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi # endif + leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax ret - - .p2align 4 -L(max): +L(max_end): movq %r8, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif - ret - - .p2align 4 -L(zero): - xorl %eax, %eax ret # endif + /* Cold case for crossing page with first load. */ .p2align 4 -L(first_vec_x0): - tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - addq %rdi, %rax - subq %rdx, %rax +L(cross_page_boundary): + movq %rdi, %rdx + /* Align data to VEC_SIZE. */ + andq $-VEC_SIZE, %rdi + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + /* Remove the leading bytes. */ # ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide shift count by 4 since each bit in K0 represent 4 + bytes. */ + movl %edx, %ecx + shrl $2, %ecx + andl $(CHAR_PER_VEC - 1), %ecx # endif - ret - - .p2align 4 -L(first_vec_x1): + /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ + sarxl %SHIFT_REG, %eax, %eax + testl %eax, %eax +# ifndef USE_AS_STRNLEN + jz L(cross_page_continue) tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - addq $VEC_SIZE, %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif ret - - .p2align 4 -L(first_vec_x2): - tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - addq $(VEC_SIZE * 2), %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif +# else + jnz L(cross_page_less_vec) +# ifndef USE_AS_WCSLEN + movl %edx, %ecx + andl $(CHAR_PER_VEC - 1), %ecx +# endif + movl $CHAR_PER_VEC, %eax + subl %ecx, %eax + /* Check the end of data. */ + cmpq %rax, %rsi + ja L(cross_page_continue) + movl %esi, %eax ret - - .p2align 4 -L(4x_vec_end): - VPCMP $0, %YMM1, %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x0) - VPCMP $0, %YMM2, %YMMZERO, %k1 - kmovd %k1, %eax - testl %eax, %eax - jnz L(first_vec_x1) - VPCMP $0, %YMM3, %YMMZERO, %k2 - kmovd %k2, %eax - testl %eax, %eax - jnz L(first_vec_x2) - VPCMP $0, %YMM4, %YMMZERO, %k3 - kmovd %k3, %eax -L(first_vec_x3): +L(cross_page_less_vec): tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - addq $(VEC_SIZE * 3), %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif + /* Select min of length and position of first null. */ + cmpq %rax, %rsi + cmovb %esi, %eax ret +# endif END (STRLEN) #endif -- 2.29.2 ^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2021-04-19 23:36 [PATCH v5 1/2] x86: Optimize strlen-evex.S Noah Goldstein @ 2021-04-19 23:36 ` Noah Goldstein 2021-04-20 1:01 ` H.J. Lu 2022-09-25 8:19 ` Aurelien Jarno 2021-04-20 1:01 ` [PATCH v5 1/2] x86: Optimize strlen-evex.S H.J. Lu 1 sibling, 2 replies; 24+ messages in thread From: Noah Goldstein @ 2021-04-19 23:36 UTC (permalink / raw) To: libc-alpha No bug. This commit optimizes strlen-avx2.S. The optimizations are mostly small things but they add up to roughly 10-30% performance improvement for strlen. The results for strnlen are bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen are all passing. Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> --- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- 2 files changed, 334 insertions(+), 214 deletions(-) diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index c377cab629..651b32908e 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strlen.c. */ IFUNC_IMPL (i, name, strlen, IFUNC_IMPL_ADD (array, i, strlen, - CPU_FEATURE_USABLE (AVX2), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), __strlen_avx2) IFUNC_IMPL_ADD (array, i, strlen, (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (RTM)), __strlen_avx2_rtm) IFUNC_IMPL_ADD (array, i, strlen, @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strnlen.c. */ IFUNC_IMPL (i, name, strnlen, IFUNC_IMPL_ADD (array, i, strnlen, - CPU_FEATURE_USABLE (AVX2), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), __strnlen_avx2) IFUNC_IMPL_ADD (array, i, strnlen, (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (RTM)), __strnlen_avx2_rtm) IFUNC_IMPL_ADD (array, i, strnlen, @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/wcslen.c. */ IFUNC_IMPL (i, name, wcslen, IFUNC_IMPL_ADD (array, i, wcslen, - CPU_FEATURE_USABLE (AVX2), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), __wcslen_avx2) IFUNC_IMPL_ADD (array, i, wcslen, (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (RTM)), __wcslen_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcslen, @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ IFUNC_IMPL (i, name, wcsnlen, IFUNC_IMPL_ADD (array, i, wcsnlen, - CPU_FEATURE_USABLE (AVX2), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), __wcsnlen_avx2) IFUNC_IMPL_ADD (array, i, wcsnlen, (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (RTM)), __wcsnlen_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcsnlen, diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S index 1caae9e6bc..bd2e6ee44a 100644 --- a/sysdeps/x86_64/multiarch/strlen-avx2.S +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S @@ -27,9 +27,11 @@ # ifdef USE_AS_WCSLEN # define VPCMPEQ vpcmpeqd # define VPMINU vpminud +# define CHAR_SIZE 4 # else # define VPCMPEQ vpcmpeqb # define VPMINU vpminub +# define CHAR_SIZE 1 # endif # ifndef VZEROUPPER @@ -41,349 +43,459 @@ # endif # define VEC_SIZE 32 +# define PAGE_SIZE 4096 .section SECTION(.text),"ax",@progbits ENTRY (STRLEN) # ifdef USE_AS_STRNLEN - /* Check for zero length. */ + /* Check zero length. */ test %RSI_LP, %RSI_LP jz L(zero) + /* Store max len in R8_LP before adjusting if using WCSLEN. */ + mov %RSI_LP, %R8_LP # ifdef USE_AS_WCSLEN shl $2, %RSI_LP # elif defined __ILP32__ /* Clear the upper 32 bits. */ movl %esi, %esi # endif - mov %RSI_LP, %R8_LP # endif - movl %edi, %ecx + movl %edi, %eax movq %rdi, %rdx vpxor %xmm0, %xmm0, %xmm0 - + /* Clear high bits from edi. Only keeping bits relevant to page + cross check. */ + andl $(PAGE_SIZE - 1), %eax /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - + VPCMPEQ (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax # ifdef USE_AS_STRNLEN - jnz L(first_vec_x0_check) - /* Adjust length and check the end of data. */ - subq $VEC_SIZE, %rsi - jbe L(max) -# else - jnz L(first_vec_x0) + /* If length < VEC_SIZE handle special. */ + cmpq $VEC_SIZE, %rsi + jbe L(first_vec_x0) # endif - - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi + /* If empty continue to aligned_more. Otherwise return bit + position of first match. */ + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN # ifdef USE_AS_STRNLEN - /* Adjust length. */ - addq %rcx, %rsi +L(zero): + xorl %eax, %eax + ret - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) + .p2align 4 +L(first_vec_x0): + /* Set bit for max len so that tzcnt will return min of max len + and position of first match. */ + btsq %rsi, %rax + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN # endif - jmp L(more_4x_vec) .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - /* Remove the leading bytes. */ - sarl %cl, %eax - testl %eax, %eax - jz L(aligned_more) +L(first_vec_x1): tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) + /* Use ecx which was computed earlier to compute correct value. + */ + subl $(VEC_SIZE * 4 + 1), %ecx + addl %ecx, %eax +# else + subl %edx, %edi + incl %edi + addl %edi, %eax # endif - addq %rdi, %rax - addq %rcx, %rax - subq %rdx, %rax # ifdef USE_AS_WCSLEN - shrq $2, %rax + shrl $2, %eax # endif -L(return_vzeroupper): - ZERO_UPPER_VEC_REGISTERS_RETURN + VZEROUPPER_RETURN .p2align 4 -L(aligned_more): +L(first_vec_x2): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" - to void possible addition overflow. */ - negq %rcx - addq $VEC_SIZE, %rcx - - /* Check the end of data. */ - subq %rcx, %rsi - jbe L(max) + /* Use ecx which was computed earlier to compute correct value. + */ + subl $(VEC_SIZE * 3 + 1), %ecx + addl %ecx, %eax +# else + subl %edx, %edi + addl $(VEC_SIZE + 1), %edi + addl %edi, %eax # endif +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN - addq $VEC_SIZE, %rdi + .p2align 4 +L(first_vec_x3): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ +# ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ + subl $(VEC_SIZE * 2 + 1), %ecx + addl %ecx, %eax +# else + subl %edx, %edi + addl $(VEC_SIZE * 2 + 1), %edi + addl %edi, %eax +# endif +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN + .p2align 4 +L(first_vec_x4): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) + /* Use ecx which was computed earlier to compute correct value. + */ + subl $(VEC_SIZE + 1), %ecx + addl %ecx, %eax +# else + subl %edx, %edi + addl $(VEC_SIZE * 3 + 1), %edi + addl %edi, %eax # endif +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN -L(more_4x_vec): + .p2align 5 +L(aligned_more): + /* Align data to VEC_SIZE - 1. This is the same number of + instructions as using andq with -VEC_SIZE but saves 4 bytes of + code on the x4 check. */ + orq $(VEC_SIZE - 1), %rdi +L(cross_page_continue): /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time since data is only aligned to VEC_SIZE. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax +# ifdef USE_AS_STRNLEN + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because + it simplies the logic in last_4x_vec_or_less. */ + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx + subq %rdx, %rcx +# endif + /* Load first VEC regardless. */ + VPCMPEQ 1(%rdi), %ymm0, %ymm1 +# ifdef USE_AS_STRNLEN + /* Adjust length. If near end handle specially. */ + subq %rcx, %rsi + jb L(last_4x_vec_or_less) +# endif + vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x1) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x2) - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x3) - addq $(VEC_SIZE * 4), %rdi - -# ifdef USE_AS_STRNLEN - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) -# endif - - /* Align data to 4 * VEC_SIZE. */ - movq %rdi, %rcx - andl $(4 * VEC_SIZE - 1), %ecx - andq $-(4 * VEC_SIZE), %rdi + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x4) + /* Align data to VEC_SIZE * 4 - 1. */ # ifdef USE_AS_STRNLEN - /* Adjust length. */ + /* Before adjusting length check if at last VEC_SIZE * 4. */ + cmpq $(VEC_SIZE * 4 - 1), %rsi + jbe L(last_4x_vec_or_less_load) + incq %rdi + movl %edi, %ecx + orq $(VEC_SIZE * 4 - 1), %rdi + andl $(VEC_SIZE * 4 - 1), %ecx + /* Readjust length. */ addq %rcx, %rsi +# else + incq %rdi + orq $(VEC_SIZE * 4 - 1), %rdi # endif - + /* Compare 4 * VEC at a time forward. */ .p2align 4 L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - vmovdqa (%rdi), %ymm1 - vmovdqa VEC_SIZE(%rdi), %ymm2 - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 - VPMINU %ymm1, %ymm2, %ymm5 - VPMINU %ymm3, %ymm4, %ymm6 - VPMINU %ymm5, %ymm6, %ymm5 - - VPCMPEQ %ymm5, %ymm0, %ymm5 - vpmovmskb %ymm5, %eax - testl %eax, %eax - jnz L(4x_vec_end) - - addq $(VEC_SIZE * 4), %rdi - -# ifndef USE_AS_STRNLEN - jmp L(loop_4x_vec) -# else +# ifdef USE_AS_STRNLEN + /* Break if at end of length. */ subq $(VEC_SIZE * 4), %rsi - ja L(loop_4x_vec) - -L(last_4x_vec_or_less): - /* Less than 4 * VEC and aligned to VEC_SIZE. */ - addl $(VEC_SIZE * 2), %esi - jle L(last_2x_vec) + jb L(last_4x_vec_or_less_cmpeq) +# endif + /* Save some code size by microfusing VPMINU with the load. Since + the matches in ymm2/ymm4 can only be returned if there where no + matches in ymm1/ymm3 respectively there is no issue with overlap. + */ + vmovdqa 1(%rdi), %ymm1 + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 + + VPMINU %ymm2, %ymm4, %ymm5 + VPCMPEQ %ymm5, %ymm0, %ymm5 + vpmovmskb %ymm5, %ecx - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x1) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ %ymm1, %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + subq %rdx, %rdi testl %eax, %eax + jnz L(last_vec_return_x0) - jnz L(first_vec_x2_check) - subl $VEC_SIZE, %esi - jle L(max) - - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ %ymm2, %ymm0, %ymm2 + vpmovmskb %ymm2, %eax testl %eax, %eax - - jnz L(first_vec_x3_check) - movq %r8, %rax -# ifdef USE_AS_WCSLEN + jnz L(last_vec_return_x1) + + /* Combine last 2 VEC. */ + VPCMPEQ %ymm3, %ymm0, %ymm3 + vpmovmskb %ymm3, %eax + /* rcx has combined result from all 4 VEC. It will only be used if + the first 3 other VEC all did not contain a match. */ + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax + subq $(VEC_SIZE * 2 - 1), %rdi + addq %rdi, %rax +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif +# endif VZEROUPPER_RETURN + +# ifdef USE_AS_STRNLEN .p2align 4 -L(last_2x_vec): - addl $(VEC_SIZE * 2), %esi - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax +L(last_4x_vec_or_less_load): + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ + subq $-(VEC_SIZE * 4), %rdi +L(last_4x_vec_or_less_cmpeq): + VPCMPEQ 1(%rdi), %ymm0, %ymm1 +L(last_4x_vec_or_less): - jnz L(first_vec_x0_check) - subl $VEC_SIZE, %esi - jle L(max) + vpmovmskb %ymm1, %eax + /* If remaining length > VEC_SIZE * 2. This works if esi is off by + VEC_SIZE * 4. */ + testl $(VEC_SIZE * 2), %esi + jnz L(last_4x_vec) - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + /* length may have been negative or positive by an offset of + VEC_SIZE * 4 depending on where this was called from. This fixes + that. */ + andl $(VEC_SIZE * 4 - 1), %esi testl %eax, %eax - jnz L(first_vec_x1_check) - movq %r8, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif - VZEROUPPER_RETURN + jnz L(last_vec_x1_check) - .p2align 4 -L(first_vec_x0_check): + subl $VEC_SIZE, %esi + jb L(max) + + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax tzcntl %eax, %eax /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi + addl $(VEC_SIZE + 1), %eax addq %rdi, %rax - subq %rdx, %rax # ifdef USE_AS_WCSLEN shrq $2, %rax # endif VZEROUPPER_RETURN +# endif .p2align 4 -L(first_vec_x1_check): +L(last_vec_return_x0): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $VEC_SIZE, %rax + subq $(VEC_SIZE * 4 - 1), %rdi addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif +# endif VZEROUPPER_RETURN .p2align 4 -L(first_vec_x2_check): +L(last_vec_return_x1): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $(VEC_SIZE * 2), %rax + subq $(VEC_SIZE * 3 - 1), %rdi addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif +# endif VZEROUPPER_RETURN +# ifdef USE_AS_STRNLEN .p2align 4 -L(first_vec_x3_check): +L(last_vec_x1_check): + tzcntl %eax, %eax /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $(VEC_SIZE * 3), %rax + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi + incl %eax addq %rdi, %rax - subq %rdx, %rax # ifdef USE_AS_WCSLEN shrq $2, %rax # endif VZEROUPPER_RETURN - .p2align 4 L(max): movq %r8, %rax + VZEROUPPER_RETURN + + .p2align 4 +L(last_4x_vec): + /* Test first 2x VEC normally. */ + testl %eax, %eax + jnz L(last_vec_x1) + + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + /* Normalize length. */ + andl $(VEC_SIZE * 4 - 1), %esi + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + subl $(VEC_SIZE * 3), %esi + jb L(max) + + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi + addl $(VEC_SIZE * 3 + 1), %eax + addq %rdi, %rax # ifdef USE_AS_WCSLEN shrq $2, %rax # endif VZEROUPPER_RETURN - .p2align 4 -L(zero): - xorl %eax, %eax - ret -# endif .p2align 4 -L(first_vec_x0): +L(last_vec_x1): + /* essentially duplicates of first_vec_x1 but use 64 bit + instructions. */ tzcntl %eax, %eax + subq %rdx, %rdi + incl %eax addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif +# endif VZEROUPPER_RETURN .p2align 4 -L(first_vec_x1): +L(last_vec_x2): + /* essentially duplicates of first_vec_x1 but use 64 bit + instructions. */ tzcntl %eax, %eax - addq $VEC_SIZE, %rax + subq %rdx, %rdi + addl $(VEC_SIZE + 1), %eax addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif +# endif VZEROUPPER_RETURN .p2align 4 -L(first_vec_x2): +L(last_vec_x3): tzcntl %eax, %eax - addq $(VEC_SIZE * 2), %rax + subl $(VEC_SIZE * 2), %esi + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max_end) + subq %rdx, %rdi + addl $(VEC_SIZE * 2 + 1), %eax addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif +# endif + VZEROUPPER_RETURN +L(max_end): + movq %r8, %rax VZEROUPPER_RETURN +# endif + /* Cold case for crossing page with first load. */ .p2align 4 -L(4x_vec_end): - VPCMPEQ %ymm1, %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - VPCMPEQ %ymm2, %ymm0, %ymm2 - vpmovmskb %ymm2, %eax +L(cross_page_boundary): + /* Align data to VEC_SIZE - 1. */ + orq $(VEC_SIZE - 1), %rdi + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT + so no need to manually mod rdx. */ + sarxl %edx, %eax, %eax +# ifdef USE_AS_STRNLEN testl %eax, %eax - jnz L(first_vec_x1) - VPCMPEQ %ymm3, %ymm0, %ymm3 - vpmovmskb %ymm3, %eax + jnz L(cross_page_less_vec) + leaq 1(%rdi), %rcx + subq %rdx, %rcx + /* Check length. */ + cmpq %rsi, %rcx + jb L(cross_page_continue) + movq %r8, %rax +# else testl %eax, %eax - jnz L(first_vec_x2) - VPCMPEQ %ymm4, %ymm0, %ymm4 - vpmovmskb %ymm4, %eax -L(first_vec_x3): + jz L(cross_page_continue) tzcntl %eax, %eax - addq $(VEC_SIZE * 3), %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif # endif +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + +# ifdef USE_AS_STRNLEN + .p2align 4 +L(cross_page_less_vec): + tzcntl %eax, %eax + cmpq %rax, %rsi + cmovb %esi, %eax +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif VZEROUPPER_RETURN +# endif END (STRLEN) #endif -- 2.29.2 ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2021-04-19 23:36 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Noah Goldstein @ 2021-04-20 1:01 ` H.J. Lu 2022-09-25 8:19 ` Aurelien Jarno 1 sibling, 0 replies; 24+ messages in thread From: H.J. Lu @ 2021-04-20 1:01 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha, carlos On Mon, Apr 19, 2021 at 4:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > No bug. This commit optimizes strlen-avx2.S. The optimizations are > mostly small things but they add up to roughly 10-30% performance > improvement for strlen. The results for strnlen are bit more > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > are all passing. > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > --- > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- > 2 files changed, 334 insertions(+), 214 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index c377cab629..651b32908e 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > IFUNC_IMPL (i, name, strlen, > IFUNC_IMPL_ADD (array, i, strlen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __strlen_avx2) > IFUNC_IMPL_ADD (array, i, strlen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __strlen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, strlen, > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/strnlen.c. */ > IFUNC_IMPL (i, name, strnlen, > IFUNC_IMPL_ADD (array, i, strnlen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __strnlen_avx2) > IFUNC_IMPL_ADD (array, i, strnlen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __strnlen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, strnlen, > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > IFUNC_IMPL (i, name, wcslen, > IFUNC_IMPL_ADD (array, i, wcslen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __wcslen_avx2) > IFUNC_IMPL_ADD (array, i, wcslen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __wcslen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, wcslen, > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ > IFUNC_IMPL (i, name, wcsnlen, > IFUNC_IMPL_ADD (array, i, wcsnlen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __wcsnlen_avx2) > IFUNC_IMPL_ADD (array, i, wcsnlen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __wcsnlen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, wcsnlen, > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S > index 1caae9e6bc..bd2e6ee44a 100644 > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S > @@ -27,9 +27,11 @@ > # ifdef USE_AS_WCSLEN > # define VPCMPEQ vpcmpeqd > # define VPMINU vpminud > +# define CHAR_SIZE 4 > # else > # define VPCMPEQ vpcmpeqb > # define VPMINU vpminub > +# define CHAR_SIZE 1 > # endif > > # ifndef VZEROUPPER > @@ -41,349 +43,459 @@ > # endif > > # define VEC_SIZE 32 > +# define PAGE_SIZE 4096 > > .section SECTION(.text),"ax",@progbits > ENTRY (STRLEN) > # ifdef USE_AS_STRNLEN > - /* Check for zero length. */ > + /* Check zero length. */ > test %RSI_LP, %RSI_LP > jz L(zero) > + /* Store max len in R8_LP before adjusting if using WCSLEN. */ > + mov %RSI_LP, %R8_LP > # ifdef USE_AS_WCSLEN > shl $2, %RSI_LP > # elif defined __ILP32__ > /* Clear the upper 32 bits. */ > movl %esi, %esi > # endif > - mov %RSI_LP, %R8_LP > # endif > - movl %edi, %ecx > + movl %edi, %eax > movq %rdi, %rdx > vpxor %xmm0, %xmm0, %xmm0 > - > + /* Clear high bits from edi. Only keeping bits relevant to page > + cross check. */ > + andl $(PAGE_SIZE - 1), %eax > /* Check if we may cross page boundary with one vector load. */ > - andl $(2 * VEC_SIZE - 1), %ecx > - cmpl $VEC_SIZE, %ecx > - ja L(cros_page_boundary) > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(cross_page_boundary) > > /* Check the first VEC_SIZE bytes. */ > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - > + VPCMPEQ (%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > # ifdef USE_AS_STRNLEN > - jnz L(first_vec_x0_check) > - /* Adjust length and check the end of data. */ > - subq $VEC_SIZE, %rsi > - jbe L(max) > -# else > - jnz L(first_vec_x0) > + /* If length < VEC_SIZE handle special. */ > + cmpq $VEC_SIZE, %rsi > + jbe L(first_vec_x0) > # endif > - > - /* Align data for aligned loads in the loop. */ > - addq $VEC_SIZE, %rdi > - andl $(VEC_SIZE - 1), %ecx > - andq $-VEC_SIZE, %rdi > + /* If empty continue to aligned_more. Otherwise return bit > + position of first match. */ > + testl %eax, %eax > + jz L(aligned_more) > + tzcntl %eax, %eax > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > # ifdef USE_AS_STRNLEN > - /* Adjust length. */ > - addq %rcx, %rsi > +L(zero): > + xorl %eax, %eax > + ret > > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > + .p2align 4 > +L(first_vec_x0): > + /* Set bit for max len so that tzcnt will return min of max len > + and position of first match. */ > + btsq %rsi, %rax > + tzcntl %eax, %eax > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > # endif > - jmp L(more_4x_vec) > > .p2align 4 > -L(cros_page_boundary): > - andl $(VEC_SIZE - 1), %ecx > - andq $-VEC_SIZE, %rdi > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - /* Remove the leading bytes. */ > - sarl %cl, %eax > - testl %eax, %eax > - jz L(aligned_more) > +L(first_vec_x1): > tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE * 4 + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + incl %edi > + addl %edi, %eax > # endif > - addq %rdi, %rax > - addq %rcx, %rax > - subq %rdx, %rax > # ifdef USE_AS_WCSLEN > - shrq $2, %rax > + shrl $2, %eax > # endif > -L(return_vzeroupper): > - ZERO_UPPER_VEC_REGISTERS_RETURN > + VZEROUPPER_RETURN > > .p2align 4 > -L(aligned_more): > +L(first_vec_x2): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > - to void possible addition overflow. */ > - negq %rcx > - addq $VEC_SIZE, %rcx > - > - /* Check the end of data. */ > - subq %rcx, %rsi > - jbe L(max) > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE * 3 + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + addl $(VEC_SIZE + 1), %edi > + addl %edi, %eax > # endif > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > - addq $VEC_SIZE, %rdi > + .p2align 4 > +L(first_vec_x3): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > +# ifdef USE_AS_STRNLEN > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE * 2 + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + addl $(VEC_SIZE * 2 + 1), %edi > + addl %edi, %eax > +# endif > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > + .p2align 4 > +L(first_vec_x4): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + addl $(VEC_SIZE * 3 + 1), %edi > + addl %edi, %eax > # endif > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > -L(more_4x_vec): > + .p2align 5 > +L(aligned_more): > + /* Align data to VEC_SIZE - 1. This is the same number of > + instructions as using andq with -VEC_SIZE but saves 4 bytes of > + code on the x4 check. */ > + orq $(VEC_SIZE - 1), %rdi > +L(cross_page_continue): > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > since data is only aligned to VEC_SIZE. */ > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > - > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > +# ifdef USE_AS_STRNLEN > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because > + it simplies the logic in last_4x_vec_or_less. */ > + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx > + subq %rdx, %rcx > +# endif > + /* Load first VEC regardless. */ > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > +# ifdef USE_AS_STRNLEN > + /* Adjust length. If near end handle specially. */ > + subq %rcx, %rsi > + jb L(last_4x_vec_or_less) > +# endif > + vpmovmskb %ymm1, %eax > testl %eax, %eax > jnz L(first_vec_x1) > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > testl %eax, %eax > jnz L(first_vec_x2) > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > testl %eax, %eax > jnz L(first_vec_x3) > > - addq $(VEC_SIZE * 4), %rdi > - > -# ifdef USE_AS_STRNLEN > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > -# endif > - > - /* Align data to 4 * VEC_SIZE. */ > - movq %rdi, %rcx > - andl $(4 * VEC_SIZE - 1), %ecx > - andq $-(4 * VEC_SIZE), %rdi > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + testl %eax, %eax > + jnz L(first_vec_x4) > > + /* Align data to VEC_SIZE * 4 - 1. */ > # ifdef USE_AS_STRNLEN > - /* Adjust length. */ > + /* Before adjusting length check if at last VEC_SIZE * 4. */ > + cmpq $(VEC_SIZE * 4 - 1), %rsi > + jbe L(last_4x_vec_or_less_load) > + incq %rdi > + movl %edi, %ecx > + orq $(VEC_SIZE * 4 - 1), %rdi > + andl $(VEC_SIZE * 4 - 1), %ecx > + /* Readjust length. */ > addq %rcx, %rsi > +# else > + incq %rdi > + orq $(VEC_SIZE * 4 - 1), %rdi > # endif > - > + /* Compare 4 * VEC at a time forward. */ > .p2align 4 > L(loop_4x_vec): > - /* Compare 4 * VEC at a time forward. */ > - vmovdqa (%rdi), %ymm1 > - vmovdqa VEC_SIZE(%rdi), %ymm2 > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > - VPMINU %ymm1, %ymm2, %ymm5 > - VPMINU %ymm3, %ymm4, %ymm6 > - VPMINU %ymm5, %ymm6, %ymm5 > - > - VPCMPEQ %ymm5, %ymm0, %ymm5 > - vpmovmskb %ymm5, %eax > - testl %eax, %eax > - jnz L(4x_vec_end) > - > - addq $(VEC_SIZE * 4), %rdi > - > -# ifndef USE_AS_STRNLEN > - jmp L(loop_4x_vec) > -# else > +# ifdef USE_AS_STRNLEN > + /* Break if at end of length. */ > subq $(VEC_SIZE * 4), %rsi > - ja L(loop_4x_vec) > - > -L(last_4x_vec_or_less): > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > - addl $(VEC_SIZE * 2), %esi > - jle L(last_2x_vec) > + jb L(last_4x_vec_or_less_cmpeq) > +# endif > + /* Save some code size by microfusing VPMINU with the load. Since > + the matches in ymm2/ymm4 can only be returned if there where no > + matches in ymm1/ymm3 respectively there is no issue with overlap. > + */ > + vmovdqa 1(%rdi), %ymm1 > + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 > + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 > + > + VPMINU %ymm2, %ymm4, %ymm5 > + VPCMPEQ %ymm5, %ymm0, %ymm5 > + vpmovmskb %ymm5, %ecx > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > + subq $-(VEC_SIZE * 4), %rdi > + testl %ecx, %ecx > + jz L(loop_4x_vec) > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x1) > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ %ymm1, %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + subq %rdx, %rdi > testl %eax, %eax > + jnz L(last_vec_return_x0) > > - jnz L(first_vec_x2_check) > - subl $VEC_SIZE, %esi > - jle L(max) > - > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ %ymm2, %ymm0, %ymm2 > + vpmovmskb %ymm2, %eax > testl %eax, %eax > - > - jnz L(first_vec_x3_check) > - movq %r8, %rax > -# ifdef USE_AS_WCSLEN > + jnz L(last_vec_return_x1) > + > + /* Combine last 2 VEC. */ > + VPCMPEQ %ymm3, %ymm0, %ymm3 > + vpmovmskb %ymm3, %eax > + /* rcx has combined result from all 4 VEC. It will only be used if > + the first 3 other VEC all did not contain a match. */ > + salq $32, %rcx > + orq %rcx, %rax > + tzcntq %rax, %rax > + subq $(VEC_SIZE * 2 - 1), %rdi > + addq %rdi, %rax > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > + > +# ifdef USE_AS_STRNLEN > .p2align 4 > -L(last_2x_vec): > - addl $(VEC_SIZE * 2), %esi > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > +L(last_4x_vec_or_less_load): > + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ > + subq $-(VEC_SIZE * 4), %rdi > +L(last_4x_vec_or_less_cmpeq): > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > +L(last_4x_vec_or_less): > > - jnz L(first_vec_x0_check) > - subl $VEC_SIZE, %esi > - jle L(max) > + vpmovmskb %ymm1, %eax > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > + VEC_SIZE * 4. */ > + testl $(VEC_SIZE * 2), %esi > + jnz L(last_4x_vec) > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + /* length may have been negative or positive by an offset of > + VEC_SIZE * 4 depending on where this was called from. This fixes > + that. */ > + andl $(VEC_SIZE * 4 - 1), %esi > testl %eax, %eax > - jnz L(first_vec_x1_check) > - movq %r8, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > -# endif > - VZEROUPPER_RETURN > + jnz L(last_vec_x1_check) > > - .p2align 4 > -L(first_vec_x0_check): > + subl $VEC_SIZE, %esi > + jb L(max) > + > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > tzcntl %eax, %eax > /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > + cmpl %eax, %esi > + jb L(max) > + subq %rdx, %rdi > + addl $(VEC_SIZE + 1), %eax > addq %rdi, %rax > - subq %rdx, %rax > # ifdef USE_AS_WCSLEN > shrq $2, %rax > # endif > VZEROUPPER_RETURN > +# endif > > .p2align 4 > -L(first_vec_x1_check): > +L(last_vec_return_x0): > tzcntl %eax, %eax > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $VEC_SIZE, %rax > + subq $(VEC_SIZE * 4 - 1), %rdi > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > .p2align 4 > -L(first_vec_x2_check): > +L(last_vec_return_x1): > tzcntl %eax, %eax > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $(VEC_SIZE * 2), %rax > + subq $(VEC_SIZE * 3 - 1), %rdi > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > +# ifdef USE_AS_STRNLEN > .p2align 4 > -L(first_vec_x3_check): > +L(last_vec_x1_check): > + > tzcntl %eax, %eax > /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $(VEC_SIZE * 3), %rax > + cmpl %eax, %esi > + jb L(max) > + subq %rdx, %rdi > + incl %eax > addq %rdi, %rax > - subq %rdx, %rax > # ifdef USE_AS_WCSLEN > shrq $2, %rax > # endif > VZEROUPPER_RETURN > > - .p2align 4 > L(max): > movq %r8, %rax > + VZEROUPPER_RETURN > + > + .p2align 4 > +L(last_4x_vec): > + /* Test first 2x VEC normally. */ > + testl %eax, %eax > + jnz L(last_vec_x1) > + > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + testl %eax, %eax > + jnz L(last_vec_x2) > + > + /* Normalize length. */ > + andl $(VEC_SIZE * 4 - 1), %esi > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + testl %eax, %eax > + jnz L(last_vec_x3) > + > + subl $(VEC_SIZE * 3), %esi > + jb L(max) > + > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + tzcntl %eax, %eax > + /* Check the end of data. */ > + cmpl %eax, %esi > + jb L(max) > + subq %rdx, %rdi > + addl $(VEC_SIZE * 3 + 1), %eax > + addq %rdi, %rax > # ifdef USE_AS_WCSLEN > shrq $2, %rax > # endif > VZEROUPPER_RETURN > > - .p2align 4 > -L(zero): > - xorl %eax, %eax > - ret > -# endif > > .p2align 4 > -L(first_vec_x0): > +L(last_vec_x1): > + /* essentially duplicates of first_vec_x1 but use 64 bit > + instructions. */ > tzcntl %eax, %eax > + subq %rdx, %rdi > + incl %eax > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > .p2align 4 > -L(first_vec_x1): > +L(last_vec_x2): > + /* essentially duplicates of first_vec_x1 but use 64 bit > + instructions. */ > tzcntl %eax, %eax > - addq $VEC_SIZE, %rax > + subq %rdx, %rdi > + addl $(VEC_SIZE + 1), %eax > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > .p2align 4 > -L(first_vec_x2): > +L(last_vec_x3): > tzcntl %eax, %eax > - addq $(VEC_SIZE * 2), %rax > + subl $(VEC_SIZE * 2), %esi > + /* Check the end of data. */ > + cmpl %eax, %esi > + jb L(max_end) > + subq %rdx, %rdi > + addl $(VEC_SIZE * 2 + 1), %eax > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > + VZEROUPPER_RETURN > +L(max_end): > + movq %r8, %rax > VZEROUPPER_RETURN > +# endif > > + /* Cold case for crossing page with first load. */ > .p2align 4 > -L(4x_vec_end): > - VPCMPEQ %ymm1, %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > - VPCMPEQ %ymm2, %ymm0, %ymm2 > - vpmovmskb %ymm2, %eax > +L(cross_page_boundary): > + /* Align data to VEC_SIZE - 1. */ > + orq $(VEC_SIZE - 1), %rdi > + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > + so no need to manually mod rdx. */ > + sarxl %edx, %eax, %eax > +# ifdef USE_AS_STRNLEN > testl %eax, %eax > - jnz L(first_vec_x1) > - VPCMPEQ %ymm3, %ymm0, %ymm3 > - vpmovmskb %ymm3, %eax > + jnz L(cross_page_less_vec) > + leaq 1(%rdi), %rcx > + subq %rdx, %rcx > + /* Check length. */ > + cmpq %rsi, %rcx > + jb L(cross_page_continue) > + movq %r8, %rax > +# else > testl %eax, %eax > - jnz L(first_vec_x2) > - VPCMPEQ %ymm4, %ymm0, %ymm4 > - vpmovmskb %ymm4, %eax > -L(first_vec_x3): > + jz L(cross_page_continue) > tzcntl %eax, %eax > - addq $(VEC_SIZE * 3), %rax > - addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > # endif > +L(return_vzeroupper): > + ZERO_UPPER_VEC_REGISTERS_RETURN > + > +# ifdef USE_AS_STRNLEN > + .p2align 4 > +L(cross_page_less_vec): > + tzcntl %eax, %eax > + cmpq %rax, %rsi > + cmovb %esi, %eax > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > VZEROUPPER_RETURN > +# endif > > END (STRLEN) > #endif > -- > 2.29.2 > LGTM. I am checking it in for you. Thanks. -- H.J. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2021-04-19 23:36 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Noah Goldstein 2021-04-20 1:01 ` H.J. Lu @ 2022-09-25 8:19 ` Aurelien Jarno 2022-09-25 14:00 ` Noah Goldstein 1 sibling, 1 reply; 24+ messages in thread From: Aurelien Jarno @ 2022-09-25 8:19 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote: > No bug. This commit optimizes strlen-avx2.S. The optimizations are > mostly small things but they add up to roughly 10-30% performance > improvement for strlen. The results for strnlen are bit more > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > are all passing. > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > --- > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- > 2 files changed, 334 insertions(+), 214 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index c377cab629..651b32908e 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > IFUNC_IMPL (i, name, strlen, > IFUNC_IMPL_ADD (array, i, strlen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __strlen_avx2) > IFUNC_IMPL_ADD (array, i, strlen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __strlen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, strlen, > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/strnlen.c. */ > IFUNC_IMPL (i, name, strnlen, > IFUNC_IMPL_ADD (array, i, strnlen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __strnlen_avx2) > IFUNC_IMPL_ADD (array, i, strnlen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __strnlen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, strnlen, > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > IFUNC_IMPL (i, name, wcslen, > IFUNC_IMPL_ADD (array, i, wcslen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __wcslen_avx2) > IFUNC_IMPL_ADD (array, i, wcslen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __wcslen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, wcslen, > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ > IFUNC_IMPL (i, name, wcsnlen, > IFUNC_IMPL_ADD (array, i, wcsnlen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __wcsnlen_avx2) > IFUNC_IMPL_ADD (array, i, wcsnlen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __wcsnlen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, wcsnlen, > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S > index 1caae9e6bc..bd2e6ee44a 100644 > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S > @@ -27,9 +27,11 @@ > # ifdef USE_AS_WCSLEN > # define VPCMPEQ vpcmpeqd > # define VPMINU vpminud > +# define CHAR_SIZE 4 > # else > # define VPCMPEQ vpcmpeqb > # define VPMINU vpminub > +# define CHAR_SIZE 1 > # endif > > # ifndef VZEROUPPER > @@ -41,349 +43,459 @@ > # endif > > # define VEC_SIZE 32 > +# define PAGE_SIZE 4096 > > .section SECTION(.text),"ax",@progbits > ENTRY (STRLEN) > # ifdef USE_AS_STRNLEN > - /* Check for zero length. */ > + /* Check zero length. */ > test %RSI_LP, %RSI_LP > jz L(zero) > + /* Store max len in R8_LP before adjusting if using WCSLEN. */ > + mov %RSI_LP, %R8_LP > # ifdef USE_AS_WCSLEN > shl $2, %RSI_LP > # elif defined __ILP32__ > /* Clear the upper 32 bits. */ > movl %esi, %esi > # endif > - mov %RSI_LP, %R8_LP > # endif > - movl %edi, %ecx > + movl %edi, %eax > movq %rdi, %rdx > vpxor %xmm0, %xmm0, %xmm0 > - > + /* Clear high bits from edi. Only keeping bits relevant to page > + cross check. */ > + andl $(PAGE_SIZE - 1), %eax > /* Check if we may cross page boundary with one vector load. */ > - andl $(2 * VEC_SIZE - 1), %ecx > - cmpl $VEC_SIZE, %ecx > - ja L(cros_page_boundary) > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(cross_page_boundary) > > /* Check the first VEC_SIZE bytes. */ > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - > + VPCMPEQ (%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > # ifdef USE_AS_STRNLEN > - jnz L(first_vec_x0_check) > - /* Adjust length and check the end of data. */ > - subq $VEC_SIZE, %rsi > - jbe L(max) > -# else > - jnz L(first_vec_x0) > + /* If length < VEC_SIZE handle special. */ > + cmpq $VEC_SIZE, %rsi > + jbe L(first_vec_x0) > # endif > - > - /* Align data for aligned loads in the loop. */ > - addq $VEC_SIZE, %rdi > - andl $(VEC_SIZE - 1), %ecx > - andq $-VEC_SIZE, %rdi > + /* If empty continue to aligned_more. Otherwise return bit > + position of first match. */ > + testl %eax, %eax > + jz L(aligned_more) > + tzcntl %eax, %eax > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > # ifdef USE_AS_STRNLEN > - /* Adjust length. */ > - addq %rcx, %rsi > +L(zero): > + xorl %eax, %eax > + ret > > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > + .p2align 4 > +L(first_vec_x0): > + /* Set bit for max len so that tzcnt will return min of max len > + and position of first match. */ > + btsq %rsi, %rax > + tzcntl %eax, %eax > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > # endif > - jmp L(more_4x_vec) > > .p2align 4 > -L(cros_page_boundary): > - andl $(VEC_SIZE - 1), %ecx > - andq $-VEC_SIZE, %rdi > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - /* Remove the leading bytes. */ > - sarl %cl, %eax > - testl %eax, %eax > - jz L(aligned_more) > +L(first_vec_x1): > tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE * 4 + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + incl %edi > + addl %edi, %eax > # endif > - addq %rdi, %rax > - addq %rcx, %rax > - subq %rdx, %rax > # ifdef USE_AS_WCSLEN > - shrq $2, %rax > + shrl $2, %eax > # endif > -L(return_vzeroupper): > - ZERO_UPPER_VEC_REGISTERS_RETURN > + VZEROUPPER_RETURN > > .p2align 4 > -L(aligned_more): > +L(first_vec_x2): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > - to void possible addition overflow. */ > - negq %rcx > - addq $VEC_SIZE, %rcx > - > - /* Check the end of data. */ > - subq %rcx, %rsi > - jbe L(max) > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE * 3 + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + addl $(VEC_SIZE + 1), %edi > + addl %edi, %eax > # endif > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > - addq $VEC_SIZE, %rdi > + .p2align 4 > +L(first_vec_x3): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > +# ifdef USE_AS_STRNLEN > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE * 2 + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + addl $(VEC_SIZE * 2 + 1), %edi > + addl %edi, %eax > +# endif > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > + .p2align 4 > +L(first_vec_x4): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + addl $(VEC_SIZE * 3 + 1), %edi > + addl %edi, %eax > # endif > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > -L(more_4x_vec): > + .p2align 5 > +L(aligned_more): > + /* Align data to VEC_SIZE - 1. This is the same number of > + instructions as using andq with -VEC_SIZE but saves 4 bytes of > + code on the x4 check. */ > + orq $(VEC_SIZE - 1), %rdi > +L(cross_page_continue): > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > since data is only aligned to VEC_SIZE. */ > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > - > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > +# ifdef USE_AS_STRNLEN > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because > + it simplies the logic in last_4x_vec_or_less. */ > + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx > + subq %rdx, %rcx > +# endif > + /* Load first VEC regardless. */ > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > +# ifdef USE_AS_STRNLEN > + /* Adjust length. If near end handle specially. */ > + subq %rcx, %rsi > + jb L(last_4x_vec_or_less) > +# endif > + vpmovmskb %ymm1, %eax > testl %eax, %eax > jnz L(first_vec_x1) > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > testl %eax, %eax > jnz L(first_vec_x2) > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > testl %eax, %eax > jnz L(first_vec_x3) > > - addq $(VEC_SIZE * 4), %rdi > - > -# ifdef USE_AS_STRNLEN > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > -# endif > - > - /* Align data to 4 * VEC_SIZE. */ > - movq %rdi, %rcx > - andl $(4 * VEC_SIZE - 1), %ecx > - andq $-(4 * VEC_SIZE), %rdi > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + testl %eax, %eax > + jnz L(first_vec_x4) > > + /* Align data to VEC_SIZE * 4 - 1. */ > # ifdef USE_AS_STRNLEN > - /* Adjust length. */ > + /* Before adjusting length check if at last VEC_SIZE * 4. */ > + cmpq $(VEC_SIZE * 4 - 1), %rsi > + jbe L(last_4x_vec_or_less_load) > + incq %rdi > + movl %edi, %ecx > + orq $(VEC_SIZE * 4 - 1), %rdi > + andl $(VEC_SIZE * 4 - 1), %ecx > + /* Readjust length. */ > addq %rcx, %rsi > +# else > + incq %rdi > + orq $(VEC_SIZE * 4 - 1), %rdi > # endif > - > + /* Compare 4 * VEC at a time forward. */ > .p2align 4 > L(loop_4x_vec): > - /* Compare 4 * VEC at a time forward. */ > - vmovdqa (%rdi), %ymm1 > - vmovdqa VEC_SIZE(%rdi), %ymm2 > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > - VPMINU %ymm1, %ymm2, %ymm5 > - VPMINU %ymm3, %ymm4, %ymm6 > - VPMINU %ymm5, %ymm6, %ymm5 > - > - VPCMPEQ %ymm5, %ymm0, %ymm5 > - vpmovmskb %ymm5, %eax > - testl %eax, %eax > - jnz L(4x_vec_end) > - > - addq $(VEC_SIZE * 4), %rdi > - > -# ifndef USE_AS_STRNLEN > - jmp L(loop_4x_vec) > -# else > +# ifdef USE_AS_STRNLEN > + /* Break if at end of length. */ > subq $(VEC_SIZE * 4), %rsi > - ja L(loop_4x_vec) > - > -L(last_4x_vec_or_less): > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > - addl $(VEC_SIZE * 2), %esi > - jle L(last_2x_vec) > + jb L(last_4x_vec_or_less_cmpeq) > +# endif > + /* Save some code size by microfusing VPMINU with the load. Since > + the matches in ymm2/ymm4 can only be returned if there where no > + matches in ymm1/ymm3 respectively there is no issue with overlap. > + */ > + vmovdqa 1(%rdi), %ymm1 > + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 > + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 > + > + VPMINU %ymm2, %ymm4, %ymm5 > + VPCMPEQ %ymm5, %ymm0, %ymm5 > + vpmovmskb %ymm5, %ecx > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > + subq $-(VEC_SIZE * 4), %rdi > + testl %ecx, %ecx > + jz L(loop_4x_vec) > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x1) > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ %ymm1, %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + subq %rdx, %rdi > testl %eax, %eax > + jnz L(last_vec_return_x0) > > - jnz L(first_vec_x2_check) > - subl $VEC_SIZE, %esi > - jle L(max) > - > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ %ymm2, %ymm0, %ymm2 > + vpmovmskb %ymm2, %eax > testl %eax, %eax > - > - jnz L(first_vec_x3_check) > - movq %r8, %rax > -# ifdef USE_AS_WCSLEN > + jnz L(last_vec_return_x1) > + > + /* Combine last 2 VEC. */ > + VPCMPEQ %ymm3, %ymm0, %ymm3 > + vpmovmskb %ymm3, %eax > + /* rcx has combined result from all 4 VEC. It will only be used if > + the first 3 other VEC all did not contain a match. */ > + salq $32, %rcx > + orq %rcx, %rax > + tzcntq %rax, %rax > + subq $(VEC_SIZE * 2 - 1), %rdi > + addq %rdi, %rax > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > + > +# ifdef USE_AS_STRNLEN > .p2align 4 > -L(last_2x_vec): > - addl $(VEC_SIZE * 2), %esi > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > +L(last_4x_vec_or_less_load): > + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ > + subq $-(VEC_SIZE * 4), %rdi > +L(last_4x_vec_or_less_cmpeq): > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > +L(last_4x_vec_or_less): > > - jnz L(first_vec_x0_check) > - subl $VEC_SIZE, %esi > - jle L(max) > + vpmovmskb %ymm1, %eax > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > + VEC_SIZE * 4. */ > + testl $(VEC_SIZE * 2), %esi > + jnz L(last_4x_vec) > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + /* length may have been negative or positive by an offset of > + VEC_SIZE * 4 depending on where this was called from. This fixes > + that. */ > + andl $(VEC_SIZE * 4 - 1), %esi > testl %eax, %eax > - jnz L(first_vec_x1_check) > - movq %r8, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > -# endif > - VZEROUPPER_RETURN > + jnz L(last_vec_x1_check) > > - .p2align 4 > -L(first_vec_x0_check): > + subl $VEC_SIZE, %esi > + jb L(max) > + > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > tzcntl %eax, %eax > /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > + cmpl %eax, %esi > + jb L(max) > + subq %rdx, %rdi > + addl $(VEC_SIZE + 1), %eax > addq %rdi, %rax > - subq %rdx, %rax > # ifdef USE_AS_WCSLEN > shrq $2, %rax > # endif > VZEROUPPER_RETURN > +# endif > > .p2align 4 > -L(first_vec_x1_check): > +L(last_vec_return_x0): > tzcntl %eax, %eax > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $VEC_SIZE, %rax > + subq $(VEC_SIZE * 4 - 1), %rdi > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > .p2align 4 > -L(first_vec_x2_check): > +L(last_vec_return_x1): > tzcntl %eax, %eax > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $(VEC_SIZE * 2), %rax > + subq $(VEC_SIZE * 3 - 1), %rdi > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > +# ifdef USE_AS_STRNLEN > .p2align 4 > -L(first_vec_x3_check): > +L(last_vec_x1_check): > + > tzcntl %eax, %eax > /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $(VEC_SIZE * 3), %rax > + cmpl %eax, %esi > + jb L(max) > + subq %rdx, %rdi > + incl %eax > addq %rdi, %rax > - subq %rdx, %rax > # ifdef USE_AS_WCSLEN > shrq $2, %rax > # endif > VZEROUPPER_RETURN > > - .p2align 4 > L(max): > movq %r8, %rax > + VZEROUPPER_RETURN > + > + .p2align 4 > +L(last_4x_vec): > + /* Test first 2x VEC normally. */ > + testl %eax, %eax > + jnz L(last_vec_x1) > + > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + testl %eax, %eax > + jnz L(last_vec_x2) > + > + /* Normalize length. */ > + andl $(VEC_SIZE * 4 - 1), %esi > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + testl %eax, %eax > + jnz L(last_vec_x3) > + > + subl $(VEC_SIZE * 3), %esi > + jb L(max) > + > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + tzcntl %eax, %eax > + /* Check the end of data. */ > + cmpl %eax, %esi > + jb L(max) > + subq %rdx, %rdi > + addl $(VEC_SIZE * 3 + 1), %eax > + addq %rdi, %rax > # ifdef USE_AS_WCSLEN > shrq $2, %rax > # endif > VZEROUPPER_RETURN > > - .p2align 4 > -L(zero): > - xorl %eax, %eax > - ret > -# endif > > .p2align 4 > -L(first_vec_x0): > +L(last_vec_x1): > + /* essentially duplicates of first_vec_x1 but use 64 bit > + instructions. */ > tzcntl %eax, %eax > + subq %rdx, %rdi > + incl %eax > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > .p2align 4 > -L(first_vec_x1): > +L(last_vec_x2): > + /* essentially duplicates of first_vec_x1 but use 64 bit > + instructions. */ > tzcntl %eax, %eax > - addq $VEC_SIZE, %rax > + subq %rdx, %rdi > + addl $(VEC_SIZE + 1), %eax > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > .p2align 4 > -L(first_vec_x2): > +L(last_vec_x3): > tzcntl %eax, %eax > - addq $(VEC_SIZE * 2), %rax > + subl $(VEC_SIZE * 2), %esi > + /* Check the end of data. */ > + cmpl %eax, %esi > + jb L(max_end) > + subq %rdx, %rdi > + addl $(VEC_SIZE * 2 + 1), %eax > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > + VZEROUPPER_RETURN > +L(max_end): > + movq %r8, %rax > VZEROUPPER_RETURN > +# endif > > + /* Cold case for crossing page with first load. */ > .p2align 4 > -L(4x_vec_end): > - VPCMPEQ %ymm1, %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > - VPCMPEQ %ymm2, %ymm0, %ymm2 > - vpmovmskb %ymm2, %eax > +L(cross_page_boundary): > + /* Align data to VEC_SIZE - 1. */ > + orq $(VEC_SIZE - 1), %rdi > + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > + so no need to manually mod rdx. */ > + sarxl %edx, %eax, %eax This is a BMI2 instruction, which is not necessary available when AVX2 is available. This causes SIGILL on some CPU. I have reported that in https://sourceware.org/bugzilla/show_bug.cgi?id=29611 Regards Aurelien -- Aurelien Jarno GPG: 4096R/1DDD8C9B aurelien@aurel32.net http://www.aurel32.net ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-09-25 8:19 ` Aurelien Jarno @ 2022-09-25 14:00 ` Noah Goldstein 2022-09-28 13:54 ` Sunil Pandey 0 siblings, 1 reply; 24+ messages in thread From: Noah Goldstein @ 2022-09-25 14:00 UTC (permalink / raw) To: Noah Goldstein, GNU C Library On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote: > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote: > > No bug. This commit optimizes strlen-avx2.S. The optimizations are > > mostly small things but they add up to roughly 10-30% performance > > improvement for strlen. The results for strnlen are bit more > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > > are all passing. > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > --- > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > > sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- > > 2 files changed, 334 insertions(+), 214 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index c377cab629..651b32908e 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > > IFUNC_IMPL (i, name, strlen, > > IFUNC_IMPL_ADD (array, i, strlen, > > - CPU_FEATURE_USABLE (AVX2), > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2)), > > __strlen_avx2) > > IFUNC_IMPL_ADD (array, i, strlen, > > (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2) > > && CPU_FEATURE_USABLE (RTM)), > > __strlen_avx2_rtm) > > IFUNC_IMPL_ADD (array, i, strlen, > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/strnlen.c. */ > > IFUNC_IMPL (i, name, strnlen, > > IFUNC_IMPL_ADD (array, i, strnlen, > > - CPU_FEATURE_USABLE (AVX2), > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2)), > > __strnlen_avx2) > > IFUNC_IMPL_ADD (array, i, strnlen, > > (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2) > > && CPU_FEATURE_USABLE (RTM)), > > __strnlen_avx2_rtm) > > IFUNC_IMPL_ADD (array, i, strnlen, > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > > IFUNC_IMPL (i, name, wcslen, > > IFUNC_IMPL_ADD (array, i, wcslen, > > - CPU_FEATURE_USABLE (AVX2), > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2)), > > __wcslen_avx2) > > IFUNC_IMPL_ADD (array, i, wcslen, > > (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2) > > && CPU_FEATURE_USABLE (RTM)), > > __wcslen_avx2_rtm) > > IFUNC_IMPL_ADD (array, i, wcslen, > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ > > IFUNC_IMPL (i, name, wcsnlen, > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > - CPU_FEATURE_USABLE (AVX2), > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2)), > > __wcsnlen_avx2) > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2) > > && CPU_FEATURE_USABLE (RTM)), > > __wcsnlen_avx2_rtm) > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S > > index 1caae9e6bc..bd2e6ee44a 100644 > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S > > @@ -27,9 +27,11 @@ > > # ifdef USE_AS_WCSLEN > > # define VPCMPEQ vpcmpeqd > > # define VPMINU vpminud > > +# define CHAR_SIZE 4 > > # else > > # define VPCMPEQ vpcmpeqb > > # define VPMINU vpminub > > +# define CHAR_SIZE 1 > > # endif > > > > # ifndef VZEROUPPER > > @@ -41,349 +43,459 @@ > > # endif > > > > # define VEC_SIZE 32 > > +# define PAGE_SIZE 4096 > > > > .section SECTION(.text),"ax",@progbits > > ENTRY (STRLEN) > > # ifdef USE_AS_STRNLEN > > - /* Check for zero length. */ > > + /* Check zero length. */ > > test %RSI_LP, %RSI_LP > > jz L(zero) > > + /* Store max len in R8_LP before adjusting if using WCSLEN. */ > > + mov %RSI_LP, %R8_LP > > # ifdef USE_AS_WCSLEN > > shl $2, %RSI_LP > > # elif defined __ILP32__ > > /* Clear the upper 32 bits. */ > > movl %esi, %esi > > # endif > > - mov %RSI_LP, %R8_LP > > # endif > > - movl %edi, %ecx > > + movl %edi, %eax > > movq %rdi, %rdx > > vpxor %xmm0, %xmm0, %xmm0 > > - > > + /* Clear high bits from edi. Only keeping bits relevant to page > > + cross check. */ > > + andl $(PAGE_SIZE - 1), %eax > > /* Check if we may cross page boundary with one vector load. */ > > - andl $(2 * VEC_SIZE - 1), %ecx > > - cmpl $VEC_SIZE, %ecx > > - ja L(cros_page_boundary) > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > + ja L(cross_page_boundary) > > > > /* Check the first VEC_SIZE bytes. */ > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - > > + VPCMPEQ (%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > # ifdef USE_AS_STRNLEN > > - jnz L(first_vec_x0_check) > > - /* Adjust length and check the end of data. */ > > - subq $VEC_SIZE, %rsi > > - jbe L(max) > > -# else > > - jnz L(first_vec_x0) > > + /* If length < VEC_SIZE handle special. */ > > + cmpq $VEC_SIZE, %rsi > > + jbe L(first_vec_x0) > > # endif > > - > > - /* Align data for aligned loads in the loop. */ > > - addq $VEC_SIZE, %rdi > > - andl $(VEC_SIZE - 1), %ecx > > - andq $-VEC_SIZE, %rdi > > + /* If empty continue to aligned_more. Otherwise return bit > > + position of first match. */ > > + testl %eax, %eax > > + jz L(aligned_more) > > + tzcntl %eax, %eax > > +# ifdef USE_AS_WCSLEN > > + shrl $2, %eax > > +# endif > > + VZEROUPPER_RETURN > > > > # ifdef USE_AS_STRNLEN > > - /* Adjust length. */ > > - addq %rcx, %rsi > > +L(zero): > > + xorl %eax, %eax > > + ret > > > > - subq $(VEC_SIZE * 4), %rsi > > - jbe L(last_4x_vec_or_less) > > + .p2align 4 > > +L(first_vec_x0): > > + /* Set bit for max len so that tzcnt will return min of max len > > + and position of first match. */ > > + btsq %rsi, %rax > > + tzcntl %eax, %eax > > +# ifdef USE_AS_WCSLEN > > + shrl $2, %eax > > +# endif > > + VZEROUPPER_RETURN > > # endif > > - jmp L(more_4x_vec) > > > > .p2align 4 > > -L(cros_page_boundary): > > - andl $(VEC_SIZE - 1), %ecx > > - andq $-VEC_SIZE, %rdi > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - /* Remove the leading bytes. */ > > - sarl %cl, %eax > > - testl %eax, %eax > > - jz L(aligned_more) > > +L(first_vec_x1): > > tzcntl %eax, %eax > > + /* Safe to use 32 bit instructions as these are only called for > > + size = [1, 159]. */ > > # ifdef USE_AS_STRNLEN > > - /* Check the end of data. */ > > - cmpq %rax, %rsi > > - jbe L(max) > > + /* Use ecx which was computed earlier to compute correct value. > > + */ > > + subl $(VEC_SIZE * 4 + 1), %ecx > > + addl %ecx, %eax > > +# else > > + subl %edx, %edi > > + incl %edi > > + addl %edi, %eax > > # endif > > - addq %rdi, %rax > > - addq %rcx, %rax > > - subq %rdx, %rax > > # ifdef USE_AS_WCSLEN > > - shrq $2, %rax > > + shrl $2, %eax > > # endif > > -L(return_vzeroupper): > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > + VZEROUPPER_RETURN > > > > .p2align 4 > > -L(aligned_more): > > +L(first_vec_x2): > > + tzcntl %eax, %eax > > + /* Safe to use 32 bit instructions as these are only called for > > + size = [1, 159]. */ > > # ifdef USE_AS_STRNLEN > > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > > - to void possible addition overflow. */ > > - negq %rcx > > - addq $VEC_SIZE, %rcx > > - > > - /* Check the end of data. */ > > - subq %rcx, %rsi > > - jbe L(max) > > + /* Use ecx which was computed earlier to compute correct value. > > + */ > > + subl $(VEC_SIZE * 3 + 1), %ecx > > + addl %ecx, %eax > > +# else > > + subl %edx, %edi > > + addl $(VEC_SIZE + 1), %edi > > + addl %edi, %eax > > # endif > > +# ifdef USE_AS_WCSLEN > > + shrl $2, %eax > > +# endif > > + VZEROUPPER_RETURN > > > > - addq $VEC_SIZE, %rdi > > + .p2align 4 > > +L(first_vec_x3): > > + tzcntl %eax, %eax > > + /* Safe to use 32 bit instructions as these are only called for > > + size = [1, 159]. */ > > +# ifdef USE_AS_STRNLEN > > + /* Use ecx which was computed earlier to compute correct value. > > + */ > > + subl $(VEC_SIZE * 2 + 1), %ecx > > + addl %ecx, %eax > > +# else > > + subl %edx, %edi > > + addl $(VEC_SIZE * 2 + 1), %edi > > + addl %edi, %eax > > +# endif > > +# ifdef USE_AS_WCSLEN > > + shrl $2, %eax > > +# endif > > + VZEROUPPER_RETURN > > > > + .p2align 4 > > +L(first_vec_x4): > > + tzcntl %eax, %eax > > + /* Safe to use 32 bit instructions as these are only called for > > + size = [1, 159]. */ > > # ifdef USE_AS_STRNLEN > > - subq $(VEC_SIZE * 4), %rsi > > - jbe L(last_4x_vec_or_less) > > + /* Use ecx which was computed earlier to compute correct value. > > + */ > > + subl $(VEC_SIZE + 1), %ecx > > + addl %ecx, %eax > > +# else > > + subl %edx, %edi > > + addl $(VEC_SIZE * 3 + 1), %edi > > + addl %edi, %eax > > # endif > > +# ifdef USE_AS_WCSLEN > > + shrl $2, %eax > > +# endif > > + VZEROUPPER_RETURN > > > > -L(more_4x_vec): > > + .p2align 5 > > +L(aligned_more): > > + /* Align data to VEC_SIZE - 1. This is the same number of > > + instructions as using andq with -VEC_SIZE but saves 4 bytes of > > + code on the x4 check. */ > > + orq $(VEC_SIZE - 1), %rdi > > +L(cross_page_continue): > > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > > since data is only aligned to VEC_SIZE. */ > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x0) > > - > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > +# ifdef USE_AS_STRNLEN > > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because > > + it simplies the logic in last_4x_vec_or_less. */ > > + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx > > + subq %rdx, %rcx > > +# endif > > + /* Load first VEC regardless. */ > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > +# ifdef USE_AS_STRNLEN > > + /* Adjust length. If near end handle specially. */ > > + subq %rcx, %rsi > > + jb L(last_4x_vec_or_less) > > +# endif > > + vpmovmskb %ymm1, %eax > > testl %eax, %eax > > jnz L(first_vec_x1) > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > testl %eax, %eax > > jnz L(first_vec_x2) > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > testl %eax, %eax > > jnz L(first_vec_x3) > > > > - addq $(VEC_SIZE * 4), %rdi > > - > > -# ifdef USE_AS_STRNLEN > > - subq $(VEC_SIZE * 4), %rsi > > - jbe L(last_4x_vec_or_less) > > -# endif > > - > > - /* Align data to 4 * VEC_SIZE. */ > > - movq %rdi, %rcx > > - andl $(4 * VEC_SIZE - 1), %ecx > > - andq $-(4 * VEC_SIZE), %rdi > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + testl %eax, %eax > > + jnz L(first_vec_x4) > > > > + /* Align data to VEC_SIZE * 4 - 1. */ > > # ifdef USE_AS_STRNLEN > > - /* Adjust length. */ > > + /* Before adjusting length check if at last VEC_SIZE * 4. */ > > + cmpq $(VEC_SIZE * 4 - 1), %rsi > > + jbe L(last_4x_vec_or_less_load) > > + incq %rdi > > + movl %edi, %ecx > > + orq $(VEC_SIZE * 4 - 1), %rdi > > + andl $(VEC_SIZE * 4 - 1), %ecx > > + /* Readjust length. */ > > addq %rcx, %rsi > > +# else > > + incq %rdi > > + orq $(VEC_SIZE * 4 - 1), %rdi > > # endif > > - > > + /* Compare 4 * VEC at a time forward. */ > > .p2align 4 > > L(loop_4x_vec): > > - /* Compare 4 * VEC at a time forward. */ > > - vmovdqa (%rdi), %ymm1 > > - vmovdqa VEC_SIZE(%rdi), %ymm2 > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > > - VPMINU %ymm1, %ymm2, %ymm5 > > - VPMINU %ymm3, %ymm4, %ymm6 > > - VPMINU %ymm5, %ymm6, %ymm5 > > - > > - VPCMPEQ %ymm5, %ymm0, %ymm5 > > - vpmovmskb %ymm5, %eax > > - testl %eax, %eax > > - jnz L(4x_vec_end) > > - > > - addq $(VEC_SIZE * 4), %rdi > > - > > -# ifndef USE_AS_STRNLEN > > - jmp L(loop_4x_vec) > > -# else > > +# ifdef USE_AS_STRNLEN > > + /* Break if at end of length. */ > > subq $(VEC_SIZE * 4), %rsi > > - ja L(loop_4x_vec) > > - > > -L(last_4x_vec_or_less): > > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > > - addl $(VEC_SIZE * 2), %esi > > - jle L(last_2x_vec) > > + jb L(last_4x_vec_or_less_cmpeq) > > +# endif > > + /* Save some code size by microfusing VPMINU with the load. Since > > + the matches in ymm2/ymm4 can only be returned if there where no > > + matches in ymm1/ymm3 respectively there is no issue with overlap. > > + */ > > + vmovdqa 1(%rdi), %ymm1 > > + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 > > + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 > > + > > + VPMINU %ymm2, %ymm4, %ymm5 > > + VPCMPEQ %ymm5, %ymm0, %ymm5 > > + vpmovmskb %ymm5, %ecx > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x0) > > + subq $-(VEC_SIZE * 4), %rdi > > + testl %ecx, %ecx > > + jz L(loop_4x_vec) > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x1) > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > + VPCMPEQ %ymm1, %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + subq %rdx, %rdi > > testl %eax, %eax > > + jnz L(last_vec_return_x0) > > > > - jnz L(first_vec_x2_check) > > - subl $VEC_SIZE, %esi > > - jle L(max) > > - > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > + VPCMPEQ %ymm2, %ymm0, %ymm2 > > + vpmovmskb %ymm2, %eax > > testl %eax, %eax > > - > > - jnz L(first_vec_x3_check) > > - movq %r8, %rax > > -# ifdef USE_AS_WCSLEN > > + jnz L(last_vec_return_x1) > > + > > + /* Combine last 2 VEC. */ > > + VPCMPEQ %ymm3, %ymm0, %ymm3 > > + vpmovmskb %ymm3, %eax > > + /* rcx has combined result from all 4 VEC. It will only be used if > > + the first 3 other VEC all did not contain a match. */ > > + salq $32, %rcx > > + orq %rcx, %rax > > + tzcntq %rax, %rax > > + subq $(VEC_SIZE * 2 - 1), %rdi > > + addq %rdi, %rax > > +# ifdef USE_AS_WCSLEN > > shrq $2, %rax > > -# endif > > +# endif > > VZEROUPPER_RETURN > > > > + > > +# ifdef USE_AS_STRNLEN > > .p2align 4 > > -L(last_2x_vec): > > - addl $(VEC_SIZE * 2), %esi > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > +L(last_4x_vec_or_less_load): > > + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ > > + subq $-(VEC_SIZE * 4), %rdi > > +L(last_4x_vec_or_less_cmpeq): > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > +L(last_4x_vec_or_less): > > > > - jnz L(first_vec_x0_check) > > - subl $VEC_SIZE, %esi > > - jle L(max) > > + vpmovmskb %ymm1, %eax > > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > > + VEC_SIZE * 4. */ > > + testl $(VEC_SIZE * 2), %esi > > + jnz L(last_4x_vec) > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > + /* length may have been negative or positive by an offset of > > + VEC_SIZE * 4 depending on where this was called from. This fixes > > + that. */ > > + andl $(VEC_SIZE * 4 - 1), %esi > > testl %eax, %eax > > - jnz L(first_vec_x1_check) > > - movq %r8, %rax > > -# ifdef USE_AS_WCSLEN > > - shrq $2, %rax > > -# endif > > - VZEROUPPER_RETURN > > + jnz L(last_vec_x1_check) > > > > - .p2align 4 > > -L(first_vec_x0_check): > > + subl $VEC_SIZE, %esi > > + jb L(max) > > + > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > tzcntl %eax, %eax > > /* Check the end of data. */ > > - cmpq %rax, %rsi > > - jbe L(max) > > + cmpl %eax, %esi > > + jb L(max) > > + subq %rdx, %rdi > > + addl $(VEC_SIZE + 1), %eax > > addq %rdi, %rax > > - subq %rdx, %rax > > # ifdef USE_AS_WCSLEN > > shrq $2, %rax > > # endif > > VZEROUPPER_RETURN > > +# endif > > > > .p2align 4 > > -L(first_vec_x1_check): > > +L(last_vec_return_x0): > > tzcntl %eax, %eax > > - /* Check the end of data. */ > > - cmpq %rax, %rsi > > - jbe L(max) > > - addq $VEC_SIZE, %rax > > + subq $(VEC_SIZE * 4 - 1), %rdi > > addq %rdi, %rax > > - subq %rdx, %rax > > -# ifdef USE_AS_WCSLEN > > +# ifdef USE_AS_WCSLEN > > shrq $2, %rax > > -# endif > > +# endif > > VZEROUPPER_RETURN > > > > .p2align 4 > > -L(first_vec_x2_check): > > +L(last_vec_return_x1): > > tzcntl %eax, %eax > > - /* Check the end of data. */ > > - cmpq %rax, %rsi > > - jbe L(max) > > - addq $(VEC_SIZE * 2), %rax > > + subq $(VEC_SIZE * 3 - 1), %rdi > > addq %rdi, %rax > > - subq %rdx, %rax > > -# ifdef USE_AS_WCSLEN > > +# ifdef USE_AS_WCSLEN > > shrq $2, %rax > > -# endif > > +# endif > > VZEROUPPER_RETURN > > > > +# ifdef USE_AS_STRNLEN > > .p2align 4 > > -L(first_vec_x3_check): > > +L(last_vec_x1_check): > > + > > tzcntl %eax, %eax > > /* Check the end of data. */ > > - cmpq %rax, %rsi > > - jbe L(max) > > - addq $(VEC_SIZE * 3), %rax > > + cmpl %eax, %esi > > + jb L(max) > > + subq %rdx, %rdi > > + incl %eax > > addq %rdi, %rax > > - subq %rdx, %rax > > # ifdef USE_AS_WCSLEN > > shrq $2, %rax > > # endif > > VZEROUPPER_RETURN > > > > - .p2align 4 > > L(max): > > movq %r8, %rax > > + VZEROUPPER_RETURN > > + > > + .p2align 4 > > +L(last_4x_vec): > > + /* Test first 2x VEC normally. */ > > + testl %eax, %eax > > + jnz L(last_vec_x1) > > + > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + testl %eax, %eax > > + jnz L(last_vec_x2) > > + > > + /* Normalize length. */ > > + andl $(VEC_SIZE * 4 - 1), %esi > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + testl %eax, %eax > > + jnz L(last_vec_x3) > > + > > + subl $(VEC_SIZE * 3), %esi > > + jb L(max) > > + > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + tzcntl %eax, %eax > > + /* Check the end of data. */ > > + cmpl %eax, %esi > > + jb L(max) > > + subq %rdx, %rdi > > + addl $(VEC_SIZE * 3 + 1), %eax > > + addq %rdi, %rax > > # ifdef USE_AS_WCSLEN > > shrq $2, %rax > > # endif > > VZEROUPPER_RETURN > > > > - .p2align 4 > > -L(zero): > > - xorl %eax, %eax > > - ret > > -# endif > > > > .p2align 4 > > -L(first_vec_x0): > > +L(last_vec_x1): > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > + instructions. */ > > tzcntl %eax, %eax > > + subq %rdx, %rdi > > + incl %eax > > addq %rdi, %rax > > - subq %rdx, %rax > > -# ifdef USE_AS_WCSLEN > > +# ifdef USE_AS_WCSLEN > > shrq $2, %rax > > -# endif > > +# endif > > VZEROUPPER_RETURN > > > > .p2align 4 > > -L(first_vec_x1): > > +L(last_vec_x2): > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > + instructions. */ > > tzcntl %eax, %eax > > - addq $VEC_SIZE, %rax > > + subq %rdx, %rdi > > + addl $(VEC_SIZE + 1), %eax > > addq %rdi, %rax > > - subq %rdx, %rax > > -# ifdef USE_AS_WCSLEN > > +# ifdef USE_AS_WCSLEN > > shrq $2, %rax > > -# endif > > +# endif > > VZEROUPPER_RETURN > > > > .p2align 4 > > -L(first_vec_x2): > > +L(last_vec_x3): > > tzcntl %eax, %eax > > - addq $(VEC_SIZE * 2), %rax > > + subl $(VEC_SIZE * 2), %esi > > + /* Check the end of data. */ > > + cmpl %eax, %esi > > + jb L(max_end) > > + subq %rdx, %rdi > > + addl $(VEC_SIZE * 2 + 1), %eax > > addq %rdi, %rax > > - subq %rdx, %rax > > -# ifdef USE_AS_WCSLEN > > +# ifdef USE_AS_WCSLEN > > shrq $2, %rax > > -# endif > > +# endif > > + VZEROUPPER_RETURN > > +L(max_end): > > + movq %r8, %rax > > VZEROUPPER_RETURN > > +# endif > > > > + /* Cold case for crossing page with first load. */ > > .p2align 4 > > -L(4x_vec_end): > > - VPCMPEQ %ymm1, %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x0) > > - VPCMPEQ %ymm2, %ymm0, %ymm2 > > - vpmovmskb %ymm2, %eax > > +L(cross_page_boundary): > > + /* Align data to VEC_SIZE - 1. */ > > + orq $(VEC_SIZE - 1), %rdi > > + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > + so no need to manually mod rdx. */ > > + sarxl %edx, %eax, %eax > > This is a BMI2 instruction, which is not necessary available when AVX2 > is available. This causes SIGILL on some CPU. I have reported that in > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 This is not a bug on master as: commit 83c5b368226c34a2f0a5287df40fc290b2b34359 Author: H.J. Lu <hjl.tools@gmail.com> Date: Mon Apr 19 10:45:07 2021 -0700 x86-64: Require BMI2 for strchr-avx2.S is already in tree. The issue is the avx2 changes where backported w.o H.J's changes. > > Regards > Aurelien > > -- > Aurelien Jarno GPG: 4096R/1DDD8C9B > aurelien@aurel32.net http://www.aurel32.net ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-09-25 14:00 ` Noah Goldstein @ 2022-09-28 13:54 ` Sunil Pandey 2022-09-28 14:02 ` Darren Tristano ` (3 more replies) 0 siblings, 4 replies; 24+ messages in thread From: Sunil Pandey @ 2022-09-28 13:54 UTC (permalink / raw) To: Noah Goldstein, Libc-stable Mailing List, Hongjiu Lu; +Cc: GNU C Library [-- Attachment #1: Type: text/plain, Size: 26445 bytes --] Attached patch fixes BZ# 29611. I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know if there is any objection. On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote: > > > No bug. This commit optimizes strlen-avx2.S. The optimizations are > > > mostly small things but they add up to roughly 10-30% performance > > > improvement for strlen. The results for strnlen are bit more > > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > > > are all passing. > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > --- > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > > > sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- > > > 2 files changed, 334 insertions(+), 214 deletions(-) > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > index c377cab629..651b32908e 100644 > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > > > IFUNC_IMPL (i, name, strlen, > > > IFUNC_IMPL_ADD (array, i, strlen, > > > - CPU_FEATURE_USABLE (AVX2), > > > + (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > __strlen_avx2) > > > IFUNC_IMPL_ADD (array, i, strlen, > > > (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2) > > > && CPU_FEATURE_USABLE (RTM)), > > > __strlen_avx2_rtm) > > > IFUNC_IMPL_ADD (array, i, strlen, > > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > /* Support sysdeps/x86_64/multiarch/strnlen.c. */ > > > IFUNC_IMPL (i, name, strnlen, > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > - CPU_FEATURE_USABLE (AVX2), > > > + (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > __strnlen_avx2) > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2) > > > && CPU_FEATURE_USABLE (RTM)), > > > __strnlen_avx2_rtm) > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > > > IFUNC_IMPL (i, name, wcslen, > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > - CPU_FEATURE_USABLE (AVX2), > > > + (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > __wcslen_avx2) > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2) > > > && CPU_FEATURE_USABLE (RTM)), > > > __wcslen_avx2_rtm) > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ > > > IFUNC_IMPL (i, name, wcsnlen, > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > - CPU_FEATURE_USABLE (AVX2), > > > + (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > __wcsnlen_avx2) > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2) > > > && CPU_FEATURE_USABLE (RTM)), > > > __wcsnlen_avx2_rtm) > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > index 1caae9e6bc..bd2e6ee44a 100644 > > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S > > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > @@ -27,9 +27,11 @@ > > > # ifdef USE_AS_WCSLEN > > > # define VPCMPEQ vpcmpeqd > > > # define VPMINU vpminud > > > +# define CHAR_SIZE 4 > > > # else > > > # define VPCMPEQ vpcmpeqb > > > # define VPMINU vpminub > > > +# define CHAR_SIZE 1 > > > # endif > > > > > > # ifndef VZEROUPPER > > > @@ -41,349 +43,459 @@ > > > # endif > > > > > > # define VEC_SIZE 32 > > > +# define PAGE_SIZE 4096 > > > > > > .section SECTION(.text),"ax",@progbits > > > ENTRY (STRLEN) > > > # ifdef USE_AS_STRNLEN > > > - /* Check for zero length. */ > > > + /* Check zero length. */ > > > test %RSI_LP, %RSI_LP > > > jz L(zero) > > > + /* Store max len in R8_LP before adjusting if using WCSLEN. */ > > > + mov %RSI_LP, %R8_LP > > > # ifdef USE_AS_WCSLEN > > > shl $2, %RSI_LP > > > # elif defined __ILP32__ > > > /* Clear the upper 32 bits. */ > > > movl %esi, %esi > > > # endif > > > - mov %RSI_LP, %R8_LP > > > # endif > > > - movl %edi, %ecx > > > + movl %edi, %eax > > > movq %rdi, %rdx > > > vpxor %xmm0, %xmm0, %xmm0 > > > - > > > + /* Clear high bits from edi. Only keeping bits relevant to page > > > + cross check. */ > > > + andl $(PAGE_SIZE - 1), %eax > > > /* Check if we may cross page boundary with one vector load. */ > > > - andl $(2 * VEC_SIZE - 1), %ecx > > > - cmpl $VEC_SIZE, %ecx > > > - ja L(cros_page_boundary) > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > + ja L(cross_page_boundary) > > > > > > /* Check the first VEC_SIZE bytes. */ > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - > > > + VPCMPEQ (%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > # ifdef USE_AS_STRNLEN > > > - jnz L(first_vec_x0_check) > > > - /* Adjust length and check the end of data. */ > > > - subq $VEC_SIZE, %rsi > > > - jbe L(max) > > > -# else > > > - jnz L(first_vec_x0) > > > + /* If length < VEC_SIZE handle special. */ > > > + cmpq $VEC_SIZE, %rsi > > > + jbe L(first_vec_x0) > > > # endif > > > - > > > - /* Align data for aligned loads in the loop. */ > > > - addq $VEC_SIZE, %rdi > > > - andl $(VEC_SIZE - 1), %ecx > > > - andq $-VEC_SIZE, %rdi > > > + /* If empty continue to aligned_more. Otherwise return bit > > > + position of first match. */ > > > + testl %eax, %eax > > > + jz L(aligned_more) > > > + tzcntl %eax, %eax > > > +# ifdef USE_AS_WCSLEN > > > + shrl $2, %eax > > > +# endif > > > + VZEROUPPER_RETURN > > > > > > # ifdef USE_AS_STRNLEN > > > - /* Adjust length. */ > > > - addq %rcx, %rsi > > > +L(zero): > > > + xorl %eax, %eax > > > + ret > > > > > > - subq $(VEC_SIZE * 4), %rsi > > > - jbe L(last_4x_vec_or_less) > > > + .p2align 4 > > > +L(first_vec_x0): > > > + /* Set bit for max len so that tzcnt will return min of max len > > > + and position of first match. */ > > > + btsq %rsi, %rax > > > + tzcntl %eax, %eax > > > +# ifdef USE_AS_WCSLEN > > > + shrl $2, %eax > > > +# endif > > > + VZEROUPPER_RETURN > > > # endif > > > - jmp L(more_4x_vec) > > > > > > .p2align 4 > > > -L(cros_page_boundary): > > > - andl $(VEC_SIZE - 1), %ecx > > > - andq $-VEC_SIZE, %rdi > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - /* Remove the leading bytes. */ > > > - sarl %cl, %eax > > > - testl %eax, %eax > > > - jz L(aligned_more) > > > +L(first_vec_x1): > > > tzcntl %eax, %eax > > > + /* Safe to use 32 bit instructions as these are only called for > > > + size = [1, 159]. */ > > > # ifdef USE_AS_STRNLEN > > > - /* Check the end of data. */ > > > - cmpq %rax, %rsi > > > - jbe L(max) > > > + /* Use ecx which was computed earlier to compute correct value. > > > + */ > > > + subl $(VEC_SIZE * 4 + 1), %ecx > > > + addl %ecx, %eax > > > +# else > > > + subl %edx, %edi > > > + incl %edi > > > + addl %edi, %eax > > > # endif > > > - addq %rdi, %rax > > > - addq %rcx, %rax > > > - subq %rdx, %rax > > > # ifdef USE_AS_WCSLEN > > > - shrq $2, %rax > > > + shrl $2, %eax > > > # endif > > > -L(return_vzeroupper): > > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > > + VZEROUPPER_RETURN > > > > > > .p2align 4 > > > -L(aligned_more): > > > +L(first_vec_x2): > > > + tzcntl %eax, %eax > > > + /* Safe to use 32 bit instructions as these are only called for > > > + size = [1, 159]. */ > > > # ifdef USE_AS_STRNLEN > > > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > > > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > > > - to void possible addition overflow. */ > > > - negq %rcx > > > - addq $VEC_SIZE, %rcx > > > - > > > - /* Check the end of data. */ > > > - subq %rcx, %rsi > > > - jbe L(max) > > > + /* Use ecx which was computed earlier to compute correct value. > > > + */ > > > + subl $(VEC_SIZE * 3 + 1), %ecx > > > + addl %ecx, %eax > > > +# else > > > + subl %edx, %edi > > > + addl $(VEC_SIZE + 1), %edi > > > + addl %edi, %eax > > > # endif > > > +# ifdef USE_AS_WCSLEN > > > + shrl $2, %eax > > > +# endif > > > + VZEROUPPER_RETURN > > > > > > - addq $VEC_SIZE, %rdi > > > + .p2align 4 > > > +L(first_vec_x3): > > > + tzcntl %eax, %eax > > > + /* Safe to use 32 bit instructions as these are only called for > > > + size = [1, 159]. */ > > > +# ifdef USE_AS_STRNLEN > > > + /* Use ecx which was computed earlier to compute correct value. > > > + */ > > > + subl $(VEC_SIZE * 2 + 1), %ecx > > > + addl %ecx, %eax > > > +# else > > > + subl %edx, %edi > > > + addl $(VEC_SIZE * 2 + 1), %edi > > > + addl %edi, %eax > > > +# endif > > > +# ifdef USE_AS_WCSLEN > > > + shrl $2, %eax > > > +# endif > > > + VZEROUPPER_RETURN > > > > > > + .p2align 4 > > > +L(first_vec_x4): > > > + tzcntl %eax, %eax > > > + /* Safe to use 32 bit instructions as these are only called for > > > + size = [1, 159]. */ > > > # ifdef USE_AS_STRNLEN > > > - subq $(VEC_SIZE * 4), %rsi > > > - jbe L(last_4x_vec_or_less) > > > + /* Use ecx which was computed earlier to compute correct value. > > > + */ > > > + subl $(VEC_SIZE + 1), %ecx > > > + addl %ecx, %eax > > > +# else > > > + subl %edx, %edi > > > + addl $(VEC_SIZE * 3 + 1), %edi > > > + addl %edi, %eax > > > # endif > > > +# ifdef USE_AS_WCSLEN > > > + shrl $2, %eax > > > +# endif > > > + VZEROUPPER_RETURN > > > > > > -L(more_4x_vec): > > > + .p2align 5 > > > +L(aligned_more): > > > + /* Align data to VEC_SIZE - 1. This is the same number of > > > + instructions as using andq with -VEC_SIZE but saves 4 bytes of > > > + code on the x4 check. */ > > > + orq $(VEC_SIZE - 1), %rdi > > > +L(cross_page_continue): > > > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > since data is only aligned to VEC_SIZE. */ > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x0) > > > - > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > +# ifdef USE_AS_STRNLEN > > > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because > > > + it simplies the logic in last_4x_vec_or_less. */ > > > + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx > > > + subq %rdx, %rcx > > > +# endif > > > + /* Load first VEC regardless. */ > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > +# ifdef USE_AS_STRNLEN > > > + /* Adjust length. If near end handle specially. */ > > > + subq %rcx, %rsi > > > + jb L(last_4x_vec_or_less) > > > +# endif > > > + vpmovmskb %ymm1, %eax > > > testl %eax, %eax > > > jnz L(first_vec_x1) > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > testl %eax, %eax > > > jnz L(first_vec_x2) > > > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > testl %eax, %eax > > > jnz L(first_vec_x3) > > > > > > - addq $(VEC_SIZE * 4), %rdi > > > - > > > -# ifdef USE_AS_STRNLEN > > > - subq $(VEC_SIZE * 4), %rsi > > > - jbe L(last_4x_vec_or_less) > > > -# endif > > > - > > > - /* Align data to 4 * VEC_SIZE. */ > > > - movq %rdi, %rcx > > > - andl $(4 * VEC_SIZE - 1), %ecx > > > - andq $-(4 * VEC_SIZE), %rdi > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + testl %eax, %eax > > > + jnz L(first_vec_x4) > > > > > > + /* Align data to VEC_SIZE * 4 - 1. */ > > > # ifdef USE_AS_STRNLEN > > > - /* Adjust length. */ > > > + /* Before adjusting length check if at last VEC_SIZE * 4. */ > > > + cmpq $(VEC_SIZE * 4 - 1), %rsi > > > + jbe L(last_4x_vec_or_less_load) > > > + incq %rdi > > > + movl %edi, %ecx > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > + andl $(VEC_SIZE * 4 - 1), %ecx > > > + /* Readjust length. */ > > > addq %rcx, %rsi > > > +# else > > > + incq %rdi > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > # endif > > > - > > > + /* Compare 4 * VEC at a time forward. */ > > > .p2align 4 > > > L(loop_4x_vec): > > > - /* Compare 4 * VEC at a time forward. */ > > > - vmovdqa (%rdi), %ymm1 > > > - vmovdqa VEC_SIZE(%rdi), %ymm2 > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > > > - VPMINU %ymm1, %ymm2, %ymm5 > > > - VPMINU %ymm3, %ymm4, %ymm6 > > > - VPMINU %ymm5, %ymm6, %ymm5 > > > - > > > - VPCMPEQ %ymm5, %ymm0, %ymm5 > > > - vpmovmskb %ymm5, %eax > > > - testl %eax, %eax > > > - jnz L(4x_vec_end) > > > - > > > - addq $(VEC_SIZE * 4), %rdi > > > - > > > -# ifndef USE_AS_STRNLEN > > > - jmp L(loop_4x_vec) > > > -# else > > > +# ifdef USE_AS_STRNLEN > > > + /* Break if at end of length. */ > > > subq $(VEC_SIZE * 4), %rsi > > > - ja L(loop_4x_vec) > > > - > > > -L(last_4x_vec_or_less): > > > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > > > - addl $(VEC_SIZE * 2), %esi > > > - jle L(last_2x_vec) > > > + jb L(last_4x_vec_or_less_cmpeq) > > > +# endif > > > + /* Save some code size by microfusing VPMINU with the load. Since > > > + the matches in ymm2/ymm4 can only be returned if there where no > > > + matches in ymm1/ymm3 respectively there is no issue with overlap. > > > + */ > > > + vmovdqa 1(%rdi), %ymm1 > > > + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 > > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 > > > + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 > > > + > > > + VPMINU %ymm2, %ymm4, %ymm5 > > > + VPCMPEQ %ymm5, %ymm0, %ymm5 > > > + vpmovmskb %ymm5, %ecx > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x0) > > > + subq $-(VEC_SIZE * 4), %rdi > > > + testl %ecx, %ecx > > > + jz L(loop_4x_vec) > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x1) > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + VPCMPEQ %ymm1, %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + subq %rdx, %rdi > > > testl %eax, %eax > > > + jnz L(last_vec_return_x0) > > > > > > - jnz L(first_vec_x2_check) > > > - subl $VEC_SIZE, %esi > > > - jle L(max) > > > - > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + VPCMPEQ %ymm2, %ymm0, %ymm2 > > > + vpmovmskb %ymm2, %eax > > > testl %eax, %eax > > > - > > > - jnz L(first_vec_x3_check) > > > - movq %r8, %rax > > > -# ifdef USE_AS_WCSLEN > > > + jnz L(last_vec_return_x1) > > > + > > > + /* Combine last 2 VEC. */ > > > + VPCMPEQ %ymm3, %ymm0, %ymm3 > > > + vpmovmskb %ymm3, %eax > > > + /* rcx has combined result from all 4 VEC. It will only be used if > > > + the first 3 other VEC all did not contain a match. */ > > > + salq $32, %rcx > > > + orq %rcx, %rax > > > + tzcntq %rax, %rax > > > + subq $(VEC_SIZE * 2 - 1), %rdi > > > + addq %rdi, %rax > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > VZEROUPPER_RETURN > > > > > > + > > > +# ifdef USE_AS_STRNLEN > > > .p2align 4 > > > -L(last_2x_vec): > > > - addl $(VEC_SIZE * 2), %esi > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > +L(last_4x_vec_or_less_load): > > > + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ > > > + subq $-(VEC_SIZE * 4), %rdi > > > +L(last_4x_vec_or_less_cmpeq): > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > +L(last_4x_vec_or_less): > > > > > > - jnz L(first_vec_x0_check) > > > - subl $VEC_SIZE, %esi > > > - jle L(max) > > > + vpmovmskb %ymm1, %eax > > > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > > > + VEC_SIZE * 4. */ > > > + testl $(VEC_SIZE * 2), %esi > > > + jnz L(last_4x_vec) > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + /* length may have been negative or positive by an offset of > > > + VEC_SIZE * 4 depending on where this was called from. This fixes > > > + that. */ > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > testl %eax, %eax > > > - jnz L(first_vec_x1_check) > > > - movq %r8, %rax > > > -# ifdef USE_AS_WCSLEN > > > - shrq $2, %rax > > > -# endif > > > - VZEROUPPER_RETURN > > > + jnz L(last_vec_x1_check) > > > > > > - .p2align 4 > > > -L(first_vec_x0_check): > > > + subl $VEC_SIZE, %esi > > > + jb L(max) > > > + > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > tzcntl %eax, %eax > > > /* Check the end of data. */ > > > - cmpq %rax, %rsi > > > - jbe L(max) > > > + cmpl %eax, %esi > > > + jb L(max) > > > + subq %rdx, %rdi > > > + addl $(VEC_SIZE + 1), %eax > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > # ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > # endif > > > VZEROUPPER_RETURN > > > +# endif > > > > > > .p2align 4 > > > -L(first_vec_x1_check): > > > +L(last_vec_return_x0): > > > tzcntl %eax, %eax > > > - /* Check the end of data. */ > > > - cmpq %rax, %rsi > > > - jbe L(max) > > > - addq $VEC_SIZE, %rax > > > + subq $(VEC_SIZE * 4 - 1), %rdi > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > -# ifdef USE_AS_WCSLEN > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > VZEROUPPER_RETURN > > > > > > .p2align 4 > > > -L(first_vec_x2_check): > > > +L(last_vec_return_x1): > > > tzcntl %eax, %eax > > > - /* Check the end of data. */ > > > - cmpq %rax, %rsi > > > - jbe L(max) > > > - addq $(VEC_SIZE * 2), %rax > > > + subq $(VEC_SIZE * 3 - 1), %rdi > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > -# ifdef USE_AS_WCSLEN > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > VZEROUPPER_RETURN > > > > > > +# ifdef USE_AS_STRNLEN > > > .p2align 4 > > > -L(first_vec_x3_check): > > > +L(last_vec_x1_check): > > > + > > > tzcntl %eax, %eax > > > /* Check the end of data. */ > > > - cmpq %rax, %rsi > > > - jbe L(max) > > > - addq $(VEC_SIZE * 3), %rax > > > + cmpl %eax, %esi > > > + jb L(max) > > > + subq %rdx, %rdi > > > + incl %eax > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > # ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > # endif > > > VZEROUPPER_RETURN > > > > > > - .p2align 4 > > > L(max): > > > movq %r8, %rax > > > + VZEROUPPER_RETURN > > > + > > > + .p2align 4 > > > +L(last_4x_vec): > > > + /* Test first 2x VEC normally. */ > > > + testl %eax, %eax > > > + jnz L(last_vec_x1) > > > + > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + testl %eax, %eax > > > + jnz L(last_vec_x2) > > > + > > > + /* Normalize length. */ > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + testl %eax, %eax > > > + jnz L(last_vec_x3) > > > + > > > + subl $(VEC_SIZE * 3), %esi > > > + jb L(max) > > > + > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + tzcntl %eax, %eax > > > + /* Check the end of data. */ > > > + cmpl %eax, %esi > > > + jb L(max) > > > + subq %rdx, %rdi > > > + addl $(VEC_SIZE * 3 + 1), %eax > > > + addq %rdi, %rax > > > # ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > # endif > > > VZEROUPPER_RETURN > > > > > > - .p2align 4 > > > -L(zero): > > > - xorl %eax, %eax > > > - ret > > > -# endif > > > > > > .p2align 4 > > > -L(first_vec_x0): > > > +L(last_vec_x1): > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > + instructions. */ > > > tzcntl %eax, %eax > > > + subq %rdx, %rdi > > > + incl %eax > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > -# ifdef USE_AS_WCSLEN > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > VZEROUPPER_RETURN > > > > > > .p2align 4 > > > -L(first_vec_x1): > > > +L(last_vec_x2): > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > + instructions. */ > > > tzcntl %eax, %eax > > > - addq $VEC_SIZE, %rax > > > + subq %rdx, %rdi > > > + addl $(VEC_SIZE + 1), %eax > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > -# ifdef USE_AS_WCSLEN > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > VZEROUPPER_RETURN > > > > > > .p2align 4 > > > -L(first_vec_x2): > > > +L(last_vec_x3): > > > tzcntl %eax, %eax > > > - addq $(VEC_SIZE * 2), %rax > > > + subl $(VEC_SIZE * 2), %esi > > > + /* Check the end of data. */ > > > + cmpl %eax, %esi > > > + jb L(max_end) > > > + subq %rdx, %rdi > > > + addl $(VEC_SIZE * 2 + 1), %eax > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > -# ifdef USE_AS_WCSLEN > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > + VZEROUPPER_RETURN > > > +L(max_end): > > > + movq %r8, %rax > > > VZEROUPPER_RETURN > > > +# endif > > > > > > + /* Cold case for crossing page with first load. */ > > > .p2align 4 > > > -L(4x_vec_end): > > > - VPCMPEQ %ymm1, %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x0) > > > - VPCMPEQ %ymm2, %ymm0, %ymm2 > > > - vpmovmskb %ymm2, %eax > > > +L(cross_page_boundary): > > > + /* Align data to VEC_SIZE - 1. */ > > > + orq $(VEC_SIZE - 1), %rdi > > > + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > > + so no need to manually mod rdx. */ > > > + sarxl %edx, %eax, %eax > > > > This is a BMI2 instruction, which is not necessary available when AVX2 > > is available. This causes SIGILL on some CPU. I have reported that in > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 > > This is not a bug on master as: > > commit 83c5b368226c34a2f0a5287df40fc290b2b34359 > Author: H.J. Lu <hjl.tools@gmail.com> > Date: Mon Apr 19 10:45:07 2021 -0700 > > x86-64: Require BMI2 for strchr-avx2.S > > is already in tree. The issue is the avx2 changes where backported > w.o H.J's changes. > > > > Regards > > Aurelien > > > > -- > > Aurelien Jarno GPG: 4096R/1DDD8C9B > > aurelien@aurel32.net http://www.aurel32.net [-- Attachment #2: 2.31-2.30-2.29-2.28.patch --] [-- Type: application/octet-stream, Size: 3697 bytes --] From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.tools@gmail.com> Date: Mon, 19 Apr 2021 10:45:07 -0700 Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S Since strchr-avx2.S updated by commit 1f745ecc2109890886b161d4791e1406fdfc29b8 Author: noah <goldstein.w.n@gmail.com> Date: Wed Feb 3 00:38:59 2021 -0500 x86-64: Refactor and improve performance of strchr-avx2.S uses sarx: c4 e2 72 f7 c0 sarx %ecx,%eax,%eax for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and ifunc-avx2.h. (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) --- sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h index 74189b6aa5..925e5b61eb 100644 --- a/sysdeps/x86_64/multiarch/ifunc-avx2.h +++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h @@ -30,11 +30,11 @@ IFUNC_SELECTOR (void) const struct cpu_features* cpu_features = __get_cpu_features (); if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + && CPU_FEATURES_CPU_P (cpu_features, BMI2) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) { if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable) - && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable) - && CPU_FEATURES_CPU_P (cpu_features, BMI2)) + && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)) return OPTIMIZE (evex); if (CPU_FEATURES_CPU_P (cpu_features, RTM)) diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 56b05ee741..f76326e0b2 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strchr.c. */ IFUNC_IMPL (i, name, strchr, IFUNC_IMPL_ADD (array, i, strchr, - HAS_ARCH_FEATURE (AVX2_Usable), + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (BMI2)), __strchr_avx2) IFUNC_IMPL_ADD (array, i, strchr, (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (BMI2) && HAS_CPU_FEATURE (RTM)), __strchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, strchr, @@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strchrnul.c. */ IFUNC_IMPL (i, name, strchrnul, IFUNC_IMPL_ADD (array, i, strchrnul, - HAS_ARCH_FEATURE (AVX2_Usable), + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (BMI2)), __strchrnul_avx2) IFUNC_IMPL_ADD (array, i, strchrnul, (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (BMI2) && HAS_CPU_FEATURE (RTM)), __strchrnul_avx2_rtm) IFUNC_IMPL_ADD (array, i, strchrnul, @@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/wcschr.c. */ IFUNC_IMPL (i, name, wcschr, IFUNC_IMPL_ADD (array, i, wcschr, - HAS_ARCH_FEATURE (AVX2_Usable), + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (BMI2)), __wcschr_avx2) IFUNC_IMPL_ADD (array, i, wcschr, (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (BMI2) && HAS_CPU_FEATURE (RTM)), __wcschr_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcschr, -- 2.36.1 [-- Attachment #3: 2.32.patch --] [-- Type: application/octet-stream, Size: 3661 bytes --] From c06b2890275868d7b8b4eeb5d57cb28288170899 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.tools@gmail.com> Date: Mon, 19 Apr 2021 10:45:07 -0700 Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S Since strchr-avx2.S updated by commit 1f745ecc2109890886b161d4791e1406fdfc29b8 Author: noah <goldstein.w.n@gmail.com> Date: Wed Feb 3 00:38:59 2021 -0500 x86-64: Refactor and improve performance of strchr-avx2.S uses sarx: c4 e2 72 f7 c0 sarx %ecx,%eax,%eax for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and ifunc-avx2.h. (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) --- sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h index f450c786f0..0d9d837488 100644 --- a/sysdeps/x86_64/multiarch/ifunc-avx2.h +++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h @@ -30,11 +30,11 @@ IFUNC_SELECTOR (void) const struct cpu_features* cpu_features = __get_cpu_features (); if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) { if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) - && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) - && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) return OPTIMIZE (evex); if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 920e64241e..d4bbf61030 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strchr.c. */ IFUNC_IMPL (i, name, strchr, IFUNC_IMPL_ADD (array, i, strchr, - CPU_FEATURE_USABLE (AVX2), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), __strchr_avx2) IFUNC_IMPL_ADD (array, i, strchr, (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (RTM)), __strchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, strchr, @@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strchrnul.c. */ IFUNC_IMPL (i, name, strchrnul, IFUNC_IMPL_ADD (array, i, strchrnul, - CPU_FEATURE_USABLE (AVX2), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), __strchrnul_avx2) IFUNC_IMPL_ADD (array, i, strchrnul, (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (RTM)), __strchrnul_avx2_rtm) IFUNC_IMPL_ADD (array, i, strchrnul, @@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/wcschr.c. */ IFUNC_IMPL (i, name, wcschr, IFUNC_IMPL_ADD (array, i, wcschr, - CPU_FEATURE_USABLE (AVX2), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), __wcschr_avx2) IFUNC_IMPL_ADD (array, i, wcschr, (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (RTM)), __wcschr_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcschr, -- 2.36.1 ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-09-28 13:54 ` Sunil Pandey @ 2022-09-28 14:02 ` Darren Tristano 2022-09-28 14:42 ` Noah Goldstein ` (2 subsequent siblings) 3 siblings, 0 replies; 24+ messages in thread From: Darren Tristano @ 2022-09-28 14:02 UTC (permalink / raw) To: Noah Goldstein, Libc-stable Mailing List, Hongjiu Lu, Sunil Pandey Cc: GNU C Library [-- Attachment #1: Type: text/plain, Size: 27733 bytes --] Please Remove me from this string. I should not be on it. ________________________________ From: Libc-stable <libc-stable-bounces+darren=darrentristano.com@sourceware.org> on behalf of Sunil Pandey via Libc-stable <libc-stable@sourceware.org> Sent: Wednesday, September 28, 2022 8:54 AM To: Noah Goldstein <goldstein.w.n@gmail.com>; Libc-stable Mailing List <libc-stable@sourceware.org>; Hongjiu Lu <hjl.tools@gmail.com> Cc: GNU C Library <libc-alpha@sourceware.org> Subject: Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S Attached patch fixes BZ# 29611. I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know if there is any objection. On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote: > > > No bug. This commit optimizes strlen-avx2.S. The optimizations are > > > mostly small things but they add up to roughly 10-30% performance > > > improvement for strlen. The results for strnlen are bit more > > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > > > are all passing. > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > --- > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > > > sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- > > > 2 files changed, 334 insertions(+), 214 deletions(-) > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > index c377cab629..651b32908e 100644 > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > > > IFUNC_IMPL (i, name, strlen, > > > IFUNC_IMPL_ADD (array, i, strlen, > > > - CPU_FEATURE_USABLE (AVX2), > > > + (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > __strlen_avx2) > > > IFUNC_IMPL_ADD (array, i, strlen, > > > (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2) > > > && CPU_FEATURE_USABLE (RTM)), > > > __strlen_avx2_rtm) > > > IFUNC_IMPL_ADD (array, i, strlen, > > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > /* Support sysdeps/x86_64/multiarch/strnlen.c. */ > > > IFUNC_IMPL (i, name, strnlen, > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > - CPU_FEATURE_USABLE (AVX2), > > > + (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > __strnlen_avx2) > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2) > > > && CPU_FEATURE_USABLE (RTM)), > > > __strnlen_avx2_rtm) > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > > > IFUNC_IMPL (i, name, wcslen, > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > - CPU_FEATURE_USABLE (AVX2), > > > + (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > __wcslen_avx2) > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2) > > > && CPU_FEATURE_USABLE (RTM)), > > > __wcslen_avx2_rtm) > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ > > > IFUNC_IMPL (i, name, wcsnlen, > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > - CPU_FEATURE_USABLE (AVX2), > > > + (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > __wcsnlen_avx2) > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2) > > > && CPU_FEATURE_USABLE (RTM)), > > > __wcsnlen_avx2_rtm) > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > index 1caae9e6bc..bd2e6ee44a 100644 > > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S > > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > @@ -27,9 +27,11 @@ > > > # ifdef USE_AS_WCSLEN > > > # define VPCMPEQ vpcmpeqd > > > # define VPMINU vpminud > > > +# define CHAR_SIZE 4 > > > # else > > > # define VPCMPEQ vpcmpeqb > > > # define VPMINU vpminub > > > +# define CHAR_SIZE 1 > > > # endif > > > > > > # ifndef VZEROUPPER > > > @@ -41,349 +43,459 @@ > > > # endif > > > > > > # define VEC_SIZE 32 > > > +# define PAGE_SIZE 4096 > > > > > > .section SECTION(.text),"ax",@progbits > > > ENTRY (STRLEN) > > > # ifdef USE_AS_STRNLEN > > > - /* Check for zero length. */ > > > + /* Check zero length. */ > > > test %RSI_LP, %RSI_LP > > > jz L(zero) > > > + /* Store max len in R8_LP before adjusting if using WCSLEN. */ > > > + mov %RSI_LP, %R8_LP > > > # ifdef USE_AS_WCSLEN > > > shl $2, %RSI_LP > > > # elif defined __ILP32__ > > > /* Clear the upper 32 bits. */ > > > movl %esi, %esi > > > # endif > > > - mov %RSI_LP, %R8_LP > > > # endif > > > - movl %edi, %ecx > > > + movl %edi, %eax > > > movq %rdi, %rdx > > > vpxor %xmm0, %xmm0, %xmm0 > > > - > > > + /* Clear high bits from edi. Only keeping bits relevant to page > > > + cross check. */ > > > + andl $(PAGE_SIZE - 1), %eax > > > /* Check if we may cross page boundary with one vector load. */ > > > - andl $(2 * VEC_SIZE - 1), %ecx > > > - cmpl $VEC_SIZE, %ecx > > > - ja L(cros_page_boundary) > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > + ja L(cross_page_boundary) > > > > > > /* Check the first VEC_SIZE bytes. */ > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - > > > + VPCMPEQ (%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > # ifdef USE_AS_STRNLEN > > > - jnz L(first_vec_x0_check) > > > - /* Adjust length and check the end of data. */ > > > - subq $VEC_SIZE, %rsi > > > - jbe L(max) > > > -# else > > > - jnz L(first_vec_x0) > > > + /* If length < VEC_SIZE handle special. */ > > > + cmpq $VEC_SIZE, %rsi > > > + jbe L(first_vec_x0) > > > # endif > > > - > > > - /* Align data for aligned loads in the loop. */ > > > - addq $VEC_SIZE, %rdi > > > - andl $(VEC_SIZE - 1), %ecx > > > - andq $-VEC_SIZE, %rdi > > > + /* If empty continue to aligned_more. Otherwise return bit > > > + position of first match. */ > > > + testl %eax, %eax > > > + jz L(aligned_more) > > > + tzcntl %eax, %eax > > > +# ifdef USE_AS_WCSLEN > > > + shrl $2, %eax > > > +# endif > > > + VZEROUPPER_RETURN > > > > > > # ifdef USE_AS_STRNLEN > > > - /* Adjust length. */ > > > - addq %rcx, %rsi > > > +L(zero): > > > + xorl %eax, %eax > > > + ret > > > > > > - subq $(VEC_SIZE * 4), %rsi > > > - jbe L(last_4x_vec_or_less) > > > + .p2align 4 > > > +L(first_vec_x0): > > > + /* Set bit for max len so that tzcnt will return min of max len > > > + and position of first match. */ > > > + btsq %rsi, %rax > > > + tzcntl %eax, %eax > > > +# ifdef USE_AS_WCSLEN > > > + shrl $2, %eax > > > +# endif > > > + VZEROUPPER_RETURN > > > # endif > > > - jmp L(more_4x_vec) > > > > > > .p2align 4 > > > -L(cros_page_boundary): > > > - andl $(VEC_SIZE - 1), %ecx > > > - andq $-VEC_SIZE, %rdi > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - /* Remove the leading bytes. */ > > > - sarl %cl, %eax > > > - testl %eax, %eax > > > - jz L(aligned_more) > > > +L(first_vec_x1): > > > tzcntl %eax, %eax > > > + /* Safe to use 32 bit instructions as these are only called for > > > + size = [1, 159]. */ > > > # ifdef USE_AS_STRNLEN > > > - /* Check the end of data. */ > > > - cmpq %rax, %rsi > > > - jbe L(max) > > > + /* Use ecx which was computed earlier to compute correct value. > > > + */ > > > + subl $(VEC_SIZE * 4 + 1), %ecx > > > + addl %ecx, %eax > > > +# else > > > + subl %edx, %edi > > > + incl %edi > > > + addl %edi, %eax > > > # endif > > > - addq %rdi, %rax > > > - addq %rcx, %rax > > > - subq %rdx, %rax > > > # ifdef USE_AS_WCSLEN > > > - shrq $2, %rax > > > + shrl $2, %eax > > > # endif > > > -L(return_vzeroupper): > > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > > + VZEROUPPER_RETURN > > > > > > .p2align 4 > > > -L(aligned_more): > > > +L(first_vec_x2): > > > + tzcntl %eax, %eax > > > + /* Safe to use 32 bit instructions as these are only called for > > > + size = [1, 159]. */ > > > # ifdef USE_AS_STRNLEN > > > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > > > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > > > - to void possible addition overflow. */ > > > - negq %rcx > > > - addq $VEC_SIZE, %rcx > > > - > > > - /* Check the end of data. */ > > > - subq %rcx, %rsi > > > - jbe L(max) > > > + /* Use ecx which was computed earlier to compute correct value. > > > + */ > > > + subl $(VEC_SIZE * 3 + 1), %ecx > > > + addl %ecx, %eax > > > +# else > > > + subl %edx, %edi > > > + addl $(VEC_SIZE + 1), %edi > > > + addl %edi, %eax > > > # endif > > > +# ifdef USE_AS_WCSLEN > > > + shrl $2, %eax > > > +# endif > > > + VZEROUPPER_RETURN > > > > > > - addq $VEC_SIZE, %rdi > > > + .p2align 4 > > > +L(first_vec_x3): > > > + tzcntl %eax, %eax > > > + /* Safe to use 32 bit instructions as these are only called for > > > + size = [1, 159]. */ > > > +# ifdef USE_AS_STRNLEN > > > + /* Use ecx which was computed earlier to compute correct value. > > > + */ > > > + subl $(VEC_SIZE * 2 + 1), %ecx > > > + addl %ecx, %eax > > > +# else > > > + subl %edx, %edi > > > + addl $(VEC_SIZE * 2 + 1), %edi > > > + addl %edi, %eax > > > +# endif > > > +# ifdef USE_AS_WCSLEN > > > + shrl $2, %eax > > > +# endif > > > + VZEROUPPER_RETURN > > > > > > + .p2align 4 > > > +L(first_vec_x4): > > > + tzcntl %eax, %eax > > > + /* Safe to use 32 bit instructions as these are only called for > > > + size = [1, 159]. */ > > > # ifdef USE_AS_STRNLEN > > > - subq $(VEC_SIZE * 4), %rsi > > > - jbe L(last_4x_vec_or_less) > > > + /* Use ecx which was computed earlier to compute correct value. > > > + */ > > > + subl $(VEC_SIZE + 1), %ecx > > > + addl %ecx, %eax > > > +# else > > > + subl %edx, %edi > > > + addl $(VEC_SIZE * 3 + 1), %edi > > > + addl %edi, %eax > > > # endif > > > +# ifdef USE_AS_WCSLEN > > > + shrl $2, %eax > > > +# endif > > > + VZEROUPPER_RETURN > > > > > > -L(more_4x_vec): > > > + .p2align 5 > > > +L(aligned_more): > > > + /* Align data to VEC_SIZE - 1. This is the same number of > > > + instructions as using andq with -VEC_SIZE but saves 4 bytes of > > > + code on the x4 check. */ > > > + orq $(VEC_SIZE - 1), %rdi > > > +L(cross_page_continue): > > > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > since data is only aligned to VEC_SIZE. */ > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x0) > > > - > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > +# ifdef USE_AS_STRNLEN > > > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because > > > + it simplies the logic in last_4x_vec_or_less. */ > > > + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx > > > + subq %rdx, %rcx > > > +# endif > > > + /* Load first VEC regardless. */ > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > +# ifdef USE_AS_STRNLEN > > > + /* Adjust length. If near end handle specially. */ > > > + subq %rcx, %rsi > > > + jb L(last_4x_vec_or_less) > > > +# endif > > > + vpmovmskb %ymm1, %eax > > > testl %eax, %eax > > > jnz L(first_vec_x1) > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > testl %eax, %eax > > > jnz L(first_vec_x2) > > > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > testl %eax, %eax > > > jnz L(first_vec_x3) > > > > > > - addq $(VEC_SIZE * 4), %rdi > > > - > > > -# ifdef USE_AS_STRNLEN > > > - subq $(VEC_SIZE * 4), %rsi > > > - jbe L(last_4x_vec_or_less) > > > -# endif > > > - > > > - /* Align data to 4 * VEC_SIZE. */ > > > - movq %rdi, %rcx > > > - andl $(4 * VEC_SIZE - 1), %ecx > > > - andq $-(4 * VEC_SIZE), %rdi > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + testl %eax, %eax > > > + jnz L(first_vec_x4) > > > > > > + /* Align data to VEC_SIZE * 4 - 1. */ > > > # ifdef USE_AS_STRNLEN > > > - /* Adjust length. */ > > > + /* Before adjusting length check if at last VEC_SIZE * 4. */ > > > + cmpq $(VEC_SIZE * 4 - 1), %rsi > > > + jbe L(last_4x_vec_or_less_load) > > > + incq %rdi > > > + movl %edi, %ecx > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > + andl $(VEC_SIZE * 4 - 1), %ecx > > > + /* Readjust length. */ > > > addq %rcx, %rsi > > > +# else > > > + incq %rdi > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > # endif > > > - > > > + /* Compare 4 * VEC at a time forward. */ > > > .p2align 4 > > > L(loop_4x_vec): > > > - /* Compare 4 * VEC at a time forward. */ > > > - vmovdqa (%rdi), %ymm1 > > > - vmovdqa VEC_SIZE(%rdi), %ymm2 > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > > > - VPMINU %ymm1, %ymm2, %ymm5 > > > - VPMINU %ymm3, %ymm4, %ymm6 > > > - VPMINU %ymm5, %ymm6, %ymm5 > > > - > > > - VPCMPEQ %ymm5, %ymm0, %ymm5 > > > - vpmovmskb %ymm5, %eax > > > - testl %eax, %eax > > > - jnz L(4x_vec_end) > > > - > > > - addq $(VEC_SIZE * 4), %rdi > > > - > > > -# ifndef USE_AS_STRNLEN > > > - jmp L(loop_4x_vec) > > > -# else > > > +# ifdef USE_AS_STRNLEN > > > + /* Break if at end of length. */ > > > subq $(VEC_SIZE * 4), %rsi > > > - ja L(loop_4x_vec) > > > - > > > -L(last_4x_vec_or_less): > > > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > > > - addl $(VEC_SIZE * 2), %esi > > > - jle L(last_2x_vec) > > > + jb L(last_4x_vec_or_less_cmpeq) > > > +# endif > > > + /* Save some code size by microfusing VPMINU with the load. Since > > > + the matches in ymm2/ymm4 can only be returned if there where no > > > + matches in ymm1/ymm3 respectively there is no issue with overlap. > > > + */ > > > + vmovdqa 1(%rdi), %ymm1 > > > + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 > > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 > > > + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 > > > + > > > + VPMINU %ymm2, %ymm4, %ymm5 > > > + VPCMPEQ %ymm5, %ymm0, %ymm5 > > > + vpmovmskb %ymm5, %ecx > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x0) > > > + subq $-(VEC_SIZE * 4), %rdi > > > + testl %ecx, %ecx > > > + jz L(loop_4x_vec) > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x1) > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + VPCMPEQ %ymm1, %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + subq %rdx, %rdi > > > testl %eax, %eax > > > + jnz L(last_vec_return_x0) > > > > > > - jnz L(first_vec_x2_check) > > > - subl $VEC_SIZE, %esi > > > - jle L(max) > > > - > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + VPCMPEQ %ymm2, %ymm0, %ymm2 > > > + vpmovmskb %ymm2, %eax > > > testl %eax, %eax > > > - > > > - jnz L(first_vec_x3_check) > > > - movq %r8, %rax > > > -# ifdef USE_AS_WCSLEN > > > + jnz L(last_vec_return_x1) > > > + > > > + /* Combine last 2 VEC. */ > > > + VPCMPEQ %ymm3, %ymm0, %ymm3 > > > + vpmovmskb %ymm3, %eax > > > + /* rcx has combined result from all 4 VEC. It will only be used if > > > + the first 3 other VEC all did not contain a match. */ > > > + salq $32, %rcx > > > + orq %rcx, %rax > > > + tzcntq %rax, %rax > > > + subq $(VEC_SIZE * 2 - 1), %rdi > > > + addq %rdi, %rax > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > VZEROUPPER_RETURN > > > > > > + > > > +# ifdef USE_AS_STRNLEN > > > .p2align 4 > > > -L(last_2x_vec): > > > - addl $(VEC_SIZE * 2), %esi > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > +L(last_4x_vec_or_less_load): > > > + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ > > > + subq $-(VEC_SIZE * 4), %rdi > > > +L(last_4x_vec_or_less_cmpeq): > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > +L(last_4x_vec_or_less): > > > > > > - jnz L(first_vec_x0_check) > > > - subl $VEC_SIZE, %esi > > > - jle L(max) > > > + vpmovmskb %ymm1, %eax > > > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > > > + VEC_SIZE * 4. */ > > > + testl $(VEC_SIZE * 2), %esi > > > + jnz L(last_4x_vec) > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + /* length may have been negative or positive by an offset of > > > + VEC_SIZE * 4 depending on where this was called from. This fixes > > > + that. */ > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > testl %eax, %eax > > > - jnz L(first_vec_x1_check) > > > - movq %r8, %rax > > > -# ifdef USE_AS_WCSLEN > > > - shrq $2, %rax > > > -# endif > > > - VZEROUPPER_RETURN > > > + jnz L(last_vec_x1_check) > > > > > > - .p2align 4 > > > -L(first_vec_x0_check): > > > + subl $VEC_SIZE, %esi > > > + jb L(max) > > > + > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > tzcntl %eax, %eax > > > /* Check the end of data. */ > > > - cmpq %rax, %rsi > > > - jbe L(max) > > > + cmpl %eax, %esi > > > + jb L(max) > > > + subq %rdx, %rdi > > > + addl $(VEC_SIZE + 1), %eax > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > # ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > # endif > > > VZEROUPPER_RETURN > > > +# endif > > > > > > .p2align 4 > > > -L(first_vec_x1_check): > > > +L(last_vec_return_x0): > > > tzcntl %eax, %eax > > > - /* Check the end of data. */ > > > - cmpq %rax, %rsi > > > - jbe L(max) > > > - addq $VEC_SIZE, %rax > > > + subq $(VEC_SIZE * 4 - 1), %rdi > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > -# ifdef USE_AS_WCSLEN > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > VZEROUPPER_RETURN > > > > > > .p2align 4 > > > -L(first_vec_x2_check): > > > +L(last_vec_return_x1): > > > tzcntl %eax, %eax > > > - /* Check the end of data. */ > > > - cmpq %rax, %rsi > > > - jbe L(max) > > > - addq $(VEC_SIZE * 2), %rax > > > + subq $(VEC_SIZE * 3 - 1), %rdi > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > -# ifdef USE_AS_WCSLEN > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > VZEROUPPER_RETURN > > > > > > +# ifdef USE_AS_STRNLEN > > > .p2align 4 > > > -L(first_vec_x3_check): > > > +L(last_vec_x1_check): > > > + > > > tzcntl %eax, %eax > > > /* Check the end of data. */ > > > - cmpq %rax, %rsi > > > - jbe L(max) > > > - addq $(VEC_SIZE * 3), %rax > > > + cmpl %eax, %esi > > > + jb L(max) > > > + subq %rdx, %rdi > > > + incl %eax > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > # ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > # endif > > > VZEROUPPER_RETURN > > > > > > - .p2align 4 > > > L(max): > > > movq %r8, %rax > > > + VZEROUPPER_RETURN > > > + > > > + .p2align 4 > > > +L(last_4x_vec): > > > + /* Test first 2x VEC normally. */ > > > + testl %eax, %eax > > > + jnz L(last_vec_x1) > > > + > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + testl %eax, %eax > > > + jnz L(last_vec_x2) > > > + > > > + /* Normalize length. */ > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + testl %eax, %eax > > > + jnz L(last_vec_x3) > > > + > > > + subl $(VEC_SIZE * 3), %esi > > > + jb L(max) > > > + > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + tzcntl %eax, %eax > > > + /* Check the end of data. */ > > > + cmpl %eax, %esi > > > + jb L(max) > > > + subq %rdx, %rdi > > > + addl $(VEC_SIZE * 3 + 1), %eax > > > + addq %rdi, %rax > > > # ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > # endif > > > VZEROUPPER_RETURN > > > > > > - .p2align 4 > > > -L(zero): > > > - xorl %eax, %eax > > > - ret > > > -# endif > > > > > > .p2align 4 > > > -L(first_vec_x0): > > > +L(last_vec_x1): > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > + instructions. */ > > > tzcntl %eax, %eax > > > + subq %rdx, %rdi > > > + incl %eax > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > -# ifdef USE_AS_WCSLEN > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > VZEROUPPER_RETURN > > > > > > .p2align 4 > > > -L(first_vec_x1): > > > +L(last_vec_x2): > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > + instructions. */ > > > tzcntl %eax, %eax > > > - addq $VEC_SIZE, %rax > > > + subq %rdx, %rdi > > > + addl $(VEC_SIZE + 1), %eax > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > -# ifdef USE_AS_WCSLEN > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > VZEROUPPER_RETURN > > > > > > .p2align 4 > > > -L(first_vec_x2): > > > +L(last_vec_x3): > > > tzcntl %eax, %eax > > > - addq $(VEC_SIZE * 2), %rax > > > + subl $(VEC_SIZE * 2), %esi > > > + /* Check the end of data. */ > > > + cmpl %eax, %esi > > > + jb L(max_end) > > > + subq %rdx, %rdi > > > + addl $(VEC_SIZE * 2 + 1), %eax > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > -# ifdef USE_AS_WCSLEN > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > + VZEROUPPER_RETURN > > > +L(max_end): > > > + movq %r8, %rax > > > VZEROUPPER_RETURN > > > +# endif > > > > > > + /* Cold case for crossing page with first load. */ > > > .p2align 4 > > > -L(4x_vec_end): > > > - VPCMPEQ %ymm1, %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x0) > > > - VPCMPEQ %ymm2, %ymm0, %ymm2 > > > - vpmovmskb %ymm2, %eax > > > +L(cross_page_boundary): > > > + /* Align data to VEC_SIZE - 1. */ > > > + orq $(VEC_SIZE - 1), %rdi > > > + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > > + so no need to manually mod rdx. */ > > > + sarxl %edx, %eax, %eax > > > > This is a BMI2 instruction, which is not necessary available when AVX2 > > is available. This causes SIGILL on some CPU. I have reported that in > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 > > This is not a bug on master as: > > commit 83c5b368226c34a2f0a5287df40fc290b2b34359 > Author: H.J. Lu <hjl.tools@gmail.com> > Date: Mon Apr 19 10:45:07 2021 -0700 > > x86-64: Require BMI2 for strchr-avx2.S > > is already in tree. The issue is the avx2 changes where backported > w.o H.J's changes. > > > > Regards > > Aurelien > > > > -- > > Aurelien Jarno GPG: 4096R/1DDD8C9B > > aurelien@aurel32.net http://www.aurel32.net ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-09-28 13:54 ` Sunil Pandey 2022-09-28 14:02 ` Darren Tristano @ 2022-09-28 14:42 ` Noah Goldstein 2022-09-28 14:54 ` Sunil Pandey 2022-09-28 18:23 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S H.J. Lu 2022-10-04 21:19 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Aurelien Jarno 3 siblings, 1 reply; 24+ messages in thread From: Noah Goldstein @ 2022-09-28 14:42 UTC (permalink / raw) To: Sunil Pandey; +Cc: Libc-stable Mailing List, Hongjiu Lu, GNU C Library On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > Attached patch fixes BZ# 29611. > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > if there is any objection. The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S Can you post these as separate emails with the patches embedded instead of attached? > > > On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote: > > > > No bug. This commit optimizes strlen-avx2.S. The optimizations are > > > > mostly small things but they add up to roughly 10-30% performance > > > > improvement for strlen. The results for strnlen are bit more > > > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > > > > are all passing. > > > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > > --- > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > > > > sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- > > > > 2 files changed, 334 insertions(+), 214 deletions(-) > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > index c377cab629..651b32908e 100644 > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > > > > IFUNC_IMPL (i, name, strlen, > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > - CPU_FEATURE_USABLE (AVX2), > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > __strlen_avx2) > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > && CPU_FEATURE_USABLE (RTM)), > > > > __strlen_avx2_rtm) > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/strnlen.c. */ > > > > IFUNC_IMPL (i, name, strnlen, > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > - CPU_FEATURE_USABLE (AVX2), > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > __strnlen_avx2) > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > && CPU_FEATURE_USABLE (RTM)), > > > > __strnlen_avx2_rtm) > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > > > > IFUNC_IMPL (i, name, wcslen, > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > - CPU_FEATURE_USABLE (AVX2), > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > __wcslen_avx2) > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > && CPU_FEATURE_USABLE (RTM)), > > > > __wcslen_avx2_rtm) > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ > > > > IFUNC_IMPL (i, name, wcsnlen, > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > - CPU_FEATURE_USABLE (AVX2), > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > __wcsnlen_avx2) > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > && CPU_FEATURE_USABLE (RTM)), > > > > __wcsnlen_avx2_rtm) > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > index 1caae9e6bc..bd2e6ee44a 100644 > > > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > @@ -27,9 +27,11 @@ > > > > # ifdef USE_AS_WCSLEN > > > > # define VPCMPEQ vpcmpeqd > > > > # define VPMINU vpminud > > > > +# define CHAR_SIZE 4 > > > > # else > > > > # define VPCMPEQ vpcmpeqb > > > > # define VPMINU vpminub > > > > +# define CHAR_SIZE 1 > > > > # endif > > > > > > > > # ifndef VZEROUPPER > > > > @@ -41,349 +43,459 @@ > > > > # endif > > > > > > > > # define VEC_SIZE 32 > > > > +# define PAGE_SIZE 4096 > > > > > > > > .section SECTION(.text),"ax",@progbits > > > > ENTRY (STRLEN) > > > > # ifdef USE_AS_STRNLEN > > > > - /* Check for zero length. */ > > > > + /* Check zero length. */ > > > > test %RSI_LP, %RSI_LP > > > > jz L(zero) > > > > + /* Store max len in R8_LP before adjusting if using WCSLEN. */ > > > > + mov %RSI_LP, %R8_LP > > > > # ifdef USE_AS_WCSLEN > > > > shl $2, %RSI_LP > > > > # elif defined __ILP32__ > > > > /* Clear the upper 32 bits. */ > > > > movl %esi, %esi > > > > # endif > > > > - mov %RSI_LP, %R8_LP > > > > # endif > > > > - movl %edi, %ecx > > > > + movl %edi, %eax > > > > movq %rdi, %rdx > > > > vpxor %xmm0, %xmm0, %xmm0 > > > > - > > > > + /* Clear high bits from edi. Only keeping bits relevant to page > > > > + cross check. */ > > > > + andl $(PAGE_SIZE - 1), %eax > > > > /* Check if we may cross page boundary with one vector load. */ > > > > - andl $(2 * VEC_SIZE - 1), %ecx > > > > - cmpl $VEC_SIZE, %ecx > > > > - ja L(cros_page_boundary) > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > + ja L(cross_page_boundary) > > > > > > > > /* Check the first VEC_SIZE bytes. */ > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - > > > > + VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > # ifdef USE_AS_STRNLEN > > > > - jnz L(first_vec_x0_check) > > > > - /* Adjust length and check the end of data. */ > > > > - subq $VEC_SIZE, %rsi > > > > - jbe L(max) > > > > -# else > > > > - jnz L(first_vec_x0) > > > > + /* If length < VEC_SIZE handle special. */ > > > > + cmpq $VEC_SIZE, %rsi > > > > + jbe L(first_vec_x0) > > > > # endif > > > > - > > > > - /* Align data for aligned loads in the loop. */ > > > > - addq $VEC_SIZE, %rdi > > > > - andl $(VEC_SIZE - 1), %ecx > > > > - andq $-VEC_SIZE, %rdi > > > > + /* If empty continue to aligned_more. Otherwise return bit > > > > + position of first match. */ > > > > + testl %eax, %eax > > > > + jz L(aligned_more) > > > > + tzcntl %eax, %eax > > > > +# ifdef USE_AS_WCSLEN > > > > + shrl $2, %eax > > > > +# endif > > > > + VZEROUPPER_RETURN > > > > > > > > # ifdef USE_AS_STRNLEN > > > > - /* Adjust length. */ > > > > - addq %rcx, %rsi > > > > +L(zero): > > > > + xorl %eax, %eax > > > > + ret > > > > > > > > - subq $(VEC_SIZE * 4), %rsi > > > > - jbe L(last_4x_vec_or_less) > > > > + .p2align 4 > > > > +L(first_vec_x0): > > > > + /* Set bit for max len so that tzcnt will return min of max len > > > > + and position of first match. */ > > > > + btsq %rsi, %rax > > > > + tzcntl %eax, %eax > > > > +# ifdef USE_AS_WCSLEN > > > > + shrl $2, %eax > > > > +# endif > > > > + VZEROUPPER_RETURN > > > > # endif > > > > - jmp L(more_4x_vec) > > > > > > > > .p2align 4 > > > > -L(cros_page_boundary): > > > > - andl $(VEC_SIZE - 1), %ecx > > > > - andq $-VEC_SIZE, %rdi > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - /* Remove the leading bytes. */ > > > > - sarl %cl, %eax > > > > - testl %eax, %eax > > > > - jz L(aligned_more) > > > > +L(first_vec_x1): > > > > tzcntl %eax, %eax > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > + size = [1, 159]. */ > > > > # ifdef USE_AS_STRNLEN > > > > - /* Check the end of data. */ > > > > - cmpq %rax, %rsi > > > > - jbe L(max) > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > + */ > > > > + subl $(VEC_SIZE * 4 + 1), %ecx > > > > + addl %ecx, %eax > > > > +# else > > > > + subl %edx, %edi > > > > + incl %edi > > > > + addl %edi, %eax > > > > # endif > > > > - addq %rdi, %rax > > > > - addq %rcx, %rax > > > > - subq %rdx, %rax > > > > # ifdef USE_AS_WCSLEN > > > > - shrq $2, %rax > > > > + shrl $2, %eax > > > > # endif > > > > -L(return_vzeroupper): > > > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > > > + VZEROUPPER_RETURN > > > > > > > > .p2align 4 > > > > -L(aligned_more): > > > > +L(first_vec_x2): > > > > + tzcntl %eax, %eax > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > + size = [1, 159]. */ > > > > # ifdef USE_AS_STRNLEN > > > > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > > > > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > > > > - to void possible addition overflow. */ > > > > - negq %rcx > > > > - addq $VEC_SIZE, %rcx > > > > - > > > > - /* Check the end of data. */ > > > > - subq %rcx, %rsi > > > > - jbe L(max) > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > + */ > > > > + subl $(VEC_SIZE * 3 + 1), %ecx > > > > + addl %ecx, %eax > > > > +# else > > > > + subl %edx, %edi > > > > + addl $(VEC_SIZE + 1), %edi > > > > + addl %edi, %eax > > > > # endif > > > > +# ifdef USE_AS_WCSLEN > > > > + shrl $2, %eax > > > > +# endif > > > > + VZEROUPPER_RETURN > > > > > > > > - addq $VEC_SIZE, %rdi > > > > + .p2align 4 > > > > +L(first_vec_x3): > > > > + tzcntl %eax, %eax > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > + size = [1, 159]. */ > > > > +# ifdef USE_AS_STRNLEN > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > + */ > > > > + subl $(VEC_SIZE * 2 + 1), %ecx > > > > + addl %ecx, %eax > > > > +# else > > > > + subl %edx, %edi > > > > + addl $(VEC_SIZE * 2 + 1), %edi > > > > + addl %edi, %eax > > > > +# endif > > > > +# ifdef USE_AS_WCSLEN > > > > + shrl $2, %eax > > > > +# endif > > > > + VZEROUPPER_RETURN > > > > > > > > + .p2align 4 > > > > +L(first_vec_x4): > > > > + tzcntl %eax, %eax > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > + size = [1, 159]. */ > > > > # ifdef USE_AS_STRNLEN > > > > - subq $(VEC_SIZE * 4), %rsi > > > > - jbe L(last_4x_vec_or_less) > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > + */ > > > > + subl $(VEC_SIZE + 1), %ecx > > > > + addl %ecx, %eax > > > > +# else > > > > + subl %edx, %edi > > > > + addl $(VEC_SIZE * 3 + 1), %edi > > > > + addl %edi, %eax > > > > # endif > > > > +# ifdef USE_AS_WCSLEN > > > > + shrl $2, %eax > > > > +# endif > > > > + VZEROUPPER_RETURN > > > > > > > > -L(more_4x_vec): > > > > + .p2align 5 > > > > +L(aligned_more): > > > > + /* Align data to VEC_SIZE - 1. This is the same number of > > > > + instructions as using andq with -VEC_SIZE but saves 4 bytes of > > > > + code on the x4 check. */ > > > > + orq $(VEC_SIZE - 1), %rdi > > > > +L(cross_page_continue): > > > > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > > since data is only aligned to VEC_SIZE. */ > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jnz L(first_vec_x0) > > > > - > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > +# ifdef USE_AS_STRNLEN > > > > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because > > > > + it simplies the logic in last_4x_vec_or_less. */ > > > > + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx > > > > + subq %rdx, %rcx > > > > +# endif > > > > + /* Load first VEC regardless. */ > > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > > +# ifdef USE_AS_STRNLEN > > > > + /* Adjust length. If near end handle specially. */ > > > > + subq %rcx, %rsi > > > > + jb L(last_4x_vec_or_less) > > > > +# endif > > > > + vpmovmskb %ymm1, %eax > > > > testl %eax, %eax > > > > jnz L(first_vec_x1) > > > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > testl %eax, %eax > > > > jnz L(first_vec_x2) > > > > > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > testl %eax, %eax > > > > jnz L(first_vec_x3) > > > > > > > > - addq $(VEC_SIZE * 4), %rdi > > > > - > > > > -# ifdef USE_AS_STRNLEN > > > > - subq $(VEC_SIZE * 4), %rsi > > > > - jbe L(last_4x_vec_or_less) > > > > -# endif > > > > - > > > > - /* Align data to 4 * VEC_SIZE. */ > > > > - movq %rdi, %rcx > > > > - andl $(4 * VEC_SIZE - 1), %ecx > > > > - andq $-(4 * VEC_SIZE), %rdi > > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > + testl %eax, %eax > > > > + jnz L(first_vec_x4) > > > > > > > > + /* Align data to VEC_SIZE * 4 - 1. */ > > > > # ifdef USE_AS_STRNLEN > > > > - /* Adjust length. */ > > > > + /* Before adjusting length check if at last VEC_SIZE * 4. */ > > > > + cmpq $(VEC_SIZE * 4 - 1), %rsi > > > > + jbe L(last_4x_vec_or_less_load) > > > > + incq %rdi > > > > + movl %edi, %ecx > > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > > + andl $(VEC_SIZE * 4 - 1), %ecx > > > > + /* Readjust length. */ > > > > addq %rcx, %rsi > > > > +# else > > > > + incq %rdi > > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > > # endif > > > > - > > > > + /* Compare 4 * VEC at a time forward. */ > > > > .p2align 4 > > > > L(loop_4x_vec): > > > > - /* Compare 4 * VEC at a time forward. */ > > > > - vmovdqa (%rdi), %ymm1 > > > > - vmovdqa VEC_SIZE(%rdi), %ymm2 > > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > > > > - VPMINU %ymm1, %ymm2, %ymm5 > > > > - VPMINU %ymm3, %ymm4, %ymm6 > > > > - VPMINU %ymm5, %ymm6, %ymm5 > > > > - > > > > - VPCMPEQ %ymm5, %ymm0, %ymm5 > > > > - vpmovmskb %ymm5, %eax > > > > - testl %eax, %eax > > > > - jnz L(4x_vec_end) > > > > - > > > > - addq $(VEC_SIZE * 4), %rdi > > > > - > > > > -# ifndef USE_AS_STRNLEN > > > > - jmp L(loop_4x_vec) > > > > -# else > > > > +# ifdef USE_AS_STRNLEN > > > > + /* Break if at end of length. */ > > > > subq $(VEC_SIZE * 4), %rsi > > > > - ja L(loop_4x_vec) > > > > - > > > > -L(last_4x_vec_or_less): > > > > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > > > > - addl $(VEC_SIZE * 2), %esi > > > > - jle L(last_2x_vec) > > > > + jb L(last_4x_vec_or_less_cmpeq) > > > > +# endif > > > > + /* Save some code size by microfusing VPMINU with the load. Since > > > > + the matches in ymm2/ymm4 can only be returned if there where no > > > > + matches in ymm1/ymm3 respectively there is no issue with overlap. > > > > + */ > > > > + vmovdqa 1(%rdi), %ymm1 > > > > + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 > > > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 > > > > + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 > > > > + > > > > + VPMINU %ymm2, %ymm4, %ymm5 > > > > + VPCMPEQ %ymm5, %ymm0, %ymm5 > > > > + vpmovmskb %ymm5, %ecx > > > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jnz L(first_vec_x0) > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > + testl %ecx, %ecx > > > > + jz L(loop_4x_vec) > > > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jnz L(first_vec_x1) > > > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > + VPCMPEQ %ymm1, %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > + subq %rdx, %rdi > > > > testl %eax, %eax > > > > + jnz L(last_vec_return_x0) > > > > > > > > - jnz L(first_vec_x2_check) > > > > - subl $VEC_SIZE, %esi > > > > - jle L(max) > > > > - > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > + VPCMPEQ %ymm2, %ymm0, %ymm2 > > > > + vpmovmskb %ymm2, %eax > > > > testl %eax, %eax > > > > - > > > > - jnz L(first_vec_x3_check) > > > > - movq %r8, %rax > > > > -# ifdef USE_AS_WCSLEN > > > > + jnz L(last_vec_return_x1) > > > > + > > > > + /* Combine last 2 VEC. */ > > > > + VPCMPEQ %ymm3, %ymm0, %ymm3 > > > > + vpmovmskb %ymm3, %eax > > > > + /* rcx has combined result from all 4 VEC. It will only be used if > > > > + the first 3 other VEC all did not contain a match. */ > > > > + salq $32, %rcx > > > > + orq %rcx, %rax > > > > + tzcntq %rax, %rax > > > > + subq $(VEC_SIZE * 2 - 1), %rdi > > > > + addq %rdi, %rax > > > > +# ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > -# endif > > > > +# endif > > > > VZEROUPPER_RETURN > > > > > > > > + > > > > +# ifdef USE_AS_STRNLEN > > > > .p2align 4 > > > > -L(last_2x_vec): > > > > - addl $(VEC_SIZE * 2), %esi > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > +L(last_4x_vec_or_less_load): > > > > + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > +L(last_4x_vec_or_less_cmpeq): > > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > > +L(last_4x_vec_or_less): > > > > > > > > - jnz L(first_vec_x0_check) > > > > - subl $VEC_SIZE, %esi > > > > - jle L(max) > > > > + vpmovmskb %ymm1, %eax > > > > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > > > > + VEC_SIZE * 4. */ > > > > + testl $(VEC_SIZE * 2), %esi > > > > + jnz L(last_4x_vec) > > > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > + /* length may have been negative or positive by an offset of > > > > + VEC_SIZE * 4 depending on where this was called from. This fixes > > > > + that. */ > > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > > testl %eax, %eax > > > > - jnz L(first_vec_x1_check) > > > > - movq %r8, %rax > > > > -# ifdef USE_AS_WCSLEN > > > > - shrq $2, %rax > > > > -# endif > > > > - VZEROUPPER_RETURN > > > > + jnz L(last_vec_x1_check) > > > > > > > > - .p2align 4 > > > > -L(first_vec_x0_check): > > > > + subl $VEC_SIZE, %esi > > > > + jb L(max) > > > > + > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > tzcntl %eax, %eax > > > > /* Check the end of data. */ > > > > - cmpq %rax, %rsi > > > > - jbe L(max) > > > > + cmpl %eax, %esi > > > > + jb L(max) > > > > + subq %rdx, %rdi > > > > + addl $(VEC_SIZE + 1), %eax > > > > addq %rdi, %rax > > > > - subq %rdx, %rax > > > > # ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > # endif > > > > VZEROUPPER_RETURN > > > > +# endif > > > > > > > > .p2align 4 > > > > -L(first_vec_x1_check): > > > > +L(last_vec_return_x0): > > > > tzcntl %eax, %eax > > > > - /* Check the end of data. */ > > > > - cmpq %rax, %rsi > > > > - jbe L(max) > > > > - addq $VEC_SIZE, %rax > > > > + subq $(VEC_SIZE * 4 - 1), %rdi > > > > addq %rdi, %rax > > > > - subq %rdx, %rax > > > > -# ifdef USE_AS_WCSLEN > > > > +# ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > -# endif > > > > +# endif > > > > VZEROUPPER_RETURN > > > > > > > > .p2align 4 > > > > -L(first_vec_x2_check): > > > > +L(last_vec_return_x1): > > > > tzcntl %eax, %eax > > > > - /* Check the end of data. */ > > > > - cmpq %rax, %rsi > > > > - jbe L(max) > > > > - addq $(VEC_SIZE * 2), %rax > > > > + subq $(VEC_SIZE * 3 - 1), %rdi > > > > addq %rdi, %rax > > > > - subq %rdx, %rax > > > > -# ifdef USE_AS_WCSLEN > > > > +# ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > -# endif > > > > +# endif > > > > VZEROUPPER_RETURN > > > > > > > > +# ifdef USE_AS_STRNLEN > > > > .p2align 4 > > > > -L(first_vec_x3_check): > > > > +L(last_vec_x1_check): > > > > + > > > > tzcntl %eax, %eax > > > > /* Check the end of data. */ > > > > - cmpq %rax, %rsi > > > > - jbe L(max) > > > > - addq $(VEC_SIZE * 3), %rax > > > > + cmpl %eax, %esi > > > > + jb L(max) > > > > + subq %rdx, %rdi > > > > + incl %eax > > > > addq %rdi, %rax > > > > - subq %rdx, %rax > > > > # ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > # endif > > > > VZEROUPPER_RETURN > > > > > > > > - .p2align 4 > > > > L(max): > > > > movq %r8, %rax > > > > + VZEROUPPER_RETURN > > > > + > > > > + .p2align 4 > > > > +L(last_4x_vec): > > > > + /* Test first 2x VEC normally. */ > > > > + testl %eax, %eax > > > > + jnz L(last_vec_x1) > > > > + > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > + testl %eax, %eax > > > > + jnz L(last_vec_x2) > > > > + > > > > + /* Normalize length. */ > > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > + testl %eax, %eax > > > > + jnz L(last_vec_x3) > > > > + > > > > + subl $(VEC_SIZE * 3), %esi > > > > + jb L(max) > > > > + > > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > + tzcntl %eax, %eax > > > > + /* Check the end of data. */ > > > > + cmpl %eax, %esi > > > > + jb L(max) > > > > + subq %rdx, %rdi > > > > + addl $(VEC_SIZE * 3 + 1), %eax > > > > + addq %rdi, %rax > > > > # ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > # endif > > > > VZEROUPPER_RETURN > > > > > > > > - .p2align 4 > > > > -L(zero): > > > > - xorl %eax, %eax > > > > - ret > > > > -# endif > > > > > > > > .p2align 4 > > > > -L(first_vec_x0): > > > > +L(last_vec_x1): > > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > > + instructions. */ > > > > tzcntl %eax, %eax > > > > + subq %rdx, %rdi > > > > + incl %eax > > > > addq %rdi, %rax > > > > - subq %rdx, %rax > > > > -# ifdef USE_AS_WCSLEN > > > > +# ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > -# endif > > > > +# endif > > > > VZEROUPPER_RETURN > > > > > > > > .p2align 4 > > > > -L(first_vec_x1): > > > > +L(last_vec_x2): > > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > > + instructions. */ > > > > tzcntl %eax, %eax > > > > - addq $VEC_SIZE, %rax > > > > + subq %rdx, %rdi > > > > + addl $(VEC_SIZE + 1), %eax > > > > addq %rdi, %rax > > > > - subq %rdx, %rax > > > > -# ifdef USE_AS_WCSLEN > > > > +# ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > -# endif > > > > +# endif > > > > VZEROUPPER_RETURN > > > > > > > > .p2align 4 > > > > -L(first_vec_x2): > > > > +L(last_vec_x3): > > > > tzcntl %eax, %eax > > > > - addq $(VEC_SIZE * 2), %rax > > > > + subl $(VEC_SIZE * 2), %esi > > > > + /* Check the end of data. */ > > > > + cmpl %eax, %esi > > > > + jb L(max_end) > > > > + subq %rdx, %rdi > > > > + addl $(VEC_SIZE * 2 + 1), %eax > > > > addq %rdi, %rax > > > > - subq %rdx, %rax > > > > -# ifdef USE_AS_WCSLEN > > > > +# ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > -# endif > > > > +# endif > > > > + VZEROUPPER_RETURN > > > > +L(max_end): > > > > + movq %r8, %rax > > > > VZEROUPPER_RETURN > > > > +# endif > > > > > > > > + /* Cold case for crossing page with first load. */ > > > > .p2align 4 > > > > -L(4x_vec_end): > > > > - VPCMPEQ %ymm1, %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jnz L(first_vec_x0) > > > > - VPCMPEQ %ymm2, %ymm0, %ymm2 > > > > - vpmovmskb %ymm2, %eax > > > > +L(cross_page_boundary): > > > > + /* Align data to VEC_SIZE - 1. */ > > > > + orq $(VEC_SIZE - 1), %rdi > > > > + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > > > + so no need to manually mod rdx. */ > > > > + sarxl %edx, %eax, %eax > > > > > > This is a BMI2 instruction, which is not necessary available when AVX2 > > > is available. This causes SIGILL on some CPU. I have reported that in > > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 > > > > This is not a bug on master as: > > > > commit 83c5b368226c34a2f0a5287df40fc290b2b34359 > > Author: H.J. Lu <hjl.tools@gmail.com> > > Date: Mon Apr 19 10:45:07 2021 -0700 > > > > x86-64: Require BMI2 for strchr-avx2.S > > > > is already in tree. The issue is the avx2 changes where backported > > w.o H.J's changes. > > > > > > Regards > > > Aurelien > > > > > > -- > > > Aurelien Jarno GPG: 4096R/1DDD8C9B > > > aurelien@aurel32.net http://www.aurel32.net ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-09-28 14:42 ` Noah Goldstein @ 2022-09-28 14:54 ` Sunil Pandey 2022-09-28 15:00 ` Noah Goldstein 0 siblings, 1 reply; 24+ messages in thread From: Sunil Pandey @ 2022-09-28 14:54 UTC (permalink / raw) To: Noah Goldstein; +Cc: Libc-stable Mailing List, Hongjiu Lu, GNU C Library On Wed, Sep 28, 2022 at 7:42 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > Attached patch fixes BZ# 29611. > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > if there is any objection. > The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S > > Can you post these as separate emails with the patches embedded instead of > attached? > > > Patches are also posted on bug report 29611. https://sourceware.org/bugzilla/show_bug.cgi?id=29611 > > > > On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha > > <libc-alpha@sourceware.org> wrote: > > > > > > On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > > > > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote: > > > > > No bug. This commit optimizes strlen-avx2.S. The optimizations are > > > > > mostly small things but they add up to roughly 10-30% performance > > > > > improvement for strlen. The results for strnlen are bit more > > > > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > > > > > are all passing. > > > > > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > > > --- > > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > > > > > sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- > > > > > 2 files changed, 334 insertions(+), 214 deletions(-) > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > index c377cab629..651b32908e 100644 > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > > > > > IFUNC_IMPL (i, name, strlen, > > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > __strlen_avx2) > > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > __strlen_avx2_rtm) > > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > /* Support sysdeps/x86_64/multiarch/strnlen.c. */ > > > > > IFUNC_IMPL (i, name, strnlen, > > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > __strnlen_avx2) > > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > __strnlen_avx2_rtm) > > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > > > > > IFUNC_IMPL (i, name, wcslen, > > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > __wcslen_avx2) > > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > __wcslen_avx2_rtm) > > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ > > > > > IFUNC_IMPL (i, name, wcsnlen, > > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > __wcsnlen_avx2) > > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > __wcsnlen_avx2_rtm) > > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > > index 1caae9e6bc..bd2e6ee44a 100644 > > > > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > > @@ -27,9 +27,11 @@ > > > > > # ifdef USE_AS_WCSLEN > > > > > # define VPCMPEQ vpcmpeqd > > > > > # define VPMINU vpminud > > > > > +# define CHAR_SIZE 4 > > > > > # else > > > > > # define VPCMPEQ vpcmpeqb > > > > > # define VPMINU vpminub > > > > > +# define CHAR_SIZE 1 > > > > > # endif > > > > > > > > > > # ifndef VZEROUPPER > > > > > @@ -41,349 +43,459 @@ > > > > > # endif > > > > > > > > > > # define VEC_SIZE 32 > > > > > +# define PAGE_SIZE 4096 > > > > > > > > > > .section SECTION(.text),"ax",@progbits > > > > > ENTRY (STRLEN) > > > > > # ifdef USE_AS_STRNLEN > > > > > - /* Check for zero length. */ > > > > > + /* Check zero length. */ > > > > > test %RSI_LP, %RSI_LP > > > > > jz L(zero) > > > > > + /* Store max len in R8_LP before adjusting if using WCSLEN. */ > > > > > + mov %RSI_LP, %R8_LP > > > > > # ifdef USE_AS_WCSLEN > > > > > shl $2, %RSI_LP > > > > > # elif defined __ILP32__ > > > > > /* Clear the upper 32 bits. */ > > > > > movl %esi, %esi > > > > > # endif > > > > > - mov %RSI_LP, %R8_LP > > > > > # endif > > > > > - movl %edi, %ecx > > > > > + movl %edi, %eax > > > > > movq %rdi, %rdx > > > > > vpxor %xmm0, %xmm0, %xmm0 > > > > > - > > > > > + /* Clear high bits from edi. Only keeping bits relevant to page > > > > > + cross check. */ > > > > > + andl $(PAGE_SIZE - 1), %eax > > > > > /* Check if we may cross page boundary with one vector load. */ > > > > > - andl $(2 * VEC_SIZE - 1), %ecx > > > > > - cmpl $VEC_SIZE, %ecx > > > > > - ja L(cros_page_boundary) > > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > > + ja L(cross_page_boundary) > > > > > > > > > > /* Check the first VEC_SIZE bytes. */ > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > - testl %eax, %eax > > > > > - > > > > > + VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > # ifdef USE_AS_STRNLEN > > > > > - jnz L(first_vec_x0_check) > > > > > - /* Adjust length and check the end of data. */ > > > > > - subq $VEC_SIZE, %rsi > > > > > - jbe L(max) > > > > > -# else > > > > > - jnz L(first_vec_x0) > > > > > + /* If length < VEC_SIZE handle special. */ > > > > > + cmpq $VEC_SIZE, %rsi > > > > > + jbe L(first_vec_x0) > > > > > # endif > > > > > - > > > > > - /* Align data for aligned loads in the loop. */ > > > > > - addq $VEC_SIZE, %rdi > > > > > - andl $(VEC_SIZE - 1), %ecx > > > > > - andq $-VEC_SIZE, %rdi > > > > > + /* If empty continue to aligned_more. Otherwise return bit > > > > > + position of first match. */ > > > > > + testl %eax, %eax > > > > > + jz L(aligned_more) > > > > > + tzcntl %eax, %eax > > > > > +# ifdef USE_AS_WCSLEN > > > > > + shrl $2, %eax > > > > > +# endif > > > > > + VZEROUPPER_RETURN > > > > > > > > > > # ifdef USE_AS_STRNLEN > > > > > - /* Adjust length. */ > > > > > - addq %rcx, %rsi > > > > > +L(zero): > > > > > + xorl %eax, %eax > > > > > + ret > > > > > > > > > > - subq $(VEC_SIZE * 4), %rsi > > > > > - jbe L(last_4x_vec_or_less) > > > > > + .p2align 4 > > > > > +L(first_vec_x0): > > > > > + /* Set bit for max len so that tzcnt will return min of max len > > > > > + and position of first match. */ > > > > > + btsq %rsi, %rax > > > > > + tzcntl %eax, %eax > > > > > +# ifdef USE_AS_WCSLEN > > > > > + shrl $2, %eax > > > > > +# endif > > > > > + VZEROUPPER_RETURN > > > > > # endif > > > > > - jmp L(more_4x_vec) > > > > > > > > > > .p2align 4 > > > > > -L(cros_page_boundary): > > > > > - andl $(VEC_SIZE - 1), %ecx > > > > > - andq $-VEC_SIZE, %rdi > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > - /* Remove the leading bytes. */ > > > > > - sarl %cl, %eax > > > > > - testl %eax, %eax > > > > > - jz L(aligned_more) > > > > > +L(first_vec_x1): > > > > > tzcntl %eax, %eax > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > + size = [1, 159]. */ > > > > > # ifdef USE_AS_STRNLEN > > > > > - /* Check the end of data. */ > > > > > - cmpq %rax, %rsi > > > > > - jbe L(max) > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > + */ > > > > > + subl $(VEC_SIZE * 4 + 1), %ecx > > > > > + addl %ecx, %eax > > > > > +# else > > > > > + subl %edx, %edi > > > > > + incl %edi > > > > > + addl %edi, %eax > > > > > # endif > > > > > - addq %rdi, %rax > > > > > - addq %rcx, %rax > > > > > - subq %rdx, %rax > > > > > # ifdef USE_AS_WCSLEN > > > > > - shrq $2, %rax > > > > > + shrl $2, %eax > > > > > # endif > > > > > -L(return_vzeroupper): > > > > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > > > > + VZEROUPPER_RETURN > > > > > > > > > > .p2align 4 > > > > > -L(aligned_more): > > > > > +L(first_vec_x2): > > > > > + tzcntl %eax, %eax > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > + size = [1, 159]. */ > > > > > # ifdef USE_AS_STRNLEN > > > > > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > > > > > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > > > > > - to void possible addition overflow. */ > > > > > - negq %rcx > > > > > - addq $VEC_SIZE, %rcx > > > > > - > > > > > - /* Check the end of data. */ > > > > > - subq %rcx, %rsi > > > > > - jbe L(max) > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > + */ > > > > > + subl $(VEC_SIZE * 3 + 1), %ecx > > > > > + addl %ecx, %eax > > > > > +# else > > > > > + subl %edx, %edi > > > > > + addl $(VEC_SIZE + 1), %edi > > > > > + addl %edi, %eax > > > > > # endif > > > > > +# ifdef USE_AS_WCSLEN > > > > > + shrl $2, %eax > > > > > +# endif > > > > > + VZEROUPPER_RETURN > > > > > > > > > > - addq $VEC_SIZE, %rdi > > > > > + .p2align 4 > > > > > +L(first_vec_x3): > > > > > + tzcntl %eax, %eax > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > + size = [1, 159]. */ > > > > > +# ifdef USE_AS_STRNLEN > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > + */ > > > > > + subl $(VEC_SIZE * 2 + 1), %ecx > > > > > + addl %ecx, %eax > > > > > +# else > > > > > + subl %edx, %edi > > > > > + addl $(VEC_SIZE * 2 + 1), %edi > > > > > + addl %edi, %eax > > > > > +# endif > > > > > +# ifdef USE_AS_WCSLEN > > > > > + shrl $2, %eax > > > > > +# endif > > > > > + VZEROUPPER_RETURN > > > > > > > > > > + .p2align 4 > > > > > +L(first_vec_x4): > > > > > + tzcntl %eax, %eax > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > + size = [1, 159]. */ > > > > > # ifdef USE_AS_STRNLEN > > > > > - subq $(VEC_SIZE * 4), %rsi > > > > > - jbe L(last_4x_vec_or_less) > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > + */ > > > > > + subl $(VEC_SIZE + 1), %ecx > > > > > + addl %ecx, %eax > > > > > +# else > > > > > + subl %edx, %edi > > > > > + addl $(VEC_SIZE * 3 + 1), %edi > > > > > + addl %edi, %eax > > > > > # endif > > > > > +# ifdef USE_AS_WCSLEN > > > > > + shrl $2, %eax > > > > > +# endif > > > > > + VZEROUPPER_RETURN > > > > > > > > > > -L(more_4x_vec): > > > > > + .p2align 5 > > > > > +L(aligned_more): > > > > > + /* Align data to VEC_SIZE - 1. This is the same number of > > > > > + instructions as using andq with -VEC_SIZE but saves 4 bytes of > > > > > + code on the x4 check. */ > > > > > + orq $(VEC_SIZE - 1), %rdi > > > > > +L(cross_page_continue): > > > > > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > > > since data is only aligned to VEC_SIZE. */ > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > - testl %eax, %eax > > > > > - jnz L(first_vec_x0) > > > > > - > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > +# ifdef USE_AS_STRNLEN > > > > > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because > > > > > + it simplies the logic in last_4x_vec_or_less. */ > > > > > + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx > > > > > + subq %rdx, %rcx > > > > > +# endif > > > > > + /* Load first VEC regardless. */ > > > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > > > +# ifdef USE_AS_STRNLEN > > > > > + /* Adjust length. If near end handle specially. */ > > > > > + subq %rcx, %rsi > > > > > + jb L(last_4x_vec_or_less) > > > > > +# endif > > > > > + vpmovmskb %ymm1, %eax > > > > > testl %eax, %eax > > > > > jnz L(first_vec_x1) > > > > > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > testl %eax, %eax > > > > > jnz L(first_vec_x2) > > > > > > > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > testl %eax, %eax > > > > > jnz L(first_vec_x3) > > > > > > > > > > - addq $(VEC_SIZE * 4), %rdi > > > > > - > > > > > -# ifdef USE_AS_STRNLEN > > > > > - subq $(VEC_SIZE * 4), %rsi > > > > > - jbe L(last_4x_vec_or_less) > > > > > -# endif > > > > > - > > > > > - /* Align data to 4 * VEC_SIZE. */ > > > > > - movq %rdi, %rcx > > > > > - andl $(4 * VEC_SIZE - 1), %ecx > > > > > - andq $-(4 * VEC_SIZE), %rdi > > > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > + testl %eax, %eax > > > > > + jnz L(first_vec_x4) > > > > > > > > > > + /* Align data to VEC_SIZE * 4 - 1. */ > > > > > # ifdef USE_AS_STRNLEN > > > > > - /* Adjust length. */ > > > > > + /* Before adjusting length check if at last VEC_SIZE * 4. */ > > > > > + cmpq $(VEC_SIZE * 4 - 1), %rsi > > > > > + jbe L(last_4x_vec_or_less_load) > > > > > + incq %rdi > > > > > + movl %edi, %ecx > > > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > > > + andl $(VEC_SIZE * 4 - 1), %ecx > > > > > + /* Readjust length. */ > > > > > addq %rcx, %rsi > > > > > +# else > > > > > + incq %rdi > > > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > > > # endif > > > > > - > > > > > + /* Compare 4 * VEC at a time forward. */ > > > > > .p2align 4 > > > > > L(loop_4x_vec): > > > > > - /* Compare 4 * VEC at a time forward. */ > > > > > - vmovdqa (%rdi), %ymm1 > > > > > - vmovdqa VEC_SIZE(%rdi), %ymm2 > > > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > > > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > > > > > - VPMINU %ymm1, %ymm2, %ymm5 > > > > > - VPMINU %ymm3, %ymm4, %ymm6 > > > > > - VPMINU %ymm5, %ymm6, %ymm5 > > > > > - > > > > > - VPCMPEQ %ymm5, %ymm0, %ymm5 > > > > > - vpmovmskb %ymm5, %eax > > > > > - testl %eax, %eax > > > > > - jnz L(4x_vec_end) > > > > > - > > > > > - addq $(VEC_SIZE * 4), %rdi > > > > > - > > > > > -# ifndef USE_AS_STRNLEN > > > > > - jmp L(loop_4x_vec) > > > > > -# else > > > > > +# ifdef USE_AS_STRNLEN > > > > > + /* Break if at end of length. */ > > > > > subq $(VEC_SIZE * 4), %rsi > > > > > - ja L(loop_4x_vec) > > > > > - > > > > > -L(last_4x_vec_or_less): > > > > > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > > > > > - addl $(VEC_SIZE * 2), %esi > > > > > - jle L(last_2x_vec) > > > > > + jb L(last_4x_vec_or_less_cmpeq) > > > > > +# endif > > > > > + /* Save some code size by microfusing VPMINU with the load. Since > > > > > + the matches in ymm2/ymm4 can only be returned if there where no > > > > > + matches in ymm1/ymm3 respectively there is no issue with overlap. > > > > > + */ > > > > > + vmovdqa 1(%rdi), %ymm1 > > > > > + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 > > > > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 > > > > > + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 > > > > > + > > > > > + VPMINU %ymm2, %ymm4, %ymm5 > > > > > + VPCMPEQ %ymm5, %ymm0, %ymm5 > > > > > + vpmovmskb %ymm5, %ecx > > > > > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > - testl %eax, %eax > > > > > - jnz L(first_vec_x0) > > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > > + testl %ecx, %ecx > > > > > + jz L(loop_4x_vec) > > > > > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > - testl %eax, %eax > > > > > - jnz L(first_vec_x1) > > > > > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > + VPCMPEQ %ymm1, %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > + subq %rdx, %rdi > > > > > testl %eax, %eax > > > > > + jnz L(last_vec_return_x0) > > > > > > > > > > - jnz L(first_vec_x2_check) > > > > > - subl $VEC_SIZE, %esi > > > > > - jle L(max) > > > > > - > > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > + VPCMPEQ %ymm2, %ymm0, %ymm2 > > > > > + vpmovmskb %ymm2, %eax > > > > > testl %eax, %eax > > > > > - > > > > > - jnz L(first_vec_x3_check) > > > > > - movq %r8, %rax > > > > > -# ifdef USE_AS_WCSLEN > > > > > + jnz L(last_vec_return_x1) > > > > > + > > > > > + /* Combine last 2 VEC. */ > > > > > + VPCMPEQ %ymm3, %ymm0, %ymm3 > > > > > + vpmovmskb %ymm3, %eax > > > > > + /* rcx has combined result from all 4 VEC. It will only be used if > > > > > + the first 3 other VEC all did not contain a match. */ > > > > > + salq $32, %rcx > > > > > + orq %rcx, %rax > > > > > + tzcntq %rax, %rax > > > > > + subq $(VEC_SIZE * 2 - 1), %rdi > > > > > + addq %rdi, %rax > > > > > +# ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > -# endif > > > > > +# endif > > > > > VZEROUPPER_RETURN > > > > > > > > > > + > > > > > +# ifdef USE_AS_STRNLEN > > > > > .p2align 4 > > > > > -L(last_2x_vec): > > > > > - addl $(VEC_SIZE * 2), %esi > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > - testl %eax, %eax > > > > > +L(last_4x_vec_or_less_load): > > > > > + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ > > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > > +L(last_4x_vec_or_less_cmpeq): > > > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > > > +L(last_4x_vec_or_less): > > > > > > > > > > - jnz L(first_vec_x0_check) > > > > > - subl $VEC_SIZE, %esi > > > > > - jle L(max) > > > > > + vpmovmskb %ymm1, %eax > > > > > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > > > > > + VEC_SIZE * 4. */ > > > > > + testl $(VEC_SIZE * 2), %esi > > > > > + jnz L(last_4x_vec) > > > > > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > + /* length may have been negative or positive by an offset of > > > > > + VEC_SIZE * 4 depending on where this was called from. This fixes > > > > > + that. */ > > > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > > > testl %eax, %eax > > > > > - jnz L(first_vec_x1_check) > > > > > - movq %r8, %rax > > > > > -# ifdef USE_AS_WCSLEN > > > > > - shrq $2, %rax > > > > > -# endif > > > > > - VZEROUPPER_RETURN > > > > > + jnz L(last_vec_x1_check) > > > > > > > > > > - .p2align 4 > > > > > -L(first_vec_x0_check): > > > > > + subl $VEC_SIZE, %esi > > > > > + jb L(max) > > > > > + > > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > tzcntl %eax, %eax > > > > > /* Check the end of data. */ > > > > > - cmpq %rax, %rsi > > > > > - jbe L(max) > > > > > + cmpl %eax, %esi > > > > > + jb L(max) > > > > > + subq %rdx, %rdi > > > > > + addl $(VEC_SIZE + 1), %eax > > > > > addq %rdi, %rax > > > > > - subq %rdx, %rax > > > > > # ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > # endif > > > > > VZEROUPPER_RETURN > > > > > +# endif > > > > > > > > > > .p2align 4 > > > > > -L(first_vec_x1_check): > > > > > +L(last_vec_return_x0): > > > > > tzcntl %eax, %eax > > > > > - /* Check the end of data. */ > > > > > - cmpq %rax, %rsi > > > > > - jbe L(max) > > > > > - addq $VEC_SIZE, %rax > > > > > + subq $(VEC_SIZE * 4 - 1), %rdi > > > > > addq %rdi, %rax > > > > > - subq %rdx, %rax > > > > > -# ifdef USE_AS_WCSLEN > > > > > +# ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > -# endif > > > > > +# endif > > > > > VZEROUPPER_RETURN > > > > > > > > > > .p2align 4 > > > > > -L(first_vec_x2_check): > > > > > +L(last_vec_return_x1): > > > > > tzcntl %eax, %eax > > > > > - /* Check the end of data. */ > > > > > - cmpq %rax, %rsi > > > > > - jbe L(max) > > > > > - addq $(VEC_SIZE * 2), %rax > > > > > + subq $(VEC_SIZE * 3 - 1), %rdi > > > > > addq %rdi, %rax > > > > > - subq %rdx, %rax > > > > > -# ifdef USE_AS_WCSLEN > > > > > +# ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > -# endif > > > > > +# endif > > > > > VZEROUPPER_RETURN > > > > > > > > > > +# ifdef USE_AS_STRNLEN > > > > > .p2align 4 > > > > > -L(first_vec_x3_check): > > > > > +L(last_vec_x1_check): > > > > > + > > > > > tzcntl %eax, %eax > > > > > /* Check the end of data. */ > > > > > - cmpq %rax, %rsi > > > > > - jbe L(max) > > > > > - addq $(VEC_SIZE * 3), %rax > > > > > + cmpl %eax, %esi > > > > > + jb L(max) > > > > > + subq %rdx, %rdi > > > > > + incl %eax > > > > > addq %rdi, %rax > > > > > - subq %rdx, %rax > > > > > # ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > # endif > > > > > VZEROUPPER_RETURN > > > > > > > > > > - .p2align 4 > > > > > L(max): > > > > > movq %r8, %rax > > > > > + VZEROUPPER_RETURN > > > > > + > > > > > + .p2align 4 > > > > > +L(last_4x_vec): > > > > > + /* Test first 2x VEC normally. */ > > > > > + testl %eax, %eax > > > > > + jnz L(last_vec_x1) > > > > > + > > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > + testl %eax, %eax > > > > > + jnz L(last_vec_x2) > > > > > + > > > > > + /* Normalize length. */ > > > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > + testl %eax, %eax > > > > > + jnz L(last_vec_x3) > > > > > + > > > > > + subl $(VEC_SIZE * 3), %esi > > > > > + jb L(max) > > > > > + > > > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > + tzcntl %eax, %eax > > > > > + /* Check the end of data. */ > > > > > + cmpl %eax, %esi > > > > > + jb L(max) > > > > > + subq %rdx, %rdi > > > > > + addl $(VEC_SIZE * 3 + 1), %eax > > > > > + addq %rdi, %rax > > > > > # ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > # endif > > > > > VZEROUPPER_RETURN > > > > > > > > > > - .p2align 4 > > > > > -L(zero): > > > > > - xorl %eax, %eax > > > > > - ret > > > > > -# endif > > > > > > > > > > .p2align 4 > > > > > -L(first_vec_x0): > > > > > +L(last_vec_x1): > > > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > > > + instructions. */ > > > > > tzcntl %eax, %eax > > > > > + subq %rdx, %rdi > > > > > + incl %eax > > > > > addq %rdi, %rax > > > > > - subq %rdx, %rax > > > > > -# ifdef USE_AS_WCSLEN > > > > > +# ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > -# endif > > > > > +# endif > > > > > VZEROUPPER_RETURN > > > > > > > > > > .p2align 4 > > > > > -L(first_vec_x1): > > > > > +L(last_vec_x2): > > > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > > > + instructions. */ > > > > > tzcntl %eax, %eax > > > > > - addq $VEC_SIZE, %rax > > > > > + subq %rdx, %rdi > > > > > + addl $(VEC_SIZE + 1), %eax > > > > > addq %rdi, %rax > > > > > - subq %rdx, %rax > > > > > -# ifdef USE_AS_WCSLEN > > > > > +# ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > -# endif > > > > > +# endif > > > > > VZEROUPPER_RETURN > > > > > > > > > > .p2align 4 > > > > > -L(first_vec_x2): > > > > > +L(last_vec_x3): > > > > > tzcntl %eax, %eax > > > > > - addq $(VEC_SIZE * 2), %rax > > > > > + subl $(VEC_SIZE * 2), %esi > > > > > + /* Check the end of data. */ > > > > > + cmpl %eax, %esi > > > > > + jb L(max_end) > > > > > + subq %rdx, %rdi > > > > > + addl $(VEC_SIZE * 2 + 1), %eax > > > > > addq %rdi, %rax > > > > > - subq %rdx, %rax > > > > > -# ifdef USE_AS_WCSLEN > > > > > +# ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > -# endif > > > > > +# endif > > > > > + VZEROUPPER_RETURN > > > > > +L(max_end): > > > > > + movq %r8, %rax > > > > > VZEROUPPER_RETURN > > > > > +# endif > > > > > > > > > > + /* Cold case for crossing page with first load. */ > > > > > .p2align 4 > > > > > -L(4x_vec_end): > > > > > - VPCMPEQ %ymm1, %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > - testl %eax, %eax > > > > > - jnz L(first_vec_x0) > > > > > - VPCMPEQ %ymm2, %ymm0, %ymm2 > > > > > - vpmovmskb %ymm2, %eax > > > > > +L(cross_page_boundary): > > > > > + /* Align data to VEC_SIZE - 1. */ > > > > > + orq $(VEC_SIZE - 1), %rdi > > > > > + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > > > > + so no need to manually mod rdx. */ > > > > > + sarxl %edx, %eax, %eax > > > > > > > > This is a BMI2 instruction, which is not necessary available when AVX2 > > > > is available. This causes SIGILL on some CPU. I have reported that in > > > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 > > > > > > This is not a bug on master as: > > > > > > commit 83c5b368226c34a2f0a5287df40fc290b2b34359 > > > Author: H.J. Lu <hjl.tools@gmail.com> > > > Date: Mon Apr 19 10:45:07 2021 -0700 > > > > > > x86-64: Require BMI2 for strchr-avx2.S > > > > > > is already in tree. The issue is the avx2 changes where backported > > > w.o H.J's changes. > > > > > > > > Regards > > > > Aurelien > > > > > > > > -- > > > > Aurelien Jarno GPG: 4096R/1DDD8C9B > > > > aurelien@aurel32.net http://www.aurel32.net ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-09-28 14:54 ` Sunil Pandey @ 2022-09-28 15:00 ` Noah Goldstein 2022-09-28 18:24 ` H.J. Lu 0 siblings, 1 reply; 24+ messages in thread From: Noah Goldstein @ 2022-09-28 15:00 UTC (permalink / raw) To: Sunil Pandey; +Cc: Libc-stable Mailing List, Hongjiu Lu, GNU C Library On Wed, Sep 28, 2022 at 7:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > On Wed, Sep 28, 2022 at 7:42 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > Attached patch fixes BZ# 29611. > > > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > > if there is any objection. > > The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S > > > > Can you post these as separate emails with the patches embedded instead of > > attached? > > > > > > > Patches are also posted on bug report 29611. > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 is there a mailing list for backport patches like this? > > > > > > > On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > > > > > > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote: > > > > > > No bug. This commit optimizes strlen-avx2.S. The optimizations are > > > > > > mostly small things but they add up to roughly 10-30% performance > > > > > > improvement for strlen. The results for strnlen are bit more > > > > > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > > > > > > are all passing. > > > > > > > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > > > > --- > > > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > > > > > > sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- > > > > > > 2 files changed, 334 insertions(+), 214 deletions(-) > > > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > > index c377cab629..651b32908e 100644 > > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > > > > > > IFUNC_IMPL (i, name, strlen, > > > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > > __strlen_avx2) > > > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > > __strlen_avx2_rtm) > > > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > > /* Support sysdeps/x86_64/multiarch/strnlen.c. */ > > > > > > IFUNC_IMPL (i, name, strnlen, > > > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > > __strnlen_avx2) > > > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > > __strnlen_avx2_rtm) > > > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > > > > > > IFUNC_IMPL (i, name, wcslen, > > > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > > __wcslen_avx2) > > > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > > __wcslen_avx2_rtm) > > > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > > /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ > > > > > > IFUNC_IMPL (i, name, wcsnlen, > > > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > > __wcsnlen_avx2) > > > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > > __wcsnlen_avx2_rtm) > > > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > > > index 1caae9e6bc..bd2e6ee44a 100644 > > > > > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > > > @@ -27,9 +27,11 @@ > > > > > > # ifdef USE_AS_WCSLEN > > > > > > # define VPCMPEQ vpcmpeqd > > > > > > # define VPMINU vpminud > > > > > > +# define CHAR_SIZE 4 > > > > > > # else > > > > > > # define VPCMPEQ vpcmpeqb > > > > > > # define VPMINU vpminub > > > > > > +# define CHAR_SIZE 1 > > > > > > # endif > > > > > > > > > > > > # ifndef VZEROUPPER > > > > > > @@ -41,349 +43,459 @@ > > > > > > # endif > > > > > > > > > > > > # define VEC_SIZE 32 > > > > > > +# define PAGE_SIZE 4096 > > > > > > > > > > > > .section SECTION(.text),"ax",@progbits > > > > > > ENTRY (STRLEN) > > > > > > # ifdef USE_AS_STRNLEN > > > > > > - /* Check for zero length. */ > > > > > > + /* Check zero length. */ > > > > > > test %RSI_LP, %RSI_LP > > > > > > jz L(zero) > > > > > > + /* Store max len in R8_LP before adjusting if using WCSLEN. */ > > > > > > + mov %RSI_LP, %R8_LP > > > > > > # ifdef USE_AS_WCSLEN > > > > > > shl $2, %RSI_LP > > > > > > # elif defined __ILP32__ > > > > > > /* Clear the upper 32 bits. */ > > > > > > movl %esi, %esi > > > > > > # endif > > > > > > - mov %RSI_LP, %R8_LP > > > > > > # endif > > > > > > - movl %edi, %ecx > > > > > > + movl %edi, %eax > > > > > > movq %rdi, %rdx > > > > > > vpxor %xmm0, %xmm0, %xmm0 > > > > > > - > > > > > > + /* Clear high bits from edi. Only keeping bits relevant to page > > > > > > + cross check. */ > > > > > > + andl $(PAGE_SIZE - 1), %eax > > > > > > /* Check if we may cross page boundary with one vector load. */ > > > > > > - andl $(2 * VEC_SIZE - 1), %ecx > > > > > > - cmpl $VEC_SIZE, %ecx > > > > > > - ja L(cros_page_boundary) > > > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > > > + ja L(cross_page_boundary) > > > > > > > > > > > > /* Check the first VEC_SIZE bytes. */ > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > - testl %eax, %eax > > > > > > - > > > > > > + VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > # ifdef USE_AS_STRNLEN > > > > > > - jnz L(first_vec_x0_check) > > > > > > - /* Adjust length and check the end of data. */ > > > > > > - subq $VEC_SIZE, %rsi > > > > > > - jbe L(max) > > > > > > -# else > > > > > > - jnz L(first_vec_x0) > > > > > > + /* If length < VEC_SIZE handle special. */ > > > > > > + cmpq $VEC_SIZE, %rsi > > > > > > + jbe L(first_vec_x0) > > > > > > # endif > > > > > > - > > > > > > - /* Align data for aligned loads in the loop. */ > > > > > > - addq $VEC_SIZE, %rdi > > > > > > - andl $(VEC_SIZE - 1), %ecx > > > > > > - andq $-VEC_SIZE, %rdi > > > > > > + /* If empty continue to aligned_more. Otherwise return bit > > > > > > + position of first match. */ > > > > > > + testl %eax, %eax > > > > > > + jz L(aligned_more) > > > > > > + tzcntl %eax, %eax > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > + shrl $2, %eax > > > > > > +# endif > > > > > > + VZEROUPPER_RETURN > > > > > > > > > > > > # ifdef USE_AS_STRNLEN > > > > > > - /* Adjust length. */ > > > > > > - addq %rcx, %rsi > > > > > > +L(zero): > > > > > > + xorl %eax, %eax > > > > > > + ret > > > > > > > > > > > > - subq $(VEC_SIZE * 4), %rsi > > > > > > - jbe L(last_4x_vec_or_less) > > > > > > + .p2align 4 > > > > > > +L(first_vec_x0): > > > > > > + /* Set bit for max len so that tzcnt will return min of max len > > > > > > + and position of first match. */ > > > > > > + btsq %rsi, %rax > > > > > > + tzcntl %eax, %eax > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > + shrl $2, %eax > > > > > > +# endif > > > > > > + VZEROUPPER_RETURN > > > > > > # endif > > > > > > - jmp L(more_4x_vec) > > > > > > > > > > > > .p2align 4 > > > > > > -L(cros_page_boundary): > > > > > > - andl $(VEC_SIZE - 1), %ecx > > > > > > - andq $-VEC_SIZE, %rdi > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > - /* Remove the leading bytes. */ > > > > > > - sarl %cl, %eax > > > > > > - testl %eax, %eax > > > > > > - jz L(aligned_more) > > > > > > +L(first_vec_x1): > > > > > > tzcntl %eax, %eax > > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > > + size = [1, 159]. */ > > > > > > # ifdef USE_AS_STRNLEN > > > > > > - /* Check the end of data. */ > > > > > > - cmpq %rax, %rsi > > > > > > - jbe L(max) > > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > > + */ > > > > > > + subl $(VEC_SIZE * 4 + 1), %ecx > > > > > > + addl %ecx, %eax > > > > > > +# else > > > > > > + subl %edx, %edi > > > > > > + incl %edi > > > > > > + addl %edi, %eax > > > > > > # endif > > > > > > - addq %rdi, %rax > > > > > > - addq %rcx, %rax > > > > > > - subq %rdx, %rax > > > > > > # ifdef USE_AS_WCSLEN > > > > > > - shrq $2, %rax > > > > > > + shrl $2, %eax > > > > > > # endif > > > > > > -L(return_vzeroupper): > > > > > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > > > > > + VZEROUPPER_RETURN > > > > > > > > > > > > .p2align 4 > > > > > > -L(aligned_more): > > > > > > +L(first_vec_x2): > > > > > > + tzcntl %eax, %eax > > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > > + size = [1, 159]. */ > > > > > > # ifdef USE_AS_STRNLEN > > > > > > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > > > > > > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > > > > > > - to void possible addition overflow. */ > > > > > > - negq %rcx > > > > > > - addq $VEC_SIZE, %rcx > > > > > > - > > > > > > - /* Check the end of data. */ > > > > > > - subq %rcx, %rsi > > > > > > - jbe L(max) > > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > > + */ > > > > > > + subl $(VEC_SIZE * 3 + 1), %ecx > > > > > > + addl %ecx, %eax > > > > > > +# else > > > > > > + subl %edx, %edi > > > > > > + addl $(VEC_SIZE + 1), %edi > > > > > > + addl %edi, %eax > > > > > > # endif > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > + shrl $2, %eax > > > > > > +# endif > > > > > > + VZEROUPPER_RETURN > > > > > > > > > > > > - addq $VEC_SIZE, %rdi > > > > > > + .p2align 4 > > > > > > +L(first_vec_x3): > > > > > > + tzcntl %eax, %eax > > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > > + size = [1, 159]. */ > > > > > > +# ifdef USE_AS_STRNLEN > > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > > + */ > > > > > > + subl $(VEC_SIZE * 2 + 1), %ecx > > > > > > + addl %ecx, %eax > > > > > > +# else > > > > > > + subl %edx, %edi > > > > > > + addl $(VEC_SIZE * 2 + 1), %edi > > > > > > + addl %edi, %eax > > > > > > +# endif > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > + shrl $2, %eax > > > > > > +# endif > > > > > > + VZEROUPPER_RETURN > > > > > > > > > > > > + .p2align 4 > > > > > > +L(first_vec_x4): > > > > > > + tzcntl %eax, %eax > > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > > + size = [1, 159]. */ > > > > > > # ifdef USE_AS_STRNLEN > > > > > > - subq $(VEC_SIZE * 4), %rsi > > > > > > - jbe L(last_4x_vec_or_less) > > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > > + */ > > > > > > + subl $(VEC_SIZE + 1), %ecx > > > > > > + addl %ecx, %eax > > > > > > +# else > > > > > > + subl %edx, %edi > > > > > > + addl $(VEC_SIZE * 3 + 1), %edi > > > > > > + addl %edi, %eax > > > > > > # endif > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > + shrl $2, %eax > > > > > > +# endif > > > > > > + VZEROUPPER_RETURN > > > > > > > > > > > > -L(more_4x_vec): > > > > > > + .p2align 5 > > > > > > +L(aligned_more): > > > > > > + /* Align data to VEC_SIZE - 1. This is the same number of > > > > > > + instructions as using andq with -VEC_SIZE but saves 4 bytes of > > > > > > + code on the x4 check. */ > > > > > > + orq $(VEC_SIZE - 1), %rdi > > > > > > +L(cross_page_continue): > > > > > > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > > > > since data is only aligned to VEC_SIZE. */ > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > - testl %eax, %eax > > > > > > - jnz L(first_vec_x0) > > > > > > - > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > +# ifdef USE_AS_STRNLEN > > > > > > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because > > > > > > + it simplies the logic in last_4x_vec_or_less. */ > > > > > > + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx > > > > > > + subq %rdx, %rcx > > > > > > +# endif > > > > > > + /* Load first VEC regardless. */ > > > > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > > > > +# ifdef USE_AS_STRNLEN > > > > > > + /* Adjust length. If near end handle specially. */ > > > > > > + subq %rcx, %rsi > > > > > > + jb L(last_4x_vec_or_less) > > > > > > +# endif > > > > > > + vpmovmskb %ymm1, %eax > > > > > > testl %eax, %eax > > > > > > jnz L(first_vec_x1) > > > > > > > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > testl %eax, %eax > > > > > > jnz L(first_vec_x2) > > > > > > > > > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > testl %eax, %eax > > > > > > jnz L(first_vec_x3) > > > > > > > > > > > > - addq $(VEC_SIZE * 4), %rdi > > > > > > - > > > > > > -# ifdef USE_AS_STRNLEN > > > > > > - subq $(VEC_SIZE * 4), %rsi > > > > > > - jbe L(last_4x_vec_or_less) > > > > > > -# endif > > > > > > - > > > > > > - /* Align data to 4 * VEC_SIZE. */ > > > > > > - movq %rdi, %rcx > > > > > > - andl $(4 * VEC_SIZE - 1), %ecx > > > > > > - andq $-(4 * VEC_SIZE), %rdi > > > > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > + testl %eax, %eax > > > > > > + jnz L(first_vec_x4) > > > > > > > > > > > > + /* Align data to VEC_SIZE * 4 - 1. */ > > > > > > # ifdef USE_AS_STRNLEN > > > > > > - /* Adjust length. */ > > > > > > + /* Before adjusting length check if at last VEC_SIZE * 4. */ > > > > > > + cmpq $(VEC_SIZE * 4 - 1), %rsi > > > > > > + jbe L(last_4x_vec_or_less_load) > > > > > > + incq %rdi > > > > > > + movl %edi, %ecx > > > > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > > > > + andl $(VEC_SIZE * 4 - 1), %ecx > > > > > > + /* Readjust length. */ > > > > > > addq %rcx, %rsi > > > > > > +# else > > > > > > + incq %rdi > > > > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > > > > # endif > > > > > > - > > > > > > + /* Compare 4 * VEC at a time forward. */ > > > > > > .p2align 4 > > > > > > L(loop_4x_vec): > > > > > > - /* Compare 4 * VEC at a time forward. */ > > > > > > - vmovdqa (%rdi), %ymm1 > > > > > > - vmovdqa VEC_SIZE(%rdi), %ymm2 > > > > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > > > > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > > > > > > - VPMINU %ymm1, %ymm2, %ymm5 > > > > > > - VPMINU %ymm3, %ymm4, %ymm6 > > > > > > - VPMINU %ymm5, %ymm6, %ymm5 > > > > > > - > > > > > > - VPCMPEQ %ymm5, %ymm0, %ymm5 > > > > > > - vpmovmskb %ymm5, %eax > > > > > > - testl %eax, %eax > > > > > > - jnz L(4x_vec_end) > > > > > > - > > > > > > - addq $(VEC_SIZE * 4), %rdi > > > > > > - > > > > > > -# ifndef USE_AS_STRNLEN > > > > > > - jmp L(loop_4x_vec) > > > > > > -# else > > > > > > +# ifdef USE_AS_STRNLEN > > > > > > + /* Break if at end of length. */ > > > > > > subq $(VEC_SIZE * 4), %rsi > > > > > > - ja L(loop_4x_vec) > > > > > > - > > > > > > -L(last_4x_vec_or_less): > > > > > > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > > > > > > - addl $(VEC_SIZE * 2), %esi > > > > > > - jle L(last_2x_vec) > > > > > > + jb L(last_4x_vec_or_less_cmpeq) > > > > > > +# endif > > > > > > + /* Save some code size by microfusing VPMINU with the load. Since > > > > > > + the matches in ymm2/ymm4 can only be returned if there where no > > > > > > + matches in ymm1/ymm3 respectively there is no issue with overlap. > > > > > > + */ > > > > > > + vmovdqa 1(%rdi), %ymm1 > > > > > > + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 > > > > > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 > > > > > > + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 > > > > > > + > > > > > > + VPMINU %ymm2, %ymm4, %ymm5 > > > > > > + VPCMPEQ %ymm5, %ymm0, %ymm5 > > > > > > + vpmovmskb %ymm5, %ecx > > > > > > > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > - testl %eax, %eax > > > > > > - jnz L(first_vec_x0) > > > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > > > + testl %ecx, %ecx > > > > > > + jz L(loop_4x_vec) > > > > > > > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > - testl %eax, %eax > > > > > > - jnz L(first_vec_x1) > > > > > > > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > + VPCMPEQ %ymm1, %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > + subq %rdx, %rdi > > > > > > testl %eax, %eax > > > > > > + jnz L(last_vec_return_x0) > > > > > > > > > > > > - jnz L(first_vec_x2_check) > > > > > > - subl $VEC_SIZE, %esi > > > > > > - jle L(max) > > > > > > - > > > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > + VPCMPEQ %ymm2, %ymm0, %ymm2 > > > > > > + vpmovmskb %ymm2, %eax > > > > > > testl %eax, %eax > > > > > > - > > > > > > - jnz L(first_vec_x3_check) > > > > > > - movq %r8, %rax > > > > > > -# ifdef USE_AS_WCSLEN > > > > > > + jnz L(last_vec_return_x1) > > > > > > + > > > > > > + /* Combine last 2 VEC. */ > > > > > > + VPCMPEQ %ymm3, %ymm0, %ymm3 > > > > > > + vpmovmskb %ymm3, %eax > > > > > > + /* rcx has combined result from all 4 VEC. It will only be used if > > > > > > + the first 3 other VEC all did not contain a match. */ > > > > > > + salq $32, %rcx > > > > > > + orq %rcx, %rax > > > > > > + tzcntq %rax, %rax > > > > > > + subq $(VEC_SIZE * 2 - 1), %rdi > > > > > > + addq %rdi, %rax > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > -# endif > > > > > > +# endif > > > > > > VZEROUPPER_RETURN > > > > > > > > > > > > + > > > > > > +# ifdef USE_AS_STRNLEN > > > > > > .p2align 4 > > > > > > -L(last_2x_vec): > > > > > > - addl $(VEC_SIZE * 2), %esi > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > - testl %eax, %eax > > > > > > +L(last_4x_vec_or_less_load): > > > > > > + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ > > > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > > > +L(last_4x_vec_or_less_cmpeq): > > > > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > > > > +L(last_4x_vec_or_less): > > > > > > > > > > > > - jnz L(first_vec_x0_check) > > > > > > - subl $VEC_SIZE, %esi > > > > > > - jle L(max) > > > > > > + vpmovmskb %ymm1, %eax > > > > > > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > > > > > > + VEC_SIZE * 4. */ > > > > > > + testl $(VEC_SIZE * 2), %esi > > > > > > + jnz L(last_4x_vec) > > > > > > > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > + /* length may have been negative or positive by an offset of > > > > > > + VEC_SIZE * 4 depending on where this was called from. This fixes > > > > > > + that. */ > > > > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > > > > testl %eax, %eax > > > > > > - jnz L(first_vec_x1_check) > > > > > > - movq %r8, %rax > > > > > > -# ifdef USE_AS_WCSLEN > > > > > > - shrq $2, %rax > > > > > > -# endif > > > > > > - VZEROUPPER_RETURN > > > > > > + jnz L(last_vec_x1_check) > > > > > > > > > > > > - .p2align 4 > > > > > > -L(first_vec_x0_check): > > > > > > + subl $VEC_SIZE, %esi > > > > > > + jb L(max) > > > > > > + > > > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > tzcntl %eax, %eax > > > > > > /* Check the end of data. */ > > > > > > - cmpq %rax, %rsi > > > > > > - jbe L(max) > > > > > > + cmpl %eax, %esi > > > > > > + jb L(max) > > > > > > + subq %rdx, %rdi > > > > > > + addl $(VEC_SIZE + 1), %eax > > > > > > addq %rdi, %rax > > > > > > - subq %rdx, %rax > > > > > > # ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > # endif > > > > > > VZEROUPPER_RETURN > > > > > > +# endif > > > > > > > > > > > > .p2align 4 > > > > > > -L(first_vec_x1_check): > > > > > > +L(last_vec_return_x0): > > > > > > tzcntl %eax, %eax > > > > > > - /* Check the end of data. */ > > > > > > - cmpq %rax, %rsi > > > > > > - jbe L(max) > > > > > > - addq $VEC_SIZE, %rax > > > > > > + subq $(VEC_SIZE * 4 - 1), %rdi > > > > > > addq %rdi, %rax > > > > > > - subq %rdx, %rax > > > > > > -# ifdef USE_AS_WCSLEN > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > -# endif > > > > > > +# endif > > > > > > VZEROUPPER_RETURN > > > > > > > > > > > > .p2align 4 > > > > > > -L(first_vec_x2_check): > > > > > > +L(last_vec_return_x1): > > > > > > tzcntl %eax, %eax > > > > > > - /* Check the end of data. */ > > > > > > - cmpq %rax, %rsi > > > > > > - jbe L(max) > > > > > > - addq $(VEC_SIZE * 2), %rax > > > > > > + subq $(VEC_SIZE * 3 - 1), %rdi > > > > > > addq %rdi, %rax > > > > > > - subq %rdx, %rax > > > > > > -# ifdef USE_AS_WCSLEN > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > -# endif > > > > > > +# endif > > > > > > VZEROUPPER_RETURN > > > > > > > > > > > > +# ifdef USE_AS_STRNLEN > > > > > > .p2align 4 > > > > > > -L(first_vec_x3_check): > > > > > > +L(last_vec_x1_check): > > > > > > + > > > > > > tzcntl %eax, %eax > > > > > > /* Check the end of data. */ > > > > > > - cmpq %rax, %rsi > > > > > > - jbe L(max) > > > > > > - addq $(VEC_SIZE * 3), %rax > > > > > > + cmpl %eax, %esi > > > > > > + jb L(max) > > > > > > + subq %rdx, %rdi > > > > > > + incl %eax > > > > > > addq %rdi, %rax > > > > > > - subq %rdx, %rax > > > > > > # ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > # endif > > > > > > VZEROUPPER_RETURN > > > > > > > > > > > > - .p2align 4 > > > > > > L(max): > > > > > > movq %r8, %rax > > > > > > + VZEROUPPER_RETURN > > > > > > + > > > > > > + .p2align 4 > > > > > > +L(last_4x_vec): > > > > > > + /* Test first 2x VEC normally. */ > > > > > > + testl %eax, %eax > > > > > > + jnz L(last_vec_x1) > > > > > > + > > > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > + testl %eax, %eax > > > > > > + jnz L(last_vec_x2) > > > > > > + > > > > > > + /* Normalize length. */ > > > > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > + testl %eax, %eax > > > > > > + jnz L(last_vec_x3) > > > > > > + > > > > > > + subl $(VEC_SIZE * 3), %esi > > > > > > + jb L(max) > > > > > > + > > > > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > + tzcntl %eax, %eax > > > > > > + /* Check the end of data. */ > > > > > > + cmpl %eax, %esi > > > > > > + jb L(max) > > > > > > + subq %rdx, %rdi > > > > > > + addl $(VEC_SIZE * 3 + 1), %eax > > > > > > + addq %rdi, %rax > > > > > > # ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > # endif > > > > > > VZEROUPPER_RETURN > > > > > > > > > > > > - .p2align 4 > > > > > > -L(zero): > > > > > > - xorl %eax, %eax > > > > > > - ret > > > > > > -# endif > > > > > > > > > > > > .p2align 4 > > > > > > -L(first_vec_x0): > > > > > > +L(last_vec_x1): > > > > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > > > > + instructions. */ > > > > > > tzcntl %eax, %eax > > > > > > + subq %rdx, %rdi > > > > > > + incl %eax > > > > > > addq %rdi, %rax > > > > > > - subq %rdx, %rax > > > > > > -# ifdef USE_AS_WCSLEN > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > -# endif > > > > > > +# endif > > > > > > VZEROUPPER_RETURN > > > > > > > > > > > > .p2align 4 > > > > > > -L(first_vec_x1): > > > > > > +L(last_vec_x2): > > > > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > > > > + instructions. */ > > > > > > tzcntl %eax, %eax > > > > > > - addq $VEC_SIZE, %rax > > > > > > + subq %rdx, %rdi > > > > > > + addl $(VEC_SIZE + 1), %eax > > > > > > addq %rdi, %rax > > > > > > - subq %rdx, %rax > > > > > > -# ifdef USE_AS_WCSLEN > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > -# endif > > > > > > +# endif > > > > > > VZEROUPPER_RETURN > > > > > > > > > > > > .p2align 4 > > > > > > -L(first_vec_x2): > > > > > > +L(last_vec_x3): > > > > > > tzcntl %eax, %eax > > > > > > - addq $(VEC_SIZE * 2), %rax > > > > > > + subl $(VEC_SIZE * 2), %esi > > > > > > + /* Check the end of data. */ > > > > > > + cmpl %eax, %esi > > > > > > + jb L(max_end) > > > > > > + subq %rdx, %rdi > > > > > > + addl $(VEC_SIZE * 2 + 1), %eax > > > > > > addq %rdi, %rax > > > > > > - subq %rdx, %rax > > > > > > -# ifdef USE_AS_WCSLEN > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > -# endif > > > > > > +# endif > > > > > > + VZEROUPPER_RETURN > > > > > > +L(max_end): > > > > > > + movq %r8, %rax > > > > > > VZEROUPPER_RETURN > > > > > > +# endif > > > > > > > > > > > > + /* Cold case for crossing page with first load. */ > > > > > > .p2align 4 > > > > > > -L(4x_vec_end): > > > > > > - VPCMPEQ %ymm1, %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > - testl %eax, %eax > > > > > > - jnz L(first_vec_x0) > > > > > > - VPCMPEQ %ymm2, %ymm0, %ymm2 > > > > > > - vpmovmskb %ymm2, %eax > > > > > > +L(cross_page_boundary): > > > > > > + /* Align data to VEC_SIZE - 1. */ > > > > > > + orq $(VEC_SIZE - 1), %rdi > > > > > > + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > > > > > + so no need to manually mod rdx. */ > > > > > > + sarxl %edx, %eax, %eax > > > > > > > > > > This is a BMI2 instruction, which is not necessary available when AVX2 > > > > > is available. This causes SIGILL on some CPU. I have reported that in > > > > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 > > > > > > > > This is not a bug on master as: > > > > > > > > commit 83c5b368226c34a2f0a5287df40fc290b2b34359 > > > > Author: H.J. Lu <hjl.tools@gmail.com> > > > > Date: Mon Apr 19 10:45:07 2021 -0700 > > > > > > > > x86-64: Require BMI2 for strchr-avx2.S > > > > > > > > is already in tree. The issue is the avx2 changes where backported > > > > w.o H.J's changes. > > > > > > > > > > Regards > > > > > Aurelien > > > > > > > > > > -- > > > > > Aurelien Jarno GPG: 4096R/1DDD8C9B > > > > > aurelien@aurel32.net http://www.aurel32.net ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-09-28 15:00 ` Noah Goldstein @ 2022-09-28 18:24 ` H.J. Lu 2022-09-30 13:19 ` FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME> Darren Tristano 0 siblings, 1 reply; 24+ messages in thread From: H.J. Lu @ 2022-09-28 18:24 UTC (permalink / raw) To: Noah Goldstein; +Cc: Sunil Pandey, Libc-stable Mailing List, GNU C Library On Wed, Sep 28, 2022 at 8:00 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Sep 28, 2022 at 7:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Wed, Sep 28, 2022 at 7:42 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > > > Attached patch fixes BZ# 29611. > > > > > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > > > if there is any objection. > > > The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S > > > > > > Can you post these as separate emails with the patches embedded instead of > > > attached? > > > > > > > > > > > Patches are also posted on bug report 29611. > > > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 > > is there a mailing list for backport patches like this? It is libc-stable. -- H.J. ^ permalink raw reply [flat|nested] 24+ messages in thread
* FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME> 2022-09-28 18:24 ` H.J. Lu @ 2022-09-30 13:19 ` Darren Tristano 0 siblings, 0 replies; 24+ messages in thread From: Darren Tristano @ 2022-09-30 13:19 UTC (permalink / raw) To: Noah Goldstein, H.J. Lu Cc: GNU C Library, Sunil Pandey, Libc-stable Mailing List [-- Attachment #1: Type: text/plain, Size: 1447 bytes --] FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME> ________________________________ From: Libc-stable <libc-stable-bounces+darren=darrentristano.com@sourceware.org> on behalf of H.J. Lu via Libc-stable <libc-stable@sourceware.org> Sent: Wednesday, September 28, 2022 1:24 PM To: Noah Goldstein <goldstein.w.n@gmail.com> Cc: GNU C Library <libc-alpha@sourceware.org>; Sunil Pandey <skpgkp2@gmail.com>; Libc-stable Mailing List <libc-stable@sourceware.org> Subject: Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S On Wed, Sep 28, 2022 at 8:00 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Sep 28, 2022 at 7:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Wed, Sep 28, 2022 at 7:42 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > > > Attached patch fixes BZ# 29611. > > > > > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > > > if there is any objection. > > > The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S > > > > > > Can you post these as separate emails with the patches embedded instead of > > > attached? > > > > > > > > > > > Patches are also posted on bug report 29611. > > > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 > > is there a mailing list for backport patches like this? It is libc-stable. -- H.J. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-09-28 13:54 ` Sunil Pandey 2022-09-28 14:02 ` Darren Tristano 2022-09-28 14:42 ` Noah Goldstein @ 2022-09-28 18:23 ` H.J. Lu 2022-09-28 19:09 ` Sunil Pandey 2022-09-30 13:19 ` FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME> Darren Tristano 2022-10-04 21:19 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Aurelien Jarno 3 siblings, 2 replies; 24+ messages in thread From: H.J. Lu @ 2022-09-28 18:23 UTC (permalink / raw) To: Sunil Pandey; +Cc: Noah Goldstein, Libc-stable Mailing List, GNU C Library On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > Attached patch fixes BZ# 29611. > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > if there is any objection. It doesn't fully fix BZ #29611. Like Noah mentioned, we need to add BMI2 check to ifunc-impl-list.c for all functions which use "ifunc-avx2.h". H.J. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-09-28 18:23 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S H.J. Lu @ 2022-09-28 19:09 ` Sunil Pandey 2022-09-28 19:23 ` H.J. Lu 2022-09-30 13:19 ` FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME> Darren Tristano 1 sibling, 1 reply; 24+ messages in thread From: Sunil Pandey @ 2022-09-28 19:09 UTC (permalink / raw) To: H.J. Lu; +Cc: Noah Goldstein, Libc-stable Mailing List, GNU C Library On Wed, Sep 28, 2022 at 11:24 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > Attached patch fixes BZ# 29611. > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > if there is any objection. > > It doesn't fully fix BZ #29611. Like Noah mentioned, we need to add > BMI2 check to ifunc-impl-list.c for all functions which use "ifunc-avx2.h". > > > H.J. Pulling up corresponding patches are extremely difficult as they are not modular. I can modify existing patches (as posted on bug report) to incorporate ifunc-impl-list.c functionality. If it is OK? For backporting small incremental changes are preferred. Single monolithic patch makes backporting extremely difficult, if not impossible. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-09-28 19:09 ` Sunil Pandey @ 2022-09-28 19:23 ` H.J. Lu 0 siblings, 0 replies; 24+ messages in thread From: H.J. Lu @ 2022-09-28 19:23 UTC (permalink / raw) To: Sunil Pandey; +Cc: Noah Goldstein, Libc-stable Mailing List, GNU C Library On Wed, Sep 28, 2022 at 12:09 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > On Wed, Sep 28, 2022 at 11:24 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > Attached patch fixes BZ# 29611. > > > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > > if there is any objection. > > > > It doesn't fully fix BZ #29611. Like Noah mentioned, we need to add > > BMI2 check to ifunc-impl-list.c for all functions which use "ifunc-avx2.h". > > > > > > H.J. > > Pulling up corresponding patches are extremely difficult as they are not > modular. I can modify existing patches (as posted on bug report) to > incorporate ifunc-impl-list.c functionality. If it is OK? Please mention BZ #29611 in the commit log of the backport and submit a separate patch to fully fix BZ #29611. We should use a patch set for BZ #29611. > For backporting small incremental changes are preferred. Single monolithic > patch makes backporting extremely difficult, if not impossible. -- H.J. ^ permalink raw reply [flat|nested] 24+ messages in thread
* FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME> 2022-09-28 18:23 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S H.J. Lu 2022-09-28 19:09 ` Sunil Pandey @ 2022-09-30 13:19 ` Darren Tristano 1 sibling, 0 replies; 24+ messages in thread From: Darren Tristano @ 2022-09-30 13:19 UTC (permalink / raw) To: Sunil Pandey, H.J. Lu; +Cc: GNU C Library, Libc-stable Mailing List [-- Attachment #1: Type: text/plain, Size: 954 bytes --] FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME> Darren Tristano, CEO FoodserviceResults T: (708) 228-1427 darrentristano.com ________________________________ From: Libc-stable <libc-stable-bounces+darren=darrentristano.com@sourceware.org> on behalf of H.J. Lu via Libc-stable <libc-stable@sourceware.org> Sent: Wednesday, September 28, 2022 1:23 PM To: Sunil Pandey <skpgkp2@gmail.com> Cc: GNU C Library <libc-alpha@sourceware.org>; Libc-stable Mailing List <libc-stable@sourceware.org> Subject: Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > Attached patch fixes BZ# 29611. > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > if there is any objection. It doesn't fully fix BZ #29611. Like Noah mentioned, we need to add BMI2 check to ifunc-impl-list.c for all functions which use "ifunc-avx2.h". H.J. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-09-28 13:54 ` Sunil Pandey ` (2 preceding siblings ...) 2022-09-28 18:23 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S H.J. Lu @ 2022-10-04 21:19 ` Aurelien Jarno 2022-10-04 21:29 ` H.J. Lu 2022-10-05 1:10 ` Sunil Pandey 3 siblings, 2 replies; 24+ messages in thread From: Aurelien Jarno @ 2022-10-04 21:19 UTC (permalink / raw) To: Sunil Pandey Cc: Noah Goldstein, Libc-stable Mailing List, Hongjiu Lu, GNU C Library On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote: > Attached patch fixes BZ# 29611. > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > if there is any objection. Sorry to be late on this. I have a few comments about that patch: > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001 > From: "H.J. Lu" <hjl.tools@gmail.com> > Date: Mon, 19 Apr 2021 10:45:07 -0700 > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S > > Since strchr-avx2.S updated by > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8 > Author: noah <goldstein.w.n@gmail.com> > Date: Wed Feb 3 00:38:59 2021 -0500 > > x86-64: Refactor and improve performance of strchr-avx2.S > > uses sarx: > > c4 e2 72 f7 c0 sarx %ecx,%eax,%eax > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and > ifunc-avx2.h. > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) > --- > sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- > 2 files changed, 11 insertions(+), 5 deletions(-) First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got backported to 2.32 and older branches, and strchr-avx2.S in those branches do not use BMI2 instructions. So it doesn't make sense to backport it. That said the change in ifunc-avx2.h fixes: - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86: Optimize memchr-avx2.S") - strlen and strnlen, broken by the backport of aaa23c350715 ("x86: Optimize strlen-avx2.S") So the issues are fixed, but mostly by chance. NB: at this stage, I haven't verified the consistency of the ifunc selectors with ifunc-impl-list.c. -- Aurelien Jarno GPG: 4096R/1DDD8C9B aurelien@aurel32.net http://www.aurel32.net ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-10-04 21:19 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Aurelien Jarno @ 2022-10-04 21:29 ` H.J. Lu 2022-10-05 1:10 ` Sunil Pandey 1 sibling, 0 replies; 24+ messages in thread From: H.J. Lu @ 2022-10-04 21:29 UTC (permalink / raw) To: Sunil Pandey, Noah Goldstein, Libc-stable Mailing List, Hongjiu Lu, GNU C Library On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote: > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote: > > Attached patch fixes BZ# 29611. > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > if there is any objection. > > Sorry to be late on this. I have a few comments about that patch: > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001 > > From: "H.J. Lu" <hjl.tools@gmail.com> > > Date: Mon, 19 Apr 2021 10:45:07 -0700 > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S > > > > Since strchr-avx2.S updated by > > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8 > > Author: noah <goldstein.w.n@gmail.com> > > Date: Wed Feb 3 00:38:59 2021 -0500 > > > > x86-64: Refactor and improve performance of strchr-avx2.S > > > > uses sarx: > > > > c4 e2 72 f7 c0 sarx %ecx,%eax,%eax > > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and > > ifunc-avx2.h. > > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) > > --- > > sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- > > 2 files changed, 11 insertions(+), 5 deletions(-) > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got > backported to 2.32 and older branches, and strchr-avx2.S in those > branches do not use BMI2 instructions. So it doesn't make sense to > backport it. > > That said the change in ifunc-avx2.h fixes: > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86: > Optimize memchr-avx2.S") > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86: > Optimize strlen-avx2.S") > > So the issues are fixed, but mostly by chance. > > NB: at this stage, I haven't verified the consistency of the ifunc > selectors with ifunc-impl-list.c. > Changes to ifunc-impl-list.c aren't strictly needed since strchr functions don't use BMI2. AVX2 strchr functions are still tested on machines with AVX2 and BMI2. -- H.J. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-10-04 21:19 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Aurelien Jarno 2022-10-04 21:29 ` H.J. Lu @ 2022-10-05 1:10 ` Sunil Pandey 2022-10-05 14:23 ` Noah Goldstein 2022-10-05 17:11 ` Aurelien Jarno 1 sibling, 2 replies; 24+ messages in thread From: Sunil Pandey @ 2022-10-05 1:10 UTC (permalink / raw) To: Sunil Pandey, Noah Goldstein, Libc-stable Mailing List, Hongjiu Lu, GNU C Library On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote: > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote: > > Attached patch fixes BZ# 29611. > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > if there is any objection. > > Sorry to be late on this. I have a few comments about that patch: > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001 > > From: "H.J. Lu" <hjl.tools@gmail.com> > > Date: Mon, 19 Apr 2021 10:45:07 -0700 > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S > > > > Since strchr-avx2.S updated by > > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8 > > Author: noah <goldstein.w.n@gmail.com> > > Date: Wed Feb 3 00:38:59 2021 -0500 > > > > x86-64: Refactor and improve performance of strchr-avx2.S > > > > uses sarx: > > > > c4 e2 72 f7 c0 sarx %ecx,%eax,%eax > > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and > > ifunc-avx2.h. > > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) > > --- > > sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- > > 2 files changed, 11 insertions(+), 5 deletions(-) > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got > backported to 2.32 and older branches, and strchr-avx2.S in those > branches do not use BMI2 instructions. So it doesn't make sense to > backport it. > > That said the change in ifunc-avx2.h fixes: > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86: > Optimize memchr-avx2.S") > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86: > Optimize strlen-avx2.S") > > So the issues are fixed, but mostly by chance. How do you know it is a "by chance" fix, do you have any evidence to back your claim? > > NB: at this stage, I haven't verified the consistency of the ifunc > selectors with ifunc-impl-list.c. > > -- > Aurelien Jarno GPG: 4096R/1DDD8C9B > aurelien@aurel32.net http://www.aurel32.net ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-10-05 1:10 ` Sunil Pandey @ 2022-10-05 14:23 ` Noah Goldstein 2022-10-05 16:35 ` Sunil Pandey 2022-10-05 17:11 ` Aurelien Jarno 1 sibling, 1 reply; 24+ messages in thread From: Noah Goldstein @ 2022-10-05 14:23 UTC (permalink / raw) To: Sunil Pandey; +Cc: Libc-stable Mailing List, Hongjiu Lu, GNU C Library On Tue, Oct 4, 2022 at 6:11 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote: > > > Attached patch fixes BZ# 29611. > > > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > > if there is any objection. > > > > Sorry to be late on this. I have a few comments about that patch: > > > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001 > > > From: "H.J. Lu" <hjl.tools@gmail.com> > > > Date: Mon, 19 Apr 2021 10:45:07 -0700 > > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S > > > > > > Since strchr-avx2.S updated by > > > > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8 > > > Author: noah <goldstein.w.n@gmail.com> > > > Date: Wed Feb 3 00:38:59 2021 -0500 > > > > > > x86-64: Refactor and improve performance of strchr-avx2.S > > > > > > uses sarx: > > > > > > c4 e2 72 f7 c0 sarx %ecx,%eax,%eax > > > > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and > > > ifunc-avx2.h. > > > > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) > > > --- > > > sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- > > > 2 files changed, 11 insertions(+), 5 deletions(-) > > > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got > > backported to 2.32 and older branches, and strchr-avx2.S in those > > branches do not use BMI2 instructions. So it doesn't make sense to > > backport it. > > > > That said the change in ifunc-avx2.h fixes: > > > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86: > > Optimize memchr-avx2.S") > > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86: > > Optimize strlen-avx2.S") > > > > So the issues are fixed, but mostly by chance. > > How do you know it is a "by chance" fix, do you have any evidence to back > your claim? There might not be evidence about the intention of the authors but clearly the strchr commit message does not clarify that it also fixes memchr/strlen. > > > > > NB: at this stage, I haven't verified the consistency of the ifunc > > selectors with ifunc-impl-list.c. > > > > -- > > Aurelien Jarno GPG: 4096R/1DDD8C9B > > aurelien@aurel32.net http://www.aurel32.net ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-10-05 14:23 ` Noah Goldstein @ 2022-10-05 16:35 ` Sunil Pandey 0 siblings, 0 replies; 24+ messages in thread From: Sunil Pandey @ 2022-10-05 16:35 UTC (permalink / raw) To: Noah Goldstein; +Cc: Libc-stable Mailing List, Hongjiu Lu, GNU C Library On Wed, Oct 5, 2022 at 7:23 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Tue, Oct 4, 2022 at 6:11 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote: > > > > Attached patch fixes BZ# 29611. > > > > > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > > > if there is any objection. > > > > > > Sorry to be late on this. I have a few comments about that patch: > > > > > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001 > > > > From: "H.J. Lu" <hjl.tools@gmail.com> > > > > Date: Mon, 19 Apr 2021 10:45:07 -0700 > > > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S > > > > > > > > Since strchr-avx2.S updated by > > > > > > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8 > > > > Author: noah <goldstein.w.n@gmail.com> > > > > Date: Wed Feb 3 00:38:59 2021 -0500 > > > > > > > > x86-64: Refactor and improve performance of strchr-avx2.S > > > > > > > > uses sarx: > > > > > > > > c4 e2 72 f7 c0 sarx %ecx,%eax,%eax > > > > > > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and > > > > ifunc-avx2.h. > > > > > > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) > > > > --- > > > > sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- > > > > 2 files changed, 11 insertions(+), 5 deletions(-) > > > > > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got > > > backported to 2.32 and older branches, and strchr-avx2.S in those > > > branches do not use BMI2 instructions. So it doesn't make sense to > > > backport it. > > > > > > That said the change in ifunc-avx2.h fixes: > > > > > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86: > > > Optimize memchr-avx2.S") > > > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86: > > > Optimize strlen-avx2.S") > > > > > > So the issues are fixed, but mostly by chance. > > > > How do you know it is a "by chance" fix, do you have any evidence to back > > your claim? > > There might not be evidence about the intention of the authors but clearly > the strchr commit message does not clarify that it also fixes memchr/strlen. ifunc-avx2.h header file is used in many functions, so fix in ifunc-avx2.h fixes all those functions too. It's not "by chance", I scan all the functions where ifunc-avx2.h are used before backporting it. Since this is a backport commit and no extra changes are made, there is no need to modify the original author commit message. > > > > > > > > NB: at this stage, I haven't verified the consistency of the ifunc > > > selectors with ifunc-impl-list.c. > > > > > > -- > > > Aurelien Jarno GPG: 4096R/1DDD8C9B > > > aurelien@aurel32.net http://www.aurel32.net ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-10-05 1:10 ` Sunil Pandey 2022-10-05 14:23 ` Noah Goldstein @ 2022-10-05 17:11 ` Aurelien Jarno 2022-10-05 18:34 ` Sunil Pandey 1 sibling, 1 reply; 24+ messages in thread From: Aurelien Jarno @ 2022-10-05 17:11 UTC (permalink / raw) To: Sunil Pandey Cc: Noah Goldstein, Libc-stable Mailing List, Hongjiu Lu, GNU C Library On 2022-10-04 18:10, Sunil Pandey via Libc-alpha wrote: > On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote: > > > Attached patch fixes BZ# 29611. > > > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > > if there is any objection. > > > > Sorry to be late on this. I have a few comments about that patch: > > > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001 > > > From: "H.J. Lu" <hjl.tools@gmail.com> > > > Date: Mon, 19 Apr 2021 10:45:07 -0700 > > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S > > > > > > Since strchr-avx2.S updated by > > > > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8 > > > Author: noah <goldstein.w.n@gmail.com> > > > Date: Wed Feb 3 00:38:59 2021 -0500 > > > > > > x86-64: Refactor and improve performance of strchr-avx2.S > > > > > > uses sarx: > > > > > > c4 e2 72 f7 c0 sarx %ecx,%eax,%eax > > > > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and > > > ifunc-avx2.h. > > > > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) > > > --- > > > sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- > > > 2 files changed, 11 insertions(+), 5 deletions(-) > > > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got > > backported to 2.32 and older branches, and strchr-avx2.S in those > > branches do not use BMI2 instructions. So it doesn't make sense to > > backport it. > > > > That said the change in ifunc-avx2.h fixes: > > > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86: > > Optimize memchr-avx2.S") > > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86: > > Optimize strlen-avx2.S") > > > > So the issues are fixed, but mostly by chance. > > How do you know it is a "by chance" fix, do you have any evidence to back > your claim? My point is that the commit that has been backported is fixing a bug that doesn't exist in 2.32 branches. strchr-avx2.S does not the sarx instruction as the commit claims, and does not use other BMI2 instructions either. However following the backport of commit acfd088a1963 and aaa23c350715 in these branches, memchr-avx2.S and strlen-avx2.S use BMI2 instructions, and as they use ifunc-avx2.h, this actually fixes the bug. -- Aurelien Jarno GPG: 4096R/1DDD8C9B aurelien@aurel32.net http://www.aurel32.net ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S 2022-10-05 17:11 ` Aurelien Jarno @ 2022-10-05 18:34 ` Sunil Pandey 0 siblings, 0 replies; 24+ messages in thread From: Sunil Pandey @ 2022-10-05 18:34 UTC (permalink / raw) To: Sunil Pandey, Noah Goldstein, Libc-stable Mailing List, Hongjiu Lu, GNU C Library On Wed, Oct 5, 2022 at 10:11 AM Aurelien Jarno <aurelien@aurel32.net> wrote: > > On 2022-10-04 18:10, Sunil Pandey via Libc-alpha wrote: > > On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote: > > > > Attached patch fixes BZ# 29611. > > > > > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > > > if there is any objection. > > > > > > Sorry to be late on this. I have a few comments about that patch: > > > > > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001 > > > > From: "H.J. Lu" <hjl.tools@gmail.com> > > > > Date: Mon, 19 Apr 2021 10:45:07 -0700 > > > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S > > > > > > > > Since strchr-avx2.S updated by > > > > > > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8 > > > > Author: noah <goldstein.w.n@gmail.com> > > > > Date: Wed Feb 3 00:38:59 2021 -0500 > > > > > > > > x86-64: Refactor and improve performance of strchr-avx2.S > > > > > > > > uses sarx: > > > > > > > > c4 e2 72 f7 c0 sarx %ecx,%eax,%eax > > > > > > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and > > > > ifunc-avx2.h. > > > > > > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) > > > > --- > > > > sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- > > > > 2 files changed, 11 insertions(+), 5 deletions(-) > > > > > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got > > > backported to 2.32 and older branches, and strchr-avx2.S in those > > > branches do not use BMI2 instructions. So it doesn't make sense to > > > backport it. > > > > > > That said the change in ifunc-avx2.h fixes: > > > > > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86: > > > Optimize memchr-avx2.S") > > > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86: > > > Optimize strlen-avx2.S") > > > > > > So the issues are fixed, but mostly by chance. > > > > How do you know it is a "by chance" fix, do you have any evidence to back > > your claim? > > My point is that the commit that has been backported is fixing a bug > that doesn't exist in 2.32 branches. strchr-avx2.S does not the sarx > instruction as the commit claims, and does not use other BMI2 > instructions either. > > However following the backport of commit acfd088a1963 and aaa23c350715 > in these branches, memchr-avx2.S and strlen-avx2.S use BMI2 > instructions, and as they use ifunc-avx2.h, this actually fixes the bug. > This patch got selected because it fixes the ifunc-avx2.h file. My preference is to take an existing patch if possible, rather than creating a new one for branches. You are right, the original patch should have been composed differently to make it crystal clear. For backporting it's preferable to have small independent patches with logical grouping. > -- > Aurelien Jarno GPG: 4096R/1DDD8C9B > aurelien@aurel32.net http://www.aurel32.net ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 1/2] x86: Optimize strlen-evex.S 2021-04-19 23:36 [PATCH v5 1/2] x86: Optimize strlen-evex.S Noah Goldstein 2021-04-19 23:36 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Noah Goldstein @ 2021-04-20 1:01 ` H.J. Lu 1 sibling, 0 replies; 24+ messages in thread From: H.J. Lu @ 2021-04-20 1:01 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha, carlos On Mon, Apr 19, 2021 at 4:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > No bug. This commit optimizes strlen-evex.S. The > optimizations are mostly small things but they add up to roughly > 10-30% performance improvement for strlen. The results for strnlen are > bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and > test-wcsnlen are all passing. > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > --- > sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++----------- > 1 file changed, 317 insertions(+), 264 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S > index 0583819078..4bf6874b82 100644 > --- a/sysdeps/x86_64/multiarch/strlen-evex.S > +++ b/sysdeps/x86_64/multiarch/strlen-evex.S > @@ -29,11 +29,13 @@ > # ifdef USE_AS_WCSLEN > # define VPCMP vpcmpd > # define VPMINU vpminud > -# define SHIFT_REG r9d > +# define SHIFT_REG ecx > +# define CHAR_SIZE 4 > # else > # define VPCMP vpcmpb > # define VPMINU vpminub > -# define SHIFT_REG ecx > +# define SHIFT_REG edx > +# define CHAR_SIZE 1 > # endif > > # define XMMZERO xmm16 > @@ -46,132 +48,165 @@ > # define YMM6 ymm22 > > # define VEC_SIZE 32 > +# define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > .section .text.evex,"ax",@progbits > ENTRY (STRLEN) > # ifdef USE_AS_STRNLEN > - /* Check for zero length. */ > + /* Check zero length. */ > test %RSI_LP, %RSI_LP > jz L(zero) > -# ifdef USE_AS_WCSLEN > - shl $2, %RSI_LP > -# elif defined __ILP32__ > +# ifdef __ILP32__ > /* Clear the upper 32 bits. */ > movl %esi, %esi > # endif > mov %RSI_LP, %R8_LP > # endif > - movl %edi, %ecx > - movq %rdi, %rdx > + movl %edi, %eax > vpxorq %XMMZERO, %XMMZERO, %XMMZERO > - > + /* Clear high bits from edi. Only keeping bits relevant to page > + cross check. */ > + andl $(PAGE_SIZE - 1), %eax > /* Check if we may cross page boundary with one vector load. */ > - andl $(2 * VEC_SIZE - 1), %ecx > - cmpl $VEC_SIZE, %ecx > - ja L(cros_page_boundary) > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(cross_page_boundary) > > /* Check the first VEC_SIZE bytes. Each bit in K0 represents a > null byte. */ > VPCMP $0, (%rdi), %YMMZERO, %k0 > kmovd %k0, %eax > - testl %eax, %eax > - > # ifdef USE_AS_STRNLEN > - jnz L(first_vec_x0_check) > - /* Adjust length and check the end of data. */ > - subq $VEC_SIZE, %rsi > - jbe L(max) > -# else > - jnz L(first_vec_x0) > + /* If length < CHAR_PER_VEC handle special. */ > + cmpq $CHAR_PER_VEC, %rsi > + jbe L(first_vec_x0) > # endif > - > - /* Align data for aligned loads in the loop. */ > - addq $VEC_SIZE, %rdi > - andl $(VEC_SIZE - 1), %ecx > - andq $-VEC_SIZE, %rdi > - > + testl %eax, %eax > + jz L(aligned_more) > + tzcntl %eax, %eax > + ret > # ifdef USE_AS_STRNLEN > - /* Adjust length. */ > - addq %rcx, %rsi > +L(zero): > + xorl %eax, %eax > + ret > > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > + .p2align 4 > +L(first_vec_x0): > + /* Set bit for max len so that tzcnt will return min of max len > + and position of first match. */ > + btsq %rsi, %rax > + tzcntl %eax, %eax > + ret > # endif > - jmp L(more_4x_vec) > > .p2align 4 > -L(cros_page_boundary): > - andl $(VEC_SIZE - 1), %ecx > - andq $-VEC_SIZE, %rdi > - > -# ifdef USE_AS_WCSLEN > - /* NB: Divide shift count by 4 since each bit in K0 represent 4 > - bytes. */ > - movl %ecx, %SHIFT_REG > - sarl $2, %SHIFT_REG > +L(first_vec_x1): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > +# ifdef USE_AS_STRNLEN > + /* Use ecx which was computed earlier to compute correct value. > + */ > + leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax > +# else > + subl %edx, %edi > +# ifdef USE_AS_WCSLEN > + /* NB: Divide bytes by 4 to get the wchar_t count. */ > + sarl $2, %edi > +# endif > + leal CHAR_PER_VEC(%rdi, %rax), %eax > # endif > - VPCMP $0, (%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > + ret > > - /* Remove the leading bytes. */ > - sarxl %SHIFT_REG, %eax, %eax > - testl %eax, %eax > - jz L(aligned_more) > + .p2align 4 > +L(first_vec_x2): > tzcntl %eax, %eax > -# ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > -# endif > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > -# endif > - addq %rdi, %rax > - addq %rcx, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > + /* Use ecx which was computed earlier to compute correct value. > + */ > + leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax > +# else > + subl %edx, %edi > +# ifdef USE_AS_WCSLEN > + /* NB: Divide bytes by 4 to get the wchar_t count. */ > + sarl $2, %edi > +# endif > + leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax > # endif > ret > > .p2align 4 > -L(aligned_more): > +L(first_vec_x3): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > - to void possible addition overflow. */ > - negq %rcx > - addq $VEC_SIZE, %rcx > - > - /* Check the end of data. */ > - subq %rcx, %rsi > - jbe L(max) > + /* Use ecx which was computed earlier to compute correct value. > + */ > + leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax > +# else > + subl %edx, %edi > +# ifdef USE_AS_WCSLEN > + /* NB: Divide bytes by 4 to get the wchar_t count. */ > + sarl $2, %edi > +# endif > + leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax > # endif > + ret > > - addq $VEC_SIZE, %rdi > - > + .p2align 4 > +L(first_vec_x4): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > + /* Use ecx which was computed earlier to compute correct value. > + */ > + leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax > +# else > + subl %edx, %edi > +# ifdef USE_AS_WCSLEN > + /* NB: Divide bytes by 4 to get the wchar_t count. */ > + sarl $2, %edi > +# endif > + leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax > # endif > + ret > > -L(more_4x_vec): > + .p2align 5 > +L(aligned_more): > + movq %rdi, %rdx > + /* Align data to VEC_SIZE. */ > + andq $-(VEC_SIZE), %rdi > +L(cross_page_continue): > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > since data is only aligned to VEC_SIZE. */ > - VPCMP $0, (%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > - > +# ifdef USE_AS_STRNLEN > + /* + CHAR_SIZE because it simplies the logic in > + last_4x_vec_or_less. */ > + leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx > + subq %rdx, %rcx > +# ifdef USE_AS_WCSLEN > + /* NB: Divide bytes by 4 to get the wchar_t count. */ > + sarl $2, %ecx > +# endif > +# endif > + /* Load first VEC regardless. */ > VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 > +# ifdef USE_AS_STRNLEN > + /* Adjust length. If near end handle specially. */ > + subq %rcx, %rsi > + jb L(last_4x_vec_or_less) > +# endif > kmovd %k0, %eax > testl %eax, %eax > jnz L(first_vec_x1) > > VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 > kmovd %k0, %eax > - testl %eax, %eax > + test %eax, %eax > jnz L(first_vec_x2) > > VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 > @@ -179,258 +214,276 @@ L(more_4x_vec): > testl %eax, %eax > jnz L(first_vec_x3) > > - addq $(VEC_SIZE * 4), %rdi > - > -# ifdef USE_AS_STRNLEN > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > -# endif > - > - /* Align data to 4 * VEC_SIZE. */ > - movq %rdi, %rcx > - andl $(4 * VEC_SIZE - 1), %ecx > - andq $-(4 * VEC_SIZE), %rdi > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 > + kmovd %k0, %eax > + testl %eax, %eax > + jnz L(first_vec_x4) > > + addq $VEC_SIZE, %rdi > # ifdef USE_AS_STRNLEN > - /* Adjust length. */ > + /* Check if at last VEC_SIZE * 4 length. */ > + cmpq $(CHAR_PER_VEC * 4 - 1), %rsi > + jbe L(last_4x_vec_or_less_load) > + movl %edi, %ecx > + andl $(VEC_SIZE * 4 - 1), %ecx > +# ifdef USE_AS_WCSLEN > + /* NB: Divide bytes by 4 to get the wchar_t count. */ > + sarl $2, %ecx > +# endif > + /* Readjust length. */ > addq %rcx, %rsi > # endif > + /* Align data to VEC_SIZE * 4. */ > + andq $-(VEC_SIZE * 4), %rdi > > + /* Compare 4 * VEC at a time forward. */ > .p2align 4 > L(loop_4x_vec): > - /* Compare 4 * VEC at a time forward. */ > - VMOVA (%rdi), %YMM1 > - VMOVA VEC_SIZE(%rdi), %YMM2 > - VMOVA (VEC_SIZE * 2)(%rdi), %YMM3 > - VMOVA (VEC_SIZE * 3)(%rdi), %YMM4 > - > - VPMINU %YMM1, %YMM2, %YMM5 > - VPMINU %YMM3, %YMM4, %YMM6 > + /* Load first VEC regardless. */ > + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > +# ifdef USE_AS_STRNLEN > + /* Break if at end of length. */ > + subq $(CHAR_PER_VEC * 4), %rsi > + jb L(last_4x_vec_or_less_cmpeq) > +# endif > + /* Save some code size by microfusing VPMINU with the load. Since > + the matches in ymm2/ymm4 can only be returned if there where no > + matches in ymm1/ymm3 respectively there is no issue with overlap. > + */ > + VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 > + VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 > + VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 > + > + VPCMP $0, %YMM2, %YMMZERO, %k0 > + VPCMP $0, %YMM4, %YMMZERO, %k1 > + subq $-(VEC_SIZE * 4), %rdi > + kortestd %k0, %k1 > + jz L(loop_4x_vec) > + > + /* Check if end was in first half. */ > + kmovd %k0, %eax > + subq %rdx, %rdi > +# ifdef USE_AS_WCSLEN > + shrq $2, %rdi > +# endif > + testl %eax, %eax > + jz L(second_vec_return) > > - VPMINU %YMM5, %YMM6, %YMM5 > - VPCMP $0, %YMM5, %YMMZERO, %k0 > - ktestd %k0, %k0 > - jnz L(4x_vec_end) > + VPCMP $0, %YMM1, %YMMZERO, %k2 > + kmovd %k2, %edx > + /* Combine VEC1 matches (edx) with VEC2 matches (eax). */ > +# ifdef USE_AS_WCSLEN > + sall $CHAR_PER_VEC, %eax > + orl %edx, %eax > + tzcntl %eax, %eax > +# else > + salq $CHAR_PER_VEC, %rax > + orq %rdx, %rax > + tzcntq %rax, %rax > +# endif > + addq %rdi, %rax > + ret > > - addq $(VEC_SIZE * 4), %rdi > > -# ifndef USE_AS_STRNLEN > - jmp L(loop_4x_vec) > -# else > - subq $(VEC_SIZE * 4), %rsi > - ja L(loop_4x_vec) > +# ifdef USE_AS_STRNLEN > > +L(last_4x_vec_or_less_load): > + /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ > + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > +L(last_4x_vec_or_less_cmpeq): > + VPCMP $0, %YMM1, %YMMZERO, %k0 > + addq $(VEC_SIZE * 3), %rdi > L(last_4x_vec_or_less): > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > - addl $(VEC_SIZE * 2), %esi > - jle L(last_2x_vec) > - > - VPCMP $0, (%rdi), %YMMZERO, %k0 > kmovd %k0, %eax > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > + VEC_SIZE * 4. */ > + testl $(CHAR_PER_VEC * 2), %esi > + jnz L(last_4x_vec) > + > + /* length may have been negative or positive by an offset of > + CHAR_PER_VEC * 4 depending on where this was called from. This > + fixes that. */ > + andl $(CHAR_PER_VEC * 4 - 1), %esi > testl %eax, %eax > - jnz L(first_vec_x0) > + jnz L(last_vec_x1_check) > > - VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x1) > + /* Check the end of data. */ > + subl $CHAR_PER_VEC, %esi > + jb L(max) > > VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 > kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x2_check) > - subl $VEC_SIZE, %esi > - jle L(max) > + tzcntl %eax, %eax > + /* Check the end of data. */ > + cmpl %eax, %esi > + jb L(max) > > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x3_check) > + subq %rdx, %rdi > +# ifdef USE_AS_WCSLEN > + /* NB: Divide bytes by 4 to get the wchar_t count. */ > + sarq $2, %rdi > +# endif > + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax > + ret > +L(max): > movq %r8, %rax > + ret > +# endif > + > + /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less) > + in the 4x VEC loop can use 2 byte encoding. */ > + .p2align 4 > +L(second_vec_return): > + VPCMP $0, %YMM3, %YMMZERO, %k0 > + /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ > +# ifdef USE_AS_WCSLEN > + kunpckbw %k0, %k1, %k0 > + kmovd %k0, %eax > + tzcntl %eax, %eax > +# else > + kunpckdq %k0, %k1, %k0 > + kmovq %k0, %rax > + tzcntq %rax, %rax > +# endif > + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax > + ret > + > + > +# ifdef USE_AS_STRNLEN > +L(last_vec_x1_check): > + tzcntl %eax, %eax > + /* Check the end of data. */ > + cmpl %eax, %esi > + jb L(max) > + subq %rdx, %rdi > # ifdef USE_AS_WCSLEN > - shrq $2, %rax > + /* NB: Divide bytes by 4 to get the wchar_t count. */ > + sarq $2, %rdi > # endif > + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax > ret > > .p2align 4 > -L(last_2x_vec): > - addl $(VEC_SIZE * 2), %esi > +L(last_4x_vec): > + /* Test first 2x VEC normally. */ > + testl %eax, %eax > + jnz L(last_vec_x1) > > - VPCMP $0, (%rdi), %YMMZERO, %k0 > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 > kmovd %k0, %eax > testl %eax, %eax > - jnz L(first_vec_x0_check) > - subl $VEC_SIZE, %esi > - jle L(max) > + jnz L(last_vec_x2) > > - VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 > + /* Normalize length. */ > + andl $(CHAR_PER_VEC * 4 - 1), %esi > + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 > kmovd %k0, %eax > testl %eax, %eax > - jnz L(first_vec_x1_check) > - movq %r8, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > -# endif > - ret > + jnz L(last_vec_x3) > > - .p2align 4 > -L(first_vec_x0_check): > + /* Check the end of data. */ > + subl $(CHAR_PER_VEC * 3), %esi > + jb L(max) > + > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 > + kmovd %k0, %eax > tzcntl %eax, %eax > -# ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > -# endif > /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq %rdi, %rax > - subq %rdx, %rax > + cmpl %eax, %esi > + jb L(max_end) > + > + subq %rdx, %rdi > # ifdef USE_AS_WCSLEN > - shrq $2, %rax > + /* NB: Divide bytes by 4 to get the wchar_t count. */ > + sarq $2, %rdi > # endif > + leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax > ret > > .p2align 4 > -L(first_vec_x1_check): > +L(last_vec_x1): > tzcntl %eax, %eax > + subq %rdx, %rdi > # ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > -# endif > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $VEC_SIZE, %rax > - addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > + /* NB: Divide bytes by 4 to get the wchar_t count. */ > + sarq $2, %rdi > # endif > + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax > ret > > .p2align 4 > -L(first_vec_x2_check): > +L(last_vec_x2): > tzcntl %eax, %eax > + subq %rdx, %rdi > # ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > -# endif > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $(VEC_SIZE * 2), %rax > - addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > + /* NB: Divide bytes by 4 to get the wchar_t count. */ > + sarq $2, %rdi > # endif > + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax > ret > > .p2align 4 > -L(first_vec_x3_check): > +L(last_vec_x3): > tzcntl %eax, %eax > -# ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > -# endif > + subl $(CHAR_PER_VEC * 2), %esi > /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $(VEC_SIZE * 3), %rax > - addq %rdi, %rax > - subq %rdx, %rax > + cmpl %eax, %esi > + jb L(max_end) > + subq %rdx, %rdi > # ifdef USE_AS_WCSLEN > - shrq $2, %rax > + /* NB: Divide bytes by 4 to get the wchar_t count. */ > + sarq $2, %rdi > # endif > + leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax > ret > - > - .p2align 4 > -L(max): > +L(max_end): > movq %r8, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > -# endif > - ret > - > - .p2align 4 > -L(zero): > - xorl %eax, %eax > ret > # endif > > + /* Cold case for crossing page with first load. */ > .p2align 4 > -L(first_vec_x0): > - tzcntl %eax, %eax > -# ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > -# endif > - addq %rdi, %rax > - subq %rdx, %rax > +L(cross_page_boundary): > + movq %rdi, %rdx > + /* Align data to VEC_SIZE. */ > + andq $-VEC_SIZE, %rdi > + VPCMP $0, (%rdi), %YMMZERO, %k0 > + kmovd %k0, %eax > + /* Remove the leading bytes. */ > # ifdef USE_AS_WCSLEN > - shrq $2, %rax > + /* NB: Divide shift count by 4 since each bit in K0 represent 4 > + bytes. */ > + movl %edx, %ecx > + shrl $2, %ecx > + andl $(CHAR_PER_VEC - 1), %ecx > # endif > - ret > - > - .p2align 4 > -L(first_vec_x1): > + /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ > + sarxl %SHIFT_REG, %eax, %eax > + testl %eax, %eax > +# ifndef USE_AS_STRNLEN > + jz L(cross_page_continue) > tzcntl %eax, %eax > -# ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > -# endif > - addq $VEC_SIZE, %rax > - addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > -# endif > ret > - > - .p2align 4 > -L(first_vec_x2): > - tzcntl %eax, %eax > -# ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > -# endif > - addq $(VEC_SIZE * 2), %rax > - addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > -# endif > +# else > + jnz L(cross_page_less_vec) > +# ifndef USE_AS_WCSLEN > + movl %edx, %ecx > + andl $(CHAR_PER_VEC - 1), %ecx > +# endif > + movl $CHAR_PER_VEC, %eax > + subl %ecx, %eax > + /* Check the end of data. */ > + cmpq %rax, %rsi > + ja L(cross_page_continue) > + movl %esi, %eax > ret > - > - .p2align 4 > -L(4x_vec_end): > - VPCMP $0, %YMM1, %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > - VPCMP $0, %YMM2, %YMMZERO, %k1 > - kmovd %k1, %eax > - testl %eax, %eax > - jnz L(first_vec_x1) > - VPCMP $0, %YMM3, %YMMZERO, %k2 > - kmovd %k2, %eax > - testl %eax, %eax > - jnz L(first_vec_x2) > - VPCMP $0, %YMM4, %YMMZERO, %k3 > - kmovd %k3, %eax > -L(first_vec_x3): > +L(cross_page_less_vec): > tzcntl %eax, %eax > -# ifdef USE_AS_WCSLEN > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - sall $2, %eax > -# endif > - addq $(VEC_SIZE * 3), %rax > - addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > -# endif > + /* Select min of length and position of first null. */ > + cmpq %rax, %rsi > + cmovb %esi, %eax > ret > +# endif > > END (STRLEN) > #endif > -- > 2.29.2 > LGTM. I am checking it in for you. Thanks. -- H.J. ^ permalink raw reply [flat|nested] 24+ messages in thread
end of thread, other threads:[~2022-10-05 18:34 UTC | newest] Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2021-04-19 23:36 [PATCH v5 1/2] x86: Optimize strlen-evex.S Noah Goldstein 2021-04-19 23:36 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Noah Goldstein 2021-04-20 1:01 ` H.J. Lu 2022-09-25 8:19 ` Aurelien Jarno 2022-09-25 14:00 ` Noah Goldstein 2022-09-28 13:54 ` Sunil Pandey 2022-09-28 14:02 ` Darren Tristano 2022-09-28 14:42 ` Noah Goldstein 2022-09-28 14:54 ` Sunil Pandey 2022-09-28 15:00 ` Noah Goldstein 2022-09-28 18:24 ` H.J. Lu 2022-09-30 13:19 ` FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME> Darren Tristano 2022-09-28 18:23 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S H.J. Lu 2022-09-28 19:09 ` Sunil Pandey 2022-09-28 19:23 ` H.J. Lu 2022-09-30 13:19 ` FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME> Darren Tristano 2022-10-04 21:19 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Aurelien Jarno 2022-10-04 21:29 ` H.J. Lu 2022-10-05 1:10 ` Sunil Pandey 2022-10-05 14:23 ` Noah Goldstein 2022-10-05 16:35 ` Sunil Pandey 2022-10-05 17:11 ` Aurelien Jarno 2022-10-05 18:34 ` Sunil Pandey 2021-04-20 1:01 ` [PATCH v5 1/2] x86: Optimize strlen-evex.S H.J. Lu
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).