On Fri, Apr 15, 2022 at 10:34 AM H.J. Lu via Libc-alpha wrote: > > On Fri, Apr 15, 2022 at 10:28 AM Noah Goldstein wrote: > > > > Old code was both inefficient and wasted code size. New code (-62 > > bytes) and comparable or better performance in the page cross case. > > > > geometric_mean(N=20) of page cross cases New / Original: 0.960 > > > > size, align0, align1, ret, New Time/Old Time > > 1, 4095, 0, 0, 1.001 > > 1, 4095, 0, 1, 0.999 > > 1, 4095, 0, -1, 1.0 > > 2, 4094, 0, 0, 1.0 > > 2, 4094, 0, 1, 1.0 > > 2, 4094, 0, -1, 1.0 > > 3, 4093, 0, 0, 1.0 > > 3, 4093, 0, 1, 1.0 > > 3, 4093, 0, -1, 1.0 > > 4, 4092, 0, 0, 0.987 > > 4, 4092, 0, 1, 1.0 > > 4, 4092, 0, -1, 1.0 > > 5, 4091, 0, 0, 0.984 > > 5, 4091, 0, 1, 1.002 > > 5, 4091, 0, -1, 1.005 > > 6, 4090, 0, 0, 0.993 > > 6, 4090, 0, 1, 1.001 > > 6, 4090, 0, -1, 1.003 > > 7, 4089, 0, 0, 0.991 > > 7, 4089, 0, 1, 1.0 > > 7, 4089, 0, -1, 1.001 > > 8, 4088, 0, 0, 0.875 > > 8, 4088, 0, 1, 0.881 > > 8, 4088, 0, -1, 0.888 > > 9, 4087, 0, 0, 0.872 > > 9, 4087, 0, 1, 0.879 > > 9, 4087, 0, -1, 0.883 > > 10, 4086, 0, 0, 0.878 > > 10, 4086, 0, 1, 0.886 > > 10, 4086, 0, -1, 0.873 > > 11, 4085, 0, 0, 0.878 > > 11, 4085, 0, 1, 0.881 > > 11, 4085, 0, -1, 0.879 > > 12, 4084, 0, 0, 0.873 > > 12, 4084, 0, 1, 0.889 > > 12, 4084, 0, -1, 0.875 > > 13, 4083, 0, 0, 0.873 > > 13, 4083, 0, 1, 0.863 > > 13, 4083, 0, -1, 0.863 > > 14, 4082, 0, 0, 0.838 > > 14, 4082, 0, 1, 0.869 > > 14, 4082, 0, -1, 0.877 > > 15, 4081, 0, 0, 0.841 > > 15, 4081, 0, 1, 0.869 > > 15, 4081, 0, -1, 0.876 > > 16, 4080, 0, 0, 0.988 > > 16, 4080, 0, 1, 0.99 > > 16, 4080, 0, -1, 0.989 > > 17, 4079, 0, 0, 0.978 > > 17, 4079, 0, 1, 0.981 > > 17, 4079, 0, -1, 0.98 > > 18, 4078, 0, 0, 0.981 > > 18, 4078, 0, 1, 0.98 > > 18, 4078, 0, -1, 0.985 > > 19, 4077, 0, 0, 0.977 > > 19, 4077, 0, 1, 0.979 > > 19, 4077, 0, -1, 0.986 > > 20, 4076, 0, 0, 0.977 > > 20, 4076, 0, 1, 0.986 > > 20, 4076, 0, -1, 0.984 > > 21, 4075, 0, 0, 0.977 > > 21, 4075, 0, 1, 0.983 > > 21, 4075, 0, -1, 0.988 > > 22, 4074, 0, 0, 0.983 > > 22, 4074, 0, 1, 0.994 > > 22, 4074, 0, -1, 0.993 > > 23, 4073, 0, 0, 0.98 > > 23, 4073, 0, 1, 0.992 > > 23, 4073, 0, -1, 0.995 > > 24, 4072, 0, 0, 0.989 > > 24, 4072, 0, 1, 0.989 > > 24, 4072, 0, -1, 0.991 > > 25, 4071, 0, 0, 0.99 > > 25, 4071, 0, 1, 0.999 > > 25, 4071, 0, -1, 0.996 > > 26, 4070, 0, 0, 0.993 > > 26, 4070, 0, 1, 0.995 > > 26, 4070, 0, -1, 0.998 > > 27, 4069, 0, 0, 0.993 > > 27, 4069, 0, 1, 0.999 > > 27, 4069, 0, -1, 1.0 > > 28, 4068, 0, 0, 0.997 > > 28, 4068, 0, 1, 1.0 > > 28, 4068, 0, -1, 0.999 > > 29, 4067, 0, 0, 0.996 > > 29, 4067, 0, 1, 0.999 > > 29, 4067, 0, -1, 0.999 > > 30, 4066, 0, 0, 0.991 > > 30, 4066, 0, 1, 1.001 > > 30, 4066, 0, -1, 0.999 > > 31, 4065, 0, 0, 0.988 > > 31, 4065, 0, 1, 0.998 > > 31, 4065, 0, -1, 0.998 > > --- > > sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++-------- > > 1 file changed, 61 insertions(+), 37 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S > > index a34ea1645d..210c9925b6 100644 > > --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S > > +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S > > @@ -429,22 +429,21 @@ L(page_cross_less_vec): > > # ifndef USE_AS_WMEMCMP > > cmpl $8, %edx > > jae L(between_8_15) > > + /* Fall through for [4, 7]. */ > > cmpl $4, %edx > > - jae L(between_4_7) > > + jb L(between_2_3) > > > > - /* Load as big endian to avoid branches. */ > > - movzwl (%rdi), %eax > > - movzwl (%rsi), %ecx > > - shll $8, %eax > > - shll $8, %ecx > > - bswap %eax > > - bswap %ecx > > - movzbl -1(%rdi, %rdx), %edi > > - movzbl -1(%rsi, %rdx), %esi > > - orl %edi, %eax > > - orl %esi, %ecx > > - /* Subtraction is okay because the upper 8 bits are zero. */ > > - subl %ecx, %eax > > + movbe (%rdi), %eax > > + movbe (%rsi), %ecx > > + shlq $32, %rax > > + shlq $32, %rcx > > + movbe -4(%rdi, %rdx), %edi > > + movbe -4(%rsi, %rdx), %esi > > + orq %rdi, %rax > > + orq %rsi, %rcx > > + subq %rcx, %rax > > + /* Fast path for return zero. */ > > + jnz L(ret_nonzero) > > /* No ymm register was touched. */ > > ret > > > > @@ -457,9 +456,33 @@ L(one_or_less): > > /* No ymm register was touched. */ > > ret > > > > + .p2align 4,, 5 > > +L(ret_nonzero): > > + sbbl %eax, %eax > > + orl $1, %eax > > + /* No ymm register was touched. */ > > + ret > > + > > + .p2align 4,, 2 > > +L(zero): > > + xorl %eax, %eax > > + /* No ymm register was touched. */ > > + ret > > + > > .p2align 4 > > L(between_8_15): > > -# endif > > + movbe (%rdi), %rax > > + movbe (%rsi), %rcx > > + subq %rcx, %rax > > + jnz L(ret_nonzero) > > + movbe -8(%rdi, %rdx), %rax > > + movbe -8(%rsi, %rdx), %rcx > > + subq %rcx, %rax > > + /* Fast path for return zero. */ > > + jnz L(ret_nonzero) > > + /* No ymm register was touched. */ > > + ret > > +# else > > /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ > > vmovq (%rdi), %xmm1 > > vmovq (%rsi), %xmm2 > > @@ -475,16 +498,13 @@ L(between_8_15): > > VPCMPEQ %xmm1, %xmm2, %xmm2 > > vpmovmskb %xmm2, %eax > > subl $0xffff, %eax > > + /* Fast path for return zero. */ > > jnz L(return_vec_0) > > /* No ymm register was touched. */ > > ret > > +# endif > > > > - .p2align 4 > > -L(zero): > > - xorl %eax, %eax > > - ret > > - > > - .p2align 4 > > + .p2align 4,, 10 > > L(between_16_31): > > /* From 16 to 31 bytes. No branch when size == 16. */ > > vmovdqu (%rsi), %xmm2 > > @@ -501,11 +521,17 @@ L(between_16_31): > > VPCMPEQ (%rdi), %xmm2, %xmm2 > > vpmovmskb %xmm2, %eax > > subl $0xffff, %eax > > + /* Fast path for return zero. */ > > jnz L(return_vec_0) > > /* No ymm register was touched. */ > > ret > > > > # ifdef USE_AS_WMEMCMP > > + .p2align 4,, 2 > > +L(zero): > > + xorl %eax, %eax > > + ret > > + > > .p2align 4 > > L(one_or_less): > > jb L(zero) > > @@ -520,22 +546,20 @@ L(one_or_less): > > # else > > > > .p2align 4 > > -L(between_4_7): > > - /* Load as big endian with overlapping movbe to avoid branches. > > - */ > > - movbe (%rdi), %eax > > - movbe (%rsi), %ecx > > - shlq $32, %rax > > - shlq $32, %rcx > > - movbe -4(%rdi, %rdx), %edi > > - movbe -4(%rsi, %rdx), %esi > > - orq %rdi, %rax > > - orq %rsi, %rcx > > - subq %rcx, %rax > > - jz L(zero_4_7) > > - sbbl %eax, %eax > > - orl $1, %eax > > -L(zero_4_7): > > +L(between_2_3): > > + /* Load as big endian to avoid branches. */ > > + movzwl (%rdi), %eax > > + movzwl (%rsi), %ecx > > + bswap %eax > > + bswap %ecx > > + shrl %eax > > + shrl %ecx > > + movzbl -1(%rdi, %rdx), %edi > > + movzbl -1(%rsi, %rdx), %esi > > + orl %edi, %eax > > + orl %esi, %ecx > > + /* Subtraction is okay because the upper bit is zero. */ > > + subl %ecx, %eax > > /* No ymm register was touched. */ > > ret > > # endif > > -- > > 2.25.1 > > > > LGTM. > > Reviewed-by: H.J. Lu > > Thanks. > > -- > H.J. I would like to backport this patch to release branches. Any comments or objections? Conflict resolution patch attached. --Sunil