From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 7844) id 4BCC6385736D; Fri, 15 Apr 2022 18:43:46 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 4BCC6385736D Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Noah Goldstein To: glibc-cvs@sourceware.org Subject: [glibc] x86: Cleanup page cross code in memcmp-avx2-movbe.S X-Act-Checkin: glibc X-Git-Author: Noah Goldstein X-Git-Refname: refs/heads/master X-Git-Oldrev: 7cbc03d03091d5664060924789afe46d30a5477e X-Git-Newrev: 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Message-Id: <20220415184346.4BCC6385736D@sourceware.org> Date: Fri, 15 Apr 2022 18:43:46 +0000 (GMT) X-BeenThere: glibc-cvs@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Glibc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 15 Apr 2022 18:43:46 -0000 https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=23102686ec67b856a2d4fd25ddaa1c0b8d175c4f commit 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Author: Noah Goldstein Date: Fri Apr 15 12:28:01 2022 -0500 x86: Cleanup page cross code in memcmp-avx2-movbe.S Old code was both inefficient and wasted code size. New code (-62 bytes) and comparable or better performance in the page cross case. geometric_mean(N=20) of page cross cases New / Original: 0.960 size, align0, align1, ret, New Time/Old Time 1, 4095, 0, 0, 1.001 1, 4095, 0, 1, 0.999 1, 4095, 0, -1, 1.0 2, 4094, 0, 0, 1.0 2, 4094, 0, 1, 1.0 2, 4094, 0, -1, 1.0 3, 4093, 0, 0, 1.0 3, 4093, 0, 1, 1.0 3, 4093, 0, -1, 1.0 4, 4092, 0, 0, 0.987 4, 4092, 0, 1, 1.0 4, 4092, 0, -1, 1.0 5, 4091, 0, 0, 0.984 5, 4091, 0, 1, 1.002 5, 4091, 0, -1, 1.005 6, 4090, 0, 0, 0.993 6, 4090, 0, 1, 1.001 6, 4090, 0, -1, 1.003 7, 4089, 0, 0, 0.991 7, 4089, 0, 1, 1.0 7, 4089, 0, -1, 1.001 8, 4088, 0, 0, 0.875 8, 4088, 0, 1, 0.881 8, 4088, 0, -1, 0.888 9, 4087, 0, 0, 0.872 9, 4087, 0, 1, 0.879 9, 4087, 0, -1, 0.883 10, 4086, 0, 0, 0.878 10, 4086, 0, 1, 0.886 10, 4086, 0, -1, 0.873 11, 4085, 0, 0, 0.878 11, 4085, 0, 1, 0.881 11, 4085, 0, -1, 0.879 12, 4084, 0, 0, 0.873 12, 4084, 0, 1, 0.889 12, 4084, 0, -1, 0.875 13, 4083, 0, 0, 0.873 13, 4083, 0, 1, 0.863 13, 4083, 0, -1, 0.863 14, 4082, 0, 0, 0.838 14, 4082, 0, 1, 0.869 14, 4082, 0, -1, 0.877 15, 4081, 0, 0, 0.841 15, 4081, 0, 1, 0.869 15, 4081, 0, -1, 0.876 16, 4080, 0, 0, 0.988 16, 4080, 0, 1, 0.99 16, 4080, 0, -1, 0.989 17, 4079, 0, 0, 0.978 17, 4079, 0, 1, 0.981 17, 4079, 0, -1, 0.98 18, 4078, 0, 0, 0.981 18, 4078, 0, 1, 0.98 18, 4078, 0, -1, 0.985 19, 4077, 0, 0, 0.977 19, 4077, 0, 1, 0.979 19, 4077, 0, -1, 0.986 20, 4076, 0, 0, 0.977 20, 4076, 0, 1, 0.986 20, 4076, 0, -1, 0.984 21, 4075, 0, 0, 0.977 21, 4075, 0, 1, 0.983 21, 4075, 0, -1, 0.988 22, 4074, 0, 0, 0.983 22, 4074, 0, 1, 0.994 22, 4074, 0, -1, 0.993 23, 4073, 0, 0, 0.98 23, 4073, 0, 1, 0.992 23, 4073, 0, -1, 0.995 24, 4072, 0, 0, 0.989 24, 4072, 0, 1, 0.989 24, 4072, 0, -1, 0.991 25, 4071, 0, 0, 0.99 25, 4071, 0, 1, 0.999 25, 4071, 0, -1, 0.996 26, 4070, 0, 0, 0.993 26, 4070, 0, 1, 0.995 26, 4070, 0, -1, 0.998 27, 4069, 0, 0, 0.993 27, 4069, 0, 1, 0.999 27, 4069, 0, -1, 1.0 28, 4068, 0, 0, 0.997 28, 4068, 0, 1, 1.0 28, 4068, 0, -1, 0.999 29, 4067, 0, 0, 0.996 29, 4067, 0, 1, 0.999 29, 4067, 0, -1, 0.999 30, 4066, 0, 0, 0.991 30, 4066, 0, 1, 1.001 30, 4066, 0, -1, 0.999 31, 4065, 0, 0, 0.988 31, 4065, 0, 1, 0.998 31, 4065, 0, -1, 0.998 Reviewed-by: H.J. Lu Diff: --- sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 +++++++++++++++++----------- 1 file changed, 61 insertions(+), 37 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S index a34ea1645d..210c9925b6 100644 --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S @@ -429,22 +429,21 @@ L(page_cross_less_vec): # ifndef USE_AS_WMEMCMP cmpl $8, %edx jae L(between_8_15) + /* Fall through for [4, 7]. */ cmpl $4, %edx - jae L(between_4_7) + jb L(between_2_3) - /* Load as big endian to avoid branches. */ - movzwl (%rdi), %eax - movzwl (%rsi), %ecx - shll $8, %eax - shll $8, %ecx - bswap %eax - bswap %ecx - movzbl -1(%rdi, %rdx), %edi - movzbl -1(%rsi, %rdx), %esi - orl %edi, %eax - orl %esi, %ecx - /* Subtraction is okay because the upper 8 bits are zero. */ - subl %ecx, %eax + movbe (%rdi), %eax + movbe (%rsi), %ecx + shlq $32, %rax + shlq $32, %rcx + movbe -4(%rdi, %rdx), %edi + movbe -4(%rsi, %rdx), %esi + orq %rdi, %rax + orq %rsi, %rcx + subq %rcx, %rax + /* Fast path for return zero. */ + jnz L(ret_nonzero) /* No ymm register was touched. */ ret @@ -457,9 +456,33 @@ L(one_or_less): /* No ymm register was touched. */ ret + .p2align 4,, 5 +L(ret_nonzero): + sbbl %eax, %eax + orl $1, %eax + /* No ymm register was touched. */ + ret + + .p2align 4,, 2 +L(zero): + xorl %eax, %eax + /* No ymm register was touched. */ + ret + .p2align 4 L(between_8_15): -# endif + movbe (%rdi), %rax + movbe (%rsi), %rcx + subq %rcx, %rax + jnz L(ret_nonzero) + movbe -8(%rdi, %rdx), %rax + movbe -8(%rsi, %rdx), %rcx + subq %rcx, %rax + /* Fast path for return zero. */ + jnz L(ret_nonzero) + /* No ymm register was touched. */ + ret +# else /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ vmovq (%rdi), %xmm1 vmovq (%rsi), %xmm2 @@ -475,16 +498,13 @@ L(between_8_15): VPCMPEQ %xmm1, %xmm2, %xmm2 vpmovmskb %xmm2, %eax subl $0xffff, %eax + /* Fast path for return zero. */ jnz L(return_vec_0) /* No ymm register was touched. */ ret +# endif - .p2align 4 -L(zero): - xorl %eax, %eax - ret - - .p2align 4 + .p2align 4,, 10 L(between_16_31): /* From 16 to 31 bytes. No branch when size == 16. */ vmovdqu (%rsi), %xmm2 @@ -501,11 +521,17 @@ L(between_16_31): VPCMPEQ (%rdi), %xmm2, %xmm2 vpmovmskb %xmm2, %eax subl $0xffff, %eax + /* Fast path for return zero. */ jnz L(return_vec_0) /* No ymm register was touched. */ ret # ifdef USE_AS_WMEMCMP + .p2align 4,, 2 +L(zero): + xorl %eax, %eax + ret + .p2align 4 L(one_or_less): jb L(zero) @@ -520,22 +546,20 @@ L(one_or_less): # else .p2align 4 -L(between_4_7): - /* Load as big endian with overlapping movbe to avoid branches. - */ - movbe (%rdi), %eax - movbe (%rsi), %ecx - shlq $32, %rax - shlq $32, %rcx - movbe -4(%rdi, %rdx), %edi - movbe -4(%rsi, %rdx), %esi - orq %rdi, %rax - orq %rsi, %rcx - subq %rcx, %rax - jz L(zero_4_7) - sbbl %eax, %eax - orl $1, %eax -L(zero_4_7): +L(between_2_3): + /* Load as big endian to avoid branches. */ + movzwl (%rdi), %eax + movzwl (%rsi), %ecx + bswap %eax + bswap %ecx + shrl %eax + shrl %ecx + movzbl -1(%rdi, %rdx), %edi + movzbl -1(%rsi, %rdx), %esi + orl %edi, %eax + orl %esi, %ecx + /* Subtraction is okay because the upper bit is zero. */ + subl %ecx, %eax /* No ymm register was touched. */ ret # endif