From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-yb1-xb2d.google.com (mail-yb1-xb2d.google.com [IPv6:2607:f8b0:4864:20::b2d]) by sourceware.org (Postfix) with ESMTPS id EF26E3858010 for ; Sun, 10 Apr 2022 00:49:10 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org EF26E3858010 Received: by mail-yb1-xb2d.google.com with SMTP id e71so6933944ybf.8 for ; Sat, 09 Apr 2022 17:49:10 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=fLz3IolS2DEPtVOgNGLXM7sLGODbYS7i2anl6gWO0yc=; b=D0cqFcpWAn7poKjjDbbHEgZpCjAYxoOqM4vjGf34fwYB+BJ5okisyDlw20PK2EGRKP 8lnWz4SRQRrCWIHvIsF53TVbA0SV0a/IXwH2X+ABHdztW8ti0ZvS563+t9dSPFPRc2Uh KMMyALkK8cO/h0wpiSqNZdkFB+XCgkcM4xFVpx1I5KibYrgsinkjjpLM57YobDCUHiRG W5qCBmxt71Lt4mt4Bt0JIL2gF8Nvym23+4AKZx1+Fm/Ex0MAllBm8kZGF1irFlaJ+EIE NAl8l8i+RZP3X3DCK69ELXoXD9TTBLdlHYfq7v3D8pdXIWtAIQrfr657gJjUY/xD2piQ IbEQ== X-Gm-Message-State: AOAM532kuDXkQjw/u/ElWjO8JJrDiJUL6q+9E9J2NdCG1GbumM1ggy2b 2YfPk2B1zdQ825GwfmkLtCzldWk9pSr5O/vbsAoWYXz/PkM= X-Google-Smtp-Source: ABdhPJzq7wGIDfNJmr07hVBj/3sPYFo5d+UxNUO3+Zvh52VrOp1bFscoY+Obl0rY6cc7ek0Q2jAn7zTmAMIHIKwz/1k= X-Received: by 2002:a25:35d5:0:b0:632:c30d:cc0f with SMTP id c204-20020a2535d5000000b00632c30dcc0fmr18775494yba.43.1649551750304; Sat, 09 Apr 2022 17:49:10 -0700 (PDT) MIME-Version: 1.0 References: <20220325183625.1170867-2-goldstein.w.n@gmail.com> <20220410004250.1448020-6-goldstein.w.n@gmail.com> In-Reply-To: <20220410004250.1448020-6-goldstein.w.n@gmail.com> From: Noah Goldstein Date: Sat, 9 Apr 2022 19:48:59 -0500 Message-ID: Subject: Re: [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S To: GNU C Library Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-8.4 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 10 Apr 2022 00:49:12 -0000 Disregard this patch. It's from the wrong patchset. On Sat, Apr 9, 2022 at 7:47 PM Noah Goldstein wrote: > > Old code was both inefficient and wasted code size. New code (-62 > bytes) and comparable or better performance in the page cross case. > > geometric_mean(N=20) of page cross cases New / Original: 0.960 > > size, align0, align1, ret, New Time/Old Time > 1, 4095, 0, 0, 1.001 > 1, 4095, 0, 1, 0.999 > 1, 4095, 0, -1, 1.0 > 2, 4094, 0, 0, 1.0 > 2, 4094, 0, 1, 1.0 > 2, 4094, 0, -1, 1.0 > 3, 4093, 0, 0, 1.0 > 3, 4093, 0, 1, 1.0 > 3, 4093, 0, -1, 1.0 > 4, 4092, 0, 0, 0.987 > 4, 4092, 0, 1, 1.0 > 4, 4092, 0, -1, 1.0 > 5, 4091, 0, 0, 0.984 > 5, 4091, 0, 1, 1.002 > 5, 4091, 0, -1, 1.005 > 6, 4090, 0, 0, 0.993 > 6, 4090, 0, 1, 1.001 > 6, 4090, 0, -1, 1.003 > 7, 4089, 0, 0, 0.991 > 7, 4089, 0, 1, 1.0 > 7, 4089, 0, -1, 1.001 > 8, 4088, 0, 0, 0.875 > 8, 4088, 0, 1, 0.881 > 8, 4088, 0, -1, 0.888 > 9, 4087, 0, 0, 0.872 > 9, 4087, 0, 1, 0.879 > 9, 4087, 0, -1, 0.883 > 10, 4086, 0, 0, 0.878 > 10, 4086, 0, 1, 0.886 > 10, 4086, 0, -1, 0.873 > 11, 4085, 0, 0, 0.878 > 11, 4085, 0, 1, 0.881 > 11, 4085, 0, -1, 0.879 > 12, 4084, 0, 0, 0.873 > 12, 4084, 0, 1, 0.889 > 12, 4084, 0, -1, 0.875 > 13, 4083, 0, 0, 0.873 > 13, 4083, 0, 1, 0.863 > 13, 4083, 0, -1, 0.863 > 14, 4082, 0, 0, 0.838 > 14, 4082, 0, 1, 0.869 > 14, 4082, 0, -1, 0.877 > 15, 4081, 0, 0, 0.841 > 15, 4081, 0, 1, 0.869 > 15, 4081, 0, -1, 0.876 > 16, 4080, 0, 0, 0.988 > 16, 4080, 0, 1, 0.99 > 16, 4080, 0, -1, 0.989 > 17, 4079, 0, 0, 0.978 > 17, 4079, 0, 1, 0.981 > 17, 4079, 0, -1, 0.98 > 18, 4078, 0, 0, 0.981 > 18, 4078, 0, 1, 0.98 > 18, 4078, 0, -1, 0.985 > 19, 4077, 0, 0, 0.977 > 19, 4077, 0, 1, 0.979 > 19, 4077, 0, -1, 0.986 > 20, 4076, 0, 0, 0.977 > 20, 4076, 0, 1, 0.986 > 20, 4076, 0, -1, 0.984 > 21, 4075, 0, 0, 0.977 > 21, 4075, 0, 1, 0.983 > 21, 4075, 0, -1, 0.988 > 22, 4074, 0, 0, 0.983 > 22, 4074, 0, 1, 0.994 > 22, 4074, 0, -1, 0.993 > 23, 4073, 0, 0, 0.98 > 23, 4073, 0, 1, 0.992 > 23, 4073, 0, -1, 0.995 > 24, 4072, 0, 0, 0.989 > 24, 4072, 0, 1, 0.989 > 24, 4072, 0, -1, 0.991 > 25, 4071, 0, 0, 0.99 > 25, 4071, 0, 1, 0.999 > 25, 4071, 0, -1, 0.996 > 26, 4070, 0, 0, 0.993 > 26, 4070, 0, 1, 0.995 > 26, 4070, 0, -1, 0.998 > 27, 4069, 0, 0, 0.993 > 27, 4069, 0, 1, 0.999 > 27, 4069, 0, -1, 1.0 > 28, 4068, 0, 0, 0.997 > 28, 4068, 0, 1, 1.0 > 28, 4068, 0, -1, 0.999 > 29, 4067, 0, 0, 0.996 > 29, 4067, 0, 1, 0.999 > 29, 4067, 0, -1, 0.999 > 30, 4066, 0, 0, 0.991 > 30, 4066, 0, 1, 1.001 > 30, 4066, 0, -1, 0.999 > 31, 4065, 0, 0, 0.988 > 31, 4065, 0, 1, 0.998 > 31, 4065, 0, -1, 0.998 > --- > sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++-------- > 1 file changed, 61 insertions(+), 37 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S > index a34ea1645d..210c9925b6 100644 > --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S > +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S > @@ -429,22 +429,21 @@ L(page_cross_less_vec): > # ifndef USE_AS_WMEMCMP > cmpl $8, %edx > jae L(between_8_15) > + /* Fall through for [4, 7]. */ > cmpl $4, %edx > - jae L(between_4_7) > + jb L(between_2_3) > > - /* Load as big endian to avoid branches. */ > - movzwl (%rdi), %eax > - movzwl (%rsi), %ecx > - shll $8, %eax > - shll $8, %ecx > - bswap %eax > - bswap %ecx > - movzbl -1(%rdi, %rdx), %edi > - movzbl -1(%rsi, %rdx), %esi > - orl %edi, %eax > - orl %esi, %ecx > - /* Subtraction is okay because the upper 8 bits are zero. */ > - subl %ecx, %eax > + movbe (%rdi), %eax > + movbe (%rsi), %ecx > + shlq $32, %rax > + shlq $32, %rcx > + movbe -4(%rdi, %rdx), %edi > + movbe -4(%rsi, %rdx), %esi > + orq %rdi, %rax > + orq %rsi, %rcx > + subq %rcx, %rax > + /* Fast path for return zero. */ > + jnz L(ret_nonzero) > /* No ymm register was touched. */ > ret > > @@ -457,9 +456,33 @@ L(one_or_less): > /* No ymm register was touched. */ > ret > > + .p2align 4,, 5 > +L(ret_nonzero): > + sbbl %eax, %eax > + orl $1, %eax > + /* No ymm register was touched. */ > + ret > + > + .p2align 4,, 2 > +L(zero): > + xorl %eax, %eax > + /* No ymm register was touched. */ > + ret > + > .p2align 4 > L(between_8_15): > -# endif > + movbe (%rdi), %rax > + movbe (%rsi), %rcx > + subq %rcx, %rax > + jnz L(ret_nonzero) > + movbe -8(%rdi, %rdx), %rax > + movbe -8(%rsi, %rdx), %rcx > + subq %rcx, %rax > + /* Fast path for return zero. */ > + jnz L(ret_nonzero) > + /* No ymm register was touched. */ > + ret > +# else > /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ > vmovq (%rdi), %xmm1 > vmovq (%rsi), %xmm2 > @@ -475,16 +498,13 @@ L(between_8_15): > VPCMPEQ %xmm1, %xmm2, %xmm2 > vpmovmskb %xmm2, %eax > subl $0xffff, %eax > + /* Fast path for return zero. */ > jnz L(return_vec_0) > /* No ymm register was touched. */ > ret > +# endif > > - .p2align 4 > -L(zero): > - xorl %eax, %eax > - ret > - > - .p2align 4 > + .p2align 4,, 10 > L(between_16_31): > /* From 16 to 31 bytes. No branch when size == 16. */ > vmovdqu (%rsi), %xmm2 > @@ -501,11 +521,17 @@ L(between_16_31): > VPCMPEQ (%rdi), %xmm2, %xmm2 > vpmovmskb %xmm2, %eax > subl $0xffff, %eax > + /* Fast path for return zero. */ > jnz L(return_vec_0) > /* No ymm register was touched. */ > ret > > # ifdef USE_AS_WMEMCMP > + .p2align 4,, 2 > +L(zero): > + xorl %eax, %eax > + ret > + > .p2align 4 > L(one_or_less): > jb L(zero) > @@ -520,22 +546,20 @@ L(one_or_less): > # else > > .p2align 4 > -L(between_4_7): > - /* Load as big endian with overlapping movbe to avoid branches. > - */ > - movbe (%rdi), %eax > - movbe (%rsi), %ecx > - shlq $32, %rax > - shlq $32, %rcx > - movbe -4(%rdi, %rdx), %edi > - movbe -4(%rsi, %rdx), %esi > - orq %rdi, %rax > - orq %rsi, %rcx > - subq %rcx, %rax > - jz L(zero_4_7) > - sbbl %eax, %eax > - orl $1, %eax > -L(zero_4_7): > +L(between_2_3): > + /* Load as big endian to avoid branches. */ > + movzwl (%rdi), %eax > + movzwl (%rsi), %ecx > + bswap %eax > + bswap %ecx > + shrl %eax > + shrl %ecx > + movzbl -1(%rdi, %rdx), %edi > + movzbl -1(%rsi, %rdx), %esi > + orl %edi, %eax > + orl %esi, %ecx > + /* Subtraction is okay because the upper bit is zero. */ > + subl %ecx, %eax > /* No ymm register was touched. */ > ret > # endif > -- > 2.25.1 >