From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pg1-x533.google.com (mail-pg1-x533.google.com [IPv6:2607:f8b0:4864:20::533]) by sourceware.org (Postfix) with ESMTPS id 23922385734D for ; Fri, 15 Apr 2022 17:34:12 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 23922385734D Received: by mail-pg1-x533.google.com with SMTP id 32so7886484pgl.4 for ; Fri, 15 Apr 2022 10:34:12 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=azbFxxHJWOjQqL4t6tU/3EPui/fru/It1EajTZoE5IQ=; b=UGKeebrA08+k6Z84zsdjW8EX0aeEMTK3OS5H9BuSphFi/3MWjfrN+rhNKMvFW2j7HQ jKvWRZhMkDzzJhdhXMB9UGfM5kpR7iFFcPziuVUqIi9qvcqixI3V292OBYFsd30v+z/b 6vV5+g8eTAjYSrGh5LQb7NG8m/AoA7UqMrRvztVodIFn8ULoAyXaKVt95oqw67LhPwdl zsc1Lqqzb8ILjI6ONSGxkiioQZVCd2qERBYlfstLAgswBFYR1IwJ7QGf8RCCZQwtc+lm QXsOdbqCH6HxGe247Epogc6cetQyvy9X/Rsy45rwYGxFBO3RgIFNtCxgASsFPkOQ4Al9 m29Q== X-Gm-Message-State: AOAM533aSl5JuOfb7ud/SsHbYw3gCioR2zgzHQiRbDFRa+wXL2mGbAuj 8QXDxUH9i+9qpnFtR37Ip6co1a8lWNLkEQb0ez8= X-Google-Smtp-Source: ABdhPJzhhOHrUOgwfjCoUquQpUARds9TYc5WQksHduUASYWjvsg/ahR7OzU3rnnjCTYwkx4dQq1PEZ5nipcIhzBlGhE= X-Received: by 2002:a63:f457:0:b0:39c:ec64:cd76 with SMTP id p23-20020a63f457000000b0039cec64cd76mr76196pgk.381.1650044051231; Fri, 15 Apr 2022 10:34:11 -0700 (PDT) MIME-Version: 1.0 References: <20220415055132.1257272-1-goldstein.w.n@gmail.com> <20220415172801.1525674-1-goldstein.w.n@gmail.com> <20220415172801.1525674-3-goldstein.w.n@gmail.com> In-Reply-To: <20220415172801.1525674-3-goldstein.w.n@gmail.com> From: "H.J. Lu" Date: Fri, 15 Apr 2022 10:33:35 -0700 Message-ID: Subject: Re: [PATCH v3 3/3] x86: Cleanup page cross code in memcmp-avx2-movbe.S To: Noah Goldstein Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3026.2 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 15 Apr 2022 17:34:14 -0000 On Fri, Apr 15, 2022 at 10:28 AM Noah Goldstein wrote: > > Old code was both inefficient and wasted code size. New code (-62 > bytes) and comparable or better performance in the page cross case. > > geometric_mean(N=20) of page cross cases New / Original: 0.960 > > size, align0, align1, ret, New Time/Old Time > 1, 4095, 0, 0, 1.001 > 1, 4095, 0, 1, 0.999 > 1, 4095, 0, -1, 1.0 > 2, 4094, 0, 0, 1.0 > 2, 4094, 0, 1, 1.0 > 2, 4094, 0, -1, 1.0 > 3, 4093, 0, 0, 1.0 > 3, 4093, 0, 1, 1.0 > 3, 4093, 0, -1, 1.0 > 4, 4092, 0, 0, 0.987 > 4, 4092, 0, 1, 1.0 > 4, 4092, 0, -1, 1.0 > 5, 4091, 0, 0, 0.984 > 5, 4091, 0, 1, 1.002 > 5, 4091, 0, -1, 1.005 > 6, 4090, 0, 0, 0.993 > 6, 4090, 0, 1, 1.001 > 6, 4090, 0, -1, 1.003 > 7, 4089, 0, 0, 0.991 > 7, 4089, 0, 1, 1.0 > 7, 4089, 0, -1, 1.001 > 8, 4088, 0, 0, 0.875 > 8, 4088, 0, 1, 0.881 > 8, 4088, 0, -1, 0.888 > 9, 4087, 0, 0, 0.872 > 9, 4087, 0, 1, 0.879 > 9, 4087, 0, -1, 0.883 > 10, 4086, 0, 0, 0.878 > 10, 4086, 0, 1, 0.886 > 10, 4086, 0, -1, 0.873 > 11, 4085, 0, 0, 0.878 > 11, 4085, 0, 1, 0.881 > 11, 4085, 0, -1, 0.879 > 12, 4084, 0, 0, 0.873 > 12, 4084, 0, 1, 0.889 > 12, 4084, 0, -1, 0.875 > 13, 4083, 0, 0, 0.873 > 13, 4083, 0, 1, 0.863 > 13, 4083, 0, -1, 0.863 > 14, 4082, 0, 0, 0.838 > 14, 4082, 0, 1, 0.869 > 14, 4082, 0, -1, 0.877 > 15, 4081, 0, 0, 0.841 > 15, 4081, 0, 1, 0.869 > 15, 4081, 0, -1, 0.876 > 16, 4080, 0, 0, 0.988 > 16, 4080, 0, 1, 0.99 > 16, 4080, 0, -1, 0.989 > 17, 4079, 0, 0, 0.978 > 17, 4079, 0, 1, 0.981 > 17, 4079, 0, -1, 0.98 > 18, 4078, 0, 0, 0.981 > 18, 4078, 0, 1, 0.98 > 18, 4078, 0, -1, 0.985 > 19, 4077, 0, 0, 0.977 > 19, 4077, 0, 1, 0.979 > 19, 4077, 0, -1, 0.986 > 20, 4076, 0, 0, 0.977 > 20, 4076, 0, 1, 0.986 > 20, 4076, 0, -1, 0.984 > 21, 4075, 0, 0, 0.977 > 21, 4075, 0, 1, 0.983 > 21, 4075, 0, -1, 0.988 > 22, 4074, 0, 0, 0.983 > 22, 4074, 0, 1, 0.994 > 22, 4074, 0, -1, 0.993 > 23, 4073, 0, 0, 0.98 > 23, 4073, 0, 1, 0.992 > 23, 4073, 0, -1, 0.995 > 24, 4072, 0, 0, 0.989 > 24, 4072, 0, 1, 0.989 > 24, 4072, 0, -1, 0.991 > 25, 4071, 0, 0, 0.99 > 25, 4071, 0, 1, 0.999 > 25, 4071, 0, -1, 0.996 > 26, 4070, 0, 0, 0.993 > 26, 4070, 0, 1, 0.995 > 26, 4070, 0, -1, 0.998 > 27, 4069, 0, 0, 0.993 > 27, 4069, 0, 1, 0.999 > 27, 4069, 0, -1, 1.0 > 28, 4068, 0, 0, 0.997 > 28, 4068, 0, 1, 1.0 > 28, 4068, 0, -1, 0.999 > 29, 4067, 0, 0, 0.996 > 29, 4067, 0, 1, 0.999 > 29, 4067, 0, -1, 0.999 > 30, 4066, 0, 0, 0.991 > 30, 4066, 0, 1, 1.001 > 30, 4066, 0, -1, 0.999 > 31, 4065, 0, 0, 0.988 > 31, 4065, 0, 1, 0.998 > 31, 4065, 0, -1, 0.998 > --- > sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++-------- > 1 file changed, 61 insertions(+), 37 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S > index a34ea1645d..210c9925b6 100644 > --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S > +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S > @@ -429,22 +429,21 @@ L(page_cross_less_vec): > # ifndef USE_AS_WMEMCMP > cmpl $8, %edx > jae L(between_8_15) > + /* Fall through for [4, 7]. */ > cmpl $4, %edx > - jae L(between_4_7) > + jb L(between_2_3) > > - /* Load as big endian to avoid branches. */ > - movzwl (%rdi), %eax > - movzwl (%rsi), %ecx > - shll $8, %eax > - shll $8, %ecx > - bswap %eax > - bswap %ecx > - movzbl -1(%rdi, %rdx), %edi > - movzbl -1(%rsi, %rdx), %esi > - orl %edi, %eax > - orl %esi, %ecx > - /* Subtraction is okay because the upper 8 bits are zero. */ > - subl %ecx, %eax > + movbe (%rdi), %eax > + movbe (%rsi), %ecx > + shlq $32, %rax > + shlq $32, %rcx > + movbe -4(%rdi, %rdx), %edi > + movbe -4(%rsi, %rdx), %esi > + orq %rdi, %rax > + orq %rsi, %rcx > + subq %rcx, %rax > + /* Fast path for return zero. */ > + jnz L(ret_nonzero) > /* No ymm register was touched. */ > ret > > @@ -457,9 +456,33 @@ L(one_or_less): > /* No ymm register was touched. */ > ret > > + .p2align 4,, 5 > +L(ret_nonzero): > + sbbl %eax, %eax > + orl $1, %eax > + /* No ymm register was touched. */ > + ret > + > + .p2align 4,, 2 > +L(zero): > + xorl %eax, %eax > + /* No ymm register was touched. */ > + ret > + > .p2align 4 > L(between_8_15): > -# endif > + movbe (%rdi), %rax > + movbe (%rsi), %rcx > + subq %rcx, %rax > + jnz L(ret_nonzero) > + movbe -8(%rdi, %rdx), %rax > + movbe -8(%rsi, %rdx), %rcx > + subq %rcx, %rax > + /* Fast path for return zero. */ > + jnz L(ret_nonzero) > + /* No ymm register was touched. */ > + ret > +# else > /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ > vmovq (%rdi), %xmm1 > vmovq (%rsi), %xmm2 > @@ -475,16 +498,13 @@ L(between_8_15): > VPCMPEQ %xmm1, %xmm2, %xmm2 > vpmovmskb %xmm2, %eax > subl $0xffff, %eax > + /* Fast path for return zero. */ > jnz L(return_vec_0) > /* No ymm register was touched. */ > ret > +# endif > > - .p2align 4 > -L(zero): > - xorl %eax, %eax > - ret > - > - .p2align 4 > + .p2align 4,, 10 > L(between_16_31): > /* From 16 to 31 bytes. No branch when size == 16. */ > vmovdqu (%rsi), %xmm2 > @@ -501,11 +521,17 @@ L(between_16_31): > VPCMPEQ (%rdi), %xmm2, %xmm2 > vpmovmskb %xmm2, %eax > subl $0xffff, %eax > + /* Fast path for return zero. */ > jnz L(return_vec_0) > /* No ymm register was touched. */ > ret > > # ifdef USE_AS_WMEMCMP > + .p2align 4,, 2 > +L(zero): > + xorl %eax, %eax > + ret > + > .p2align 4 > L(one_or_less): > jb L(zero) > @@ -520,22 +546,20 @@ L(one_or_less): > # else > > .p2align 4 > -L(between_4_7): > - /* Load as big endian with overlapping movbe to avoid branches. > - */ > - movbe (%rdi), %eax > - movbe (%rsi), %ecx > - shlq $32, %rax > - shlq $32, %rcx > - movbe -4(%rdi, %rdx), %edi > - movbe -4(%rsi, %rdx), %esi > - orq %rdi, %rax > - orq %rsi, %rcx > - subq %rcx, %rax > - jz L(zero_4_7) > - sbbl %eax, %eax > - orl $1, %eax > -L(zero_4_7): > +L(between_2_3): > + /* Load as big endian to avoid branches. */ > + movzwl (%rdi), %eax > + movzwl (%rsi), %ecx > + bswap %eax > + bswap %ecx > + shrl %eax > + shrl %ecx > + movzbl -1(%rdi, %rdx), %edi > + movzbl -1(%rsi, %rdx), %esi > + orl %edi, %eax > + orl %esi, %ecx > + /* Subtraction is okay because the upper bit is zero. */ > + subl %ecx, %eax > /* No ymm register was touched. */ > ret > # endif > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu Thanks. -- H.J.