From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-vs1-xe2a.google.com (mail-vs1-xe2a.google.com [IPv6:2607:f8b0:4864:20::e2a]) by sourceware.org (Postfix) with ESMTPS id 5F4D63858C54; Wed, 27 Apr 2022 23:53:20 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 5F4D63858C54 Received: by mail-vs1-xe2a.google.com with SMTP id m62so2556125vsc.2; Wed, 27 Apr 2022 16:53:20 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=2ENJRH4FiAaXwwET+9kNcOnDxfH2ohePRz7Mytaj8Cc=; b=rtjDjF0FA2Zwi5snNimrhj8e6bzIphHMjtWsCYtqbBWPUHZLAaTXTeIgJpyl6y9Uf3 26W6BQzvpo2dsCrhghkpfqkmKaaQhTwgLf9uskFomZCmqNoikeUx7UWb6V+cpCmR3dRK 8eYHA37sMFvCAMsdEdnxIRyXGIdXDsB+2xyT0sLKjx1Z+VEHKkbknpmWQEVesJnsADNk WgVYtyF/A/4xjOdafj/Y8x7N2CDmNKMZEKg+YAiUlaPT7oDJrB42HheMEZswTZyH7+Iy tttTjxVvw08sVeIMSGFKUOpVaXR3wjaW3VHGpSPld20GDFDoi5YitLMthDrWLfQM/WDX g6rg== X-Gm-Message-State: AOAM530hxkTiLY/d4nsptV+UCiDvttqAXewwrGpoI89A6ReK6sgT2P6Y JPrITAcfs3EVjyV2accW1pCGLRoPdmM/iLzsfD/L3XN7gSQ= X-Google-Smtp-Source: ABdhPJwNZyYaqfh/BKRAyFRAUIhoMhjAJIqXX8LW/xl1f5nGatoqTcgRj9xVvZD13roZpz1DCbGQxY8vew2WtA9Kbnw= X-Received: by 2002:a67:ee4f:0:b0:32c:ee75:6e98 with SMTP id g15-20020a67ee4f000000b0032cee756e98mr2726862vsp.79.1651103599659; Wed, 27 Apr 2022 16:53:19 -0700 (PDT) MIME-Version: 1.0 References: <20210421213951.404588-1-goldstein.w.n@gmail.com> <20210422180403.422364-1-goldstein.w.n@gmail.com> In-Reply-To: From: Sunil Pandey Date: Wed, 27 Apr 2022 16:52:43 -0700 Message-ID: Subject: Re: [PATCH v3 1/2] x86: Optimize strchr-avx2.S To: "H.J. Lu" , libc-stable@sourceware.org Cc: Noah Goldstein , GNU C Library Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-7.6 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, GIT_PATCH_0, HK_RANDOM_ENVFROM, HK_RANDOM_FROM, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-stable@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-stable mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 27 Apr 2022 23:53:22 -0000 On Fri, Apr 23, 2021 at 3:11 PM H.J. Lu via Libc-alpha wrote: > > On Fri, Apr 23, 2021 at 12:56 PM Noah Goldstein wrote: > > > > On Fri, Apr 23, 2021 at 12:56 PM H.J. Lu wrote: > > > > > > On Thu, Apr 22, 2021 at 02:04:02PM -0400, Noah Goldstein wrote: > > > > No bug. This commit optimizes strchr-avx2.S. The optimizations are all > > > > small things such as save an ALU in the alignment process, saving a > > > > few instructions in the loop return, saving some bytes in the main > > > > loop, and increasing the ILP in the return cases. test-strchr, > > > > test-strchrnul, test-wcschr, and test-wcschrnul are all passing. > > > > > > > > Signed-off-by: Noah Goldstein > > > > --- > > > > sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++---------- > > > > 1 file changed, 173 insertions(+), 121 deletions(-) > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S > > > > index 25bec38b5d..220165d2ba 100644 > > > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S > > > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S > > > > @@ -49,132 +49,144 @@ > > > > > > > > .section SECTION(.text),"ax",@progbits > > > > ENTRY (STRCHR) > > > > - movl %edi, %ecx > > > > -# ifndef USE_AS_STRCHRNUL > > > > - xorl %edx, %edx > > > > -# endif > > > > - > > > > /* Broadcast CHAR to YMM0. */ > > > > vmovd %esi, %xmm0 > > > > + movl %edi, %eax > > > > + andl $(PAGE_SIZE - 1), %eax > > > > + VPBROADCAST %xmm0, %ymm0 > > > > vpxor %xmm9, %xmm9, %xmm9 > > > > - VPBROADCAST %xmm0, %ymm0 > > > > > > > > /* Check if we cross page boundary with one vector load. */ > > > > - andl $(PAGE_SIZE - 1), %ecx > > > > - cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > > > > - ja L(cross_page_boundary) > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > + ja L(cross_page_boundary) > > > > > > > > /* Check the first VEC_SIZE bytes. Search for both CHAR and the > > > > null byte. */ > > > > vmovdqu (%rdi), %ymm8 > > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > vpor %ymm1, %ymm2, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > + vpmovmskb %ymm1, %eax > > > > > > This change isn't needed. > > > > Fixed. For the future what is the rule for when to have a tab after > > the instructions vs just a space? > > I prefer if the length of is less than a tab, > > > > Otherwise > > > > > > > > > > testl %eax, %eax > > > > - jz L(more_vecs) > > > > + jz L(aligned_more) > > > > tzcntl %eax, %eax > > > > - /* Found CHAR or the null byte. */ > > > > - addq %rdi, %rax > > > > # ifndef USE_AS_STRCHRNUL > > > > - cmp (%rax), %CHAR_REG > > > > - cmovne %rdx, %rax > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero) > > > > # endif > > > > -L(return_vzeroupper): > > > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > > > - > > > > - .p2align 4 > > > > -L(more_vecs): > > > > - /* Align data for aligned loads in the loop. */ > > > > - andq $-VEC_SIZE, %rdi > > > > -L(aligned_more): > > > > - > > > > - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > > - since data is only aligned to VEC_SIZE. */ > > > > - vmovdqa VEC_SIZE(%rdi), %ymm8 > > > > - addq $VEC_SIZE, %rdi > > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > - vpor %ymm1, %ymm2, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jnz L(first_vec_x0) > > > > - > > > > - vmovdqa VEC_SIZE(%rdi), %ymm8 > > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > - vpor %ymm1, %ymm2, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jnz L(first_vec_x1) > > > > - > > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 > > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > - vpor %ymm1, %ymm2, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jnz L(first_vec_x2) > > > > - > > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > - vpor %ymm1, %ymm2, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jz L(prep_loop_4x) > > > > + addq %rdi, %rax > > > > + VZEROUPPER_RETURN > > > > > > > > + /* .p2align 5 helps keep performance more consistent if ENTRY() > > > > + alignment % 32 was either 16 or 0. As well this makes the > > > > + alignment % 32 of the loop_4x_vec fixed which makes tuning it > > > > + easier. */ > > > > + .p2align 5 > > > > +L(first_vec_x4): > > > > tzcntl %eax, %eax > > > > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > > > > + addq $(VEC_SIZE * 3 + 1), %rdi > > > > # ifndef USE_AS_STRCHRNUL > > > > - cmp (%rax), %CHAR_REG > > > > - cmovne %rdx, %rax > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero) > > > > # endif > > > > + addq %rdi, %rax > > > > VZEROUPPER_RETURN > > > > > > > > - .p2align 4 > > > > -L(first_vec_x0): > > > > - tzcntl %eax, %eax > > > > - /* Found CHAR or the null byte. */ > > > > - addq %rdi, %rax > > > > # ifndef USE_AS_STRCHRNUL > > > > - cmp (%rax), %CHAR_REG > > > > - cmovne %rdx, %rax > > > > -# endif > > > > +L(zero): > > > > + xorl %eax, %eax > > > > VZEROUPPER_RETURN > > > > +# endif > > > > + > > > > > > > > .p2align 4 > > > > L(first_vec_x1): > > > > tzcntl %eax, %eax > > > > - leaq VEC_SIZE(%rdi, %rax), %rax > > > > + incq %rdi > > > > # ifndef USE_AS_STRCHRNUL > > > > - cmp (%rax), %CHAR_REG > > > > - cmovne %rdx, %rax > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero) > > > > # endif > > > > + addq %rdi, %rax > > > > VZEROUPPER_RETURN > > > > > > > > .p2align 4 > > > > L(first_vec_x2): > > > > tzcntl %eax, %eax > > > > + addq $(VEC_SIZE + 1), %rdi > > > > +# ifndef USE_AS_STRCHRNUL > > > > /* Found CHAR or the null byte. */ > > > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero) > > > > +# endif > > > > + addq %rdi, %rax > > > > + VZEROUPPER_RETURN > > > > + > > > > + .p2align 4 > > > > +L(first_vec_x3): > > > > + tzcntl %eax, %eax > > > > + addq $(VEC_SIZE * 2 + 1), %rdi > > > > # ifndef USE_AS_STRCHRNUL > > > > - cmp (%rax), %CHAR_REG > > > > - cmovne %rdx, %rax > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero) > > > > # endif > > > > + addq %rdi, %rax > > > > VZEROUPPER_RETURN > > > > > > > > -L(prep_loop_4x): > > > > - /* Align data to 4 * VEC_SIZE. */ > > > > - andq $-(VEC_SIZE * 4), %rdi > > > > + .p2align 4 > > > > +L(aligned_more): > > > > + /* Align data to VEC_SIZE - 1. This is the same number of > > > > + instructions as using andq -VEC_SIZE but saves 4 bytes of code on > > > > + x4 check. */ > > > > + orq $(VEC_SIZE - 1), %rdi > > > > +L(cross_page_continue): > > > > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since > > > > + data is only aligned to VEC_SIZE. */ > > > > + vmovdqa 1(%rdi), %ymm8 > > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > + vpor %ymm1, %ymm2, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > > > Use a space, not tab, after vpmovmskb. > > > > Fixed. > > > > > > > > > + testl %eax, %eax > > > > + jnz L(first_vec_x1) > > > > > > > > + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 > > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > + vpor %ymm1, %ymm2, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > > > Use a space, not tab, after vpmovmskb. > > > > > > > Fixed. > > > > > > + testl %eax, %eax > > > > + jnz L(first_vec_x2) > > > > + > > > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 > > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > + vpor %ymm1, %ymm2, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > > > Use a space, not tab, after vpmovmskb. > > > > > > > Fixed. > > > > > > + testl %eax, %eax > > > > + jnz L(first_vec_x3) > > > > + > > > > + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 > > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > + vpor %ymm1, %ymm2, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > > > Use a space, not tab, after vpmovmskb. > > > > > > > Fixed. > > > > > > + testl %eax, %eax > > > > + jnz L(first_vec_x4) > > > > + /* Align data to VEC_SIZE * 4 - 1. */ > > > > + addq $(VEC_SIZE * 4 + 1), %rdi > > > > + andq $-(VEC_SIZE * 4), %rdi > > > > .p2align 4 > > > > L(loop_4x_vec): > > > > /* Compare 4 * VEC at a time forward. */ > > > > - vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 > > > > - vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 > > > > - vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 > > > > - vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 > > > > + vmovdqa (%rdi), %ymm5 > > > > + vmovdqa (VEC_SIZE)(%rdi), %ymm6 > > > > + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 > > > > + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > > > > > > > /* Leaves only CHARS matching esi as 0. */ > > > > vpxor %ymm5, %ymm0, %ymm1 > > > > @@ -190,62 +202,102 @@ L(loop_4x_vec): > > > > VPMINU %ymm1, %ymm2, %ymm5 > > > > VPMINU %ymm3, %ymm4, %ymm6 > > > > > > > > - VPMINU %ymm5, %ymm6, %ymm5 > > > > + VPMINU %ymm5, %ymm6, %ymm6 > > > > > > > > - VPCMPEQ %ymm5, %ymm9, %ymm5 > > > > - vpmovmskb %ymm5, %eax > > > > + VPCMPEQ %ymm6, %ymm9, %ymm6 > > > > + vpmovmskb %ymm6, %ecx > > > > > > Use a space, not tab, after vpmovmskb. > > > > > > > Fixed. > > > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > + testl %ecx, %ecx > > > > + jz L(loop_4x_vec) > > > > > > > > - addq $(VEC_SIZE * 4), %rdi > > > > - testl %eax, %eax > > > > - jz L(loop_4x_vec) > > > > > > > > - VPCMPEQ %ymm1, %ymm9, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > + VPCMPEQ %ymm1, %ymm9, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > > > Use a space, not tab, after vpmovmskb. > > > > > > > Fixed. > > > > > > testl %eax, %eax > > > > - jnz L(first_vec_x0) > > > > + jnz L(last_vec_x0) > > > > + > > > > > > > > - VPCMPEQ %ymm2, %ymm9, %ymm2 > > > > - vpmovmskb %ymm2, %eax > > > > + VPCMPEQ %ymm5, %ymm9, %ymm2 > > > > + vpmovmskb %ymm2, %eax > > > > > > Use a space, not tab, after vpmovmskb. > > > > > > > Fixed. > > > > > > testl %eax, %eax > > > > - jnz L(first_vec_x1) > > > > + jnz L(last_vec_x1) > > > > + > > > > + VPCMPEQ %ymm3, %ymm9, %ymm3 > > > > + vpmovmskb %ymm3, %eax > > > > + /* rcx has combined result from all 4 VEC. It will only be used if > > > > + the first 3 other VEC all did not contain a match. */ > > > > + salq $32, %rcx > > > > + orq %rcx, %rax > > > > + tzcntq %rax, %rax > > > > + subq $(VEC_SIZE * 2), %rdi > > > > +# ifndef USE_AS_STRCHRNUL > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero_end) > > > > +# endif > > > > + addq %rdi, %rax > > > > + VZEROUPPER_RETURN > > > > + > > > > > > > > - VPCMPEQ %ymm3, %ymm9, %ymm3 > > > > - VPCMPEQ %ymm4, %ymm9, %ymm4 > > > > - vpmovmskb %ymm3, %ecx > > > > - vpmovmskb %ymm4, %eax > > > > - salq $32, %rax > > > > - orq %rcx, %rax > > > > - tzcntq %rax, %rax > > > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > > > + .p2align 4 > > > > +L(last_vec_x0): > > > > + tzcntl %eax, %eax > > > > + addq $-(VEC_SIZE * 4), %rdi > > > > # ifndef USE_AS_STRCHRNUL > > > > - cmp (%rax), %CHAR_REG > > > > - cmovne %rdx, %rax > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero_end) > > > > # endif > > > > + addq %rdi, %rax > > > > VZEROUPPER_RETURN > > > > > > > > +# ifndef USE_AS_STRCHRNUL > > > > +L(zero_end): > > > > + xorl %eax, %eax > > > > + VZEROUPPER_RETURN > > > > +# endif > > > > + > > > > + .p2align 4 > > > > +L(last_vec_x1): > > > > + tzcntl %eax, %eax > > > > + subq $(VEC_SIZE * 3), %rdi > > > > +# ifndef USE_AS_STRCHRNUL > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero_end) > > > > +# endif > > > > + addq %rdi, %rax > > > > + VZEROUPPER_RETURN > > > > + > > > > + > > > > /* Cold case for crossing page with first load. */ > > > > .p2align 4 > > > > L(cross_page_boundary): > > > > - andq $-VEC_SIZE, %rdi > > > > - andl $(VEC_SIZE - 1), %ecx > > > > - > > > > - vmovdqa (%rdi), %ymm8 > > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > + movq %rdi, %rdx > > > > + /* Align rdi to VEC_SIZE - 1. */ > > > > + orq $(VEC_SIZE - 1), %rdi > > > > + vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 > > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > vpor %ymm1, %ymm2, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - /* Remove the leading bits. */ > > > > - sarxl %ecx, %eax, %eax > > > > + vpmovmskb %ymm1, %eax > > > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > > > + so no need to manually mod edx. */ > > > > + sarxl %edx, %eax, %eax > > > > testl %eax, %eax > > > > - jz L(aligned_more) > > > > + jz L(cross_page_continue) > > > > tzcntl %eax, %eax > > > > - addq %rcx, %rdi > > > > - addq %rdi, %rax > > > > # ifndef USE_AS_STRCHRNUL > > > > - cmp (%rax), %CHAR_REG > > > > - cmovne %rdx, %rax > > > > + xorl %ecx, %ecx > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdx, %rax), %CHAR_REG > > > > + leaq (%rdx, %rax), %rax > > > > + cmovne %rcx, %rax > > > > +# else > > > > + addq %rdx, %rax > > > > # endif > > > > - VZEROUPPER_RETURN > > > > +L(return_vzeroupper): > > > > + ZERO_UPPER_VEC_REGISTERS_RETURN > > > > > > > > END (STRCHR) > > > > # endif > > > > -- > > > > 2.29.2 > > > > > > > > > > Thanks. > > > > > > H.J. > > > > -- > H.J. I would like to backport this patch to release branches. Any comments or objections? --Sunil