From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-oi1-x230.google.com (mail-oi1-x230.google.com [IPv6:2607:f8b0:4864:20::230]) by sourceware.org (Postfix) with ESMTPS id AC4C53944803 for ; Fri, 23 Apr 2021 20:15:28 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org AC4C53944803 Received: by mail-oi1-x230.google.com with SMTP id k18so45386332oik.1 for ; Fri, 23 Apr 2021 13:15:28 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=tk+7FKMFHbdB+1XVUlDrtOF8qU8eA15sLvH36icopPc=; b=Y+EHbFeAeqeHQLOtrN7dY3qwMS7exlNHNFGRwvtwfU79QFixDj94UEV+7xYxKpGiZJ MJPuwUhenA/XM+JZQj0pFAs1CSYL+bPqhUHo0JP5S9WHLdJ1aQRDFSS4CBR1BB9OSr2w LpRJ47cQR7FakS3NZuc5c75BUA3DyjIvU5YBeZ3ojQ3Ne1pUuWbwtBBrCyOeNlM0WB4X YXjEdoDUy+JVNfD7cA43HmuA1HaYjQc6+EwdUbVY1+Di1wEvZsnfEldoe3XgxWXfJEPE ScIsL02FlMA0kk5lpR9ebQ5BDzBcFIr11GLF3GyUs4UuQF59A/vaTorHGmnj8dm9ucZO rrGA== X-Gm-Message-State: AOAM533tF7fdUid3Ll98JKcG9YwIow7QNthrQxkPMfMFrTCh56qAcf2n jrcFLuJxWGFtraicCEm2Iwnhymt2vFQvXYZ/p8pmiaO+xl98bw== X-Google-Smtp-Source: ABdhPJwQv4DXlwqi+meL4Vrm7ncOecIii4TsaAphDeJncXRVb4F/OIfJ83gtdr2ubFn4nAYDW03QZ9pqIx5v5moOriE= X-Received: by 2002:aca:b208:: with SMTP id b8mr5342882oif.79.1619208927794; Fri, 23 Apr 2021 13:15:27 -0700 (PDT) MIME-Version: 1.0 References: <20210421213951.404588-1-goldstein.w.n@gmail.com> <20210422180403.422364-1-goldstein.w.n@gmail.com> In-Reply-To: From: "H.J. Lu" Date: Fri, 23 Apr 2021 13:14:51 -0700 Message-ID: Subject: Re: [PATCH v3 1/2] x86: Optimize strchr-avx2.S To: Noah Goldstein Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3034.7 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 23 Apr 2021 20:15:31 -0000 On Fri, Apr 23, 2021 at 12:56 PM Noah Goldstein wrote: > > On Fri, Apr 23, 2021 at 12:56 PM H.J. Lu wrote: > > > > On Thu, Apr 22, 2021 at 02:04:02PM -0400, Noah Goldstein wrote: > > > No bug. This commit optimizes strchr-avx2.S. The optimizations are all > > > small things such as save an ALU in the alignment process, saving a > > > few instructions in the loop return, saving some bytes in the main > > > loop, and increasing the ILP in the return cases. test-strchr, > > > test-strchrnul, test-wcschr, and test-wcschrnul are all passing. > > > > > > Signed-off-by: Noah Goldstein > > > --- > > > sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++---------- > > > 1 file changed, 173 insertions(+), 121 deletions(-) > > > > > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S > > > index 25bec38b5d..220165d2ba 100644 > > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S > > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S > > > @@ -49,132 +49,144 @@ > > > > > > .section SECTION(.text),"ax",@progbits > > > ENTRY (STRCHR) > > > - movl %edi, %ecx > > > -# ifndef USE_AS_STRCHRNUL > > > - xorl %edx, %edx > > > -# endif > > > - > > > /* Broadcast CHAR to YMM0. */ > > > vmovd %esi, %xmm0 > > > + movl %edi, %eax > > > + andl $(PAGE_SIZE - 1), %eax > > > + VPBROADCAST %xmm0, %ymm0 > > > vpxor %xmm9, %xmm9, %xmm9 > > > - VPBROADCAST %xmm0, %ymm0 > > > > > > /* Check if we cross page boundary with one vector load. */ > > > - andl $(PAGE_SIZE - 1), %ecx > > > - cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > > > - ja L(cross_page_boundary) > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > + ja L(cross_page_boundary) > > > > > > /* Check the first VEC_SIZE bytes. Search for both CHAR and the > > > null byte. */ > > > vmovdqu (%rdi), %ymm8 > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > vpor %ymm1, %ymm2, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + vpmovmskb %ymm1, %eax > > > > This change isn't needed. > > Fixed. For the future what is the rule for when to have a tab after > the instructions vs just a space? I prefer if the length of is less than a tab, Otherwise > > > > > testl %eax, %eax > > > - jz L(more_vecs) > > > + jz L(aligned_more) > > > tzcntl %eax, %eax > > > - /* Found CHAR or the null byte. */ > > > - addq %rdi, %rax > > > # ifndef USE_AS_STRCHRNUL > > > - cmp (%rax), %CHAR_REG > > > - cmovne %rdx, %rax > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero) > > > # endif > > > -L(return_vzeroupper): > > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > > - > > > - .p2align 4 > > > -L(more_vecs): > > > - /* Align data for aligned loads in the loop. */ > > > - andq $-VEC_SIZE, %rdi > > > -L(aligned_more): > > > - > > > - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > - since data is only aligned to VEC_SIZE. */ > > > - vmovdqa VEC_SIZE(%rdi), %ymm8 > > > - addq $VEC_SIZE, %rdi > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > - vpor %ymm1, %ymm2, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x0) > > > - > > > - vmovdqa VEC_SIZE(%rdi), %ymm8 > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > - vpor %ymm1, %ymm2, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x1) > > > - > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > - vpor %ymm1, %ymm2, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x2) > > > - > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > - vpor %ymm1, %ymm2, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jz L(prep_loop_4x) > > > + addq %rdi, %rax > > > + VZEROUPPER_RETURN > > > > > > + /* .p2align 5 helps keep performance more consistent if ENTRY() > > > + alignment % 32 was either 16 or 0. As well this makes the > > > + alignment % 32 of the loop_4x_vec fixed which makes tuning it > > > + easier. */ > > > + .p2align 5 > > > +L(first_vec_x4): > > > tzcntl %eax, %eax > > > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > > > + addq $(VEC_SIZE * 3 + 1), %rdi > > > # ifndef USE_AS_STRCHRNUL > > > - cmp (%rax), %CHAR_REG > > > - cmovne %rdx, %rax > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero) > > > # endif > > > + addq %rdi, %rax > > > VZEROUPPER_RETURN > > > > > > - .p2align 4 > > > -L(first_vec_x0): > > > - tzcntl %eax, %eax > > > - /* Found CHAR or the null byte. */ > > > - addq %rdi, %rax > > > # ifndef USE_AS_STRCHRNUL > > > - cmp (%rax), %CHAR_REG > > > - cmovne %rdx, %rax > > > -# endif > > > +L(zero): > > > + xorl %eax, %eax > > > VZEROUPPER_RETURN > > > +# endif > > > + > > > > > > .p2align 4 > > > L(first_vec_x1): > > > tzcntl %eax, %eax > > > - leaq VEC_SIZE(%rdi, %rax), %rax > > > + incq %rdi > > > # ifndef USE_AS_STRCHRNUL > > > - cmp (%rax), %CHAR_REG > > > - cmovne %rdx, %rax > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero) > > > # endif > > > + addq %rdi, %rax > > > VZEROUPPER_RETURN > > > > > > .p2align 4 > > > L(first_vec_x2): > > > tzcntl %eax, %eax > > > + addq $(VEC_SIZE + 1), %rdi > > > +# ifndef USE_AS_STRCHRNUL > > > /* Found CHAR or the null byte. */ > > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero) > > > +# endif > > > + addq %rdi, %rax > > > + VZEROUPPER_RETURN > > > + > > > + .p2align 4 > > > +L(first_vec_x3): > > > + tzcntl %eax, %eax > > > + addq $(VEC_SIZE * 2 + 1), %rdi > > > # ifndef USE_AS_STRCHRNUL > > > - cmp (%rax), %CHAR_REG > > > - cmovne %rdx, %rax > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero) > > > # endif > > > + addq %rdi, %rax > > > VZEROUPPER_RETURN > > > > > > -L(prep_loop_4x): > > > - /* Align data to 4 * VEC_SIZE. */ > > > - andq $-(VEC_SIZE * 4), %rdi > > > + .p2align 4 > > > +L(aligned_more): > > > + /* Align data to VEC_SIZE - 1. This is the same number of > > > + instructions as using andq -VEC_SIZE but saves 4 bytes of code on > > > + x4 check. */ > > > + orq $(VEC_SIZE - 1), %rdi > > > +L(cross_page_continue): > > > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since > > > + data is only aligned to VEC_SIZE. */ > > > + vmovdqa 1(%rdi), %ymm8 > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > + vpor %ymm1, %ymm2, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > > Use a space, not tab, after vpmovmskb. > > Fixed. > > > > > > + testl %eax, %eax > > > + jnz L(first_vec_x1) > > > > > > + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > + vpor %ymm1, %ymm2, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > > Use a space, not tab, after vpmovmskb. > > > > Fixed. > > > > + testl %eax, %eax > > > + jnz L(first_vec_x2) > > > + > > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > + vpor %ymm1, %ymm2, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > > Use a space, not tab, after vpmovmskb. > > > > Fixed. > > > > + testl %eax, %eax > > > + jnz L(first_vec_x3) > > > + > > > + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > + vpor %ymm1, %ymm2, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > > Use a space, not tab, after vpmovmskb. > > > > Fixed. > > > > + testl %eax, %eax > > > + jnz L(first_vec_x4) > > > + /* Align data to VEC_SIZE * 4 - 1. */ > > > + addq $(VEC_SIZE * 4 + 1), %rdi > > > + andq $-(VEC_SIZE * 4), %rdi > > > .p2align 4 > > > L(loop_4x_vec): > > > /* Compare 4 * VEC at a time forward. */ > > > - vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 > > > - vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 > > > - vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 > > > - vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 > > > + vmovdqa (%rdi), %ymm5 > > > + vmovdqa (VEC_SIZE)(%rdi), %ymm6 > > > + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 > > > + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > > > > > /* Leaves only CHARS matching esi as 0. */ > > > vpxor %ymm5, %ymm0, %ymm1 > > > @@ -190,62 +202,102 @@ L(loop_4x_vec): > > > VPMINU %ymm1, %ymm2, %ymm5 > > > VPMINU %ymm3, %ymm4, %ymm6 > > > > > > - VPMINU %ymm5, %ymm6, %ymm5 > > > + VPMINU %ymm5, %ymm6, %ymm6 > > > > > > - VPCMPEQ %ymm5, %ymm9, %ymm5 > > > - vpmovmskb %ymm5, %eax > > > + VPCMPEQ %ymm6, %ymm9, %ymm6 > > > + vpmovmskb %ymm6, %ecx > > > > Use a space, not tab, after vpmovmskb. > > > > Fixed. > > > > + subq $-(VEC_SIZE * 4), %rdi > > > + testl %ecx, %ecx > > > + jz L(loop_4x_vec) > > > > > > - addq $(VEC_SIZE * 4), %rdi > > > - testl %eax, %eax > > > - jz L(loop_4x_vec) > > > > > > - VPCMPEQ %ymm1, %ymm9, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + VPCMPEQ %ymm1, %ymm9, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > > Use a space, not tab, after vpmovmskb. > > > > Fixed. > > > > testl %eax, %eax > > > - jnz L(first_vec_x0) > > > + jnz L(last_vec_x0) > > > + > > > > > > - VPCMPEQ %ymm2, %ymm9, %ymm2 > > > - vpmovmskb %ymm2, %eax > > > + VPCMPEQ %ymm5, %ymm9, %ymm2 > > > + vpmovmskb %ymm2, %eax > > > > Use a space, not tab, after vpmovmskb. > > > > Fixed. > > > > testl %eax, %eax > > > - jnz L(first_vec_x1) > > > + jnz L(last_vec_x1) > > > + > > > + VPCMPEQ %ymm3, %ymm9, %ymm3 > > > + vpmovmskb %ymm3, %eax > > > + /* rcx has combined result from all 4 VEC. It will only be used if > > > + the first 3 other VEC all did not contain a match. */ > > > + salq $32, %rcx > > > + orq %rcx, %rax > > > + tzcntq %rax, %rax > > > + subq $(VEC_SIZE * 2), %rdi > > > +# ifndef USE_AS_STRCHRNUL > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero_end) > > > +# endif > > > + addq %rdi, %rax > > > + VZEROUPPER_RETURN > > > + > > > > > > - VPCMPEQ %ymm3, %ymm9, %ymm3 > > > - VPCMPEQ %ymm4, %ymm9, %ymm4 > > > - vpmovmskb %ymm3, %ecx > > > - vpmovmskb %ymm4, %eax > > > - salq $32, %rax > > > - orq %rcx, %rax > > > - tzcntq %rax, %rax > > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > > + .p2align 4 > > > +L(last_vec_x0): > > > + tzcntl %eax, %eax > > > + addq $-(VEC_SIZE * 4), %rdi > > > # ifndef USE_AS_STRCHRNUL > > > - cmp (%rax), %CHAR_REG > > > - cmovne %rdx, %rax > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero_end) > > > # endif > > > + addq %rdi, %rax > > > VZEROUPPER_RETURN > > > > > > +# ifndef USE_AS_STRCHRNUL > > > +L(zero_end): > > > + xorl %eax, %eax > > > + VZEROUPPER_RETURN > > > +# endif > > > + > > > + .p2align 4 > > > +L(last_vec_x1): > > > + tzcntl %eax, %eax > > > + subq $(VEC_SIZE * 3), %rdi > > > +# ifndef USE_AS_STRCHRNUL > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero_end) > > > +# endif > > > + addq %rdi, %rax > > > + VZEROUPPER_RETURN > > > + > > > + > > > /* Cold case for crossing page with first load. */ > > > .p2align 4 > > > L(cross_page_boundary): > > > - andq $-VEC_SIZE, %rdi > > > - andl $(VEC_SIZE - 1), %ecx > > > - > > > - vmovdqa (%rdi), %ymm8 > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > + movq %rdi, %rdx > > > + /* Align rdi to VEC_SIZE - 1. */ > > > + orq $(VEC_SIZE - 1), %rdi > > > + vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > vpor %ymm1, %ymm2, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - /* Remove the leading bits. */ > > > - sarxl %ecx, %eax, %eax > > > + vpmovmskb %ymm1, %eax > > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > > + so no need to manually mod edx. */ > > > + sarxl %edx, %eax, %eax > > > testl %eax, %eax > > > - jz L(aligned_more) > > > + jz L(cross_page_continue) > > > tzcntl %eax, %eax > > > - addq %rcx, %rdi > > > - addq %rdi, %rax > > > # ifndef USE_AS_STRCHRNUL > > > - cmp (%rax), %CHAR_REG > > > - cmovne %rdx, %rax > > > + xorl %ecx, %ecx > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdx, %rax), %CHAR_REG > > > + leaq (%rdx, %rax), %rax > > > + cmovne %rcx, %rax > > > +# else > > > + addq %rdx, %rax > > > # endif > > > - VZEROUPPER_RETURN > > > +L(return_vzeroupper): > > > + ZERO_UPPER_VEC_REGISTERS_RETURN > > > > > > END (STRCHR) > > > # endif > > > -- > > > 2.29.2 > > > > > > > Thanks. > > > > H.J. -- H.J.