From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pf1-x433.google.com (mail-pf1-x433.google.com [IPv6:2607:f8b0:4864:20::433]) by sourceware.org (Postfix) with ESMTPS id E5FD0385783D for ; Fri, 23 Apr 2021 19:56:08 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org E5FD0385783D Received: by mail-pf1-x433.google.com with SMTP id a12so34748276pfc.7 for ; Fri, 23 Apr 2021 12:56:08 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=OPGGm/+ovK4DsexzTV0FLzOtCEjhtFdB8nv7t38mlEA=; b=NPKubzmgDWCrvRgmD295dW2GUZ+F6i5uI06fpHxMVxZWwAnpEEenqvymAqlG8j6uqi z57DTJXA1Ai+s0Mys9nbQAtKS9KcWBRmVzIquuO8Onnjx/WjWAqyAZeY7QKlTzunxtXZ 1OhLioGgS9ZXjinFRaJbLLz6q/aZ5EZsI+XeddJLBH3VJrx1egFKZ3F3Sh+3tFnGOcr+ 3T5ZzY2R0lEQzsbpZyiwIIayXPfz5uCHramZSg+9cAQ3QbjZLnjHBoSrLB1NB3pVeJ7H 1ZX6CY+T1y461pTq6c50wimXb1wO3nwtski5j/SDhLOWNV0IpWeDrPgchR5WJqGLrQ4/ KBjg== X-Gm-Message-State: AOAM530O7jh4n8ot8rC3OgZ1CqN/oObk8KVwwgOkzxwIXHwKk/ejyhC/ hss1TLujDbgl0ec4liktSm1ZHxqRSik9xpwz7IA= X-Google-Smtp-Source: ABdhPJw2y9fSn4wGyGhHcRQd+7UVzC5LZy+OaqMPUygjKp7A4UEbxPug80gxTrRoBhPZznxIJRpd/tMAIbDSMuDjysc= X-Received: by 2002:a62:5c6:0:b029:24d:e97f:1b1d with SMTP id 189-20020a6205c60000b029024de97f1b1dmr5495202pff.40.1619207767923; Fri, 23 Apr 2021 12:56:07 -0700 (PDT) MIME-Version: 1.0 References: <20210421213951.404588-1-goldstein.w.n@gmail.com> <20210422180403.422364-1-goldstein.w.n@gmail.com> In-Reply-To: From: Noah Goldstein Date: Fri, 23 Apr 2021 15:55:57 -0400 Message-ID: Subject: Re: [PATCH v3 1/2] x86: Optimize strchr-avx2.S To: "H.J. Lu" Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-10.2 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 23 Apr 2021 19:56:11 -0000 On Fri, Apr 23, 2021 at 12:56 PM H.J. Lu wrote: > > On Thu, Apr 22, 2021 at 02:04:02PM -0400, Noah Goldstein wrote: > > No bug. This commit optimizes strchr-avx2.S. The optimizations are all > > small things such as save an ALU in the alignment process, saving a > > few instructions in the loop return, saving some bytes in the main > > loop, and increasing the ILP in the return cases. test-strchr, > > test-strchrnul, test-wcschr, and test-wcschrnul are all passing. > > > > Signed-off-by: Noah Goldstein > > --- > > sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++---------- > > 1 file changed, 173 insertions(+), 121 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S > > index 25bec38b5d..220165d2ba 100644 > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S > > @@ -49,132 +49,144 @@ > > > > .section SECTION(.text),"ax",@progbits > > ENTRY (STRCHR) > > - movl %edi, %ecx > > -# ifndef USE_AS_STRCHRNUL > > - xorl %edx, %edx > > -# endif > > - > > /* Broadcast CHAR to YMM0. */ > > vmovd %esi, %xmm0 > > + movl %edi, %eax > > + andl $(PAGE_SIZE - 1), %eax > > + VPBROADCAST %xmm0, %ymm0 > > vpxor %xmm9, %xmm9, %xmm9 > > - VPBROADCAST %xmm0, %ymm0 > > > > /* Check if we cross page boundary with one vector load. */ > > - andl $(PAGE_SIZE - 1), %ecx > > - cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > > - ja L(cross_page_boundary) > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > + ja L(cross_page_boundary) > > > > /* Check the first VEC_SIZE bytes. Search for both CHAR and the > > null byte. */ > > vmovdqu (%rdi), %ymm8 > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > vpor %ymm1, %ymm2, %ymm1 > > - vpmovmskb %ymm1, %eax > > + vpmovmskb %ymm1, %eax > > This change isn't needed. Fixed. For the future what is the rule for when to have a tab after the instructions vs just a space? > > > testl %eax, %eax > > - jz L(more_vecs) > > + jz L(aligned_more) > > tzcntl %eax, %eax > > - /* Found CHAR or the null byte. */ > > - addq %rdi, %rax > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Found CHAR or the null byte. */ > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero) > > # endif > > -L(return_vzeroupper): > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > - > > - .p2align 4 > > -L(more_vecs): > > - /* Align data for aligned loads in the loop. */ > > - andq $-VEC_SIZE, %rdi > > -L(aligned_more): > > - > > - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time > > - since data is only aligned to VEC_SIZE. */ > > - vmovdqa VEC_SIZE(%rdi), %ymm8 > > - addq $VEC_SIZE, %rdi > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > - vpor %ymm1, %ymm2, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x0) > > - > > - vmovdqa VEC_SIZE(%rdi), %ymm8 > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > - vpor %ymm1, %ymm2, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x1) > > - > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > - vpor %ymm1, %ymm2, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x2) > > - > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > - vpor %ymm1, %ymm2, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jz L(prep_loop_4x) > > + addq %rdi, %rax > > + VZEROUPPER_RETURN > > > > + /* .p2align 5 helps keep performance more consistent if ENTRY() > > + alignment % 32 was either 16 or 0. As well this makes the > > + alignment % 32 of the loop_4x_vec fixed which makes tuning it > > + easier. */ > > + .p2align 5 > > +L(first_vec_x4): > > tzcntl %eax, %eax > > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > > + addq $(VEC_SIZE * 3 + 1), %rdi > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Found CHAR or the null byte. */ > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero) > > # endif > > + addq %rdi, %rax > > VZEROUPPER_RETURN > > > > - .p2align 4 > > -L(first_vec_x0): > > - tzcntl %eax, %eax > > - /* Found CHAR or the null byte. */ > > - addq %rdi, %rax > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > -# endif > > +L(zero): > > + xorl %eax, %eax > > VZEROUPPER_RETURN > > +# endif > > + > > > > .p2align 4 > > L(first_vec_x1): > > tzcntl %eax, %eax > > - leaq VEC_SIZE(%rdi, %rax), %rax > > + incq %rdi > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Found CHAR or the null byte. */ > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero) > > # endif > > + addq %rdi, %rax > > VZEROUPPER_RETURN > > > > .p2align 4 > > L(first_vec_x2): > > tzcntl %eax, %eax > > + addq $(VEC_SIZE + 1), %rdi > > +# ifndef USE_AS_STRCHRNUL > > /* Found CHAR or the null byte. */ > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero) > > +# endif > > + addq %rdi, %rax > > + VZEROUPPER_RETURN > > + > > + .p2align 4 > > +L(first_vec_x3): > > + tzcntl %eax, %eax > > + addq $(VEC_SIZE * 2 + 1), %rdi > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Found CHAR or the null byte. */ > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero) > > # endif > > + addq %rdi, %rax > > VZEROUPPER_RETURN > > > > -L(prep_loop_4x): > > - /* Align data to 4 * VEC_SIZE. */ > > - andq $-(VEC_SIZE * 4), %rdi > > + .p2align 4 > > +L(aligned_more): > > + /* Align data to VEC_SIZE - 1. This is the same number of > > + instructions as using andq -VEC_SIZE but saves 4 bytes of code on > > + x4 check. */ > > + orq $(VEC_SIZE - 1), %rdi > > +L(cross_page_continue): > > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since > > + data is only aligned to VEC_SIZE. */ > > + vmovdqa 1(%rdi), %ymm8 > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > + vpor %ymm1, %ymm2, %ymm1 > > + vpmovmskb %ymm1, %eax > > Use a space, not tab, after vpmovmskb. Fixed. > > > + testl %eax, %eax > > + jnz L(first_vec_x1) > > > > + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > + vpor %ymm1, %ymm2, %ymm1 > > + vpmovmskb %ymm1, %eax > > Use a space, not tab, after vpmovmskb. > Fixed. > > + testl %eax, %eax > > + jnz L(first_vec_x2) > > + > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > + vpor %ymm1, %ymm2, %ymm1 > > + vpmovmskb %ymm1, %eax > > Use a space, not tab, after vpmovmskb. > Fixed. > > + testl %eax, %eax > > + jnz L(first_vec_x3) > > + > > + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > + vpor %ymm1, %ymm2, %ymm1 > > + vpmovmskb %ymm1, %eax > > Use a space, not tab, after vpmovmskb. > Fixed. > > + testl %eax, %eax > > + jnz L(first_vec_x4) > > + /* Align data to VEC_SIZE * 4 - 1. */ > > + addq $(VEC_SIZE * 4 + 1), %rdi > > + andq $-(VEC_SIZE * 4), %rdi > > .p2align 4 > > L(loop_4x_vec): > > /* Compare 4 * VEC at a time forward. */ > > - vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 > > - vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 > > - vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 > > - vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 > > + vmovdqa (%rdi), %ymm5 > > + vmovdqa (VEC_SIZE)(%rdi), %ymm6 > > + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 > > + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > > > /* Leaves only CHARS matching esi as 0. */ > > vpxor %ymm5, %ymm0, %ymm1 > > @@ -190,62 +202,102 @@ L(loop_4x_vec): > > VPMINU %ymm1, %ymm2, %ymm5 > > VPMINU %ymm3, %ymm4, %ymm6 > > > > - VPMINU %ymm5, %ymm6, %ymm5 > > + VPMINU %ymm5, %ymm6, %ymm6 > > > > - VPCMPEQ %ymm5, %ymm9, %ymm5 > > - vpmovmskb %ymm5, %eax > > + VPCMPEQ %ymm6, %ymm9, %ymm6 > > + vpmovmskb %ymm6, %ecx > > Use a space, not tab, after vpmovmskb. > Fixed. > > + subq $-(VEC_SIZE * 4), %rdi > > + testl %ecx, %ecx > > + jz L(loop_4x_vec) > > > > - addq $(VEC_SIZE * 4), %rdi > > - testl %eax, %eax > > - jz L(loop_4x_vec) > > > > - VPCMPEQ %ymm1, %ymm9, %ymm1 > > - vpmovmskb %ymm1, %eax > > + VPCMPEQ %ymm1, %ymm9, %ymm1 > > + vpmovmskb %ymm1, %eax > > Use a space, not tab, after vpmovmskb. > Fixed. > > testl %eax, %eax > > - jnz L(first_vec_x0) > > + jnz L(last_vec_x0) > > + > > > > - VPCMPEQ %ymm2, %ymm9, %ymm2 > > - vpmovmskb %ymm2, %eax > > + VPCMPEQ %ymm5, %ymm9, %ymm2 > > + vpmovmskb %ymm2, %eax > > Use a space, not tab, after vpmovmskb. > Fixed. > > testl %eax, %eax > > - jnz L(first_vec_x1) > > + jnz L(last_vec_x1) > > + > > + VPCMPEQ %ymm3, %ymm9, %ymm3 > > + vpmovmskb %ymm3, %eax > > + /* rcx has combined result from all 4 VEC. It will only be used if > > + the first 3 other VEC all did not contain a match. */ > > + salq $32, %rcx > > + orq %rcx, %rax > > + tzcntq %rax, %rax > > + subq $(VEC_SIZE * 2), %rdi > > +# ifndef USE_AS_STRCHRNUL > > + /* Found CHAR or the null byte. */ > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero_end) > > +# endif > > + addq %rdi, %rax > > + VZEROUPPER_RETURN > > + > > > > - VPCMPEQ %ymm3, %ymm9, %ymm3 > > - VPCMPEQ %ymm4, %ymm9, %ymm4 > > - vpmovmskb %ymm3, %ecx > > - vpmovmskb %ymm4, %eax > > - salq $32, %rax > > - orq %rcx, %rax > > - tzcntq %rax, %rax > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > + .p2align 4 > > +L(last_vec_x0): > > + tzcntl %eax, %eax > > + addq $-(VEC_SIZE * 4), %rdi > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Found CHAR or the null byte. */ > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero_end) > > # endif > > + addq %rdi, %rax > > VZEROUPPER_RETURN > > > > +# ifndef USE_AS_STRCHRNUL > > +L(zero_end): > > + xorl %eax, %eax > > + VZEROUPPER_RETURN > > +# endif > > + > > + .p2align 4 > > +L(last_vec_x1): > > + tzcntl %eax, %eax > > + subq $(VEC_SIZE * 3), %rdi > > +# ifndef USE_AS_STRCHRNUL > > + /* Found CHAR or the null byte. */ > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero_end) > > +# endif > > + addq %rdi, %rax > > + VZEROUPPER_RETURN > > + > > + > > /* Cold case for crossing page with first load. */ > > .p2align 4 > > L(cross_page_boundary): > > - andq $-VEC_SIZE, %rdi > > - andl $(VEC_SIZE - 1), %ecx > > - > > - vmovdqa (%rdi), %ymm8 > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > + movq %rdi, %rdx > > + /* Align rdi to VEC_SIZE - 1. */ > > + orq $(VEC_SIZE - 1), %rdi > > + vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > vpor %ymm1, %ymm2, %ymm1 > > - vpmovmskb %ymm1, %eax > > - /* Remove the leading bits. */ > > - sarxl %ecx, %eax, %eax > > + vpmovmskb %ymm1, %eax > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > + so no need to manually mod edx. */ > > + sarxl %edx, %eax, %eax > > testl %eax, %eax > > - jz L(aligned_more) > > + jz L(cross_page_continue) > > tzcntl %eax, %eax > > - addq %rcx, %rdi > > - addq %rdi, %rax > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + xorl %ecx, %ecx > > + /* Found CHAR or the null byte. */ > > + cmp (%rdx, %rax), %CHAR_REG > > + leaq (%rdx, %rax), %rax > > + cmovne %rcx, %rax > > +# else > > + addq %rdx, %rax > > # endif > > - VZEROUPPER_RETURN > > +L(return_vzeroupper): > > + ZERO_UPPER_VEC_REGISTERS_RETURN > > > > END (STRCHR) > > # endif > > -- > > 2.29.2 > > > > Thanks. > > H.J.