From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-vs1-xe2e.google.com (mail-vs1-xe2e.google.com [IPv6:2607:f8b0:4864:20::e2e]) by sourceware.org (Postfix) with ESMTPS id A0FC83858D28; Wed, 27 Apr 2022 23:44:06 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org A0FC83858D28 Received: by mail-vs1-xe2e.google.com with SMTP id u205so3144824vsu.6; Wed, 27 Apr 2022 16:44:06 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=ZOBVK+oGIA3cosLGl6ySWHSjD4veUtYfkSmO+3adTrk=; b=EMyz6JjJGpj4PQZenwOaVlQdjCISWPFSZce6kbVdxSY3S2TfUQPxqBpU2cvVna/Y0R KnmoGqypGRjSfx9QcFDLHvLN49g6aP/x9utAviSvibcVsSem+KSnYOOQ32CUdYEc9sIx Y229FbTxex2Uw0kC5oP9kpf2eht69sBtpwf9ZroDgOyA/OvvsyQbtKwINR+OeLsUsqTm 9TmJKkHx3ec5Oh2OEKm48SrGiGzlleXN7y79arGamiVEG6Z4LnsnXJH3kwkRHxFIzeg+ 9B1RAPOonk4pQBghp9VWLmcC68c1O9yAzu20w7/7LRYRPB1aph/tJzzp89PAoproiUPN qENg== X-Gm-Message-State: AOAM533ZHg8XTET46tbqkx3XN1q16hd34jttsUxhlGDjM41qR+fDtJ5s QwoTZ1/HIgMzlHckNNoBnQvy4w9Dz0R02KXwbTE= X-Google-Smtp-Source: ABdhPJy55KwFi77sBehhZ4kPmZftAm9mklCz2UCPmqLpmpvOOhR7T14NHi+KSH6/0Yk4jK087Y2YXeI2gnUF+URb6bE= X-Received: by 2002:a05:6102:3f0f:b0:32a:46dd:a908 with SMTP id k15-20020a0561023f0f00b0032a46dda908mr8828738vsv.48.1651103045829; Wed, 27 Apr 2022 16:44:05 -0700 (PDT) MIME-Version: 1.0 References: <20210203053900.4125403-1-goldstein.w.n@gmail.com> In-Reply-To: From: Sunil Pandey Date: Wed, 27 Apr 2022 16:43:29 -0700 Message-ID: Subject: Re: [PATCH v4 1/2] x86: Refactor and improve performance of strchr-avx2.S To: Noah Goldstein , libc-stable@sourceware.org Cc: "H.J. Lu" , GNU C Library Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-6.8 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, GIT_PATCH_0, HK_RANDOM_ENVFROM, HK_RANDOM_FROM, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-stable@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-stable mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 27 Apr 2022 23:44:09 -0000 On Mon, Feb 8, 2021 at 1:46 PM Noah Goldstein via Libc-alpha wrote: > > On Mon, Feb 8, 2021 at 2:48 PM Noah Goldstein wrote: > > > > On Mon, Feb 8, 2021 at 2:33 PM H.J. Lu wrote: > > > > > > On Mon, Feb 8, 2021 at 6:08 AM H.J. Lu wrote: > > > > > > > > On Tue, Feb 2, 2021 at 9:39 PM wrote: > > > > > > > > > > From: noah > > > > > > > > > > No bug. Just seemed the performance could be improved a bit. Observed > > > > > and expected behavior are unchanged. Optimized body of main > > > > > loop. Updated page cross logic and optimized accordingly. Made a few > > > > > minor instruction selection modifications. No regressions in test > > > > > suite. Both test-strchrnul and test-strchr passed. > > > > > > > > > > Signed-off-by: noah > > > > > --- > > > > > sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++------------- > > > > > sysdeps/x86_64/multiarch/strchr.c | 1 + > > > > > 2 files changed, 118 insertions(+), 118 deletions(-) > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S > > > > > index d416558d04..8b9d78b55a 100644 > > > > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S > > > > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S > > > > > @@ -27,10 +27,12 @@ > > > > > # ifdef USE_AS_WCSCHR > > > > > # define VPBROADCAST vpbroadcastd > > > > > # define VPCMPEQ vpcmpeqd > > > > > +# define VPMINU vpminud > > > > > # define CHAR_REG esi > > > > > # else > > > > > # define VPBROADCAST vpbroadcastb > > > > > # define VPCMPEQ vpcmpeqb > > > > > +# define VPMINU vpminub > > > > > # define CHAR_REG sil > > > > > # endif > > > > > > > > > > @@ -39,20 +41,26 @@ > > > > > # endif > > > > > > > > > > # define VEC_SIZE 32 > > > > > +# define PAGE_SIZE 4096 > > > > > > > > > > .section .text.avx,"ax",@progbits > > > > > ENTRY (STRCHR) > > > > > movl %edi, %ecx > > > > > - /* Broadcast CHAR to YMM0. */ > > > > > +# ifndef USE_AS_STRCHRNUL > > > > > + xorl %edx, %edx > > > > > +# endif > > > > > + > > > > > + /* Broadcast CHAR to YMM0. */ > > > > > vmovd %esi, %xmm0 > > > > > vpxor %xmm9, %xmm9, %xmm9 > > > > > VPBROADCAST %xmm0, %ymm0 > > > > > - /* Check if we may cross page boundary with one vector load. */ > > > > > - andl $(2 * VEC_SIZE - 1), %ecx > > > > > - cmpl $VEC_SIZE, %ecx > > > > > - ja L(cros_page_boundary) > > > > > - > > > > > - /* Check the first VEC_SIZE bytes. Search for both CHAR and the > > > > > + > > > > > + /* Check if we cross page boundary with one vector load. */ > > > > > + andl $(PAGE_SIZE - 1), %ecx > > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > > > > > + ja L(cross_page_boundary) > > > > > + > > > > > + /* Check the first VEC_SIZE bytes. Search for both CHAR and the > > > > > null byte. */ > > > > > vmovdqu (%rdi), %ymm8 > > > > > VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > > @@ -60,50 +68,27 @@ ENTRY (STRCHR) > > > > > vpor %ymm1, %ymm2, %ymm1 > > > > > vpmovmskb %ymm1, %eax > > > > > testl %eax, %eax > > > > > - jnz L(first_vec_x0) > > > > > - > > > > > - /* Align data for aligned loads in the loop. */ > > > > > - addq $VEC_SIZE, %rdi > > > > > - andl $(VEC_SIZE - 1), %ecx > > > > > - andq $-VEC_SIZE, %rdi > > > > > - > > > > > - jmp L(more_4x_vec) > > > > > - > > > > > - .p2align 4 > > > > > -L(cros_page_boundary): > > > > > - andl $(VEC_SIZE - 1), %ecx > > > > > - andq $-VEC_SIZE, %rdi > > > > > - vmovdqu (%rdi), %ymm8 > > > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > > - vpor %ymm1, %ymm2, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > - /* Remove the leading bytes. */ > > > > > - sarl %cl, %eax > > > > > - testl %eax, %eax > > > > > - jz L(aligned_more) > > > > > - /* Found CHAR or the null byte. */ > > > > > + jz L(more_vecs) > > > > > tzcntl %eax, %eax > > > > > - addq %rcx, %rax > > > > > -# ifdef USE_AS_STRCHRNUL > > > > > + /* Found CHAR or the null byte. */ > > > > > addq %rdi, %rax > > > > > -# else > > > > > - xorl %edx, %edx > > > > > - leaq (%rdi, %rax), %rax > > > > > - cmp (%rax), %CHAR_REG > > > > > +# ifndef USE_AS_STRCHRNUL > > > > > + cmp (%rax), %CHAR_REG > > > > > cmovne %rdx, %rax > > > > > # endif > > > > > VZEROUPPER > > > > > ret > > > > > > > > > > .p2align 4 > > > > > +L(more_vecs): > > > > > + /* Align data for aligned loads in the loop. */ > > > > > + andq $-VEC_SIZE, %rdi > > > > > L(aligned_more): > > > > > - addq $VEC_SIZE, %rdi > > > > > > > > > > -L(more_4x_vec): > > > > > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > > > - since data is only aligned to VEC_SIZE. */ > > > > > - vmovdqa (%rdi), %ymm8 > > > > > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > > > + since data is only aligned to VEC_SIZE. */ > > > > > + vmovdqa VEC_SIZE(%rdi), %ymm8 > > > > > + addq $VEC_SIZE, %rdi > > > > > VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > > VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > > vpor %ymm1, %ymm2, %ymm1 > > > > > @@ -125,7 +110,7 @@ L(more_4x_vec): > > > > > vpor %ymm1, %ymm2, %ymm1 > > > > > vpmovmskb %ymm1, %eax > > > > > testl %eax, %eax > > > > > - jnz L(first_vec_x2) > > > > > + jnz L(first_vec_x2) > > > > > > > > > > vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > > > > VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > > @@ -133,122 +118,136 @@ L(more_4x_vec): > > > > > vpor %ymm1, %ymm2, %ymm1 > > > > > vpmovmskb %ymm1, %eax > > > > > testl %eax, %eax > > > > > - jnz L(first_vec_x3) > > > > > - > > > > > - addq $(VEC_SIZE * 4), %rdi > > > > > - > > > > > - /* Align data to 4 * VEC_SIZE. */ > > > > > - movq %rdi, %rcx > > > > > - andl $(4 * VEC_SIZE - 1), %ecx > > > > > - andq $-(4 * VEC_SIZE), %rdi > > > > > - > > > > > - .p2align 4 > > > > > -L(loop_4x_vec): > > > > > - /* Compare 4 * VEC at a time forward. */ > > > > > - vmovdqa (%rdi), %ymm5 > > > > > - vmovdqa VEC_SIZE(%rdi), %ymm6 > > > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 > > > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > > > > - > > > > > - VPCMPEQ %ymm5, %ymm0, %ymm1 > > > > > - VPCMPEQ %ymm6, %ymm0, %ymm2 > > > > > - VPCMPEQ %ymm7, %ymm0, %ymm3 > > > > > - VPCMPEQ %ymm8, %ymm0, %ymm4 > > > > > - > > > > > - VPCMPEQ %ymm5, %ymm9, %ymm5 > > > > > - VPCMPEQ %ymm6, %ymm9, %ymm6 > > > > > - VPCMPEQ %ymm7, %ymm9, %ymm7 > > > > > - VPCMPEQ %ymm8, %ymm9, %ymm8 > > > > > - > > > > > - vpor %ymm1, %ymm5, %ymm1 > > > > > - vpor %ymm2, %ymm6, %ymm2 > > > > > - vpor %ymm3, %ymm7, %ymm3 > > > > > - vpor %ymm4, %ymm8, %ymm4 > > > > > - > > > > > - vpor %ymm1, %ymm2, %ymm5 > > > > > - vpor %ymm3, %ymm4, %ymm6 > > > > > - > > > > > - vpor %ymm5, %ymm6, %ymm5 > > > > > - > > > > > - vpmovmskb %ymm5, %eax > > > > > - testl %eax, %eax > > > > > - jnz L(4x_vec_end) > > > > > - > > > > > - addq $(VEC_SIZE * 4), %rdi > > > > > + jz L(prep_loop_4x) > > > > > > > > > > - jmp L(loop_4x_vec) > > > > > + tzcntl %eax, %eax > > > > > + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > > > > > +# ifndef USE_AS_STRCHRNUL > > > > > + cmp (%rax), %CHAR_REG > > > > > + cmovne %rdx, %rax > > > > > +# endif > > > > > + VZEROUPPER > > > > > + ret > > > > > > > > > > .p2align 4 > > > > > L(first_vec_x0): > > > > > - /* Found CHAR or the null byte. */ > > > > > tzcntl %eax, %eax > > > > > -# ifdef USE_AS_STRCHRNUL > > > > > + /* Found CHAR or the null byte. */ > > > > > addq %rdi, %rax > > > > > -# else > > > > > - xorl %edx, %edx > > > > > - leaq (%rdi, %rax), %rax > > > > > - cmp (%rax), %CHAR_REG > > > > > +# ifndef USE_AS_STRCHRNUL > > > > > + cmp (%rax), %CHAR_REG > > > > > cmovne %rdx, %rax > > > > > # endif > > > > > VZEROUPPER > > > > > ret > > > > > - > > > > > + > > > > > .p2align 4 > > > > > L(first_vec_x1): > > > > > tzcntl %eax, %eax > > > > > -# ifdef USE_AS_STRCHRNUL > > > > > - addq $VEC_SIZE, %rax > > > > > - addq %rdi, %rax > > > > > -# else > > > > > - xorl %edx, %edx > > > > > leaq VEC_SIZE(%rdi, %rax), %rax > > > > > - cmp (%rax), %CHAR_REG > > > > > +# ifndef USE_AS_STRCHRNUL > > > > > + cmp (%rax), %CHAR_REG > > > > > cmovne %rdx, %rax > > > > > # endif > > > > > VZEROUPPER > > > > > - ret > > > > > - > > > > > + ret > > > > > + > > > > > .p2align 4 > > > > > L(first_vec_x2): > > > > > tzcntl %eax, %eax > > > > > -# ifdef USE_AS_STRCHRNUL > > > > > - addq $(VEC_SIZE * 2), %rax > > > > > - addq %rdi, %rax > > > > > -# else > > > > > - xorl %edx, %edx > > > > > + /* Found CHAR or the null byte. */ > > > > > leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > > > > - cmp (%rax), %CHAR_REG > > > > > +# ifndef USE_AS_STRCHRNUL > > > > > + cmp (%rax), %CHAR_REG > > > > > cmovne %rdx, %rax > > > > > # endif > > > > > VZEROUPPER > > > > > ret > > > > > + > > > > > +L(prep_loop_4x): > > > > > + /* Align data to 4 * VEC_SIZE. */ > > > > > + andq $-(VEC_SIZE * 4), %rdi > > > > > > > > > > .p2align 4 > > > > > -L(4x_vec_end): > > > > > +L(loop_4x_vec): > > > > > + /* Compare 4 * VEC at a time forward. */ > > > > > + vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 > > > > > + vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 > > > > > + vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 > > > > > + vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 > > > > > + > > > > > + /* Leaves only CHARS matching esi as 0. */ > > > > > + vpxor %ymm5, %ymm0, %ymm1 > > > > > + vpxor %ymm6, %ymm0, %ymm2 > > > > > + vpxor %ymm7, %ymm0, %ymm3 > > > > > + vpxor %ymm8, %ymm0, %ymm4 > > > > > + > > > > > + VPMINU %ymm1, %ymm5, %ymm1 > > > > > + VPMINU %ymm2, %ymm6, %ymm2 > > > > > + VPMINU %ymm3, %ymm7, %ymm3 > > > > > + VPMINU %ymm4, %ymm8, %ymm4 > > > > > + > > > > > + VPMINU %ymm1, %ymm2, %ymm5 > > > > > + VPMINU %ymm3, %ymm4, %ymm6 > > > > > + > > > > > + VPMINU %ymm5, %ymm6, %ymm5 > > > > > + > > > > > + VPCMPEQ %ymm5, %ymm9, %ymm5 > > > > > + vpmovmskb %ymm5, %eax > > > > > + > > > > > + addq $(VEC_SIZE * 4), %rdi > > > > > + testl %eax, %eax > > > > > + jz L(loop_4x_vec) > > > > > + > > > > > + VPCMPEQ %ymm1, %ymm9, %ymm1 > > > > > vpmovmskb %ymm1, %eax > > > > > testl %eax, %eax > > > > > jnz L(first_vec_x0) > > > > > + > > > > > + VPCMPEQ %ymm2, %ymm9, %ymm2 > > > > > vpmovmskb %ymm2, %eax > > > > > testl %eax, %eax > > > > > jnz L(first_vec_x1) > > > > > - vpmovmskb %ymm3, %eax > > > > > - testl %eax, %eax > > > > > - jnz L(first_vec_x2) > > > > > + > > > > > + VPCMPEQ %ymm3, %ymm9, %ymm3 > > > > > + VPCMPEQ %ymm4, %ymm9, %ymm4 > > > > > + vpmovmskb %ymm3, %ecx > > > > > vpmovmskb %ymm4, %eax > > > > > + salq $32, %rax > > > > > + orq %rcx, %rax > > > > > + tzcntq %rax, %rax > > > > > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > > > > +# ifndef USE_AS_STRCHRNUL > > > > > + cmp (%rax), %CHAR_REG > > > > > + cmovne %rdx, %rax > > > > > +# endif > > > > > + VZEROUPPER > > > > > + ret > > > > > + > > > > > + /* Cold case for crossing page with first load. */ > > > > > + .p2align 4 > > > > > +L(cross_page_boundary): > > > > > + andq $-VEC_SIZE, %rdi > > > > > + andl $(VEC_SIZE - 1), %ecx > > > > > + > > > > > + vmovdqa (%rdi), %ymm8 > > > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > > + vpor %ymm1, %ymm2, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > + /* Remove the leading bits. */ > > > > > + sarxl %ecx, %eax, %eax > > > > > testl %eax, %eax > > > > > -L(first_vec_x3): > > > > > + jz L(aligned_more) > > > > > tzcntl %eax, %eax > > > > > -# ifdef USE_AS_STRCHRNUL > > > > > - addq $(VEC_SIZE * 3), %rax > > > > > + addq %rcx, %rdi > > > > > addq %rdi, %rax > > > > > -# else > > > > > - xorl %edx, %edx > > > > > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > > > > > - cmp (%rax), %CHAR_REG > > > > > +# ifndef USE_AS_STRCHRNUL > > > > > + cmp (%rax), %CHAR_REG > > > > > cmovne %rdx, %rax > > > > > # endif > > > > > VZEROUPPER > > > > > ret > > > > > > > > > > END (STRCHR) > > > > > -#endif > > > > > +# endif > > > > > diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c > > > > > index 583a152794..4dfbe3b58b 100644 > > > > > --- a/sysdeps/x86_64/multiarch/strchr.c > > > > > +++ b/sysdeps/x86_64/multiarch/strchr.c > > > > > @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void) > > > > > > > > > > if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) > > > > > && CPU_FEATURE_USABLE_P (cpu_features, AVX2) > > > > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2) > > > > > && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) > > > > > return OPTIMIZE (avx2); > > > > > > > > > > -- > > > > > 2.29.2 > > > > > > > > > > > > > LGTM. > > > > > > > > Thanks. > > > > > > > > > > This is the updated patch with extra white spaces fixed I am checking in. > > > > > > -- > > > H.J. > > > > Awesome! Thanks! > > > > N.G. > > Shoot, just realized this one has the old commit message that only > references test-strchr and test-strchrnul as passing (missing > reference to test-wcschr and test-wcschrnul). > > Do you want me to send another patch with proper commit message or can > you fix it on your end or do is not really matter? > > N.G. I would like to backport this patch to release branches. Any comments or objections? --Sunil