From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-ej1-x62b.google.com (mail-ej1-x62b.google.com [IPv6:2a00:1450:4864:20::62b]) by sourceware.org (Postfix) with ESMTPS id D1A5A3858D35 for ; Sun, 30 Oct 2022 19:33:43 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org D1A5A3858D35 Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=gmail.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=gmail.com Received: by mail-ej1-x62b.google.com with SMTP id t25so24795060ejb.8 for ; Sun, 30 Oct 2022 12:33:43 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=cc:to:subject:message-id:date:from:in-reply-to:references :mime-version:from:to:cc:subject:date:message-id:reply-to; bh=BupwWB8N1AqoYuCSoy4c8k6Xa0b+hCMjxJ0J6WeP8DI=; b=YFDqo9EdN3jI3ZjxLXa+axyVQrTFw1GMQD1rpyqwzztB5XaBPHpKOlAMmBy0ptcYOX kbx78X1DEJxLkSPIF5//5q+j7nKghHYhse7guVl4r9RMguqM3XSPtW5RiQtE5Jbk//R3 6fStmwMEUkNWG8eyB6gHzM5Q1tBC7VR+6e6I6CdIJeY8jgHRMMO1iaZPsaN9pfO4HtAO ubG2UupOWnutmdYfSvxDGkC7wmJofUxySa4tL9RQd37JpCfoLhqLX271FyHZ3BkAkkmE c3S0M7yrjchKYxOYiCkeq+j0ltEDEj40vTU6U6KO4ym9Hl41zKcDip0Q4YoVJZbxIW7T ECAw== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=cc:to:subject:message-id:date:from:in-reply-to:references :mime-version:x-gm-message-state:from:to:cc:subject:date:message-id :reply-to; bh=BupwWB8N1AqoYuCSoy4c8k6Xa0b+hCMjxJ0J6WeP8DI=; b=w0emHxqqjghmtG4BH+8dk+Cwhbmhy+ptwRFMb9aFYSXydYm3RjhZgmozLveNIpRdKe fv9rxlX1q4TP8Tq90dGSNbItkduupFjhdGsKVixZZgYp4owxJPTsrwemi6rjvUBXtKlj 8KR7/reSwS2WO/HJ76moWB9YO2vyXQgV48rXxUwOTm+kzSk5utpIhDf6mWl/LIhnA9QC xLBUU3KlilnyXVR40G07z23aZw9o2XINOZoxrF3NX5oKmNYPJKAcBMpbjW6m1J5r+eOX /DeHiqILYeBaXVo6WbJgF2N2WkFHqQ2zLyqifK6YS7nJKTQ80B5ZC5JeUsWaNqjEQ5ae oyEw== X-Gm-Message-State: ACrzQf1FLvBSpoFCO+qLozEk1q2Aei0OzyQO8LtC1xf4HG8R41K7QVmS 2CRq09mQTppPLrS97o/a8NOKbsJiOU2wdudTnaZI4aMzZf0= X-Google-Smtp-Source: AMsMyM6QBlzGR9cxwVoJ9GR6lpWc2QSHCWoxVncSjymKRBBhpTTp9aJLPKoe6Ym9AoWxMrxJ5pvGUkAyc5verg148EU= X-Received: by 2002:a17:906:fe46:b0:73d:939a:ec99 with SMTP id wz6-20020a170906fe4600b0073d939aec99mr9525714ejb.169.1667158422301; Sun, 30 Oct 2022 12:33:42 -0700 (PDT) MIME-Version: 1.0 References: <20221028154810.1801123-1-skpgkp2@gmail.com> In-Reply-To: From: Noah Goldstein Date: Sun, 30 Oct 2022 14:33:30 -0500 Message-ID: Subject: Re: [PATCH] x86-64: Improve evex512 version of strlen functions To: Sunil Pandey Cc: libc-alpha@sourceware.org Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-9.8 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,FREEMAIL_FROM,GIT_PATCH_0,RCVD_IN_DNSWL_NONE,SPF_HELO_NONE,SPF_PASS,TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: On Sun, Oct 30, 2022 at 2:03 PM Sunil Pandey wrote: > > On Fri, Oct 28, 2022 at 10:24 AM Noah Goldstein wrote: > > > > On Fri, Oct 28, 2022 at 12:12 PM Sunil Pandey wrote: > > > > > > On Fri, Oct 28, 2022 at 9:34 AM Noah Goldstein wrote: > > > > > > > > On Fri, Oct 28, 2022 at 10:49 AM Sunil K Pandey via Libc-alpha > > > > wrote: > > > > > > > > > > This patch improves following functionality > > > > > - Replace VPCMP with VPCMPEQ. > > > > > - Replace page cross check logic with sall. > > > > > - Remove extra lea from align_more. > > > > > - Remove uncondition loop jump. > > > > > - Use bsf to check max length in first vector. > > > > > --- > > > > > sysdeps/x86_64/multiarch/strlen-evex-base.S | 91 +++++++++++++-------- > > > > > 1 file changed, 57 insertions(+), 34 deletions(-) > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S > > > > > index c832b15a48..fd6c770e6e 100644 > > > > > --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S > > > > > +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S > > > > > @@ -25,12 +25,12 @@ > > > > > # include > > > > > > > > > > # ifdef USE_AS_WCSLEN > > > > > -# define VPCMP vpcmpd > > > > > +# define VPCMPEQ vpcmpeqd > > > > > # define VPTESTN vptestnmd > > > > > # define VPMINU vpminud > > > > > # define CHAR_SIZE 4 > > > > > # else > > > > > -# define VPCMP vpcmpb > > > > > +# define VPCMPEQ vpcmpeqb > > > > > # define VPTESTN vptestnmb > > > > > # define VPMINU vpminub > > > > > # define CHAR_SIZE 1 > > > > > @@ -55,20 +55,29 @@ ENTRY_P2ALIGN (STRLEN, 6) > > > > > > > > > > movl %edi, %eax > > > > > vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) > > > > > - andl $(PAGE_SIZE - 1), %eax > > > > > - cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > > + sall $20, %eax > > > > > + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax > > > > > ja L(page_cross) > > > > > > > > > > /* Compare [w]char for null, mask bit will be set for match. */ > > > > > - VPCMP $0, (%rdi), %VMM(0), %k0 > > > > > + VPCMPEQ (%rdi), %VMM(0), %k0 > > > > > +# ifdef USE_AS_STRNLEN > > > > > + KMOV %k0, %VRCX > > > > > + /* Store max length in rax. */ > > > > > + mov %rsi, %rax > > > > > + /* If rcx is 0, rax will have max length. We can not use VRCX > > > > > + and VRAX here for evex256 because, upper 32 bits may be > > > > > + undefined for ecx and eax. */ > > > > > + bsfq %rcx, %rax > > > > > + cmp $CHAR_PER_VEC, %rax > > > > > + ja L(align_more) > > > > > + cmpq %rax, %rsi > > > > > + cmovb %esi, %eax > > > > > +# else > > > > > KMOV %k0, %VRAX > > > > > test %VRAX, %VRAX > > > > > jz L(align_more) > > > > > - > > > > > bsf %VRAX, %VRAX > > > > > -# ifdef USE_AS_STRNLEN > > > > > - cmpq %rsi, %rax > > > > > - cmovnb %rsi, %rax > > > > > # endif > > > > > ret > > > > > > > > > > @@ -81,25 +90,24 @@ L(ret_max): > > > > > # endif > > > > > > > > > > L(align_more): > > > > > - leaq VEC_SIZE(%rdi), %rax > > > > > + mov %rdi, %rax > > > > > /* Align rax to VEC_SIZE. */ > > > > > andq $-VEC_SIZE, %rax > > > > > # ifdef USE_AS_STRNLEN > > > > > - movq %rax, %rdx > > > > > - subq %rdi, %rdx > > > > > + movq %rdi, %rdx > > > > > + subq %rax, %rdx > > > > > # ifdef USE_AS_WCSLEN > > > > > shr $2, %VRDX > > > > > # endif > > > > > /* At this point rdx contains [w]chars already compared. */ > > > > > - subq %rsi, %rdx > > > > > - jae L(ret_max) > > > > > - negq %rdx > > > > > + leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx > > > > > /* At this point rdx contains number of w[char] needs to go. > > > > > Now onwards rdx will keep decrementing with each compare. */ > > > > > # endif > > > > > > > > > > /* Loop unroll 4 times for 4 vector loop. */ > > > > > - VPCMP $0, (%rax), %VMM(0), %k0 > > > > > + VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 > > > > > + subq $-VEC_SIZE, %rax > > > > > KMOV %k0, %VRCX > > > > > test %VRCX, %VRCX > > > > > jnz L(ret_vec_x1) > > > > > @@ -109,7 +117,7 @@ L(align_more): > > > > > jbe L(ret_max) > > > > > # endif > > > > > > > > > > - VPCMP $0, VEC_SIZE(%rax), %VMM(0), %k0 > > > > > + VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 > > > > > KMOV %k0, %VRCX > > > > > test %VRCX, %VRCX > > > > > jnz L(ret_vec_x2) > > > > > @@ -119,7 +127,7 @@ L(align_more): > > > > > jbe L(ret_max) > > > > > # endif > > > > > > > > > > - VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0 > > > > > + VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0 > > > > > KMOV %k0, %VRCX > > > > > test %VRCX, %VRCX > > > > > jnz L(ret_vec_x3) > > > > > @@ -129,7 +137,7 @@ L(align_more): > > > > > jbe L(ret_max) > > > > > # endif > > > > > > > > > > - VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0 > > > > > + VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0 > > > > > KMOV %k0, %VRCX > > > > > test %VRCX, %VRCX > > > > > jnz L(ret_vec_x4) > > > > > @@ -155,16 +163,10 @@ L(align_more): > > > > > addq %rcx, %rdx > > > > > /* Need jump as we don't want to add/subtract rdx for first > > > > > iteration of 4 x VEC_SIZE aligned loop. */ > > > > > - jmp L(loop_entry) > > > > > # endif > > > > > > > > > > .p2align 4,,11 > > > > > L(loop): > > > > > -# ifdef USE_AS_STRNLEN > > > > > - subq $(CHAR_PER_VEC * 4), %rdx > > > > > - jbe L(ret_max) > > > > > -L(loop_entry): > > > > > -# endif > > > > > /* VPMINU and VPCMP combination provide better performance as > > > > > compared to alternative combinations. */ > > > > > VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) > > > > > @@ -177,7 +179,18 @@ L(loop_entry): > > > > > > > > > > subq $-(VEC_SIZE * 4), %rax > > > > > KORTEST %k0, %k1 > > > > > - jz L(loop) > > > > > + > > > > > +# ifndef USE_AS_STRNLEN > > > > > + jz L(loop) > > > > > +# else > > > > > + jnz L(loopend) > > > > > + subq $(CHAR_PER_VEC * 4), %rdx > > > > > + ja L(loop) > > > > > + mov %rsi, %rax > > > > > + ret > > > > > +# endif > > > > > + > > > > > +L(loopend): > > > > > > > > > > VPTESTN %VMM(1), %VMM(1), %k2 > > > > > KMOV %k2, %VRCX > > > > > @@ -249,24 +262,34 @@ L(ret_vec_x1): > > > > > ret > > > > > > > > > > L(page_cross): > > > > > - movl %eax, %ecx > > > > > -# ifdef USE_AS_WCSLEN > > > > > + mov %rdi, %rax > > > > > + movl %edi, %ecx > > > > > andl $(VEC_SIZE - 1), %ecx > > > > > +# ifdef USE_AS_WCSLEN > > > > > sarl $2, %ecx > > > > > # endif > > > > > /* ecx contains number of w[char] to be skipped as a result > > > > > of address alignment. */ > > > > > - xorq %rdi, %rax > > > > > - VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0 > > > > > - KMOV %k0, %VRAX > > > > > + andq $-VEC_SIZE, %rax > > > > > + VPCMPEQ (%rax), %VMM(0), %k0 > > > > > + KMOV %k0, %VRDX > > > > > /* Ignore number of character for alignment adjustment. */ > > > > > - shr %cl, %VRAX > > > > > + shr %cl, %VRDX > > > > > +# ifdef USE_AS_STRNLEN > > > > > + jnz L(page_cross_end) > > > > > + movl $CHAR_PER_VEC, %eax > > > > > + sub %ecx, %eax > > > > > + cmp %rax, %rsi > > > > > + ja L(align_more) > > > > > +# else > > > > > jz L(align_more) > > > > > +# endif > > > > > > > > > > - bsf %VRAX, %VRAX > > > > > +L(page_cross_end): > > > > > + bsf %VRDX, %VRAX > > > > > # ifdef USE_AS_STRNLEN > > > > > cmpq %rsi, %rax > > > > > - cmovnb %rsi, %rax > > > > > + cmovnb %esi, %eax > > > > > # endif > > > > > ret > > > > > > > > > > -- > > > > > 2.36.1 > > > > > > > > > > > > > Can you post some performance numbers? > > > > > > SKX perf data attached. > > > > Can you add numbers comparing this version to the previous version? > > Here you go. This change is okay by me.