From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-oo1-xc2d.google.com (mail-oo1-xc2d.google.com [IPv6:2607:f8b0:4864:20::c2d]) by sourceware.org (Postfix) with ESMTPS id 459D63858D1E for ; Sat, 15 Oct 2022 03:49:13 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 459D63858D1E Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=gmail.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=gmail.com Received: by mail-oo1-xc2d.google.com with SMTP id h1-20020a4aa741000000b004756c611188so1964845oom.4 for ; Fri, 14 Oct 2022 20:49:13 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=cc:to:subject:message-id:date:from:in-reply-to:references :mime-version:from:to:cc:subject:date:message-id:reply-to; bh=UbiMcDqhL3ZcVOp0hWnfoNtixj+Vw6w3a035z1oQ+uk=; b=Q7J5XuxeHSvgb9XMfV34C9C1PXnvf9miatKeTChGTs1g9XPt+NknFbI1YfYL/vn4+L s4Rc6q4QN5O0uJDe77pUwhQ75sZMHlZi4ChSeCn979MIKLSbUdsPd1hhA6jkETcnitlD Y2YppUWIZeKgkfa7njvHlHDOwtc3nzpJxjnJ6l14hQzOZsqbBIeUwpcrQwtIhxQGn3o6 de0A1/YRcMviwd9T9jyAkAAO5x8GnKPSOnqwZh9j7l7KtVBgHJENLcHYxE+1A0+FhhLW r2NRjfc6TdNLhNtTQZZmkuVhkoQPJrNFZjzkGOUd6Jg/hrOtwQgWpxYC7K5gw4z9JT0M KOew== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=cc:to:subject:message-id:date:from:in-reply-to:references :mime-version:x-gm-message-state:from:to:cc:subject:date:message-id :reply-to; bh=UbiMcDqhL3ZcVOp0hWnfoNtixj+Vw6w3a035z1oQ+uk=; b=zuCEvQkPHEer7qIMMtGV3OsXXCCyRhZdi90CCbRxXe1Sd0k80Gr5uXkvqsg1OK70ll z7fjbvEoE902eqjAcFkY8Vpsl3j0PEGqfxhIzf27LQ7C5Fne+eS/10EPwZAF6cJNJZHN fhjCjj6GN8GYWJT+WwMh8FPVJbhP1Fr4+3Mce07/CwFfW3xT+6zI2GBnokJ7wehkDTcr 2ojJSEp6rihRbJzjG7Jw/PiNxDI8TTVYa30hwKKdnZam9TW2IrevNI5XrBhSVTTVSfZb vpdrsNvzzJTMh0035Wg9O711y+vm479tB5723Ui6j0L9T70dLmgk8vbB1X0m0wS3YZgw V9ow== X-Gm-Message-State: ACrzQf2d6YANqUTNcsfDrzT9wlnpotbuWKh/bL9xQdJdD4Bl2892vuJW XW1JT66++UgaaAFa+5hcqGJEp+R0ofB/fOpLHQw= X-Google-Smtp-Source: AMsMyM5fJ0Nnxs81za5nk/OXqPuqnwy9rAYB1nFeJTBql/NdlXWja6Y8OC9mza1qkDPZD2XSzbfxnwMkp8m9rGF6PK8= X-Received: by 2002:a4a:ac8c:0:b0:47f:90f3:4116 with SMTP id b12-20020a4aac8c000000b0047f90f34116mr345403oon.49.1665805752523; Fri, 14 Oct 2022 20:49:12 -0700 (PDT) MIME-Version: 1.0 References: <20221014164008.1325863-1-goldstein.w.n@gmail.com> <20221015030030.204172-1-goldstein.w.n@gmail.com> <20221015030030.204172-6-goldstein.w.n@gmail.com> In-Reply-To: <20221015030030.204172-6-goldstein.w.n@gmail.com> From: Sunil Pandey Date: Fri, 14 Oct 2022 20:48:36 -0700 Message-ID: Subject: Re: [PATCH v10 6/6] x86: Update strlen-evex-base to use new reg/vec macros. To: Noah Goldstein Cc: libc-alpha@sourceware.org Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-5.7 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,FREEMAIL_ENVFROM_END_DIGIT,FREEMAIL_FROM,GIT_PATCH_0,HK_RANDOM_ENVFROM,HK_RANDOM_FROM,RCVD_IN_DNSWL_NONE,SPF_HELO_NONE,SPF_PASS,TXREP,URIBL_BLACK autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: On Fri, Oct 14, 2022 at 8:01 PM Noah Goldstein via Libc-alpha wrote: > > To avoid duplicate the VMM / GPR / mask insn macros in all incoming > evex512 files use the macros defined in 'reg-macros.h' and > '{vec}-macros.h' > > This commit does not change libc.so > > Tested build on x86-64 > --- > sysdeps/x86_64/multiarch/strlen-evex-base.S | 116 +++++++------------- > sysdeps/x86_64/multiarch/strlen-evex512.S | 4 +- > 2 files changed, 44 insertions(+), 76 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S > index 418e9f8411..c832b15a48 100644 > --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S > +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S > @@ -36,42 +36,10 @@ > # define CHAR_SIZE 1 > # endif > > -# define XMM0 xmm16 > # define PAGE_SIZE 4096 > # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > -# if VEC_SIZE == 64 > -# define KMOV kmovq > -# define KORTEST kortestq > -# define RAX rax > -# define RCX rcx > -# define RDX rdx > -# define SHR shrq > -# define TEXTSUFFIX evex512 > -# define VMM0 zmm16 > -# define VMM1 zmm17 > -# define VMM2 zmm18 > -# define VMM3 zmm19 > -# define VMM4 zmm20 > -# define VMOVA vmovdqa64 > -# elif VEC_SIZE == 32 > -/* Currently Unused. */ > -# define KMOV kmovd > -# define KORTEST kortestd > -# define RAX eax > -# define RCX ecx > -# define RDX edx > -# define SHR shrl > -# define TEXTSUFFIX evex256 > -# define VMM0 ymm16 > -# define VMM1 ymm17 > -# define VMM2 ymm18 > -# define VMM3 ymm19 > -# define VMM4 ymm20 > -# define VMOVA vmovdqa32 > -# endif > - > - .section .text.TEXTSUFFIX, "ax", @progbits > + .section SECTION(.text),"ax",@progbits > /* Aligning entry point to 64 byte, provides better performance for > one vector length string. */ > ENTRY_P2ALIGN (STRLEN, 6) > @@ -86,18 +54,18 @@ ENTRY_P2ALIGN (STRLEN, 6) > # endif > > movl %edi, %eax > - vpxorq %XMM0, %XMM0, %XMM0 > + vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) > andl $(PAGE_SIZE - 1), %eax > cmpl $(PAGE_SIZE - VEC_SIZE), %eax > ja L(page_cross) > > /* Compare [w]char for null, mask bit will be set for match. */ > - VPCMP $0, (%rdi), %VMM0, %k0 > - KMOV %k0, %RAX > - test %RAX, %RAX > + VPCMP $0, (%rdi), %VMM(0), %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > jz L(align_more) > > - bsf %RAX, %RAX > + bsf %VRAX, %VRAX > # ifdef USE_AS_STRNLEN > cmpq %rsi, %rax > cmovnb %rsi, %rax > @@ -120,7 +88,7 @@ L(align_more): > movq %rax, %rdx > subq %rdi, %rdx > # ifdef USE_AS_WCSLEN > - SHR $2, %RDX > + shr $2, %VRDX > # endif > /* At this point rdx contains [w]chars already compared. */ > subq %rsi, %rdx > @@ -131,9 +99,9 @@ L(align_more): > # endif > > /* Loop unroll 4 times for 4 vector loop. */ > - VPCMP $0, (%rax), %VMM0, %k0 > - KMOV %k0, %RCX > - test %RCX, %RCX > + VPCMP $0, (%rax), %VMM(0), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(ret_vec_x1) > > # ifdef USE_AS_STRNLEN > @@ -141,9 +109,9 @@ L(align_more): > jbe L(ret_max) > # endif > > - VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0 > - KMOV %k0, %RCX > - test %RCX, %RCX > + VPCMP $0, VEC_SIZE(%rax), %VMM(0), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(ret_vec_x2) > > # ifdef USE_AS_STRNLEN > @@ -151,9 +119,9 @@ L(align_more): > jbe L(ret_max) > # endif > > - VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0 > - KMOV %k0, %RCX > - test %RCX, %RCX > + VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(ret_vec_x3) > > # ifdef USE_AS_STRNLEN > @@ -161,9 +129,9 @@ L(align_more): > jbe L(ret_max) > # endif > > - VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0 > - KMOV %k0, %RCX > - test %RCX, %RCX > + VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(ret_vec_x4) > > # ifdef USE_AS_STRNLEN > @@ -179,7 +147,7 @@ L(align_more): > # ifdef USE_AS_STRNLEN > subq %rax, %rcx > # ifdef USE_AS_WCSLEN > - SHR $2, %RCX > + shr $2, %VRCX > # endif > /* rcx contains number of [w]char will be recompared due to > alignment fixes. rdx must be incremented by rcx to offset > @@ -199,42 +167,42 @@ L(loop_entry): > # endif > /* VPMINU and VPCMP combination provide better performance as > compared to alternative combinations. */ > - VMOVA (VEC_SIZE * 4)(%rax), %VMM1 > - VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2 > - VMOVA (VEC_SIZE * 6)(%rax), %VMM3 > - VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4 > + VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) > + VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2) > + VMOVA (VEC_SIZE * 6)(%rax), %VMM(3) > + VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4) > > - VPTESTN %VMM2, %VMM2, %k0 > - VPTESTN %VMM4, %VMM4, %k1 > + VPTESTN %VMM(2), %VMM(2), %k0 > + VPTESTN %VMM(4), %VMM(4), %k1 > > subq $-(VEC_SIZE * 4), %rax > KORTEST %k0, %k1 > jz L(loop) > > - VPTESTN %VMM1, %VMM1, %k2 > - KMOV %k2, %RCX > - test %RCX, %RCX > + VPTESTN %VMM(1), %VMM(1), %k2 > + KMOV %k2, %VRCX > + test %VRCX, %VRCX > jnz L(ret_vec_x1) > > - KMOV %k0, %RCX > + KMOV %k0, %VRCX > /* At this point, if k0 is non zero, null char must be in the > second vector. */ > - test %RCX, %RCX > + test %VRCX, %VRCX > jnz L(ret_vec_x2) > > - VPTESTN %VMM3, %VMM3, %k3 > - KMOV %k3, %RCX > - test %RCX, %RCX > + VPTESTN %VMM(3), %VMM(3), %k3 > + KMOV %k3, %VRCX > + test %VRCX, %VRCX > jnz L(ret_vec_x3) > /* At this point null [w]char must be in the fourth vector so no > need to check. */ > - KMOV %k1, %RCX > + KMOV %k1, %VRCX > > /* Fourth, third, second vector terminating are pretty much > same, implemented this way to avoid branching and reuse code > from pre loop exit condition. */ > L(ret_vec_x4): > - bsf %RCX, %RCX > + bsf %VRCX, %VRCX > subq %rdi, %rax > # ifdef USE_AS_WCSLEN > subq $-(VEC_SIZE * 3), %rax > @@ -250,7 +218,7 @@ L(ret_vec_x4): > ret > > L(ret_vec_x3): > - bsf %RCX, %RCX > + bsf %VRCX, %VRCX > subq %rdi, %rax > # ifdef USE_AS_WCSLEN > subq $-(VEC_SIZE * 2), %rax > @@ -268,7 +236,7 @@ L(ret_vec_x3): > L(ret_vec_x2): > subq $-VEC_SIZE, %rax > L(ret_vec_x1): > - bsf %RCX, %RCX > + bsf %VRCX, %VRCX > subq %rdi, %rax > # ifdef USE_AS_WCSLEN > shrq $2, %rax > @@ -289,13 +257,13 @@ L(page_cross): > /* ecx contains number of w[char] to be skipped as a result > of address alignment. */ > xorq %rdi, %rax > - VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0 > - KMOV %k0, %RAX > + VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0 > + KMOV %k0, %VRAX > /* Ignore number of character for alignment adjustment. */ > - SHR %cl, %RAX > + shr %cl, %VRAX > jz L(align_more) > > - bsf %RAX, %RAX > + bsf %VRAX, %VRAX > # ifdef USE_AS_STRNLEN > cmpq %rsi, %rax > cmovnb %rsi, %rax > diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S > index 116f8981c8..10c3415c8a 100644 > --- a/sysdeps/x86_64/multiarch/strlen-evex512.S > +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S > @@ -2,6 +2,6 @@ > # define STRLEN __strlen_evex512 > #endif > > -#define VEC_SIZE 64 > - > +#include "x86-evex512-vecs.h" > +#include "reg-macros.h" > #include "strlen-evex-base.S" > -- > 2.34.1 > LGTM --Sunil