From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pj1-x102a.google.com (mail-pj1-x102a.google.com [IPv6:2607:f8b0:4864:20::102a]) by sourceware.org (Postfix) with ESMTPS id 1BCB6385841E for ; Sun, 26 Dec 2021 17:05:41 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 1BCB6385841E Received: by mail-pj1-x102a.google.com with SMTP id n15-20020a17090a160f00b001a75089daa3so16282487pja.1 for ; Sun, 26 Dec 2021 09:05:41 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=nMgTjU9xaQMXlYlFX0sIoTEoYSXb24Pm8fs94J4aPRc=; b=Vc7ugXKWEknWeMp3ZlpZ1PgTf7HdbgAj1Zjg/zUp6YeS3Nb5AivKBzXpPQXAiLujge o0x8yHaWSm/KNtlbLzM6xSlkiy2Phoej4IGVQjV2tTtbNQ9UQ//og6+Y0kPhQFoDKlIe mWT69Ku965m5Ah8A+wegXfukxwMWH1/Yh9qIUkzH7JzN1fWbBvS9qGZJ9fS2dCAg9Z4g n2MHZSKZVHmzrrGnrN16ftpJyNbpWWiy3zjYt2jBKOrHW3Y8486xyBeGizQS373EvxDa GCZHkWnHFFsw4cGYrfJmnWwepssKZp26vhIS1axHCSJZvpgj2pADPDcQsieahnP5/rj6 DQSw== X-Gm-Message-State: AOAM530YgZLFwjK/R3E5eYp0xgNyCJDWgzGhf4bOAF9Qv1D1TVGRU2/y G1/deHetji8L0Hws/mYaGGe5s1TNx6A+6oztImraVjPpe14= X-Google-Smtp-Source: ABdhPJy8/o86wkKqHNRJY0VW8MyrHiPQaltH8yD7E3gNgcvEsjVoktry5ZA27/kr0O7LlYn6/oh4F+8VSYum2gmpXIY= X-Received: by 2002:a17:902:f284:b0:148:a2af:4abd with SMTP id k4-20020a170902f28400b00148a2af4abdmr8620779plc.4.1640538340070; Sun, 26 Dec 2021 09:05:40 -0800 (PST) MIME-Version: 1.0 References: <20211225032257.2887327-1-goldstein.w.n@gmail.com> <20211225032257.2887327-2-goldstein.w.n@gmail.com> In-Reply-To: <20211225032257.2887327-2-goldstein.w.n@gmail.com> From: "H.J. Lu" Date: Sun, 26 Dec 2021 09:05:04 -0800 Message-ID: Subject: Re: [PATCH v1 2/2] x86: Optimize L(less_vec) case in memcmpeq-evex.S To: Noah Goldstein Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3028.4 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 26 Dec 2021 17:05:43 -0000 On Fri, Dec 24, 2021 at 7:23 PM Noah Goldstein wrote: > > No bug. > Optimizations are twofold. > > 1) Replace page cross and 0/1 checks with masked load instructions in > L(less_vec). In applications this reduces branch-misses in the > hot [0, 32] case. > 2) Change controlflow so that L(less_vec) case gets the fall through. > > Change 2) helps copies in the [0, 32] size range but comes at the cost > of copies in the [33, 64] size range. From profiles of GCC and > Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this > appears to the the right tradeoff. > --- > sysdeps/x86_64/multiarch/memcmpeq-evex.S | 170 ++++++----------------- > 1 file changed, 43 insertions(+), 127 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S > index f27e732036..b5e1edbdff 100644 > --- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S > +++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S > @@ -39,6 +39,7 @@ > # define MEMCMPEQ __memcmpeq_evex > # endif > > +# define VMOVU_MASK vmovdqu8 > # define VMOVU vmovdqu64 > # define VPCMP vpcmpub > # define VPTEST vptestmb > @@ -62,12 +63,39 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6) > movl %edx, %edx > # endif > cmp $VEC_SIZE, %RDX_LP > - jb L(less_vec) > + /* Fall through for [0, VEC_SIZE] as its the hottest. */ > + ja L(more_1x_vec) > + > + /* Create mask of bytes that are guranteed to be valid because > + of length (edx). Using masked movs allows us to skip checks for > + page crosses/zero size. */ > + movl $-1, %ecx > + bzhil %edx, %ecx, %ecx > + kmovd %ecx, %k2 > + > + /* Use masked loads as VEC_SIZE could page cross where length > + (edx) would not. */ > + VMOVU_MASK (%rsi), %YMM2{%k2} > + VPCMP $4,(%rdi), %YMM2, %k1{%k2} > + kmovd %k1, %eax > + ret > > - /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ > + > +L(last_1x_vec): > + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1 > + VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1 > + kmovd %k1, %eax > +L(return_neq0): > + ret > + > + > + > + .p2align 4 > +L(more_1x_vec): > + /* From VEC + 1 to 2 * VEC. */ > VMOVU (%rsi), %YMM1 > /* Use compare not equals to directly check for mismatch. */ > - VPCMP $4, (%rdi), %YMM1, %k1 > + VPCMP $4,(%rdi), %YMM1, %k1 > kmovd %k1, %eax > testl %eax, %eax > jnz L(return_neq0) > @@ -88,13 +116,13 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6) > > /* Check third and fourth VEC no matter what. */ > VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 > - VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1 > + VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1 > kmovd %k1, %eax > testl %eax, %eax > jnz L(return_neq0) > > VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 > - VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1 > + VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1 > kmovd %k1, %eax > testl %eax, %eax > jnz L(return_neq0) > @@ -132,66 +160,6 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6) > /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */ > VPTEST %YMM4, %YMM4, %k1 > kmovd %k1, %eax > -L(return_neq0): > - ret > - > - /* Fits in padding needed to .p2align 5 L(less_vec). */ > -L(last_1x_vec): > - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1 > - VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1 > - kmovd %k1, %eax > - ret > - > - /* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32 > - byte aligned. */ > - .p2align 5 > -L(less_vec): > - /* Check if one or less char. This is necessary for size = 0 but > - is also faster for size = 1. */ > - cmpl $1, %edx > - jbe L(one_or_less) > - > - /* Check if loading one VEC from either s1 or s2 could cause a > - page cross. This can have false positives but is by far the > - fastest method. */ > - movl %edi, %eax > - orl %esi, %eax > - andl $(PAGE_SIZE - 1), %eax > - cmpl $(PAGE_SIZE - VEC_SIZE), %eax > - jg L(page_cross_less_vec) > - > - /* No page cross possible. */ > - VMOVU (%rsi), %YMM2 > - VPCMP $4, (%rdi), %YMM2, %k1 > - kmovd %k1, %eax > - /* Result will be zero if s1 and s2 match. Otherwise first set > - bit will be first mismatch. */ > - bzhil %edx, %eax, %eax > - ret > - > - /* Relatively cold but placing close to L(less_vec) for 2 byte > - jump encoding. */ > - .p2align 4 > -L(one_or_less): > - jb L(zero) > - movzbl (%rsi), %ecx > - movzbl (%rdi), %eax > - subl %ecx, %eax > - /* No ymm register was touched. */ > - ret > - /* Within the same 16 byte block is L(one_or_less). */ > -L(zero): > - xorl %eax, %eax > - ret > - > - .p2align 4 > -L(last_2x_vec): > - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1 > - vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1 > - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2 > - vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2 > - VPTEST %YMM2, %YMM2, %k1 > - kmovd %k1, %eax > ret > > .p2align 4 > @@ -211,7 +179,7 @@ L(loop_4x_vec): > vpxorq (%rdi), %YMM1, %YMM1 > > VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 > - vpternlogd $0xde, (VEC_SIZE)(%rdi), %YMM1, %YMM2 > + vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2 > > VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 > vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 > @@ -238,7 +206,7 @@ L(loop_4x_vec): > VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 > /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while > oring with YMM4. Result is stored in YMM4. */ > - vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %YMM3, %YMM4 > + vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4 > cmpl $(VEC_SIZE * 2), %edi > jae L(8x_last_2x_vec) > > @@ -256,68 +224,16 @@ L(8x_last_2x_vec): > L(return_neq2): > ret > > - /* Relatively cold case as page cross are unexpected. */ > - .p2align 4 > -L(page_cross_less_vec): > - cmpl $16, %edx > - jae L(between_16_31) > - cmpl $8, %edx > - ja L(between_9_15) > - cmpl $4, %edx > - jb L(between_2_3) > - /* From 4 to 8 bytes. No branch when size == 4. */ > - movl (%rdi), %eax > - subl (%rsi), %eax > - movl -4(%rdi, %rdx), %ecx > - movl -4(%rsi, %rdx), %edi > - subl %edi, %ecx > - orl %ecx, %eax > - ret > - > - .p2align 4,, 8 > -L(between_16_31): > - /* From 16 to 31 bytes. No branch when size == 16. */ > - > - /* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe. > - */ > - vmovdqu (%rsi), %xmm1 > - vpcmpeqb (%rdi), %xmm1, %xmm1 > - vmovdqu -16(%rsi, %rdx), %xmm2 > - vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2 > - vpand %xmm1, %xmm2, %xmm2 > - vpmovmskb %xmm2, %eax > - notw %ax > - /* No ymm register was touched. */ > - ret > - > .p2align 4,, 8 > -L(between_9_15): > - /* From 9 to 15 bytes. */ > - movq (%rdi), %rax > - subq (%rsi), %rax > - movq -8(%rdi, %rdx), %rcx > - movq -8(%rsi, %rdx), %rdi > - subq %rdi, %rcx > - orq %rcx, %rax > - /* edx is guranteed to be a non-zero int. */ > - cmovnz %edx, %eax > - ret > - > - /* Don't align. This is cold and aligning here will cause code > - to spill into next cache line. */ > -L(between_2_3): > - /* From 2 to 3 bytes. No branch when size == 2. */ > - movzwl (%rdi), %eax > - movzwl (%rsi), %ecx > - subl %ecx, %eax > - movzbl -1(%rdi, %rdx), %ecx > - /* All machines that support evex will insert a "merging uop" > - avoiding any serious partial register stalls. */ > - subb -1(%rsi, %rdx), %cl > - orl %ecx, %eax > - /* No ymm register was touched. */ > +L(last_2x_vec): > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1 > + vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1 > + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2 > + vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2 > + VPTEST %YMM2, %YMM2, %k1 > + kmovd %k1, %eax > ret > > - /* 4 Bytes from next cache line. */ > + /* 1 Bytes from next cache line. */ > END (MEMCMPEQ) > #endif > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu Thanks. -- H.J.