From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <hjl.tools@gmail.com>
Received: from mail-pj1-x102a.google.com (mail-pj1-x102a.google.com
 [IPv6:2607:f8b0:4864:20::102a])
 by sourceware.org (Postfix) with ESMTPS id 1BCB6385841E
 for <libc-alpha@sourceware.org>; Sun, 26 Dec 2021 17:05:41 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 1BCB6385841E
Received: by mail-pj1-x102a.google.com with SMTP id
 n15-20020a17090a160f00b001a75089daa3so16282487pja.1
 for <libc-alpha@sourceware.org>; Sun, 26 Dec 2021 09:05:41 -0800 (PST)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:mime-version:references:in-reply-to:from:date
 :message-id:subject:to:cc;
 bh=nMgTjU9xaQMXlYlFX0sIoTEoYSXb24Pm8fs94J4aPRc=;
 b=Vc7ugXKWEknWeMp3ZlpZ1PgTf7HdbgAj1Zjg/zUp6YeS3Nb5AivKBzXpPQXAiLujge
 o0x8yHaWSm/KNtlbLzM6xSlkiy2Phoej4IGVQjV2tTtbNQ9UQ//og6+Y0kPhQFoDKlIe
 mWT69Ku965m5Ah8A+wegXfukxwMWH1/Yh9qIUkzH7JzN1fWbBvS9qGZJ9fS2dCAg9Z4g
 n2MHZSKZVHmzrrGnrN16ftpJyNbpWWiy3zjYt2jBKOrHW3Y8486xyBeGizQS373EvxDa
 GCZHkWnHFFsw4cGYrfJmnWwepssKZp26vhIS1axHCSJZvpgj2pADPDcQsieahnP5/rj6
 DQSw==
X-Gm-Message-State: AOAM530YgZLFwjK/R3E5eYp0xgNyCJDWgzGhf4bOAF9Qv1D1TVGRU2/y
 G1/deHetji8L0Hws/mYaGGe5s1TNx6A+6oztImraVjPpe14=
X-Google-Smtp-Source: ABdhPJy8/o86wkKqHNRJY0VW8MyrHiPQaltH8yD7E3gNgcvEsjVoktry5ZA27/kr0O7LlYn6/oh4F+8VSYum2gmpXIY=
X-Received: by 2002:a17:902:f284:b0:148:a2af:4abd with SMTP id
 k4-20020a170902f28400b00148a2af4abdmr8620779plc.4.1640538340070; Sun, 26 Dec
 2021 09:05:40 -0800 (PST)
MIME-Version: 1.0
References: <20211225032257.2887327-1-goldstein.w.n@gmail.com>
 <20211225032257.2887327-2-goldstein.w.n@gmail.com>
In-Reply-To: <20211225032257.2887327-2-goldstein.w.n@gmail.com>
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sun, 26 Dec 2021 09:05:04 -0800
Message-ID: <CAMe9rOoqnQezax4AcFz41c1OE0GYHM6ppZ_m18Z2xktYN7Ufvw@mail.gmail.com>
Subject: Re: [PATCH v1 2/2] x86: Optimize L(less_vec) case in memcmpeq-evex.S
To: Noah Goldstein <goldstein.w.n@gmail.com>
Cc: GNU C Library <libc-alpha@sourceware.org>,
 "Carlos O'Donell" <carlos@systemhalted.org>
Content-Type: text/plain; charset="UTF-8"
X-Spam-Status: No, score=-3028.4 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-List-Received-Date: Sun, 26 Dec 2021 17:05:43 -0000

On Fri, Dec 24, 2021 at 7:23 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug.
> Optimizations are twofold.
>
> 1) Replace page cross and 0/1 checks with masked load instructions in
>    L(less_vec). In applications this reduces branch-misses in the
>    hot [0, 32] case.
> 2) Change controlflow so that L(less_vec) case gets the fall through.
>
> Change 2) helps copies in the [0, 32] size range but comes at the cost
> of copies in the [33, 64] size range.  From profiles of GCC and
> Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
> appears to the the right tradeoff.
> ---
>  sysdeps/x86_64/multiarch/memcmpeq-evex.S | 170 ++++++-----------------
>  1 file changed, 43 insertions(+), 127 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> index f27e732036..b5e1edbdff 100644
> --- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> +++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> @@ -39,6 +39,7 @@
>  #  define MEMCMPEQ     __memcmpeq_evex
>  # endif
>
> +# define VMOVU_MASK    vmovdqu8
>  # define VMOVU vmovdqu64
>  # define VPCMP vpcmpub
>  # define VPTEST        vptestmb
> @@ -62,12 +63,39 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
>         movl    %edx, %edx
>  # endif
>         cmp     $VEC_SIZE, %RDX_LP
> -       jb      L(less_vec)
> +       /* Fall through for [0, VEC_SIZE] as its the hottest.  */
> +       ja      L(more_1x_vec)
> +
> +       /* Create mask of bytes that are guranteed to be valid because
> +          of length (edx). Using masked movs allows us to skip checks for
> +          page crosses/zero size.  */
> +       movl    $-1, %ecx
> +       bzhil   %edx, %ecx, %ecx
> +       kmovd   %ecx, %k2
> +
> +       /* Use masked loads as VEC_SIZE could page cross where length
> +          (edx) would not.  */
> +       VMOVU_MASK (%rsi), %YMM2{%k2}
> +       VPCMP   $4,(%rdi), %YMM2, %k1{%k2}
> +       kmovd   %k1, %eax
> +       ret
>
> -       /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +
> +L(last_1x_vec):
> +       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
> +       VPCMP   $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
> +       kmovd   %k1, %eax
> +L(return_neq0):
> +       ret
> +
> +
> +
> +       .p2align 4
> +L(more_1x_vec):
> +       /* From VEC + 1 to 2 * VEC.  */
>         VMOVU   (%rsi), %YMM1
>         /* Use compare not equals to directly check for mismatch.  */
> -       VPCMP   $4, (%rdi), %YMM1, %k1
> +       VPCMP   $4,(%rdi), %YMM1, %k1
>         kmovd   %k1, %eax
>         testl   %eax, %eax
>         jnz     L(return_neq0)
> @@ -88,13 +116,13 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
>
>         /* Check third and fourth VEC no matter what.  */
>         VMOVU   (VEC_SIZE * 2)(%rsi), %YMM3
> -       VPCMP   $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
> +       VPCMP   $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
>         kmovd   %k1, %eax
>         testl   %eax, %eax
>         jnz     L(return_neq0)
>
>         VMOVU   (VEC_SIZE * 3)(%rsi), %YMM4
> -       VPCMP   $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
> +       VPCMP   $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
>         kmovd   %k1, %eax
>         testl   %eax, %eax
>         jnz     L(return_neq0)
> @@ -132,66 +160,6 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
>         /* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
>         VPTEST  %YMM4, %YMM4, %k1
>         kmovd   %k1, %eax
> -L(return_neq0):
> -       ret
> -
> -       /* Fits in padding needed to .p2align 5 L(less_vec).  */
> -L(last_1x_vec):
> -       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
> -       VPCMP   $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
> -       kmovd   %k1, %eax
> -       ret
> -
> -       /* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32
> -          byte aligned.  */
> -       .p2align 5
> -L(less_vec):
> -       /* Check if one or less char. This is necessary for size = 0 but
> -          is also faster for size = 1.  */
> -       cmpl    $1, %edx
> -       jbe     L(one_or_less)
> -
> -       /* Check if loading one VEC from either s1 or s2 could cause a
> -          page cross. This can have false positives but is by far the
> -          fastest method.  */
> -       movl    %edi, %eax
> -       orl     %esi, %eax
> -       andl    $(PAGE_SIZE - 1), %eax
> -       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> -       jg      L(page_cross_less_vec)
> -
> -       /* No page cross possible.  */
> -       VMOVU   (%rsi), %YMM2
> -       VPCMP   $4, (%rdi), %YMM2, %k1
> -       kmovd   %k1, %eax
> -       /* Result will be zero if s1 and s2 match. Otherwise first set
> -          bit will be first mismatch.  */
> -       bzhil   %edx, %eax, %eax
> -       ret
> -
> -       /* Relatively cold but placing close to L(less_vec) for 2 byte
> -          jump encoding.  */
> -       .p2align 4
> -L(one_or_less):
> -       jb      L(zero)
> -       movzbl  (%rsi), %ecx
> -       movzbl  (%rdi), %eax
> -       subl    %ecx, %eax
> -       /* No ymm register was touched.  */
> -       ret
> -       /* Within the same 16 byte block is L(one_or_less).  */
> -L(zero):
> -       xorl    %eax, %eax
> -       ret
> -
> -       .p2align 4
> -L(last_2x_vec):
> -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
> -       vpxorq  -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
> -       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
> -       vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
> -       VPTEST  %YMM2, %YMM2, %k1
> -       kmovd   %k1, %eax
>         ret
>
>         .p2align 4
> @@ -211,7 +179,7 @@ L(loop_4x_vec):
>         vpxorq  (%rdi), %YMM1, %YMM1
>
>         VMOVU   VEC_SIZE(%rsi, %rdi), %YMM2
> -       vpternlogd $0xde, (VEC_SIZE)(%rdi), %YMM1, %YMM2
> +       vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
>
>         VMOVU   (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
>         vpxorq  (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> @@ -238,7 +206,7 @@ L(loop_4x_vec):
>         VMOVU   (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
>         /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
>            oring with YMM4. Result is stored in YMM4.  */
> -       vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
> +       vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
>         cmpl    $(VEC_SIZE * 2), %edi
>         jae     L(8x_last_2x_vec)
>
> @@ -256,68 +224,16 @@ L(8x_last_2x_vec):
>  L(return_neq2):
>         ret
>
> -       /* Relatively cold case as page cross are unexpected.  */
> -       .p2align 4
> -L(page_cross_less_vec):
> -       cmpl    $16, %edx
> -       jae     L(between_16_31)
> -       cmpl    $8, %edx
> -       ja      L(between_9_15)
> -       cmpl    $4, %edx
> -       jb      L(between_2_3)
> -       /* From 4 to 8 bytes.  No branch when size == 4.  */
> -       movl    (%rdi), %eax
> -       subl    (%rsi), %eax
> -       movl    -4(%rdi, %rdx), %ecx
> -       movl    -4(%rsi, %rdx), %edi
> -       subl    %edi, %ecx
> -       orl     %ecx, %eax
> -       ret
> -
> -       .p2align 4,, 8
> -L(between_16_31):
> -       /* From 16 to 31 bytes.  No branch when size == 16.  */
> -
> -       /* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe.
> -        */
> -       vmovdqu (%rsi), %xmm1
> -       vpcmpeqb (%rdi), %xmm1, %xmm1
> -       vmovdqu -16(%rsi, %rdx), %xmm2
> -       vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2
> -       vpand   %xmm1, %xmm2, %xmm2
> -       vpmovmskb %xmm2, %eax
> -       notw    %ax
> -       /* No ymm register was touched.  */
> -       ret
> -
>         .p2align 4,, 8
> -L(between_9_15):
> -       /* From 9 to 15 bytes.  */
> -       movq    (%rdi), %rax
> -       subq    (%rsi), %rax
> -       movq    -8(%rdi, %rdx), %rcx
> -       movq    -8(%rsi, %rdx), %rdi
> -       subq    %rdi, %rcx
> -       orq     %rcx, %rax
> -       /* edx is guranteed to be a non-zero int.  */
> -       cmovnz  %edx, %eax
> -       ret
> -
> -       /* Don't align. This is cold and aligning here will cause code
> -          to spill into next cache line.  */
> -L(between_2_3):
> -       /* From 2 to 3 bytes.  No branch when size == 2.  */
> -       movzwl  (%rdi), %eax
> -       movzwl  (%rsi), %ecx
> -       subl    %ecx, %eax
> -       movzbl  -1(%rdi, %rdx), %ecx
> -       /* All machines that support evex will insert a "merging uop"
> -          avoiding any serious partial register stalls.  */
> -       subb    -1(%rsi, %rdx), %cl
> -       orl     %ecx, %eax
> -       /* No ymm register was touched.  */
> +L(last_2x_vec):
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
> +       vpxorq  -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
> +       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
> +       vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
> +       VPTEST  %YMM2, %YMM2, %k1
> +       kmovd   %k1, %eax
>         ret
>
> -    /* 4 Bytes from next cache line. */
> +    /* 1 Bytes from next cache line. */
>  END (MEMCMPEQ)
>  #endif
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.