From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <danilak@google.com>
Received: from mail-ej1-x629.google.com (mail-ej1-x629.google.com
 [IPv6:2a00:1450:4864:20::629])
 by sourceware.org (Postfix) with ESMTPS id C42B83853541
 for <libc-alpha@sourceware.org>; Wed,  6 Jul 2022 07:43:06 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org C42B83853541
Received: by mail-ej1-x629.google.com with SMTP id n4so6284697ejz.10
 for <libc-alpha@sourceware.org>; Wed, 06 Jul 2022 00:43:06 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:mime-version:references:in-reply-to:from:date
 :message-id:subject:to:cc;
 bh=GUXwFbRZo88vXNokiry6iBWnesu2FwH+K6gMFGFkXq8=;
 b=2uYWtIMsGeaLK1ywT0dpAMuUJQRyEgC+UHX8PfyJ9littvFN3PEGTdTFeO4Cj+Q2i0
 gLUEHN8O8CQ0XYxrbKbUZmj4jCjwHO4mqlvJzYHFbU+lD2/qUGvNqiX+DPDGpVdutdqB
 1+87VNBU09XVk+2M0k6XIU6+VrUygKufkk5fpHCRwyd/p1HTnRzjw0HxMuCJifHyS4Wf
 s8H66T3pS5TqiSjWIncS52mF/qBU7vZ5NKbP2ZpPeoxxZQhGVaBfYYowG8zaoR6b+Txl
 Cvxq6VkewAHKRvsZEePQp8DPLi31LR4U3PInJ6+39rUExC1BExXUyfHmnjzYKKAjbKhh
 elJw==
X-Gm-Message-State: AJIora+wiMYgoSiRiFojBnut7yEfDIi1+gUDoDD7cOinyrbKiJ/haoKc
 syOxZsv8AI1xdVSmHohbST2Cf8VNtY3amk4mFlCi4VZzN2Q=
X-Google-Smtp-Source: AGRyM1uK8+0BaQN7xt0agF/+0awoixwgAlRujJ8k2Ap4FF6YBdp01Jrj84NB0BjXOMF6GzZYbzrGv06yKQ21YNAPRPk=
X-Received: by 2002:a17:906:38e:b0:722:e8f1:df6c with SMTP id
 b14-20020a170906038e00b00722e8f1df6cmr38527608eja.500.1657093383142; Wed, 06
 Jul 2022 00:43:03 -0700 (PDT)
MIME-Version: 1.0
References: <20220627161213.1081512-1-danilak@google.com>
In-Reply-To: <20220627161213.1081512-1-danilak@google.com>
From: Danila Kutenin <danilak@google.com>
Date: Wed, 6 Jul 2022 08:42:52 +0100
Message-ID: <CA+EkT6ucwr_XqpVxzBpj1YVEMn4_kdFoZYSWDbGSruqBzAFDdA@mail.gmail.com>
Subject: Re: [PATCH] aarch64: Optimize string functions with shrn instruction
To: libc-alpha@sourceware.org
Cc: Szabolcs.Nagy@arm.com
X-Spam-Status: No, score=-26.3 required=5.0 tests=BAYES_00, DKIMWL_WL_MED,
 DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, ENV_AND_HDR_SPF_MATCH,
 GIT_PATCH_0, HTML_MESSAGE, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP,
 T_SCC_BODY_TEXT_LINE, USER_IN_DEF_DKIM_WL,
 USER_IN_DEF_SPF_WL autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on
 server2.sourceware.org
Content-Type: text/plain; charset="UTF-8"
X-Content-Filtered-By: Mailman/MimeDel 2.1.29
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-List-Received-Date: Wed, 06 Jul 2022 07:43:11 -0000

Friendly ping

On Mon, Jun 27, 2022 at 5:12 PM Danila Kutenin <danilak@google.com> wrote:

> We found that string functions were using AND+ADDP
> to find the nibble/syndrome mask but there is an easier
> opportunity through `SHRN dst.8b, src.8h, 4` (shift
> right every 2 bytes by 4 and narrow to 1 byte) and has
> same latency on all SIMD ARMv8 targets as ADDP. There
> are also possible gaps for memcmp but that's for
> another patch.
>
> We see 10-20% savings for small-mid size cases (<=128)
> which are primary cases for general workloads.
> ---
>  sysdeps/aarch64/memchr.S    | 25 +++++++++----------------
>  sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
>  sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
>  sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
>  sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
>  sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
>  6 files changed, 59 insertions(+), 102 deletions(-)
>
> diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> index b060eee97d..2053a977b6 100644
> --- a/sysdeps/aarch64/memchr.S
> +++ b/sysdeps/aarch64/memchr.S
> @@ -41,24 +41,21 @@
>  #define synd           x5
>  #define shift          x6
>  #define        tmp             x7
> -#define wtmp           w7
>
>  #define vrepchr                v0
>  #define qdata          q1
>  #define vdata          v1
>  #define vhas_chr       v2
> -#define vrepmask       v3
> -#define vend           v4
> -#define dend           d4
> +#define vend           v3
> +#define dend           d3
>
>  /*
>     Core algorithm:
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (MEMCHR)
>         PTR_ARG (0)
> @@ -67,12 +64,9 @@ ENTRY (MEMCHR)
>         cbz     cntin, L(nomatch)
>         ld1     {vdata.16b}, [src]
>         dup     vrepchr.16b, chrin
> -       mov     wtmp, 0xf00f
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
>         lsl     shift, srcin, 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbz     synd, L(start_loop)
> @@ -111,8 +105,7 @@ L(loop32_2):
>         fmov    synd, dend
>         cbz     synd, L(loop32)
>  L(end):
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         add     tmp, srcin, cntin
>         sub     cntrem, tmp, src
> diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> index e0efbad91c..5179320720 <(517)%20932-0720> 100644
> --- a/sysdeps/aarch64/memrchr.S
> +++ b/sysdeps/aarch64/memrchr.S
> @@ -37,7 +37,6 @@
>  #define synd           x5
>  #define shift          x6
>  #define        tmp             x7
> -#define wtmp           w7
>  #define end            x8
>  #define endm1          x9
>
> @@ -45,18 +44,16 @@
>  #define qdata          q1
>  #define vdata          v1
>  #define vhas_chr       v2
> -#define vrepmask       v3
> -#define vend           v4
> -#define dend           d4
> +#define vend           v3
> +#define dend           d3
>
>  /*
>     Core algorithm:
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (__memrchr)
>         PTR_ARG (0)
> @@ -67,12 +64,9 @@ ENTRY (__memrchr)
>         cbz     cntin, L(nomatch)
>         ld1     {vdata.16b}, [src]
>         dup     vrepchr.16b, chrin
> -       mov     wtmp, 0xf00f
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
>         neg     shift, end, lsl 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsl     synd, synd, shift
>         cbz     synd, L(start_loop)
> @@ -109,8 +103,7 @@ L(loop32_2):
>         fmov    synd, dend
>         cbz     synd, L(loop32)
>  L(end):
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>
>         add     tmp, src, 15
> diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
> index 442726fd49..ee154ab74b 100644
> --- a/sysdeps/aarch64/strchrnul.S
> +++ b/sysdeps/aarch64/strchrnul.S
> @@ -33,38 +33,32 @@
>  #define src            x2
>  #define tmp1           x1
>  #define tmp2           x3
> -#define tmp2w          w3
>
>  #define vrepchr                v0
>  #define vdata          v1
>  #define qdata          q1
>  #define vhas_nul       v2
>  #define vhas_chr       v3
> -#define vrepmask       v4
> -#define vend           v5
> -#define dend           d5
> +#define vend           v4
> +#define dend           d4
>
> -/* Core algorithm:
> -
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +/*
> +   Core algorithm:
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (__strchrnul)
>         PTR_ARG (0)
>         bic     src, srcin, 15
>         dup     vrepchr.16b, chrin
>         ld1     {vdata.16b}, [src]
> -       mov     tmp2w, 0xf00f
> -       dup     vrepmask.8h, tmp2w
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
>         cmhs    vhas_chr.16b, vhas_chr.16b, vdata.16b
>         lsl     tmp2, srcin, 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    tmp1, dend
>         lsr     tmp1, tmp1, tmp2        /* Mask padding bits.  */
>         cbz     tmp1, L(loop)
> @@ -83,8 +77,7 @@ L(loop):
>         fmov    tmp1, dend
>         cbz     tmp1, L(loop)
>
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    tmp1, dend
>  #ifndef __AARCH64EB__
>         rbit    tmp1, tmp1
> diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> index da53170ece..78d27b4aa6 100644
> --- a/sysdeps/aarch64/strcpy.S
> +++ b/sysdeps/aarch64/strcpy.S
> @@ -40,7 +40,6 @@
>  #define len            x4
>  #define synd           x4
>  #define        tmp             x5
> -#define wtmp           w5
>  #define shift          x5
>  #define data1          x6
>  #define dataw1         w6
> @@ -50,9 +49,8 @@
>  #define dataq          q0
>  #define vdata          v0
>  #define vhas_nul       v1
> -#define vrepmask       v2
> -#define vend           v3
> -#define dend           d3
> +#define vend           v2
> +#define dend           d2
>  #define dataq2         q1
>
>  #ifdef BUILD_STPCPY
> @@ -63,34 +61,29 @@
>  # define IFSTPCPY(X,...)
>  #endif
>
> -/* Core algorithm:
> -
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +/*
> +   Core algorithm:
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (STRCPY)
>         PTR_ARG (0)
>         PTR_ARG (1)
>         bic     src, srcin, 15
> -       mov     wtmp, 0xf00f
>         ld1     {vdata.16b}, [src]
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_nul.16b, vdata.16b, 0
>         lsl     shift, srcin, 2
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbnz    synd, L(tail)
>
>         ldr     dataq, [src, 16]!
>         cmeq    vhas_nul.16b, vdata.16b, 0
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         cbz     synd, L(start_loop)
>
> @@ -162,8 +155,7 @@ L(loop):
>         fmov    synd, dend
>         cbz     synd, L(loop)
>
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>  #ifndef __AARCH64EB__
>         rbit    synd, synd
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index a2310871c2..3a5d088407 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -34,35 +34,29 @@
>  #define src            x1
>  #define        synd            x2
>  #define tmp            x3
> -#define wtmp           w3
>  #define shift          x4
>
>  #define data           q0
>  #define vdata          v0
>  #define vhas_nul       v1
> -#define vrepmask       v2
> -#define vend           v3
> -#define dend           d3
> +#define vend           v2
> +#define dend           d2
>
>  /* Core algorithm:
>
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting trailing zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (STRLEN)
>         PTR_ARG (0)
>         bic     src, srcin, 15
> -       mov     wtmp, 0xf00f
>         ld1     {vdata.16b}, [src]
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_nul.16b, vdata.16b, 0
>         lsl     shift, srcin, 2
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbz     synd, L(loop)
> @@ -80,8 +74,7 @@ L(loop):
>         fmov    synd, dend
>         cbz     synd, L(loop)
>
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         sub     result, src, srcin
>         fmov    synd, dend
>  #ifndef __AARCH64EB__
> diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> index 0dbecb0ce9..282bddc9aa 100644
> --- a/sysdeps/aarch64/strnlen.S
> +++ b/sysdeps/aarch64/strnlen.S
> @@ -33,39 +33,33 @@
>  #define src            x2
>  #define synd           x3
>  #define        shift           x4
> -#define wtmp           w4
>  #define tmp            x4
>  #define cntrem         x5
>
>  #define qdata          q0
>  #define vdata          v0
>  #define vhas_chr       v1
> -#define vrepmask       v2
> -#define vend           v3
> -#define dend           d3
> +#define vend           v2
> +#define dend           d2
>
>  /*
>     Core algorithm:
>
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting trailing zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (__strnlen)
>         PTR_ARG (0)
>         SIZE_ARG (1)
>         bic     src, srcin, 15
> -       mov     wtmp, 0xf00f
>         cbz     cntin, L(nomatch)
>         ld1     {vdata.16b}, [src], 16
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_chr.16b, vdata.16b, 0
>         lsl     shift, srcin, 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbz     synd, L(start_loop)
> @@ -103,8 +97,7 @@ L(loop32_2):
>         cbz     synd, L(loop32)
>
>  L(end):
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         sub     src, src, 16
>         mov     synd, vend.d[0]
>         sub     result, src, srcin
> --
> 2.37.0.rc0.161.g10f37bed90-goog
>
>