From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-ej1-x629.google.com (mail-ej1-x629.google.com [IPv6:2a00:1450:4864:20::629]) by sourceware.org (Postfix) with ESMTPS id C42B83853541 for ; Wed, 6 Jul 2022 07:43:06 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org C42B83853541 Received: by mail-ej1-x629.google.com with SMTP id n4so6284697ejz.10 for ; Wed, 06 Jul 2022 00:43:06 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=GUXwFbRZo88vXNokiry6iBWnesu2FwH+K6gMFGFkXq8=; b=2uYWtIMsGeaLK1ywT0dpAMuUJQRyEgC+UHX8PfyJ9littvFN3PEGTdTFeO4Cj+Q2i0 gLUEHN8O8CQ0XYxrbKbUZmj4jCjwHO4mqlvJzYHFbU+lD2/qUGvNqiX+DPDGpVdutdqB 1+87VNBU09XVk+2M0k6XIU6+VrUygKufkk5fpHCRwyd/p1HTnRzjw0HxMuCJifHyS4Wf s8H66T3pS5TqiSjWIncS52mF/qBU7vZ5NKbP2ZpPeoxxZQhGVaBfYYowG8zaoR6b+Txl Cvxq6VkewAHKRvsZEePQp8DPLi31LR4U3PInJ6+39rUExC1BExXUyfHmnjzYKKAjbKhh elJw== X-Gm-Message-State: AJIora+wiMYgoSiRiFojBnut7yEfDIi1+gUDoDD7cOinyrbKiJ/haoKc syOxZsv8AI1xdVSmHohbST2Cf8VNtY3amk4mFlCi4VZzN2Q= X-Google-Smtp-Source: AGRyM1uK8+0BaQN7xt0agF/+0awoixwgAlRujJ8k2Ap4FF6YBdp01Jrj84NB0BjXOMF6GzZYbzrGv06yKQ21YNAPRPk= X-Received: by 2002:a17:906:38e:b0:722:e8f1:df6c with SMTP id b14-20020a170906038e00b00722e8f1df6cmr38527608eja.500.1657093383142; Wed, 06 Jul 2022 00:43:03 -0700 (PDT) MIME-Version: 1.0 References: <20220627161213.1081512-1-danilak@google.com> In-Reply-To: <20220627161213.1081512-1-danilak@google.com> From: Danila Kutenin Date: Wed, 6 Jul 2022 08:42:52 +0100 Message-ID: Subject: Re: [PATCH] aarch64: Optimize string functions with shrn instruction To: libc-alpha@sourceware.org Cc: Szabolcs.Nagy@arm.com X-Spam-Status: No, score=-26.3 required=5.0 tests=BAYES_00, DKIMWL_WL_MED, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, ENV_AND_HDR_SPF_MATCH, GIT_PATCH_0, HTML_MESSAGE, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE, USER_IN_DEF_DKIM_WL, USER_IN_DEF_SPF_WL autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org Content-Type: text/plain; charset="UTF-8" X-Content-Filtered-By: Mailman/MimeDel 2.1.29 X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 06 Jul 2022 07:43:11 -0000 Friendly ping On Mon, Jun 27, 2022 at 5:12 PM Danila Kutenin wrote: > We found that string functions were using AND+ADDP > to find the nibble/syndrome mask but there is an easier > opportunity through `SHRN dst.8b, src.8h, 4` (shift > right every 2 bytes by 4 and narrow to 1 byte) and has > same latency on all SIMD ARMv8 targets as ADDP. There > are also possible gaps for memcmp but that's for > another patch. > > We see 10-20% savings for small-mid size cases (<=128) > which are primary cases for general workloads. > --- > sysdeps/aarch64/memchr.S | 25 +++++++++---------------- > sysdeps/aarch64/memrchr.S | 25 +++++++++---------------- > sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------ > sysdeps/aarch64/strcpy.S | 32 ++++++++++++-------------------- > sysdeps/aarch64/strlen.S | 25 +++++++++---------------- > sysdeps/aarch64/strnlen.S | 25 +++++++++---------------- > 6 files changed, 59 insertions(+), 102 deletions(-) > > diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S > index b060eee97d..2053a977b6 100644 > --- a/sysdeps/aarch64/memchr.S > +++ b/sysdeps/aarch64/memchr.S > @@ -41,24 +41,21 @@ > #define synd x5 > #define shift x6 > #define tmp x7 > -#define wtmp w7 > > #define vrepchr v0 > #define qdata q1 > #define vdata v1 > #define vhas_chr v2 > -#define vrepmask v3 > -#define vend v4 > -#define dend d4 > +#define vend v3 > +#define dend d3 > > /* > Core algorithm: > - For each 16-byte chunk we calculate a 64-bit syndrome value with four > bits > - per byte. For even bytes, bits 0-3 are set if the relevant byte > matched the > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits > 4-7 are > - set likewise for odd bytes so that adjacent bytes can be merged. Since > the > - bits in the syndrome reflect the order in which things occur in the > original > - string, counting trailing zeros identifies exactly which byte > matched. */ > + For each 16-byte chunk we calculate a 64-bit nibble mask value with > four bits > + per byte. We take 4 bits of every comparison byte with shift right and > narrow > + by 4 instruction. Since the bits in the nibble mask reflect the order > in > + which things occur in the original string, counting leading zeros > identifies > + exactly which byte matched. */ > > ENTRY (MEMCHR) > PTR_ARG (0) > @@ -67,12 +64,9 @@ ENTRY (MEMCHR) > cbz cntin, L(nomatch) > ld1 {vdata.16b}, [src] > dup vrepchr.16b, chrin > - mov wtmp, 0xf00f > - dup vrepmask.8h, wtmp > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b > lsl shift, srcin, 2 > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 > */ > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ > fmov synd, dend > lsr synd, synd, shift > cbz synd, L(start_loop) > @@ -111,8 +105,7 @@ L(loop32_2): > fmov synd, dend > cbz synd, L(loop32) > L(end): > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 > */ > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ > fmov synd, dend > add tmp, srcin, cntin > sub cntrem, tmp, src > diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S > index e0efbad91c..5179320720 <(517)%20932-0720> 100644 > --- a/sysdeps/aarch64/memrchr.S > +++ b/sysdeps/aarch64/memrchr.S > @@ -37,7 +37,6 @@ > #define synd x5 > #define shift x6 > #define tmp x7 > -#define wtmp w7 > #define end x8 > #define endm1 x9 > > @@ -45,18 +44,16 @@ > #define qdata q1 > #define vdata v1 > #define vhas_chr v2 > -#define vrepmask v3 > -#define vend v4 > -#define dend d4 > +#define vend v3 > +#define dend d3 > > /* > Core algorithm: > - For each 16-byte chunk we calculate a 64-bit syndrome value with four > bits > - per byte. For even bytes, bits 0-3 are set if the relevant byte > matched the > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits > 4-7 are > - set likewise for odd bytes so that adjacent bytes can be merged. Since > the > - bits in the syndrome reflect the order in which things occur in the > original > - string, counting trailing zeros identifies exactly which byte > matched. */ > + For each 16-byte chunk we calculate a 64-bit nibble mask value with > four bits > + per byte. We take 4 bits of every comparison byte with shift right and > narrow > + by 4 instruction. Since the bits in the nibble mask reflect the order > in > + which things occur in the original string, counting leading zeros > identifies > + exactly which byte matched. */ > > ENTRY (__memrchr) > PTR_ARG (0) > @@ -67,12 +64,9 @@ ENTRY (__memrchr) > cbz cntin, L(nomatch) > ld1 {vdata.16b}, [src] > dup vrepchr.16b, chrin > - mov wtmp, 0xf00f > - dup vrepmask.8h, wtmp > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b > neg shift, end, lsl 2 > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 > */ > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ > fmov synd, dend > lsl synd, synd, shift > cbz synd, L(start_loop) > @@ -109,8 +103,7 @@ L(loop32_2): > fmov synd, dend > cbz synd, L(loop32) > L(end): > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 > */ > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ > fmov synd, dend > > add tmp, src, 15 > diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S > index 442726fd49..ee154ab74b 100644 > --- a/sysdeps/aarch64/strchrnul.S > +++ b/sysdeps/aarch64/strchrnul.S > @@ -33,38 +33,32 @@ > #define src x2 > #define tmp1 x1 > #define tmp2 x3 > -#define tmp2w w3 > > #define vrepchr v0 > #define vdata v1 > #define qdata q1 > #define vhas_nul v2 > #define vhas_chr v3 > -#define vrepmask v4 > -#define vend v5 > -#define dend d5 > +#define vend v4 > +#define dend d4 > > -/* Core algorithm: > - > - For each 16-byte chunk we calculate a 64-bit syndrome value with four > bits > - per byte. For even bytes, bits 0-3 are set if the relevant byte > matched the > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits > 4-7 are > - set likewise for odd bytes so that adjacent bytes can be merged. Since > the > - bits in the syndrome reflect the order in which things occur in the > original > - string, counting trailing zeros identifies exactly which byte > matched. */ > +/* > + Core algorithm: > + For each 16-byte chunk we calculate a 64-bit nibble mask value with > four bits > + per byte. We take 4 bits of every comparison byte with shift right and > narrow > + by 4 instruction. Since the bits in the nibble mask reflect the order > in > + which things occur in the original string, counting leading zeros > identifies > + exactly which byte matched. */ > > ENTRY (__strchrnul) > PTR_ARG (0) > bic src, srcin, 15 > dup vrepchr.16b, chrin > ld1 {vdata.16b}, [src] > - mov tmp2w, 0xf00f > - dup vrepmask.8h, tmp2w > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b > cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b > lsl tmp2, srcin, 2 > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 > */ > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ > fmov tmp1, dend > lsr tmp1, tmp1, tmp2 /* Mask padding bits. */ > cbz tmp1, L(loop) > @@ -83,8 +77,7 @@ L(loop): > fmov tmp1, dend > cbz tmp1, L(loop) > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 > */ > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ > fmov tmp1, dend > #ifndef __AARCH64EB__ > rbit tmp1, tmp1 > diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S > index da53170ece..78d27b4aa6 100644 > --- a/sysdeps/aarch64/strcpy.S > +++ b/sysdeps/aarch64/strcpy.S > @@ -40,7 +40,6 @@ > #define len x4 > #define synd x4 > #define tmp x5 > -#define wtmp w5 > #define shift x5 > #define data1 x6 > #define dataw1 w6 > @@ -50,9 +49,8 @@ > #define dataq q0 > #define vdata v0 > #define vhas_nul v1 > -#define vrepmask v2 > -#define vend v3 > -#define dend d3 > +#define vend v2 > +#define dend d2 > #define dataq2 q1 > > #ifdef BUILD_STPCPY > @@ -63,34 +61,29 @@ > # define IFSTPCPY(X,...) > #endif > > -/* Core algorithm: > - > - For each 16-byte chunk we calculate a 64-bit syndrome value with four > bits > - per byte. For even bytes, bits 0-3 are set if the relevant byte > matched the > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits > 4-7 are > - set likewise for odd bytes so that adjacent bytes can be merged. Since > the > - bits in the syndrome reflect the order in which things occur in the > original > - string, counting trailing zeros identifies exactly which byte > matched. */ > +/* > + Core algorithm: > + For each 16-byte chunk we calculate a 64-bit nibble mask value with > four bits > + per byte. We take 4 bits of every comparison byte with shift right and > narrow > + by 4 instruction. Since the bits in the nibble mask reflect the order > in > + which things occur in the original string, counting leading zeros > identifies > + exactly which byte matched. */ > > ENTRY (STRCPY) > PTR_ARG (0) > PTR_ARG (1) > bic src, srcin, 15 > - mov wtmp, 0xf00f > ld1 {vdata.16b}, [src] > - dup vrepmask.8h, wtmp > cmeq vhas_nul.16b, vdata.16b, 0 > lsl shift, srcin, 2 > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b > - addp vend.16b, vhas_nul.16b, vhas_nul.16b > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ > fmov synd, dend > lsr synd, synd, shift > cbnz synd, L(tail) > > ldr dataq, [src, 16]! > cmeq vhas_nul.16b, vdata.16b, 0 > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b > - addp vend.16b, vhas_nul.16b, vhas_nul.16b > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ > fmov synd, dend > cbz synd, L(start_loop) > > @@ -162,8 +155,7 @@ L(loop): > fmov synd, dend > cbz synd, L(loop) > > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b > - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 > */ > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ > fmov synd, dend > #ifndef __AARCH64EB__ > rbit synd, synd > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S > index a2310871c2..3a5d088407 100644 > --- a/sysdeps/aarch64/strlen.S > +++ b/sysdeps/aarch64/strlen.S > @@ -34,35 +34,29 @@ > #define src x1 > #define synd x2 > #define tmp x3 > -#define wtmp w3 > #define shift x4 > > #define data q0 > #define vdata v0 > #define vhas_nul v1 > -#define vrepmask v2 > -#define vend v3 > -#define dend d3 > +#define vend v2 > +#define dend d2 > > /* Core algorithm: > > - For each 16-byte chunk we calculate a 64-bit syndrome value with four > bits > - per byte. For even bytes, bits 0-3 are set if the relevant byte > matched the > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits > 4-7 are > - set likewise for odd bytes so that adjacent bytes can be merged. Since > the > - bits in the syndrome reflect the order in which things occur in the > original > - string, counting trailing zeros identifies exactly which byte > matched. */ > + For each 16-byte chunk we calculate a 64-bit nibble mask value with > four bits > + per byte. We take 4 bits of every comparison byte with shift right and > narrow > + by 4 instruction. Since the bits in the nibble mask reflect the order > in > + which things occur in the original string, counting trailing zeros > identifies > + exactly which byte matched. */ > > ENTRY (STRLEN) > PTR_ARG (0) > bic src, srcin, 15 > - mov wtmp, 0xf00f > ld1 {vdata.16b}, [src] > - dup vrepmask.8h, wtmp > cmeq vhas_nul.16b, vdata.16b, 0 > lsl shift, srcin, 2 > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b > - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 > */ > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ > fmov synd, dend > lsr synd, synd, shift > cbz synd, L(loop) > @@ -80,8 +74,7 @@ L(loop): > fmov synd, dend > cbz synd, L(loop) > > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b > - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 > */ > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ > sub result, src, srcin > fmov synd, dend > #ifndef __AARCH64EB__ > diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S > index 0dbecb0ce9..282bddc9aa 100644 > --- a/sysdeps/aarch64/strnlen.S > +++ b/sysdeps/aarch64/strnlen.S > @@ -33,39 +33,33 @@ > #define src x2 > #define synd x3 > #define shift x4 > -#define wtmp w4 > #define tmp x4 > #define cntrem x5 > > #define qdata q0 > #define vdata v0 > #define vhas_chr v1 > -#define vrepmask v2 > -#define vend v3 > -#define dend d3 > +#define vend v2 > +#define dend d2 > > /* > Core algorithm: > > - For each 16-byte chunk we calculate a 64-bit syndrome value with four > bits > - per byte. For even bytes, bits 0-3 are set if the relevant byte > matched the > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits > 4-7 are > - set likewise for odd bytes so that adjacent bytes can be merged. Since > the > - bits in the syndrome reflect the order in which things occur in the > original > - string, counting trailing zeros identifies exactly which byte > matched. */ > + For each 16-byte chunk we calculate a 64-bit nibble mask value with > four bits > + per byte. We take 4 bits of every comparison byte with shift right and > narrow > + by 4 instruction. Since the bits in the nibble mask reflect the order > in > + which things occur in the original string, counting trailing zeros > identifies > + exactly which byte matched. */ > > ENTRY (__strnlen) > PTR_ARG (0) > SIZE_ARG (1) > bic src, srcin, 15 > - mov wtmp, 0xf00f > cbz cntin, L(nomatch) > ld1 {vdata.16b}, [src], 16 > - dup vrepmask.8h, wtmp > cmeq vhas_chr.16b, vdata.16b, 0 > lsl shift, srcin, 2 > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 > */ > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ > fmov synd, dend > lsr synd, synd, shift > cbz synd, L(start_loop) > @@ -103,8 +97,7 @@ L(loop32_2): > cbz synd, L(loop32) > > L(end): > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 > */ > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ > sub src, src, 16 > mov synd, vend.d[0] > sub result, src, srcin > -- > 2.37.0.rc0.161.g10f37bed90-goog > >