From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from us-smtp-1.mimecast.com (us-smtp-delivery-1.mimecast.com [205.139.110.120]) by sourceware.org (Postfix) with ESMTP id 075CA3857C5C for ; Thu, 16 Jul 2020 15:31:37 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 075CA3857C5C Received: from mail-qv1-f71.google.com (mail-qv1-f71.google.com [209.85.219.71]) (Using TLS) by relay.mimecast.com with ESMTP id us-mta-438--s9G7HdYPgu6rxGKRlOGqw-1; Thu, 16 Jul 2020 11:31:08 -0400 X-MC-Unique: -s9G7HdYPgu6rxGKRlOGqw-1 Received: by mail-qv1-f71.google.com with SMTP id x37so3654677qvf.4 for ; Thu, 16 Jul 2020 08:31:08 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:subject:to:references:from:organization :message-id:date:user-agent:mime-version:in-reply-to :content-language:content-transfer-encoding; bh=3lOdeoJWvN6411Hn2x72pOt6/+xErbtKOjt+CkQqOIk=; b=KmaY0k5mF1jaz4PIWAmDOUqiiyLUjQmzGSYt+bor8Fn5mw952+5t7XBJJrqOaa+IIm +d+yJ2k47ktk5xYfaAiYOUk3wlkN5X+qknWX3G8ggwGM9LqJUio+YS4UztO6SLve1kfP hpOrKQyYChMi+KgTTLP0zOMZye5f0ZnTbZsmboYgWA/3WAMRmyctOypUJdA+6uGmaphJ by+s7IPSVYTo5p1apfNeQlUh5EuzgQ1hWuZpdpOhf49FFTXKSP+ooxrnNhSLtHXj2p6x Yfa77KywszlmMPgbraT1ReAZZmk+vGOIMreDubWNzRBCBzVpvyMuc1D2XDTSKJkp3Ebx iKKg== X-Gm-Message-State: AOAM531uhShulG75nQCYhyYstNbTwqxHHtLkoIi9Mq5bAIAmU8MgTRva 79WYjDWBkTcFdHm4iwAeUrTIRywiiJ4wd1rYL+DA4M9QCpCQRx9WcIQq6VNm4vCbQDxNJ0zJZkN HyeyTK0P/3pRjD10tSJsX X-Received: by 2002:a0c:dc8c:: with SMTP id n12mr4752284qvk.221.1594913467452; Thu, 16 Jul 2020 08:31:07 -0700 (PDT) X-Google-Smtp-Source: ABdhPJxaPhwl2WbjnTxj5pzX88VXpzr2VBoUoX6aiDfn0Iqjwr7r+TXst+ZBABERb2zjjmnZXKvHiw== X-Received: by 2002:a0c:dc8c:: with SMTP id n12mr4752239qvk.221.1594913466930; Thu, 16 Jul 2020 08:31:06 -0700 (PDT) Received: from [192.168.1.4] (198-84-170-103.cpe.teksavvy.com. [198.84.170.103]) by smtp.gmail.com with ESMTPSA id u58sm8705000qth.77.2020.07.16.08.31.05 (version=TLS1_3 cipher=TLS_AES_128_GCM_SHA256 bits=128/128); Thu, 16 Jul 2020 08:31:06 -0700 (PDT) Subject: Re: [PATCH] AArch64: Improve strlen_asimd performance (bug 25824) To: Wilco Dijkstra , 'GNU C Library' , Szabolcs Nagy References: From: Carlos O'Donell Organization: Red Hat Message-ID: <6b81d85c-9683-2b38-710a-d4bbe3c76198@redhat.com> Date: Thu, 16 Jul 2020 11:31:04 -0400 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Thunderbird/68.7.0 MIME-Version: 1.0 In-Reply-To: Content-Language: en-US X-Mimecast-Spam-Score: 0 X-Mimecast-Originator: redhat.com Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 7bit X-Spam-Status: No, score=-11.6 required=5.0 tests=BAYES_00, DKIMWL_WL_HIGH, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, KAM_LOTSOFHASH, KAM_SHORT, RCVD_IN_DNSWL_NONE, RCVD_IN_MSPIKE_H2, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 16 Jul 2020 15:31:39 -0000 On 7/16/20 9:00 AM, Wilco Dijkstra wrote: > Optimize strlen using a mix of scalar and SIMD code. On modern micro > architectures large strings are 2.6 times faster than existing strlen_asimd > and 35% faster than the new MTE version of strlen. > > On a random strlen benchmark using small sizes the speedup is 7% vs > strlen_asimd and 40% vs the MTE strlen. This fixes the main strlen > regressions on Cortex-A53 and other cores with a simple Neon unit > (see https://sourceware.org/pipermail/libc-alpha/2020-June/114641.html ) > > Rename __strlen_generic to __strlen_mte, and select strlen_asimd when > MTE is not enabled (this is waiting on support for a HWCAP_MTE bit > which can hopefully be added soon). > > This fixes big-endian bug 25824. Passes GLIBC regression tests. > > I'd like this for 2.32 to fix the bug and avoid any regressions. The copyright/description changes don't look correct. Overall this is OK for 2.32, but Szabolcs should review this also. > --- > > diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile > index a65c554bf3a60ccbed6b519bbbc46aabdf5b6025..4377df0735287c210efd661188f9e6e3923c8003 100644 > --- a/sysdeps/aarch64/multiarch/Makefile > +++ b/sysdeps/aarch64/multiarch/Makefile > @@ -4,5 +4,5 @@ sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ > memcpy_new \ > memset_generic memset_falkor memset_emag memset_kunpeng \ > memchr_generic memchr_nosimd \ > - strlen_generic strlen_asimd > + strlen_mte strlen_asimd > endif > diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > index c1df2a012ae17f45f26bd45fc98fe45b2f4d9eb1..1e22fdf8726bf4cd92aed09401b2772f514bf3dc 100644 > --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > @@ -62,7 +62,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > IFUNC_IMPL (i, name, strlen, > IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd) > - IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic)) > + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_mte)) > > return i; > } > diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c > index 99f2cf2cde54fd1158383d097ba51edc1377f55b..7c0352dd878086708ac785807bc4d210b85e528f 100644 > --- a/sysdeps/aarch64/multiarch/strlen.c > +++ b/sysdeps/aarch64/multiarch/strlen.c > @@ -26,17 +26,15 @@ > # include > # include > > -#define USE_ASIMD_STRLEN() IS_FALKOR (midr) > +/* This should check HWCAP_MTE when it is available. */ > +#define MTE_ENABLED() (false) OK. > > extern __typeof (__redirect_strlen) __strlen; > > -extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden; > +extern __typeof (__redirect_strlen) __strlen_mte attribute_hidden; > extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden; > > -libc_ifunc (__strlen, > - (USE_ASIMD_STRLEN () || IS_KUNPENG920 (midr) > - ? __strlen_asimd > - :__strlen_generic)); > +libc_ifunc (__strlen, (MTE_ENABLED () ? __strlen_mte : __strlen_asimd)); OK. > > # undef strlen > strong_alias (__strlen, strlen); > diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S > index 236a2c96a6eb5f02b0f0847d230857f0aee87fbe..076a905dceae501d85c1ab59a2250d8305c718f2 100644 > --- a/sysdeps/aarch64/multiarch/strlen_asimd.S > +++ b/sysdeps/aarch64/multiarch/strlen_asimd.S > @@ -1,5 +1,4 @@ > -/* Strlen implementation that uses ASIMD instructions for load and NULL checks. > - Copyright (C) 2018-2020 Free Software Foundation, Inc. Why are you removing the first line description? > +/* Copyright (C) 2020 Free Software Foundation, Inc. This needs justification. > > This file is part of the GNU C Library. > > @@ -20,80 +19,90 @@ > #include > > /* Assumptions: > + * > + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. > + * Not MTE compatible. > + */ > + > +#define srcin x0 > +#define len x0 > + > +#define src x1 > +#define data1 x2 > +#define data2 x3 > +#define has_nul1 x4 > +#define has_nul2 x5 > +#define tmp1 x4 > +#define tmp2 x5 > +#define tmp3 x6 > +#define tmp4 x7 > +#define zeroones x8 > + > +#define maskv v0 > +#define maskd d0 > +#define dataq1 q1 > +#define dataq2 q2 > +#define datav1 v1 > +#define datav2 v2 > +#define tmp x2 > +#define tmpw w2 > +#define synd x3 > +#define shift x4 > + > +/* For the first 32 bytes, NUL detection works on the principle that > + (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a > + byte is zero, and can be done in parallel across the entire word. */ > > - ARMv8-a, AArch64, ASIMD, unaligned accesses, min page size 4k. */ > +#define REP8_01 0x0101010101010101 > +#define REP8_7f 0x7f7f7f7f7f7f7f7f > > /* To test the page crossing code path more thoroughly, compile with > -DTEST_PAGE_CROSS - this will force all calls through the slower > entry path. This option is not intended for production use. */ > > -/* Arguments and results. */ > -#define srcin x0 > -#define len x0 > - > -/* Locals and temporaries. */ > -#define src x1 > -#define data1 x2 > -#define data2 x3 > -#define has_nul1 x4 > -#define has_nul2 x5 > -#define tmp1 x4 > -#define tmp2 x5 > -#define tmp3 x6 > -#define tmp4 x7 > -#define zeroones x8 > -#define dataq q2 > -#define datav v2 > -#define datab2 b3 > -#define dataq2 q3 > -#define datav2 v3 > - > -#define REP8_01 0x0101010101010101 > -#define REP8_7f 0x7f7f7f7f7f7f7f7f > - > #ifdef TEST_PAGE_CROSS > -# define MIN_PAGE_SIZE 16 > +# define MIN_PAGE_SIZE 32 > #else > # define MIN_PAGE_SIZE 4096 > #endif > > - /* Since strings are short on average, we check the first 16 bytes > - of the string for a NUL character. In order to do an unaligned load > - safely we have to do a page cross check first. If there is a NUL > - byte we calculate the length from the 2 8-byte words using > - conditional select to reduce branch mispredictions (it is unlikely > - strlen_asimd will be repeatedly called on strings with the same > - length). > - > - If the string is longer than 16 bytes, we align src so don't need > - further page cross checks, and process 16 bytes per iteration. > - > - If the page cross check fails, we read 16 bytes from an aligned > - address, remove any characters before the string, and continue > - in the main loop using aligned loads. Since strings crossing a > - page in the first 16 bytes are rare (probability of > - 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. > - > - AArch64 systems have a minimum page size of 4k. We don't bother > - checking for larger page sizes - the cost of setting up the correct > - page size is just not worth the extra gain from a small reduction in > - the cases taking the slow path. Note that we only care about > - whether the first fetch, which may be misaligned, crosses a page > - boundary. */ > - > -ENTRY_ALIGN (__strlen_asimd, 6) > - DELOUSE (0) > - DELOUSE (1) > +/* Core algorithm: > + > + Since strings are short on average, we check the first 32 bytes of the > + string for a NUL character without aligning the string. In order to use > + unaligned loads safely we must do a page cross check first. > + > + If there is a NUL byte we calculate the length from the 2 8-byte words > + using conditional select to reduce branch mispredictions (it is unlikely > + strlen will be repeatedly called on strings with the same length). > + > + If the string is longer than 32 bytes, align src so we don't need further > + page cross checks, and process 32 bytes per iteration using a fast SIMD > + loop. > + > + If the page cross check fails, we read 32 bytes from an aligned address, > + and ignore any characters before the string. If it contains a NUL > + character, return the length, if not, continue in the main loop. */ > + > +ENTRY (__strlen_asimd) > + DELOUSE (0) > + > and tmp1, srcin, MIN_PAGE_SIZE - 1 > - mov zeroones, REP8_01 > - cmp tmp1, MIN_PAGE_SIZE - 16 > - b.gt L(page_cross) > + cmp tmp1, MIN_PAGE_SIZE - 32 > + b.hi L(page_cross) > + > + /* Look for a NUL byte in the first 16 bytes. */ > ldp data1, data2, [srcin] > + mov zeroones, REP8_01 > + > #ifdef __AARCH64EB__ > + /* For big-endian, carry propagation (if the final byte in the > + string is 0x01) means we cannot use has_nul1/2 directly. > + Since we expect strings to be small and early-exit, > + byte-swap the data now so has_null1/2 will be correct. */ > rev data1, data1 > rev data2, data2 > #endif > - > sub tmp1, data1, zeroones > orr tmp2, data1, REP8_7f > sub tmp3, data2, zeroones > @@ -101,78 +110,105 @@ ENTRY_ALIGN (__strlen_asimd, 6) > bics has_nul1, tmp1, tmp2 > bic has_nul2, tmp3, tmp4 > ccmp has_nul2, 0, 0, eq > - beq L(main_loop_entry) > + b.eq L(bytes16_31) > + > + /* Find the exact offset of the first NUL byte in the first 16 bytes > + from the string start. Enter with C = has_nul1 == 0. */ > csel has_nul1, has_nul1, has_nul2, cc > mov len, 8 > rev has_nul1, has_nul1 > - clz tmp1, has_nul1 > csel len, xzr, len, cc > + clz tmp1, has_nul1 > add len, len, tmp1, lsr 3 > ret > > -L(main_loop_entry): > - bic src, srcin, 15 > - sub src, src, 16 > - > -L(main_loop): > - ldr dataq, [src, 32]! > -L(page_cross_entry): > - /* Get the minimum value and keep going if it is not zero. */ > - uminv datab2, datav.16b > - mov tmp1, datav2.d[0] > - cbz tmp1, L(tail) > - ldr dataq, [src, 16] > - uminv datab2, datav.16b > - mov tmp1, datav2.d[0] > - cbnz tmp1, L(main_loop) > - add src, src, 16 > - > -L(tail): > + .p2align 3 > + /* Look for a NUL byte at offset 16..31 in the string. */ > +L(bytes16_31): > + ldp data1, data2, [srcin, 16] > #ifdef __AARCH64EB__ > - rev64 datav.16b, datav.16b > -#endif > - /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a > - pair of scalars and then compute the length from the earliest NULL > - byte. */ > - cmeq datav.16b, datav.16b, #0 > - mov data1, datav.d[0] > - mov data2, datav.d[1] > - cmp data1, 0 > - csel data1, data1, data2, ne > - sub len, src, srcin > rev data1, data1 > - add tmp2, len, 8 > - clz tmp1, data1 > - csel len, len, tmp2, ne > + rev data2, data2 > +#endif > + sub tmp1, data1, zeroones > + orr tmp2, data1, REP8_7f > + sub tmp3, data2, zeroones > + orr tmp4, data2, REP8_7f > + bics has_nul1, tmp1, tmp2 > + bic has_nul2, tmp3, tmp4 > + ccmp has_nul2, 0, 0, eq > + b.eq L(loop_entry) > + > + /* Find the exact offset of the first NUL byte at offset 16..31 from > + the string start. Enter with C = has_nul1 == 0. */ > + csel has_nul1, has_nul1, has_nul2, cc > + mov len, 24 > + rev has_nul1, has_nul1 > + mov tmp3, 16 > + clz tmp1, has_nul1 > + csel len, tmp3, len, cc > add len, len, tmp1, lsr 3 > ret > > - /* Load 16 bytes from [srcin & ~15] and force the bytes that precede > - srcin to 0xff, so we ignore any NUL bytes before the string. > - Then continue in the aligned loop. */ > -L(page_cross): > - mov tmp3, 63 > - bic src, srcin, 15 > - and tmp1, srcin, 7 > - ands tmp2, srcin, 8 > - ldr dataq, [src] > - lsl tmp1, tmp1, 3 > - csel tmp2, tmp2, tmp1, eq > - csel tmp1, tmp1, tmp3, eq > - mov tmp4, -1 > +L(loop_entry): > + bic src, srcin, 31 > + > + .p2align 5 > +L(loop): > + ldp dataq1, dataq2, [src, 32]! > + uminp maskv.16b, datav1.16b, datav2.16b > + uminp maskv.16b, maskv.16b, maskv.16b > + cmeq maskv.8b, maskv.8b, 0 > + fmov synd, maskd > + cbz synd, L(loop) > + > + /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ > + cmeq maskv.16b, datav1.16b, 0 > + sub len, src, srcin > + tst synd, 0xffffffff > + b.ne 1f > + cmeq maskv.16b, datav2.16b, 0 > + add len, len, 16 > +1: > + /* Generate a bitmask and compute correct byte offset. */ > #ifdef __AARCH64EB__ > - /* Big-endian. Early bytes are at MSB. */ > - lsr tmp1, tmp4, tmp1 > - lsr tmp2, tmp4, tmp2 > + bic maskv.8h, 0xf0 > #else > - /* Little-endian. Early bytes are at LSB. */ > - lsl tmp1, tmp4, tmp1 > - lsl tmp2, tmp4, tmp2 > + bic maskv.8h, 0x0f, lsl 8 > +#endif > + umaxp maskv.16b, maskv.16b, maskv.16b > + fmov synd, maskd > +#ifndef __AARCH64EB__ > + rbit synd, synd > #endif > - mov datav2.d[0], tmp1 > - mov datav2.d[1], tmp2 > - orn datav.16b, datav.16b, datav2.16b > - b L(page_cross_entry) > + clz tmp, synd > + add len, len, tmp, lsr 2 > + ret > + > + .p2align 4 > + > +L(page_cross): > + bic src, srcin, 31 > + mov tmpw, 0x0c03 > + movk tmpw, 0xc030, lsl 16 > + ld1 {datav1.16b, datav2.16b}, [src] > + dup maskv.4s, tmpw > + cmeq datav1.16b, datav1.16b, 0 > + cmeq datav2.16b, datav2.16b, 0 > + and datav1.16b, datav1.16b, maskv.16b > + and datav2.16b, datav2.16b, maskv.16b > + addp maskv.16b, datav1.16b, datav2.16b > + addp maskv.16b, maskv.16b, maskv.16b > + fmov synd, maskd > + lsl shift, srcin, 1 > + lsr synd, synd, shift > + cbz synd, L(loop) > + > + rbit synd, synd > + clz len, synd > + lsr len, len, 1 > + ret > + > END (__strlen_asimd) > weak_alias (__strlen_asimd, strlen_asimd) > libc_hidden_builtin_def (strlen_asimd) > diff --git a/sysdeps/aarch64/multiarch/strlen_generic.S b/sysdeps/aarch64/multiarch/strlen_mte.S > similarity index 88% > rename from sysdeps/aarch64/multiarch/strlen_generic.S > rename to sysdeps/aarch64/multiarch/strlen_mte.S > index 61d3f72c9985bdd103d5e4c68337fed4a55511be..b8daa54dd89afbd99a6338cef45f49a25defaa26 100644 > --- a/sysdeps/aarch64/multiarch/strlen_generic.S > +++ b/sysdeps/aarch64/multiarch/strlen_mte.S OK. Rename. > @@ -17,14 +17,14 @@ > . */ > > /* The actual strlen code is in ../strlen.S. If we are building libc this file > - defines __strlen_generic. Otherwise the include of ../strlen.S will define > + defines __strlen_mte. Otherwise the include of ../strlen.S will define OK. > the normal __strlen entry points. */ > > #include > > #if IS_IN (libc) > > -# define STRLEN __strlen_generic > +# define STRLEN __strlen_mte > > /* Do not hide the generic version of strlen, we use it internally. */ > # undef libc_hidden_builtin_def > @@ -32,7 +32,7 @@ > > # ifdef SHARED > /* It doesn't make sense to send libc-internal strlen calls through a PLT. */ > - .globl __GI_strlen; __GI_strlen = __strlen_generic > + .globl __GI_strlen; __GI_strlen = __strlen_mte > # endif > #endif > > > -- Cheers, Carlos.