Re: [PATCH] AArch64: Improve strnlen performance

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

From: Szabolcs Nagy <Szabolcs.Nagy@arm.com>
To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Cc: 'GNU C Library' <libc-alpha@sourceware.org>
Subject: Re: [PATCH] AArch64: Improve strnlen performance
Date: Wed, 30 Jun 2021 09:41:46 +0100	[thread overview]
Message-ID: <20210630084145.GD14854@arm.com> (raw)
In-Reply-To: <VE1PR08MB5599E4DA23BB432BDE1ABDEB83089@VE1PR08MB5599.eurprd08.prod.outlook.com>

The 06/23/2021 16:22, Wilco Dijkstra wrote:
> 
> Optimize strnlen by avoiding UMINV which is slow on most cores. On Neoverse N1
> large strings are 1.8x faster than the current version, and bench-strnlen is 50%
> faster overall. This version is MTE compatible.
> 
> Passes GLIBC regress, OK for commit?

This is ok to commit, thanks.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>


> diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> index 2b57575c55cc41a5c6aa813af216c6e34f6cb7b0..37e9eed4120750f4e03d563938438b8c5384f75d 100644
> --- a/sysdeps/aarch64/strnlen.S
> +++ b/sysdeps/aarch64/strnlen.S
> @@ -22,197 +22,105 @@
> 
>  /* Assumptions:
>   *
> - * ARMv8-a, AArch64
> + * ARMv8-a, AArch64, Advanced SIMD.
> + * MTE compatible.
>   */
> 
> -/* Arguments and results.  */
>  #define srcin          x0
> -#define len            x0
> -#define limit          x1
> +#define cntin          x1
> +#define result         x0
> 
> -/* Locals and temporaries.  */
>  #define src            x2
> -#define data1          x3
> -#define data2          x4
> -#define data2a         x5
> -#define has_nul1       x6
> -#define has_nul2       x7
> -#define tmp1           x8
> -#define tmp2           x9
> -#define tmp3           x10
> -#define tmp4           x11
> -#define zeroones       x12
> -#define pos            x13
> -#define limit_wd       x14
> -
> -#define dataq          q2
> -#define datav          v2
> -#define datab2         b3
> -#define dataq2         q3
> -#define datav2         v3
> -#define REP8_01 0x0101010101010101
> -#define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
> -
> -ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9)
> +#define synd           x3
> +#define        shift           x4
> +#define wtmp           w4
> +#define tmp            x4
> +#define cntrem         x5
> +
> +#define qdata          q0
> +#define vdata          v0
> +#define vhas_chr       v1
> +#define vrepmask       v2
> +#define vend           v3
> +#define dend           d3
> +
> +/*
> +   Core algorithm:
> +
> +   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> +   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> +   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> +   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> +   bits in the syndrome reflect the order in which things occur in the original
> +   string, counting trailing zeros identifies exactly which byte matched.  */
> +
> +ENTRY (__strnlen)
>         PTR_ARG (0)
>         SIZE_ARG (1)
> -       cbz     limit, L(hit_limit)
> -       mov     zeroones, #REP8_01
> -       bic     src, srcin, #15
> -       ands    tmp1, srcin, #15
> -       b.ne    L(misaligned)
> -       /* Calculate the number of full and partial words -1.  */
> -       sub     limit_wd, limit, #1     /* Limit != 0, so no underflow.  */
> -       lsr     limit_wd, limit_wd, #4  /* Convert to Qwords.  */
> -
> -       /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> -          (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> -          can be done in parallel across the entire word.  */
> -       /* The inner loop deals with two Dwords at a time.  This has a
> -          slightly higher start-up cost, but we should win quite quickly,
> -          especially on cores with a high number of issue slots per
> -          cycle, as we get much better parallelism out of the operations.  */
> -
> -       /* Start of critial section -- keep to one 64Byte cache line.  */
> -
> -       ldp     data1, data2, [src], #16
> -L(realigned):
> -       sub     tmp1, data1, zeroones
> -       orr     tmp2, data1, #REP8_7f
> -       sub     tmp3, data2, zeroones
> -       orr     tmp4, data2, #REP8_7f
> -       bic     has_nul1, tmp1, tmp2
> -       bic     has_nul2, tmp3, tmp4
> -       subs    limit_wd, limit_wd, #1
> -       orr     tmp1, has_nul1, has_nul2
> -       ccmp    tmp1, #0, #0, pl        /* NZCV = 0000  */
> -       b.eq    L(loop)
> -       /* End of critical section -- keep to one 64Byte cache line.  */
> -
> -       orr     tmp1, has_nul1, has_nul2
> -       cbz     tmp1, L(hit_limit)      /* No null in final Qword.  */
> -
> -       /* We know there's a null in the final Qword.  The easiest thing
> -          to do now is work out the length of the string and return
> -          MIN (len, limit).  */
> -
> -       sub     len, src, srcin
> -       cbz     has_nul1, L(nul_in_data2)
> -#ifdef __AARCH64EB__
> -       mov     data2, data1
> +       bic     src, srcin, 15
> +       mov     wtmp, 0xf00f
> +       cbz     cntin, L(nomatch)
> +       ld1     {vdata.16b}, [src], 16
> +       dup     vrepmask.8h, wtmp
> +       cmeq    vhas_chr.16b, vdata.16b, 0
> +       lsl     shift, srcin, 2
> +       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> +       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
> +       fmov    synd, dend
> +       lsr     synd, synd, shift
> +       cbz     synd, L(start_loop)
> +L(finish):
> +       rbit    synd, synd
> +       clz     synd, synd
> +       lsr     result, synd, 2
> +       cmp     cntin, result
> +       csel    result, cntin, result, ls
> +       ret
> +
> +L(start_loop):
> +       sub     tmp, src, srcin
> +       subs    cntrem, cntin, tmp
> +       b.ls    L(nomatch)
> +
> +       /* Make sure that it won't overread by a 16-byte chunk */
> +       add     tmp, cntrem, 15
> +       tbnz    tmp, 4, L(loop32_2)
> +
> +       .p2align 5
> +L(loop32):
> +       ldr     qdata, [src], 16
> +       cmeq    vhas_chr.16b, vdata.16b, 0
> +       umaxp   vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
> +       fmov    synd, dend
> +       cbnz    synd, L(end)
> +L(loop32_2):
> +       ldr     qdata, [src], 16
> +       subs    cntrem, cntrem, 32
> +       cmeq    vhas_chr.16b, vdata.16b, 0
> +       b.ls    L(end)
> +       umaxp   vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
> +       fmov    synd, dend
> +       cbz     synd, L(loop32)
> +
> +L(end):
> +       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> +       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
> +       sub     src, src, 16
> +       mov     synd, vend.d[0]
> +       sub     result, src, srcin
> +#ifndef __AARCH64EB__
> +       rbit    synd, synd
>  #endif
> -       sub     len, len, #8
> -       mov     has_nul2, has_nul1
> -L(nul_in_data2):
> -#ifdef __AARCH64EB__
> -       /* For big-endian, carry propagation (if the final byte in the
> -          string is 0x01) means we cannot use has_nul directly.  The
> -          easiest way to get the correct byte is to byte-swap the data
> -          and calculate the syndrome a second time.  */
> -       rev     data2, data2
> -       sub     tmp1, data2, zeroones
> -       orr     tmp2, data2, #REP8_7f
> -       bic     has_nul2, tmp1, tmp2
> -#endif
> -       sub     len, len, #8
> -       rev     has_nul2, has_nul2
> -       clz     pos, has_nul2
> -       add     len, len, pos, lsr #3           /* Bits to bytes.  */
> -       cmp     len, limit
> -       csel    len, len, limit, ls             /* Return the lower value.  */
> -       RET
> -
> -L(loop):
> -       ldr     dataq, [src], #16
> -       uminv   datab2, datav.16b
> -       mov     tmp1, datav2.d[0]
> -       subs    limit_wd, limit_wd, #1
> -       ccmp    tmp1, #0, #4, pl        /* NZCV = 0000  */
> -       b.eq    L(loop_end)
> -       ldr     dataq, [src], #16
> -       uminv   datab2, datav.16b
> -       mov     tmp1, datav2.d[0]
> -       subs    limit_wd, limit_wd, #1
> -       ccmp    tmp1, #0, #4, pl        /* NZCV = 0000  */
> -       b.ne    L(loop)
> -L(loop_end):
> -       /* End of critical section -- keep to one 64Byte cache line.  */
> -
> -       cbnz    tmp1, L(hit_limit)      /* No null in final Qword.  */
> -
> -       /* We know there's a null in the final Qword.  The easiest thing
> -          to do now is work out the length of the string and return
> -          MIN (len, limit).  */
> -
> -#ifdef __AARCH64EB__
> -       rev64   datav.16b, datav.16b
> -#endif
> -       /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
> -          pair of scalars and then compute the length from the earliest NULL
> -          byte.  */
> -
> -       cmeq    datav.16b, datav.16b, #0
> -#ifdef __AARCH64EB__
> -       mov     data1, datav.d[1]
> -       mov     data2, datav.d[0]
> -#else
> -       mov     data1, datav.d[0]
> -       mov     data2, datav.d[1]
> -#endif
> -       cmp     data1, 0
> -       csel    data1, data1, data2, ne
> -       sub     len, src, srcin
> -       sub     len, len, #16
> -       rev     data1, data1
> -       add     tmp2, len, 8
> -       clz     tmp1, data1
> -       csel    len, len, tmp2, ne
> -       add     len, len, tmp1, lsr 3
> -       cmp     len, limit
> -       csel    len, len, limit, ls             /* Return the lower value.  */
> -       RET
> -
> -L(misaligned):
> -       /* Deal with a partial first word.
> -          We're doing two things in parallel here;
> -          1) Calculate the number of words (but avoiding overflow if
> -             limit is near ULONG_MAX) - to do this we need to work out
> -             limit + tmp1 - 1 as a 65-bit value before shifting it;
> -          2) Load and mask the initial data words - we force the bytes
> -             before the ones we are interested in to 0xff - this ensures
> -             early bytes will not hit any zero detection.  */
> -       sub     limit_wd, limit, #1
> -       neg     tmp4, tmp1
> -       cmp     tmp1, #8
> -
> -       and     tmp3, limit_wd, #15
> -       lsr     limit_wd, limit_wd, #4
> -       mov     tmp2, #~0
> -
> -       ldp     data1, data2, [src], #16
> -       lsl     tmp4, tmp4, #3          /* Bytes beyond alignment -> bits.  */
> -       add     tmp3, tmp3, tmp1
> -
> -#ifdef __AARCH64EB__
> -       /* Big-endian.  Early bytes are at MSB.  */
> -       lsl     tmp2, tmp2, tmp4        /* Shift (tmp1 & 63).  */
> -#else
> -       /* Little-endian.  Early bytes are at LSB.  */
> -       lsr     tmp2, tmp2, tmp4        /* Shift (tmp1 & 63).  */
> -#endif
> -       add     limit_wd, limit_wd, tmp3, lsr #4
> -
> -       orr     data1, data1, tmp2
> -       orr     data2a, data2, tmp2
> +       clz     synd, synd
> +       add     result, result, synd, lsr 2
> +       cmp     cntin, result
> +       csel    result, cntin, result, ls
> +       ret
> 
> -       csinv   data1, data1, xzr, le
> -       csel    data2, data2, data2a, le
> -       b       L(realigned)
> +L(nomatch):
> +       mov     result, cntin
> +       ret
> 
> -L(hit_limit):
> -       mov     len, limit
> -       RET
>  END (__strnlen)
>  libc_hidden_def (__strnlen)
>  weak_alias (__strnlen, strnlen)
>

     prev parent reply	other threads:[~2021-06-30  8:42 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-23 15:22 Wilco Dijkstra
2021-06-30  8:41 ` Szabolcs Nagy [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210630084145.GD14854@arm.com \
    --to=szabolcs.nagy@arm.com \
    --cc=Wilco.Dijkstra@arm.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).