public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Szabolcs Nagy <Szabolcs.Nagy@arm.com>
To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Cc: 'GNU C Library' <libc-alpha@sourceware.org>
Subject: Re: [PATCH] AArch64: Add SVE memcpy
Date: Tue, 7 Jun 2022 14:55:06 +0100	[thread overview]
Message-ID: <Yp9YusSE55Uz1OUB@arm.com> (raw)
In-Reply-To: <DB6PR0801MB1879E66ED9048B425C61981D83DF9@DB6PR0801MB1879.eurprd08.prod.outlook.com>

The 06/01/2022 17:03, Wilco Dijkstra wrote:
> Hi Szabolcs,
> 
> > i'd expect MAX_IFUNC to be updated whenever we add a new memcpy.
> 
> Yes this macro is quite broken, the value 7 should be correct, but I've set it to 8 for now. The correct approach is to check for 'max' inside IFUNC_IMPL_ADD.
> 
> Here is v2 (which also improves the readability of the ifunc selectors):
> 
> Add an initial SVE memcpy implementation.  Copies up to 32 bytes use SVE vectors
> which improves the random memcpy benchmark significantly.  Cleanup the memcpy
> and memmove ifunc selectors.
> 

thanks, this looks good.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>


> ---
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index 7500cf1e9369443daed1c5a938903dcff9d0a45e..490aaa0faf1e74fb7014e2ed6b574d92b8b83637 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -1,6 +1,6 @@
>  ifeq ($(subdir),string)
>  sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \
> -                  memcpy_falkor memcpy_a64fx \
> +                  memcpy_falkor memcpy_a64fx memcpy_sve \
>                    memset_generic memset_falkor memset_emag memset_kunpeng \
>                    memset_a64fx \
>                    memchr_generic memchr_nosimd \
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 483fbbdb523c9a5b697a9b7f63153d2cefb482ac..f6c6d008da285122cf3549efa5adf05d5e098680 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -25,7 +25,7 @@
>  #include <stdio.h>
> 
>  /* Maximum number of IFUNC implementations.  */
> -#define MAX_IFUNC      7
> +#define MAX_IFUNC      8
> 
>  size_t
>  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> @@ -45,6 +45,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
>  #if HAVE_AARCH64_SVE_ASM
>               IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
> +             IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve)
>  #endif
>               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
>    IFUNC_IMPL (i, name, memmove,
> @@ -54,6 +55,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
>  #if HAVE_AARCH64_SVE_ASM
>               IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
> +             IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve)
>  #endif
>               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
>    IFUNC_IMPL (i, name, memset,
> diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
> index a476dd548fd04fddcd0b39cc3263866769bb309a..0486213f08db8ab08a903c4b9e6bdd19df6dccec 100644
> --- a/sysdeps/aarch64/multiarch/memcpy.c
> +++ b/sysdeps/aarch64/multiarch/memcpy.c
> @@ -33,27 +33,38 @@ extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
> -# if HAVE_AARCH64_SVE_ASM
>  extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
> -# endif
> -
> -libc_ifunc (__libc_memcpy,
> -            (IS_THUNDERX (midr)
> -            ? __memcpy_thunderx
> -            : (IS_FALKOR (midr) || IS_PHECDA (midr)
> -               ? __memcpy_falkor
> -               : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
> -                  ? __memcpy_thunderx2
> -                  : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
> -                     || IS_NEOVERSE_V1 (midr)
> -                     ? __memcpy_simd
> -# if HAVE_AARCH64_SVE_ASM
> -                    : (IS_A64FX (midr) && sve
> -                       ? __memcpy_a64fx
> -                       : __memcpy_generic))))));
> -# else
> -                    : __memcpy_generic)))));
> -# endif
> +extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden;
> +
> +static inline __typeof (__redirect_memcpy) *
> +select_memcpy_ifunc (void)
> +{
> +  INIT_ARCH ();
> +
> +  if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
> +    return __memcpy_simd;
> +
> +  if (sve && HAVE_AARCH64_SVE_ASM)
> +    {
> +      if (IS_A64FX (midr))
> +       return __memcpy_a64fx;
> +      return __memcpy_sve;
> +    }
> +
> +  if (IS_THUNDERX (midr))
> +    return __memcpy_thunderx;
> +
> +  if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr))
> +    return __memcpy_thunderx2;
> +
> +  if (IS_FALKOR (midr) || IS_PHECDA (midr))
> +    return __memcpy_falkor;
> +
> +  return __memcpy_generic;
> +}
> +
> +libc_ifunc (__libc_memcpy, select_memcpy_ifunc ());
> +
>  # undef memcpy
>  strong_alias (__libc_memcpy, memcpy);
>  #endif
> diff --git a/sysdeps/aarch64/multiarch/memcpy_sve.S b/sysdeps/aarch64/multiarch/memcpy_sve.S
> new file mode 100644
> index 0000000000000000000000000000000000000000..a70907ec5595114e4da35c54e932279c22e53cc6
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memcpy_sve.S
> @@ -0,0 +1,218 @@
> +/* Optimized memcpy for SVE.
> +   Copyright (C) 2021-2022 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
> + *
> + */
> +
> +#define dstin  x0
> +#define src    x1
> +#define count  x2
> +#define dst    x3
> +#define srcend x4
> +#define dstend x5
> +#define tmp1   x6
> +#define vlen   x6
> +
> +#define A_q    q0
> +#define B_q    q1
> +#define C_q    q2
> +#define D_q    q3
> +#define E_q    q4
> +#define F_q    q5
> +#define G_q    q6
> +#define H_q    q7
> +
> +/* This implementation supports both memcpy and memmove and shares most code.
> +   It uses unaligned accesses and branchless sequences to keep the code small,
> +   simple and improve performance.
> +
> +   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
> +   copies of up to 128 bytes, and large copies.  The overhead of the overlap
> +   check in memmove is negligible since it is only required for large copies.
> +
> +   Large copies use a software pipelined loop processing 64 bytes per iteration.
> +   The source pointer is 16-byte aligned to minimize unaligned accesses.
> +   The loop tail is handled by always copying 64 bytes from the end.
> +*/
> +
> +#if HAVE_AARCH64_SVE_ASM
> +
> +       .arch armv8.2-a+sve
> +
> +ENTRY (__memcpy_sve)
> +       PTR_ARG (0)
> +       PTR_ARG (1)
> +       SIZE_ARG (2)
> +
> +       cmp     count, 128
> +       b.hi    L(copy_long)
> +       cmp     count, 32
> +       b.hi    L(copy32_128)
> +
> +       whilelo p0.b, xzr, count
> +       cntb    vlen
> +       tbnz    vlen, 4, L(vlen128)
> +       ld1b    z0.b, p0/z, [src]
> +       st1b    z0.b, p0, [dstin]
> +       ret
> +
> +       /* Medium copies: 33..128 bytes.  */
> +L(copy32_128):
> +       add     srcend, src, count
> +       add     dstend, dstin, count
> +       ldp     A_q, B_q, [src]
> +       ldp     C_q, D_q, [srcend, -32]
> +       cmp     count, 64
> +       b.hi    L(copy128)
> +       stp     A_q, B_q, [dstin]
> +       stp     C_q, D_q, [dstend, -32]
> +       ret
> +
> +       /* Copy 65..128 bytes.  */
> +L(copy128):
> +       ldp     E_q, F_q, [src, 32]
> +       cmp     count, 96
> +       b.ls    L(copy96)
> +       ldp     G_q, H_q, [srcend, -64]
> +       stp     G_q, H_q, [dstend, -64]
> +L(copy96):
> +       stp     A_q, B_q, [dstin]
> +       stp     E_q, F_q, [dstin, 32]
> +       stp     C_q, D_q, [dstend, -32]
> +       ret
> +
> +L(vlen128):
> +       whilelo p1.b, vlen, count
> +       ld1b    z0.b, p0/z, [src, 0, mul vl]
> +       ld1b    z1.b, p1/z, [src, 1, mul vl]
> +       st1b    z0.b, p0, [dstin, 0, mul vl]
> +       st1b    z1.b, p1, [dstin, 1, mul vl]
> +       ret
> +
> +       .p2align 4
> +       /* Copy more than 128 bytes.  */
> +L(copy_long):
> +       add     srcend, src, count
> +       add     dstend, dstin, count
> +
> +       /* Copy 16 bytes and then align src to 16-byte alignment.  */
> +       ldr     D_q, [src]
> +       and     tmp1, src, 15
> +       bic     src, src, 15
> +       sub     dst, dstin, tmp1
> +       add     count, count, tmp1      /* Count is now 16 too large.  */
> +       ldp     A_q, B_q, [src, 16]
> +       str     D_q, [dstin]
> +       ldp     C_q, D_q, [src, 48]
> +       subs    count, count, 128 + 16  /* Test and readjust count.  */
> +       b.ls    L(copy64_from_end)
> +L(loop64):
> +       stp     A_q, B_q, [dst, 16]
> +       ldp     A_q, B_q, [src, 80]
> +       stp     C_q, D_q, [dst, 48]
> +       ldp     C_q, D_q, [src, 112]
> +       add     src, src, 64
> +       add     dst, dst, 64
> +       subs    count, count, 64
> +       b.hi    L(loop64)
> +
> +       /* Write the last iteration and copy 64 bytes from the end.  */
> +L(copy64_from_end):
> +       ldp     E_q, F_q, [srcend, -64]
> +       stp     A_q, B_q, [dst, 16]
> +       ldp     A_q, B_q, [srcend, -32]
> +       stp     C_q, D_q, [dst, 48]
> +       stp     E_q, F_q, [dstend, -64]
> +       stp     A_q, B_q, [dstend, -32]
> +       ret
> +
> +END (__memcpy_sve)
> +libc_hidden_builtin_def (__memcpy_sve)
> +
> +
> +ENTRY (__memmove_sve)
> +       PTR_ARG (0)
> +       PTR_ARG (1)
> +       SIZE_ARG (2)
> +
> +       cmp     count, 128
> +       b.hi    L(move_long)
> +       cmp     count, 32
> +       b.hi    L(copy32_128)
> +
> +       whilelo p0.b, xzr, count
> +       cntb    vlen
> +       tbnz    vlen, 4, L(vlen128)
> +       ld1b    z0.b, p0/z, [src]
> +       st1b    z0.b, p0, [dstin]
> +       ret
> +
> +       .p2align 4
> +L(move_long):
> +       add     srcend, src, count
> +       add     dstend, dstin, count
> +       /* Only use backward copy if there is an overlap.  */
> +       sub     tmp1, dstin, src
> +       cbz     tmp1, L(return)
> +       cmp     tmp1, count
> +       b.hs    L(copy_long)
> +
> +       /* Large backwards copy for overlapping copies.
> +          Copy 16 bytes and then align srcend to 16-byte alignment.  */
> +       ldr     D_q, [srcend, -16]
> +       and     tmp1, srcend, 15
> +       bic     srcend, srcend, 15
> +       sub     count, count, tmp1
> +       ldp     A_q, B_q, [srcend, -32]
> +       str     D_q, [dstend, -16]
> +       ldp     C_q, D_q, [srcend, -64]
> +       sub     dstend, dstend, tmp1
> +       subs    count, count, 128
> +       b.ls    L(copy64_from_start)
> +
> +L(loop64_backwards):
> +       str     B_q, [dstend, -16]
> +       str     A_q, [dstend, -32]
> +       ldp     A_q, B_q, [srcend, -96]
> +       str     D_q, [dstend, -48]
> +       str     C_q, [dstend, -64]!
> +       ldp     C_q, D_q, [srcend, -128]
> +       sub     srcend, srcend, 64
> +       subs    count, count, 64
> +       b.hi    L(loop64_backwards)
> +
> +       /* Write the last iteration and copy 64 bytes from the start.  */
> +L(copy64_from_start):
> +       ldp     E_q, F_q, [src, 32]
> +       stp     A_q, B_q, [dstend, -32]
> +       ldp     A_q, B_q, [src]
> +       stp     C_q, D_q, [dstend, -64]
> +       stp     E_q, F_q, [dstin, 32]
> +       stp     A_q, B_q, [dstin]
> +L(return):
> +       ret
> +
> +END (__memmove_sve)
> +libc_hidden_builtin_def (__memmove_sve)
> +#endif
> diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
> index 4f7d7eedfd943d1b6a54e90dc973406b30cc8db9..261996ecc4b36bdaf3c50598d910e9c7dcace08b 100644
> --- a/sysdeps/aarch64/multiarch/memmove.c
> +++ b/sysdeps/aarch64/multiarch/memmove.c
> @@ -33,27 +33,38 @@ extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
> -# if HAVE_AARCH64_SVE_ASM
>  extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
> -# endif
> -
> -libc_ifunc (__libc_memmove,
> -            (IS_THUNDERX (midr)
> -            ? __memmove_thunderx
> -            : (IS_FALKOR (midr) || IS_PHECDA (midr)
> -               ? __memmove_falkor
> -               : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
> -                  ? __memmove_thunderx2
> -                  : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
> -                     || IS_NEOVERSE_V1 (midr)
> -                     ? __memmove_simd
> -# if HAVE_AARCH64_SVE_ASM
> -                    : (IS_A64FX (midr) && sve
> -                       ? __memmove_a64fx
> -                       : __memmove_generic))))));
> -# else
> -                    : __memmove_generic)))));
> -# endif
> +extern __typeof (__redirect_memmove) __memmove_sve attribute_hidden;
> +
> +static inline __typeof (__redirect_memmove) *
> +select_memmove_ifunc (void)
> +{
> +  INIT_ARCH ();
> +
> +  if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
> +    return __memmove_simd;
> +
> +  if (sve && HAVE_AARCH64_SVE_ASM)
> +    {
> +      if (IS_A64FX (midr))
> +       return __memmove_a64fx;
> +      return __memmove_sve;
> +    }
> +
> +  if (IS_THUNDERX (midr))
> +    return __memmove_thunderx;
> +
> +  if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr))
> +    return __memmove_thunderx2;
> +
> +  if (IS_FALKOR (midr) || IS_PHECDA (midr))
> +    return __memmove_falkor;
> +
> +  return __memmove_generic;
> +}
> +
> +libc_ifunc (__libc_memmove, select_memmove_ifunc ());
> +
>  # undef memmove
>  strong_alias (__libc_memmove, memmove);
>  #endif
> 

      reply	other threads:[~2022-06-07 13:55 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-05-25 16:04 Wilco Dijkstra
2022-05-30 12:08 ` Szabolcs Nagy
2022-06-01 16:03   ` Wilco Dijkstra
2022-06-07 13:55     ` Szabolcs Nagy [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Yp9YusSE55Uz1OUB@arm.com \
    --to=szabolcs.nagy@arm.com \
    --cc=Wilco.Dijkstra@arm.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).