Attached are a set of benchmarks of the new code versus the existing
memcpy implementation on a Cortex-A15 platform.

On 15 April 2013 10:56, Will Newton <will.newton@linaro.org> wrote:
>
> Add a high performance memcpy routine optimized for Cortex-A15 with
> variants for use in the presence of NEON and VFP hardware, selected
> at runtime using indirect function support.
>
> This was tested on armv7l-unknown-linux-gnueabihf. One new testsuite
> failure was introduced (elf/ifuncmain5picstatic) which was caused by
> a bug in ld. A fix for that ld issue has been submitted here:
>
>   http://sourceware.org/ml/binutils/2013-04/msg00143.html
>
> ports/ChangeLog.arm:
>
> 2013-04-15  Will Newton  <will.newton@linaro.org>
>
>         * sysdeps/arm/armv7/multiarch/Makefile: New file.
>         * sysdeps/arm/armv7/multiarch/ifunc-impl-list.c: Likewise.
>         * sysdeps/arm/armv7/multiarch/memcpy.S: Likewise.
>         * sysdeps/arm/armv7/multiarch/memcpy_impl.S: Likewise.
>         * sysdeps/arm/armv7/multiarch/memcpy_neon.S: Likewise.
>         * sysdeps/arm/armv7/multiarch/memcpy_vfp.S: Likewise.
>
> Signed-off-by: Will Newton <will.newton@linaro.org>
> ---
>  ports/sysdeps/arm/armv7/multiarch/Makefile         |   3 +
>  .../sysdeps/arm/armv7/multiarch/ifunc-impl-list.c  |  46 ++
>  ports/sysdeps/arm/armv7/multiarch/memcpy.S         |  96 ++++
>  ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S    | 600 +++++++++++++++++++++
>  ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S    |   3 +
>  ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S     |   3 +
>  6 files changed, 751 insertions(+)
>  create mode 100644 ports/sysdeps/arm/armv7/multiarch/Makefile
>  create mode 100644 ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
>  create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy.S
>  create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
>  create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S
>  create mode 100644 ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
>
> diff --git a/ports/sysdeps/arm/armv7/multiarch/Makefile b/ports/sysdeps/arm/armv7/multiarch/Makefile
> new file mode 100644
> index 0000000..e834cc9
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/Makefile
> @@ -0,0 +1,3 @@
> +ifeq ($(subdir),string)
> +sysdep_routines += memcpy_neon memcpy_vfp
> +endif
> diff --git a/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
> new file mode 100644
> index 0000000..176288b
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
> @@ -0,0 +1,46 @@
> +/* Enumerate available IFUNC implementations of a function.  arm version.
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <assert.h>
> +#include <string.h>
> +#include <wchar.h>
> +#include <ldsodefs.h>
> +#include <sysdep.h>
> +#include <ifunc-impl-list.h>
> +
> +/* Fill ARRAY of MAX elements with IFUNC implementations for function
> +   NAME and return the number of valid entries.  */
> +
> +size_t
> +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> +                       size_t max)
> +{
> +  size_t i = 0;
> +  int hwcap;
> +
> +  hwcap = GLRO(dl_hwcap);
> +
> +  IFUNC_IMPL (i, name, memcpy,
> +             IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_ARM_NEON,
> +                             __memcpy_neon)
> +             IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_ARM_VFPv3,
> +                             __memcpy_vfp)
> +             IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm));
> +
> +  return i;
> +}
> diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy.S b/ports/sysdeps/arm/armv7/multiarch/memcpy.S
> new file mode 100644
> index 0000000..a9e2faf
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/memcpy.S
> @@ -0,0 +1,96 @@
> +/* Multiple versions of memcpy
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include <rtld-global-offsets.h>
> +#include <arm-features.h>
> +
> +#if !defined NOT_IN_libc
> +       .text
> +ENTRY(memcpy)
> +       .type   memcpy, %gnu_indirect_function
> +# ifdef PIC
> +       ldr     a3, 1f
> +0:     add     a3, pc, a3
> +# endif
> +
> +       tst     a1, #HWCAP_ARM_NEON
> +       beq     .Lno_neon
> +# ifdef PIC
> +       ldr     a4, .Lmemcpy_neon
> +       ldr     r0, [a3, a4]
> +# else
> +       ldr     r0, .Lmemcpy_neon
> +# endif
> +       b       .Lreturn
> +.Lno_neon:
> +
> +       tst     a1, #HWCAP_ARM_VFP
> +       beq     .Lno_vfp
> +# ifdef PIC
> +       ldr     a4, .Lmemcpy_vfp
> +       ldr     r0, [a3, a4]
> +# else
> +       ldr     r0, .Lmemcpy_vfp
> +# endif
> +       b       .Lreturn
> +.Lno_vfp:
> +# ifdef PIC
> +       ldr     a4, .Lmemcpy_arm
> +       ldr     r0, [a3, a4]
> +# else
> +       ldr     r0, .Lmemcpy_arm
> +# endif
> +
> +.Lreturn:
> +       DO_RET(lr)
> +
> +# ifdef PIC
> +1:     .long   _GLOBAL_OFFSET_TABLE_ - 0b - PC_OFS
> +.Lmemcpy_neon:
> +       .long   C_SYMBOL_NAME(__memcpy_neon)(GOT)
> +.Lmemcpy_vfp:
> +       .long   C_SYMBOL_NAME(__memcpy_vfp)(GOT)
> +.Lmemcpy_arm:
> +       .long   C_SYMBOL_NAME(__memcpy_arm)(GOT)
> +# else
> +.Lmemcpy_neon:
> +       .long   C_SYMBOL_NAME(__memcpy_neon)
> +.Lmemcpy_vfp:
> +       .long   C_SYMBOL_NAME(__memcpy_vfp)
> +.Lmemcpy_arm:
> +       .long   C_SYMBOL_NAME(__memcpy_arm)
> +# endif
> +
> +END(memcpy)
> +
> +libc_hidden_builtin_def (memcpy)
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +#undef weak_alias
> +#define weak_alias(x, y)
> +#undef libc_hidden_def
> +#define libc_hidden_def(name)
> +
> +#define memcpy __memcpy_arm
> +
> +#endif
> +
> +#include "memcpy_impl.S"
> diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
> new file mode 100644
> index 0000000..2c466d25
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
> @@ -0,0 +1,600 @@
> +/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.
> +
> +   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
> +   of VFP or NEON when built with the appropriate flags.
> +
> +   Assumptions:
> +
> +    ARMv6 (ARMv7-a if using Neon)
> +    ARM state
> +    Unaligned accesses
> +    LDRD/STRD support unaligned word accesses
> +
> + */
> +
> +#include <sysdep.h>
> +
> +       .syntax unified
> +       /* This implementation requires ARM state.  */
> +       .arm
> +
> +#ifdef MEMCPY_NEON
> +
> +       .fpu    neon
> +       .arch   armv7-a
> +# define FRAME_SIZE    4
> +# define USE_VFP
> +# define USE_NEON
> +
> +#elif defined (MEMCPY_VFP)
> +
> +       .arch   armv6
> +       .fpu    vfpv2
> +# define FRAME_SIZE    32
> +# define USE_VFP
> +
> +#else
> +       .arch   armv6
> +# define FRAME_SIZE    32
> +
> +#endif
> +
> +/* Old versions of GAS incorrectly implement the NEON align semantics.  */
> +#ifdef BROKEN_ASM_NEON_ALIGN
> +#define ALIGN(addr, align) addr,:align
> +#else
> +#define ALIGN(addr, align) addr:align
> +#endif
> +
> +#define PC_OFFSET      8       /* PC pipeline compensation.  */
> +#define INSN_SIZE      4
> +
> +/* Call parameters.  */
> +#define dstin  r0
> +#define src    r1
> +#define count  r2
> +
> +/* Locals.  */
> +#define tmp1   r3
> +#define dst    ip
> +#define tmp2   r10
> +
> +#ifndef USE_NEON
> +/* For bulk copies using GP registers.  */
> +#define        A_l     r2              /* Call-clobbered.  */
> +#define        A_h     r3              /* Call-clobbered.  */
> +#define        B_l     r4
> +#define        B_h     r5
> +#define        C_l     r6
> +#define        C_h     r7
> +#define        D_l     r8
> +#define        D_h     r9
> +#endif
> +
> +/* Number of lines ahead to pre-fetch data.  If you change this the code
> +   below will need adjustment to compensate.  */
> +
> +#define prefetch_lines 5
> +
> +#ifdef USE_VFP
> +       .macro  cpy_line_vfp vreg, base
> +       vstr    \vreg, [dst, #\base]
> +       vldr    \vreg, [src, #\base]
> +       vstr    d0, [dst, #\base + 8]
> +       vldr    d0, [src, #\base + 8]
> +       vstr    d1, [dst, #\base + 16]
> +       vldr    d1, [src, #\base + 16]
> +       vstr    d2, [dst, #\base + 24]
> +       vldr    d2, [src, #\base + 24]
> +       vstr    \vreg, [dst, #\base + 32]
> +       vldr    \vreg, [src, #\base + prefetch_lines * 64 - 32]
> +       vstr    d0, [dst, #\base + 40]
> +       vldr    d0, [src, #\base + 40]
> +       vstr    d1, [dst, #\base + 48]
> +       vldr    d1, [src, #\base + 48]
> +       vstr    d2, [dst, #\base + 56]
> +       vldr    d2, [src, #\base + 56]
> +       .endm
> +
> +       .macro  cpy_tail_vfp vreg, base
> +       vstr    \vreg, [dst, #\base]
> +       vldr    \vreg, [src, #\base]
> +       vstr    d0, [dst, #\base + 8]
> +       vldr    d0, [src, #\base + 8]
> +       vstr    d1, [dst, #\base + 16]
> +       vldr    d1, [src, #\base + 16]
> +       vstr    d2, [dst, #\base + 24]
> +       vldr    d2, [src, #\base + 24]
> +       vstr    \vreg, [dst, #\base + 32]
> +       vstr    d0, [dst, #\base + 40]
> +       vldr    d0, [src, #\base + 40]
> +       vstr    d1, [dst, #\base + 48]
> +       vldr    d1, [src, #\base + 48]
> +       vstr    d2, [dst, #\base + 56]
> +       vldr    d2, [src, #\base + 56]
> +       .endm
> +#endif
> +
> +       .p2align 6
> +ENTRY(memcpy)
> +
> +       mov     dst, dstin      /* Preserve dstin, we need to return it.  */
> +       cmp     count, #64
> +       bge     .Lcpy_not_short
> +       /* Deal with small copies quickly by dropping straight into the
> +          exit block.  */
> +
> +.Ltail63unaligned:
> +#ifdef USE_NEON
> +       and     tmp1, count, #0x38
> +       rsb     tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
> +       add     pc, pc, tmp1
> +       vld1.8  {d0}, [src]!    /* 14 words to go.  */
> +       vst1.8  {d0}, [dst]!
> +       vld1.8  {d0}, [src]!    /* 12 words to go.  */
> +       vst1.8  {d0}, [dst]!
> +       vld1.8  {d0}, [src]!    /* 10 words to go.  */
> +       vst1.8  {d0}, [dst]!
> +       vld1.8  {d0}, [src]!    /* 8 words to go.  */
> +       vst1.8  {d0}, [dst]!
> +       vld1.8  {d0}, [src]!    /* 6 words to go.  */
> +       vst1.8  {d0}, [dst]!
> +       vld1.8  {d0}, [src]!    /* 4 words to go.  */
> +       vst1.8  {d0}, [dst]!
> +       vld1.8  {d0}, [src]!    /* 2 words to go.  */
> +       vst1.8  {d0}, [dst]!
> +
> +       tst     count, #4
> +       ldrne   tmp1, [src], #4
> +       strne   tmp1, [dst], #4
> +#else
> +       /* Copy up to 15 full words of data.  May not be aligned.  */
> +       /* Cannot use VFP for unaligned data.  */
> +       and     tmp1, count, #0x3c
> +       add     dst, dst, tmp1
> +       add     src, src, tmp1
> +       rsb     tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
> +       /* Jump directly into the sequence below at the correct offset.  */
> +       add     pc, pc, tmp1, lsl #1
> +
> +       ldr     tmp1, [src, #-60]       /* 15 words to go.  */
> +       str     tmp1, [dst, #-60]
> +
> +       ldr     tmp1, [src, #-56]       /* 14 words to go.  */
> +       str     tmp1, [dst, #-56]
> +       ldr     tmp1, [src, #-52]
> +       str     tmp1, [dst, #-52]
> +
> +       ldr     tmp1, [src, #-48]       /* 12 words to go.  */
> +       str     tmp1, [dst, #-48]
> +       ldr     tmp1, [src, #-44]
> +       str     tmp1, [dst, #-44]
> +
> +       ldr     tmp1, [src, #-40]       /* 10 words to go.  */
> +       str     tmp1, [dst, #-40]
> +       ldr     tmp1, [src, #-36]
> +       str     tmp1, [dst, #-36]
> +
> +       ldr     tmp1, [src, #-32]       /* 8 words to go.  */
> +       str     tmp1, [dst, #-32]
> +       ldr     tmp1, [src, #-28]
> +       str     tmp1, [dst, #-28]
> +
> +       ldr     tmp1, [src, #-24]       /* 6 words to go.  */
> +       str     tmp1, [dst, #-24]
> +       ldr     tmp1, [src, #-20]
> +       str     tmp1, [dst, #-20]
> +
> +       ldr     tmp1, [src, #-16]       /* 4 words to go.  */
> +       str     tmp1, [dst, #-16]
> +       ldr     tmp1, [src, #-12]
> +       str     tmp1, [dst, #-12]
> +
> +       ldr     tmp1, [src, #-8]        /* 2 words to go.  */
> +       str     tmp1, [dst, #-8]
> +       ldr     tmp1, [src, #-4]
> +       str     tmp1, [dst, #-4]
> +#endif
> +
> +       lsls    count, count, #31
> +       ldrhcs  tmp1, [src], #2
> +       ldrbne  src, [src]              /* Src is dead, use as a scratch.  */
> +       strhcs  tmp1, [dst], #2
> +       strbne  src, [dst]
> +       bx      lr
> +
> +.Lcpy_not_short:
> +       /* At least 64 bytes to copy, but don't know the alignment yet.  */
> +       str     tmp2, [sp, #-FRAME_SIZE]!
> +       and     tmp2, src, #3
> +       and     tmp1, dst, #3
> +       cmp     tmp1, tmp2
> +       bne     .Lcpy_notaligned
> +
> +#ifdef USE_VFP
> +       /* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
> +          that the FP pipeline is much better at streaming loads and
> +          stores.  This is outside the critical loop.  */
> +       vmov.f32        s0, s0
> +#endif
> +
> +       /* SRC and DST have the same mutual 32-bit alignment, but we may
> +          still need to pre-copy some bytes to get to natural alignment.
> +          We bring DST into full 64-bit alignment.  */
> +       lsls    tmp2, dst, #29
> +       beq     1f
> +       rsbs    tmp2, tmp2, #0
> +       sub     count, count, tmp2, lsr #29
> +       ldrmi   tmp1, [src], #4
> +       strmi   tmp1, [dst], #4
> +       lsls    tmp2, tmp2, #2
> +       ldrhcs  tmp1, [src], #2
> +       ldrbne  tmp2, [src], #1
> +       strhcs  tmp1, [dst], #2
> +       strbne  tmp2, [dst], #1
> +
> +1:
> +       subs    tmp2, count, #64        /* Use tmp2 for count.  */
> +       blt     .Ltail63aligned
> +
> +       cmp     tmp2, #512
> +       bge     .Lcpy_body_long
> +
> +.Lcpy_body_medium:                     /* Count in tmp2.  */
> +#ifdef USE_VFP
> +1:
> +       vldr    d0, [src, #0]
> +       subs    tmp2, tmp2, #64
> +       vldr    d1, [src, #8]
> +       vstr    d0, [dst, #0]
> +       vldr    d0, [src, #16]
> +       vstr    d1, [dst, #8]
> +       vldr    d1, [src, #24]
> +       vstr    d0, [dst, #16]
> +       vldr    d0, [src, #32]
> +       vstr    d1, [dst, #24]
> +       vldr    d1, [src, #40]
> +       vstr    d0, [dst, #32]
> +       vldr    d0, [src, #48]
> +       vstr    d1, [dst, #40]
> +       vldr    d1, [src, #56]
> +       vstr    d0, [dst, #48]
> +       add     src, src, #64
> +       vstr    d1, [dst, #56]
> +       add     dst, dst, #64
> +       bge     1b
> +       tst     tmp2, #0x3f
> +       beq     .Ldone
> +
> +.Ltail63aligned:                       /* Count in tmp2.  */
> +       and     tmp1, tmp2, #0x38
> +       add     dst, dst, tmp1
> +       add     src, src, tmp1
> +       rsb     tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
> +       add     pc, pc, tmp1
> +
> +       vldr    d0, [src, #-56] /* 14 words to go.  */
> +       vstr    d0, [dst, #-56]
> +       vldr    d0, [src, #-48] /* 12 words to go.  */
> +       vstr    d0, [dst, #-48]
> +       vldr    d0, [src, #-40] /* 10 words to go.  */
> +       vstr    d0, [dst, #-40]
> +       vldr    d0, [src, #-32] /* 8 words to go.  */
> +       vstr    d0, [dst, #-32]
> +       vldr    d0, [src, #-24] /* 6 words to go.  */
> +       vstr    d0, [dst, #-24]
> +       vldr    d0, [src, #-16] /* 4 words to go.  */
> +       vstr    d0, [dst, #-16]
> +       vldr    d0, [src, #-8]  /* 2 words to go.  */
> +       vstr    d0, [dst, #-8]
> +#else
> +       sub     src, src, #8
> +       sub     dst, dst, #8
> +1:
> +       ldrd    A_l, A_h, [src, #8]
> +       strd    A_l, A_h, [dst, #8]
> +       ldrd    A_l, A_h, [src, #16]
> +       strd    A_l, A_h, [dst, #16]
> +       ldrd    A_l, A_h, [src, #24]
> +       strd    A_l, A_h, [dst, #24]
> +       ldrd    A_l, A_h, [src, #32]
> +       strd    A_l, A_h, [dst, #32]
> +       ldrd    A_l, A_h, [src, #40]
> +       strd    A_l, A_h, [dst, #40]
> +       ldrd    A_l, A_h, [src, #48]
> +       strd    A_l, A_h, [dst, #48]
> +       ldrd    A_l, A_h, [src, #56]
> +       strd    A_l, A_h, [dst, #56]
> +       ldrd    A_l, A_h, [src, #64]!
> +       strd    A_l, A_h, [dst, #64]!
> +       subs    tmp2, tmp2, #64
> +       bge     1b
> +       tst     tmp2, #0x3f
> +       bne     1f
> +       ldr     tmp2,[sp], #FRAME_SIZE
> +       bx      lr
> +1:
> +       add     src, src, #8
> +       add     dst, dst, #8
> +
> +.Ltail63aligned:                       /* Count in tmp2.  */
> +       /* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
> +          we know that the src and dest are 32-bit aligned so we can use
> +          LDRD/STRD to improve efficiency.  */
> +       /* TMP2 is now negative, but we don't care about that.  The bottom
> +          six bits still tell us how many bytes are left to copy.  */
> +
> +       and     tmp1, tmp2, #0x38
> +       add     dst, dst, tmp1
> +       add     src, src, tmp1
> +       rsb     tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
> +       add     pc, pc, tmp1
> +       ldrd    A_l, A_h, [src, #-56]   /* 14 words to go.  */
> +       strd    A_l, A_h, [dst, #-56]
> +       ldrd    A_l, A_h, [src, #-48]   /* 12 words to go.  */
> +       strd    A_l, A_h, [dst, #-48]
> +       ldrd    A_l, A_h, [src, #-40]   /* 10 words to go.  */
> +       strd    A_l, A_h, [dst, #-40]
> +       ldrd    A_l, A_h, [src, #-32]   /* 8 words to go.  */
> +       strd    A_l, A_h, [dst, #-32]
> +       ldrd    A_l, A_h, [src, #-24]   /* 6 words to go.  */
> +       strd    A_l, A_h, [dst, #-24]
> +       ldrd    A_l, A_h, [src, #-16]   /* 4 words to go.  */
> +       strd    A_l, A_h, [dst, #-16]
> +       ldrd    A_l, A_h, [src, #-8]    /* 2 words to go.  */
> +       strd    A_l, A_h, [dst, #-8]
> +
> +#endif
> +       tst     tmp2, #4
> +       ldrne   tmp1, [src], #4
> +       strne   tmp1, [dst], #4
> +       lsls    tmp2, tmp2, #31         /* Count (tmp2) now dead. */
> +       ldrhcs  tmp1, [src], #2
> +       ldrbne  tmp2, [src]
> +       strhcs  tmp1, [dst], #2
> +       strbne  tmp2, [dst]
> +
> +.Ldone:
> +       ldr     tmp2, [sp], #FRAME_SIZE
> +       bx      lr
> +
> +.Lcpy_body_long:                       /* Count in tmp2.  */
> +
> +       /* Long copy.  We know that there's at least (prefetch_lines * 64)
> +          bytes to go.  */
> +#ifdef USE_VFP
> +       /* Don't use PLD.  Instead, read some data in advance of the current
> +          copy position into a register.  This should act like a PLD
> +          operation but we won't have to repeat the transfer.  */
> +
> +       vldr    d3, [src, #0]
> +       vldr    d4, [src, #64]
> +       vldr    d5, [src, #128]
> +       vldr    d6, [src, #192]
> +       vldr    d7, [src, #256]
> +
> +       vldr    d0, [src, #8]
> +       vldr    d1, [src, #16]
> +       vldr    d2, [src, #24]
> +       add     src, src, #32
> +
> +       subs    tmp2, tmp2, #prefetch_lines * 64 * 2
> +       blt     2f
> +1:
> +       cpy_line_vfp    d3, 0
> +       cpy_line_vfp    d4, 64
> +       cpy_line_vfp    d5, 128
> +       add     dst, dst, #3 * 64
> +       add     src, src, #3 * 64
> +       cpy_line_vfp    d6, 0
> +       cpy_line_vfp    d7, 64
> +       add     dst, dst, #2 * 64
> +       add     src, src, #2 * 64
> +       subs    tmp2, tmp2, #prefetch_lines * 64
> +       bge     1b
> +
> +2:
> +       cpy_tail_vfp    d3, 0
> +       cpy_tail_vfp    d4, 64
> +       cpy_tail_vfp    d5, 128
> +       add     src, src, #3 * 64
> +       add     dst, dst, #3 * 64
> +       cpy_tail_vfp    d6, 0
> +       vstr    d7, [dst, #64]
> +       vldr    d7, [src, #64]
> +       vstr    d0, [dst, #64 + 8]
> +       vldr    d0, [src, #64 + 8]
> +       vstr    d1, [dst, #64 + 16]
> +       vldr    d1, [src, #64 + 16]
> +       vstr    d2, [dst, #64 + 24]
> +       vldr    d2, [src, #64 + 24]
> +       vstr    d7, [dst, #64 + 32]
> +       add     src, src, #96
> +       vstr    d0, [dst, #64 + 40]
> +       vstr    d1, [dst, #64 + 48]
> +       vstr    d2, [dst, #64 + 56]
> +       add     dst, dst, #128
> +       add     tmp2, tmp2, #prefetch_lines * 64
> +       b       .Lcpy_body_medium
> +#else
> +       /* Long copy.  Use an SMS style loop to maximize the I/O
> +          bandwidth of the core.  We don't have enough spare registers
> +          to synthesise prefetching, so use PLD operations.  */
> +       /* Pre-bias src and dst.  */
> +       sub     src, src, #8
> +       sub     dst, dst, #8
> +       pld     [src, #8]
> +       pld     [src, #72]
> +       subs    tmp2, tmp2, #64
> +       pld     [src, #136]
> +       ldrd    A_l, A_h, [src, #8]
> +       strd    B_l, B_h, [sp, #8]
> +       ldrd    B_l, B_h, [src, #16]
> +       strd    C_l, C_h, [sp, #16]
> +       ldrd    C_l, C_h, [src, #24]
> +       strd    D_l, D_h, [sp, #24]
> +       pld     [src, #200]
> +       ldrd    D_l, D_h, [src, #32]!
> +       b       1f
> +       .p2align        6
> +2:
> +       pld     [src, #232]
> +       strd    A_l, A_h, [dst, #40]
> +       ldrd    A_l, A_h, [src, #40]
> +       strd    B_l, B_h, [dst, #48]
> +       ldrd    B_l, B_h, [src, #48]
> +       strd    C_l, C_h, [dst, #56]
> +       ldrd    C_l, C_h, [src, #56]
> +       strd    D_l, D_h, [dst, #64]!
> +       ldrd    D_l, D_h, [src, #64]!
> +       subs    tmp2, tmp2, #64
> +1:
> +       strd    A_l, A_h, [dst, #8]
> +       ldrd    A_l, A_h, [src, #8]
> +       strd    B_l, B_h, [dst, #16]
> +       ldrd    B_l, B_h, [src, #16]
> +       strd    C_l, C_h, [dst, #24]
> +       ldrd    C_l, C_h, [src, #24]
> +       strd    D_l, D_h, [dst, #32]
> +       ldrd    D_l, D_h, [src, #32]
> +       bcs     2b
> +       /* Save the remaining bytes and restore the callee-saved regs.  */
> +       strd    A_l, A_h, [dst, #40]
> +       add     src, src, #40
> +       strd    B_l, B_h, [dst, #48]
> +       ldrd    B_l, B_h, [sp, #8]
> +       strd    C_l, C_h, [dst, #56]
> +       ldrd    C_l, C_h, [sp, #16]
> +       strd    D_l, D_h, [dst, #64]
> +       ldrd    D_l, D_h, [sp, #24]
> +       add     dst, dst, #72
> +       tst     tmp2, #0x3f
> +       bne     .Ltail63aligned
> +       ldr     tmp2, [sp], #FRAME_SIZE
> +       bx      lr
> +#endif
> +
> +.Lcpy_notaligned:
> +       pld     [src]
> +       pld     [src, #64]
> +       /* There's at least 64 bytes to copy, but there is no mutual
> +          alignment.  */
> +       /* Bring DST to 64-bit alignment.  */
> +       lsls    tmp2, dst, #29
> +       pld     [src, #(2 * 64)]
> +       beq     1f
> +       rsbs    tmp2, tmp2, #0
> +       sub     count, count, tmp2, lsr #29
> +       ldrmi   tmp1, [src], #4
> +       strmi   tmp1, [dst], #4
> +       lsls    tmp2, tmp2, #2
> +       ldrbne  tmp1, [src], #1
> +       ldrhcs  tmp2, [src], #2
> +       strbne  tmp1, [dst], #1
> +       strhcs  tmp2, [dst], #2
> +1:
> +       pld     [src, #(3 * 64)]
> +       subs    count, count, #64
> +       ldrmi   tmp2, [sp], #FRAME_SIZE
> +       bmi     .Ltail63unaligned
> +       pld     [src, #(4 * 64)]
> +
> +#ifdef USE_NEON
> +       vld1.8  {d0-d3}, [src]!
> +       vld1.8  {d4-d7}, [src]!
> +       subs    count, count, #64
> +       bmi     2f
> +1:
> +       pld     [src, #(4 * 64)]
> +       vst1.8  {d0-d3}, [ALIGN (dst, 64)]!
> +       vld1.8  {d0-d3}, [src]!
> +       vst1.8  {d4-d7}, [ALIGN (dst, 64)]!
> +       vld1.8  {d4-d7}, [src]!
> +       subs    count, count, #64
> +       bpl     1b
> +2:
> +       vst1.8  {d0-d3}, [ALIGN (dst, 64)]!
> +       vst1.8  {d4-d7}, [ALIGN (dst, 64)]!
> +       ands    count, count, #0x3f
> +#else
> +       /* Use an SMS style loop to maximize the I/O bandwidth.  */
> +       sub     src, src, #4
> +       sub     dst, dst, #8
> +       subs    tmp2, count, #64        /* Use tmp2 for count.  */
> +       ldr     A_l, [src, #4]
> +       ldr     A_h, [src, #8]
> +       strd    B_l, B_h, [sp, #8]
> +       ldr     B_l, [src, #12]
> +       ldr     B_h, [src, #16]
> +       strd    C_l, C_h, [sp, #16]
> +       ldr     C_l, [src, #20]
> +       ldr     C_h, [src, #24]
> +       strd    D_l, D_h, [sp, #24]
> +       ldr     D_l, [src, #28]
> +       ldr     D_h, [src, #32]!
> +       b       1f
> +       .p2align        6
> +2:
> +       pld     [src, #(5 * 64) - (32 - 4)]
> +       strd    A_l, A_h, [dst, #40]
> +       ldr     A_l, [src, #36]
> +       ldr     A_h, [src, #40]
> +       strd    B_l, B_h, [dst, #48]
> +       ldr     B_l, [src, #44]
> +       ldr     B_h, [src, #48]
> +       strd    C_l, C_h, [dst, #56]
> +       ldr     C_l, [src, #52]
> +       ldr     C_h, [src, #56]
> +       strd    D_l, D_h, [dst, #64]!
> +       ldr     D_l, [src, #60]
> +       ldr     D_h, [src, #64]!
> +       subs    tmp2, tmp2, #64
> +1:
> +       strd    A_l, A_h, [dst, #8]
> +       ldr     A_l, [src, #4]
> +       ldr     A_h, [src, #8]
> +       strd    B_l, B_h, [dst, #16]
> +       ldr     B_l, [src, #12]
> +       ldr     B_h, [src, #16]
> +       strd    C_l, C_h, [dst, #24]
> +       ldr     C_l, [src, #20]
> +       ldr     C_h, [src, #24]
> +       strd    D_l, D_h, [dst, #32]
> +       ldr     D_l, [src, #28]
> +       ldr     D_h, [src, #32]
> +       bcs     2b
> +
> +       /* Save the remaining bytes and restore the callee-saved regs.  */
> +       strd    A_l, A_h, [dst, #40]
> +       add     src, src, #36
> +       strd    B_l, B_h, [dst, #48]
> +       ldrd    B_l, B_h, [sp, #8]
> +       strd    C_l, C_h, [dst, #56]
> +       ldrd    C_l, C_h, [sp, #16]
> +       strd    D_l, D_h, [dst, #64]
> +       ldrd    D_l, D_h, [sp, #24]
> +       add     dst, dst, #72
> +       ands    count, tmp2, #0x3f
> +#endif
> +       ldr     tmp2, [sp], #FRAME_SIZE
> +       bne     .Ltail63unaligned
> +       bx      lr
> +
> +END(memcpy)
> +libc_hidden_builtin_def (memcpy)
> diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S
> new file mode 100644
> index 0000000..c0ef1f8
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_neon.S
> @@ -0,0 +1,3 @@
> +#define MEMCPY_NEON
> +#define memcpy __memcpy_neon
> +#include "memcpy_impl.S"
> diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
> new file mode 100644
> index 0000000..d21b702
> --- /dev/null
> +++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
> @@ -0,0 +1,3 @@
> +#define MEMCPY_VFP
> +#define memcpy __memcpy_vfp
> +#include "memcpy_impl.S"
> --
> 1.8.1.4
>


-- 
Will Newton
Toolchain Working Group, Linaro