From: Raphael M Zinsly <rzinsly@linux.ibm.com>
To: "Lucas A. M. Magalhaes" <lamm@linux.ibm.com>, libc-alpha@sourceware.org
Subject: Re: [PATCH] powerpc: Optimized memcmp for power10
Date: Fri, 28 May 2021 15:37:49 -0300 [thread overview]
Message-ID: <0598966a-28f1-4e68-0158-5576bfd82805@linux.ibm.com> (raw)
In-Reply-To: <20210524170732.1739829-1-lamm@linux.ibm.com>
Hi Lucas,
The code LGTM with some formatting fixes.
Reviewed-by: Raphael M Zinsly <rzinsly@linux.ibm.com>
On 24/05/2021 14:07, Lucas A. M. Magalhaes via Libc-alpha wrote:
> This patch was based on the __memcmp_power8 and the recent
> __strlen_power10.
>
> Improvements from __memcmp_power8:
>
> 1. Don't need alignment code.
>
> On POWER10 lxvp and lxvl do not generate alignment interrupts, so
> they are safe for use on caching-inhibited memory. Notice that the
> comparison on the main loop will wait for both VSR to be ready.
> Therefore aligning one of the input address does not improve
> performance. In order to align both registers a vperm is necessary
> which add too much overhead.
>
> 2. Uses new POWER10 instructions
>
> This code uses lxvp to decrease contention on load by loading 32 bytes
> per instruction.
> The vextractbm is used to have a smaller tail code for calculating the
> return value.
>
> 3. Performance improvement
>
> This version has around 35% better performance on average. I saw no
> performance regressions for any length or alignment.
>
> Thanks Matheus for helping me out with some details.
>
> Co-authored-by: Matheus Castanho <msc@linux.ibm.com>
> ---
> sysdeps/powerpc/powerpc64/le/power10/memcmp.S | 181 ++++++++++++++++++
> sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
> .../powerpc64/multiarch/ifunc-impl-list.c | 6 +
> .../powerpc64/multiarch/memcmp-power10.S | 26 +++
> sysdeps/powerpc/powerpc64/multiarch/memcmp.c | 6 +
> 5 files changed, 220 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/powerpc/powerpc64/le/power10/memcmp.S
> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S
>
> diff --git a/sysdeps/powerpc/powerpc64/le/power10/memcmp.S b/sysdeps/powerpc/powerpc64/le/power10/memcmp.S
> new file mode 100644
> index 0000000000..0285ba32bc
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power10/memcmp.S
> @@ -0,0 +1,181 @@
> +/* Optimized memcmp implementation for POWER10.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +/* TODO: Replace macros by the actual instructions when minimum binutils becomes
> + >= 2.35. This is used to keep compatibility with older versions. */
> +#define VEXTRACTBM(rt,vrb) \
> + .long(((4)<<(32-6)) \
> + | ((rt)<<(32-11)) \
> + | ((8)<<(32-16)) \
> + | ((vrb)<<(32-21)) \
> + | 1602)
> +
> +#define LXVP(xtp,dq,ra) \
> + .long(((6)<<(32-6)) \
> + | ((((xtp)-32)>>1)<<(32-10)) \
> + | ((1)<<(32-11)) \
> + | ((ra)<<(32-16)) \
> + | dq)
> +
> +/* Compare 32 bytes. */
> +#define COMPARE_32(vr1,vr2,offset,tail_1,tail_2)\
> + LXVP(32+vr1,offset,r3); \
> + LXVP(32+vr2,offset,r4); \
> + vcmpneb. v5,vr1+1,vr2+1; \
> + bne cr6,L(tail_2); \
The cr6 is unaligned with the previous line, also the backslashes are
unaligned in the first lines.
There are simmilar issues on many other parts of the code.
> + vcmpneb. v4,vr1,vr2; \
> + bne cr6,L(tail_1); \
> +
> +/* Get the first different bytes of each vector register. Then subtract them and
> + return the result on r3. */
> +#define TAIL(v_res,s1,s2) \
> + vctzlsbb r7,v_res; \
> + vextubrx r8,r7,s1; \
> + vextubrx r9,r7,s2; \
> + subf r3,r9,r8; \
> + blr; \
> +
> +/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4],
> + size_t size [r5]) */
> +
> +#ifndef MEMCMP
> +# define MEMCMP memcmp
> +#endif
> + .machine power9
> +ENTRY_TOCLESS (MEMCMP, 4)
> + CALL_MCOUNT 3
> +
> + cmpldi cr6,r5,64
> + bgt cr6,L(loop_head)
> +
> +/* Compare 64 bytes. This section is used for lengths <= 64 and for the last
> + bytes for larger lengths. */
> +L(last_compare):
> + li r8,16
> +
> + sldi r9,r5,56
> + sldi r8,r8,56
> + addi r6,r3,16
> + addi r7,r4,16
> +
> + /* Align up to 16 bytes. */
> + lxvl 32+v0,r3,r9
> + lxvl 32+v2,r4,r9
> +
> + /* The sub. and vcmpneb. results are concatenated by the crnand in order to
> + do a single branch. It's doing a NOT(CR0.GT AND CR6.EQ) them loading to
s/them/then/
Also both lines above have more than 79 characters.
> + CR0.LT. That means r9 not bigger than 0 and v4 is not all equal
r9 *is* not bigger
> + to 0. */
> + sub. r9,r9,r8
> + vcmpneb. v4,v0,v2
> + crnand 4*cr0+3,4*cr0+1,4*cr6+2
> + bt 4*cr0+3,L(tail1)
> +
> + addi r3,r3,32
> + addi r4,r4,32
> +
> + lxvl 32+v1,r6,r9
> + lxvl 32+v3,r7,r9
> + sub. r9,r9,r8
> + vcmpneb. v5,v1,v3
> + crnand 4*cr0+3,4*cr0+1,4*cr6+2
> + bt 4*cr0+3,L(tail2)
> +
> + addi r6,r3,16
> + addi r7,r4,16
> +
> + lxvl 32+v6,r3,r9
> + lxvl 32+v8,r4,r9
> + sub. r9,r9,r8
> + vcmpneb. v4,v6,v8
> + crnand 4*cr0+3,4*cr0+1,4*cr6+2
> + bt 4*cr0+3,L(tail3)
> +
> + lxvl 32+v7,r6,r9
> + lxvl 32+v9,r7,r9
> + vcmpneb. v5,v7,v9
> + bne cr6,L(tail4)
> +
> +L(finish):
> + /* The contents are equal. */
> + li r3,0
> + blr
> +
> +L(loop_head):
> + /* Calculate how many loops to run. */
> + srdi. r8,r5,7
> + beq L(loop_tail)
> + mtctr r8
> +
> +/* Main loop. Compares 128 bytes each loop. */
> + .p2align 5
> +L(loop_128):
> + COMPARE_32(v0,v2,0,tail1,tail2)
> + COMPARE_32(v6,v8,32,tail3,tail4)
> + COMPARE_32(v10,v12,64,tail5,tail6)
> + COMPARE_32(v14,v16,96,tail7,tail8)
> +
> + addi r3,r3,128
> + addi r4,r4,128
> + bdnz L(loop_128)
> +
> + /* Account loop comparisons. */
> + clrldi. r5,r5,57
> + beq L(finish)
> +
> +/* Compares 64 bytes if length is still bigger than 64 bytes. */
> + .p2align 5
> +L(loop_tail):
> + cmpldi r5,64
> + ble L(last_compare)
> + COMPARE_32(v0,v2,0,tail1,tail2)
> + COMPARE_32(v6,v8,32,tail3,tail4)
> + addi r3,r3,64
> + addi r4,r4,64
> + subi r5,r5,64
> + b L(last_compare)
> +
> +L(tail1):
> + TAIL(v4,v0,v2)
> +
> +L(tail2):
> + TAIL(v5,v1,v3)
> +
> +L(tail3):
> + TAIL(v4,v6,v8)
> +
> +L(tail4):
> + TAIL(v5,v7,v9)
> +
> +L(tail5):
> + TAIL(v4,v10,v12)
> +
> +L(tail6):
> + TAIL(v5,v11,v13)
> +
> +L(tail7):
> + TAIL(v4,v14,v16)
> +
> +L(tail8):
> + TAIL(v5,v15,v17)
> +
> +END (MEMCMP)
> +libc_hidden_builtin_def (memcmp)
> +weak_alias (memcmp, bcmp)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index e6e013db17..626845a43c 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
> strncase-power8
>
> ifneq (,$(filter %le,$(config-machine)))
> -sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \
> +sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
> rawmemchr-power9 rawmemchr-power10 \
> strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
> strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index a92b67448e..0acdf22ba3 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -184,6 +184,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/powerpc/powerpc64/multiarch/memcmp.c. */
> IFUNC_IMPL (i, name, memcmp,
> +#ifdef __LITTLE_ENDIAN__
> + IFUNC_IMPL_ADD (array, i, memcmp,
> + hwcap2 & PPC_FEATURE2_ARCH_3_1
> + && hwcap & PPC_FEATURE_HAS_VSX,
> + __memcmp_power10)
> +#endif
> IFUNC_IMPL_ADD (array, i, memcmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
> __memcmp_power8)
> IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_HAS_VSX,
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S
> new file mode 100644
> index 0000000000..73a0debd4a
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S
> @@ -0,0 +1,26 @@
> +/* Optimized memcmp implementation for POWER10.
> + Copyright (C) 2017-2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define MEMCMP __memcmp_power10
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +#undef weak_alias
> +#define weak_alias(name,alias)
> +
> +#include <sysdeps/powerpc/powerpc64/le/power10/memcmp.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
> index 6fd53267b6..4fd089aba7 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
> @@ -27,11 +27,17 @@ extern __typeof (memcmp) __memcmp_ppc attribute_hidden;
> extern __typeof (memcmp) __memcmp_power4 attribute_hidden;
> extern __typeof (memcmp) __memcmp_power7 attribute_hidden;
> extern __typeof (memcmp) __memcmp_power8 attribute_hidden;
> +extern __typeof (memcmp) __memcmp_power10 attribute_hidden;
> # undef memcmp
>
> /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> ifunc symbol properly. */
> libc_ifunc_redirected (__redirect_memcmp, memcmp,
> +#ifdef __LITTLE_ENDIAN__
> + (hwcap2 & PPC_FEATURE2_ARCH_3_1
> + && hwcap & PPC_FEATURE_HAS_VSX)
> + ? __memcmp_power10 :
> +#endif
> (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> ? __memcmp_power8 :
> (hwcap & PPC_FEATURE_HAS_VSX)
>
--
Raphael Moreira Zinsly
next prev parent reply other threads:[~2021-05-28 18:37 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-05-24 17:07 Lucas A. M. Magalhaes
2021-05-28 18:37 ` Raphael M Zinsly [this message]
2021-05-31 21:05 ` Lucas A. M. Magalhaes
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=0598966a-28f1-4e68-0158-5576bfd82805@linux.ibm.com \
--to=rzinsly@linux.ibm.com \
--cc=lamm@linux.ibm.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).