public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Raphael M Zinsly <rzinsly@linux.ibm.com>
To: "Lucas A. M. Magalhaes" <lamm@linux.ibm.com>, libc-alpha@sourceware.org
Subject: Re: [PATCH] powerpc: Optimized memcmp for power10
Date: Fri, 28 May 2021 15:37:49 -0300	[thread overview]
Message-ID: <0598966a-28f1-4e68-0158-5576bfd82805@linux.ibm.com> (raw)
In-Reply-To: <20210524170732.1739829-1-lamm@linux.ibm.com>

Hi Lucas,

The code LGTM with some formatting fixes.

Reviewed-by: Raphael M Zinsly <rzinsly@linux.ibm.com>

On 24/05/2021 14:07, Lucas A. M. Magalhaes via Libc-alpha wrote:
> This patch was based on the __memcmp_power8 and the recent
> __strlen_power10.
> 
> Improvements from __memcmp_power8:
> 
> 1. Don't need alignment code.
> 
>     On POWER10 lxvp and lxvl do not generate alignment interrupts, so
> they are safe for use on caching-inhibited memory.  Notice that the
> comparison on the main loop will wait for both VSR to be ready.
> Therefore aligning one of the input address does not improve
> performance.  In order to align both registers a vperm is necessary
> which add too much overhead.
> 
> 2. Uses new POWER10 instructions
> 
>     This code uses lxvp to decrease contention on load by loading 32 bytes
> per instruction.
>     The vextractbm is used to have a smaller tail code for calculating the
> return value.
> 
> 3. Performance improvement
> 
>     This version has around 35% better performance on average. I saw no
> performance regressions for any length or alignment.
> 
> Thanks Matheus for helping me out with some details.
> 
> Co-authored-by: Matheus Castanho <msc@linux.ibm.com>
> ---
>   sysdeps/powerpc/powerpc64/le/power10/memcmp.S | 181 ++++++++++++++++++
>   sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
>   .../powerpc64/multiarch/ifunc-impl-list.c     |   6 +
>   .../powerpc64/multiarch/memcmp-power10.S      |  26 +++
>   sysdeps/powerpc/powerpc64/multiarch/memcmp.c  |   6 +
>   5 files changed, 220 insertions(+), 1 deletion(-)
>   create mode 100644 sysdeps/powerpc/powerpc64/le/power10/memcmp.S
>   create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power10/memcmp.S b/sysdeps/powerpc/powerpc64/le/power10/memcmp.S
> new file mode 100644
> index 0000000000..0285ba32bc
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power10/memcmp.S
> @@ -0,0 +1,181 @@
> +/* Optimized memcmp implementation for POWER10.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* TODO: Replace macros by the actual instructions when minimum binutils becomes
> +   >= 2.35.  This is used to keep compatibility with older versions.  */
> +#define VEXTRACTBM(rt,vrb)	 \
> +	.long(((4)<<(32-6))	 \
> +	      | ((rt)<<(32-11))	 \
> +	      | ((8)<<(32-16))	 \
> +	      | ((vrb)<<(32-21)) \
> +	      | 1602)
> +
> +#define LXVP(xtp,dq,ra)		   \
> +	.long(((6)<<(32-6))		   \
> +	      | ((((xtp)-32)>>1)<<(32-10)) \
> +	      | ((1)<<(32-11))		   \
> +	      | ((ra)<<(32-16))		   \
> +	      | dq)
> +
> +/* Compare 32 bytes.  */
> +#define COMPARE_32(vr1,vr2,offset,tail_1,tail_2)\
> +	LXVP(32+vr1,offset,r3);	\
> +	LXVP(32+vr2,offset,r4);	\
> +	vcmpneb.	v5,vr1+1,vr2+1; \
> +	bne	cr6,L(tail_2);		\

The cr6 is unaligned with the previous line, also the backslashes are 
unaligned in the first lines.
There are simmilar issues on many other parts of the code.

> +	vcmpneb.	v4,vr1,vr2;	\
> +	bne	cr6,L(tail_1);		\
> +
> +/* Get the first different bytes of each vector register. Then subtract them and
> +   return the result on r3.  */
> +#define TAIL(v_res,s1,s2)	\
> +	vctzlsbb	r7,v_res;	\
> +	vextubrx	r8,r7,s1;	\
> +	vextubrx	r9,r7,s2;	\
> +	subf	r3,r9,r8;		\
> +	blr;					\
> +
> +/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4],
> +					size_t size [r5])  */
> +
> +#ifndef MEMCMP
> +# define MEMCMP memcmp
> +#endif
> +	.machine  power9
> +ENTRY_TOCLESS (MEMCMP, 4)
> +	CALL_MCOUNT 3
> +
> +	cmpldi	cr6,r5,64
> +	bgt	cr6,L(loop_head)
> +
> +/* Compare 64 bytes. This section is used for lengths <= 64 and for the last
> +   bytes for larger lengths.  */
> +L(last_compare):
> +	li	r8,16
> +
> +	sldi	r9,r5,56
> +	sldi	r8,r8,56
> +	addi	r6,r3,16
> +	addi	r7,r4,16
> +
> +	/* Align up to 16 bytes.  */
> +	lxvl	32+v0,r3,r9
> +	lxvl	32+v2,r4,r9
> +
> +	/* The sub. and vcmpneb. results are concatenated by the crnand in order to
> +	   do a single branch. It's doing a NOT(CR0.GT AND CR6.EQ) them loading to

s/them/then/
Also both lines above have more than 79 characters.

> +	   CR0.LT.  That means r9 not bigger than 0 and v4 is not all equal

r9 *is* not bigger

> +	   to 0.  */
> +	sub.	r9,r9,r8
> +	vcmpneb.	v4,v0,v2
> +	crnand	4*cr0+3,4*cr0+1,4*cr6+2
> +	bt	4*cr0+3,L(tail1)
> +
> +	addi	r3,r3,32
> +	addi	r4,r4,32
> +
> +	lxvl	32+v1,r6,r9
> +	lxvl	32+v3,r7,r9
> +	sub.	r9,r9,r8
> +	vcmpneb.	v5,v1,v3
> +	crnand  4*cr0+3,4*cr0+1,4*cr6+2
> +	bt	4*cr0+3,L(tail2)
> +
> +	addi	r6,r3,16
> +	addi	r7,r4,16
> +
> +	lxvl	32+v6,r3,r9
> +	lxvl	32+v8,r4,r9
> +	sub.	r9,r9,r8
> +	vcmpneb.	v4,v6,v8
> +	crnand  4*cr0+3,4*cr0+1,4*cr6+2
> +	bt	4*cr0+3,L(tail3)
> +
> +	lxvl	32+v7,r6,r9
> +	lxvl	32+v9,r7,r9
> +	vcmpneb.	v5,v7,v9
> +	bne	cr6,L(tail4)
> +
> +L(finish):
> +	/* The contents are equal.  */
> +	li	r3,0
> +	blr
> +
> +L(loop_head):
> +	/* Calculate how many loops to run.  */
> +	srdi.	r8,r5,7
> +	beq	L(loop_tail)
> +	mtctr	r8
> +
> +/* Main loop.  Compares 128 bytes each loop.  */
> +	.p2align 5
> +L(loop_128):
> +	COMPARE_32(v0,v2,0,tail1,tail2)
> +	COMPARE_32(v6,v8,32,tail3,tail4)
> +	COMPARE_32(v10,v12,64,tail5,tail6)
> +	COMPARE_32(v14,v16,96,tail7,tail8)
> +
> +	addi	r3,r3,128
> +	addi	r4,r4,128
> +	bdnz	L(loop_128)
> +
> +	/* Account loop comparisons.  */
> +	clrldi.	r5,r5,57
> +	beq	L(finish)
> +
> +/* Compares 64 bytes if length is still bigger than 64 bytes.  */
> +	.p2align 5
> +L(loop_tail):
> +	cmpldi	r5,64
> +	ble	L(last_compare)
> +	COMPARE_32(v0,v2,0,tail1,tail2)
> +	COMPARE_32(v6,v8,32,tail3,tail4)
> +	addi	r3,r3,64
> +	addi	r4,r4,64
> +	subi	r5,r5,64
> +	b	L(last_compare)
> +
> +L(tail1):
> +	TAIL(v4,v0,v2)
> +
> +L(tail2):
> +	TAIL(v5,v1,v3)
> +
> +L(tail3):
> +	TAIL(v4,v6,v8)
> +
> +L(tail4):
> +	TAIL(v5,v7,v9)
> +
> +L(tail5):
> +	TAIL(v4,v10,v12)
> +
> +L(tail6):
> +	TAIL(v5,v11,v13)
> +
> +L(tail7):
> +	TAIL(v4,v14,v16)
> +
> +L(tail8):
> +	TAIL(v5,v15,v17)
> +
> +END (MEMCMP)
> +libc_hidden_builtin_def (memcmp)
> +weak_alias (memcmp, bcmp)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index e6e013db17..626845a43c 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
>   		   strncase-power8
> 
>   ifneq (,$(filter %le,$(config-machine)))
> -sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \
> +sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
>   		   rawmemchr-power9 rawmemchr-power10 \
>   		   strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
>   		   strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index a92b67448e..0acdf22ba3 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -184,6 +184,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> 
>     /* Support sysdeps/powerpc/powerpc64/multiarch/memcmp.c.  */
>     IFUNC_IMPL (i, name, memcmp,
> +#ifdef __LITTLE_ENDIAN__
> +	      IFUNC_IMPL_ADD (array, i, memcmp,
> +            hwcap2 & PPC_FEATURE2_ARCH_3_1
> +            && hwcap & PPC_FEATURE_HAS_VSX,
> +			      __memcmp_power10)
> +#endif
>   	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
>   			      __memcmp_power8)
>   	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_HAS_VSX,
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S
> new file mode 100644
> index 0000000000..73a0debd4a
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S
> @@ -0,0 +1,26 @@
> +/* Optimized memcmp implementation for POWER10.
> +   Copyright (C) 2017-2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define MEMCMP __memcmp_power10
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +#undef weak_alias
> +#define weak_alias(name,alias)
> +
> +#include <sysdeps/powerpc/powerpc64/le/power10/memcmp.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
> index 6fd53267b6..4fd089aba7 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
> @@ -27,11 +27,17 @@ extern __typeof (memcmp) __memcmp_ppc attribute_hidden;
>   extern __typeof (memcmp) __memcmp_power4 attribute_hidden;
>   extern __typeof (memcmp) __memcmp_power7 attribute_hidden;
>   extern __typeof (memcmp) __memcmp_power8 attribute_hidden;
> +extern __typeof (memcmp) __memcmp_power10 attribute_hidden;
>   # undef memcmp
> 
>   /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
>      ifunc symbol properly.  */
>   libc_ifunc_redirected (__redirect_memcmp, memcmp,
> +#ifdef __LITTLE_ENDIAN__
> +				(hwcap2 & PPC_FEATURE2_ARCH_3_1
> +				 && hwcap & PPC_FEATURE_HAS_VSX)
> +				 ? __memcmp_power10 :
> +#endif
>   		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
>   		       ? __memcmp_power8 :
>   		       (hwcap & PPC_FEATURE_HAS_VSX)
> 

-- 
Raphael Moreira Zinsly

  reply	other threads:[~2021-05-28 18:37 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-24 17:07 Lucas A. M. Magalhaes
2021-05-28 18:37 ` Raphael M Zinsly [this message]
2021-05-31 21:05   ` Lucas A. M. Magalhaes

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=0598966a-28f1-4e68-0158-5576bfd82805@linux.ibm.com \
    --to=rzinsly@linux.ibm.com \
    --cc=lamm@linux.ibm.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).