From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 89227 invoked by alias); 4 Apr 2016 20:46:58 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Received: (qmail 89216 invoked by uid 89); 4 Apr 2016 20:46:57 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.9 required=5.0 tests=BAYES_00,RCVD_IN_DNSWL_NONE,SPF_PASS autolearn=ham version=3.3.2 spammy=04042016 X-HELO: mail-yw0-f176.google.com X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20130820; h=x-gm-message-state:subject:to:references:from:message-id:date :user-agent:mime-version:in-reply-to:content-transfer-encoding; bh=ZjQh4DPcegMSPu9MdtQ1W8eDPYqY/1eB7HicEL3hv3w=; b=cgigdCuI6QcUt6b6LwJNEIMK8A1FJc+sMR1u3kfiHKol3HlR7TpbM/B0BGa1SAhiQi N1kJSKplsLGQqy0Snd9Qi37VM/5SrgV25qz3Xarud/qOm5uEQWa/LHjJhTpIqj9LwKo4 LH9u2uX5E+VwlU2tfAYM7s4BXcr9ucDDeW/FGTVzm+nk0NG7Rvj8coJUzZcdzJVI78Rv K7rFQEB3RcfMggVuJljdkfoFldRB4HGG71OlSb3HWjlweLE0EQwQzDegNwcj8JEXki5v U/fhJm+U1iWGiFsIdrRYP9J8CsQjr5EW+9bKI+6tXGSc2EoEF8l3SKdSMjH7S8/HiHPe nCDQ== X-Gm-Message-State: AD7BkJK7FrzyhtuFeKWGYVuNDf3w1OfKV8unxCw61Vuah+yE+ych4WTHplXYXz9Q4mmn7SdJ X-Received: by 10.129.70.212 with SMTP id t203mr11421869ywa.295.1459802805103; Mon, 04 Apr 2016 13:46:45 -0700 (PDT) Subject: Re: [PATCH v3] powerpc: Add optimized P8 strspn To: libc-alpha@sourceware.org References: <5702A88A.7060507@linux.vnet.ibm.com> From: Adhemerval Zanella Message-ID: <5702D2B0.9090209@linaro.org> Date: Mon, 04 Apr 2016 20:46:00 -0000 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.6.0 MIME-Version: 1.0 In-Reply-To: <5702A88A.7060507@linux.vnet.ibm.com> Content-Type: text/plain; charset=windows-1252 Content-Transfer-Encoding: 7bit X-SW-Source: 2016-04/txt/msg00061.txt.bz2 LGTM. On 04-04-2016 14:46, Paul E. Murphy wrote: > No big changes, but necessary to fix a silent merge conflict: > > * Rename r0 to 0 for lvx/lvsr usage as appropriate > * A couple of minor typos in comments > * Change strspn-ppc64.S to strspn-ppc64.c as recent changes > have retracted the PPC64 specific ASM > > retested on ppc64le > > ----8<---- > This utilizes vectors and bitmasks. For small needle, large > haystack, the performance improvement is upto 8x. For short > strings (0-4B), the cost of computing the bitmask dominates, > and is a tad slower. > > 2016-04-04 Paul E. Murphy > > * sysdeps/powerpc/powerpc64/multiarch/Makefile: > (sysdep_routines): Add new strspn targets. > * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c: > (__libc_ifunc_impl_list): Add strspn. > * sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S: > New file. > * sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c: > Likewise. > * sysdeps/powerpc/powerpc64/multiarch/strspn.c: > Likewise. > * sysdeps/powerpc/powerpc64/power8/strspn.S: > Likewise. > --- > sysdeps/powerpc/powerpc64/multiarch/Makefile | 3 +- > .../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 8 + > .../powerpc/powerpc64/multiarch/strspn-power8.S | 40 +++++ > sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c | 25 +++ > sysdeps/powerpc/powerpc64/multiarch/strspn.c | 35 ++++ > sysdeps/powerpc/powerpc64/power8/strspn.S | 179 +++++++++++++++++++++ > 6 files changed, 289 insertions(+), 1 deletion(-) > create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S > create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c > create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strspn.c > create mode 100644 sysdeps/powerpc/powerpc64/power8/strspn.S > > diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile > index 3b0e3a0..7ed56bf 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile > +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile > @@ -19,7 +19,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ > strcmp-power8 strcmp-power7 strcmp-ppc64 \ > strcat-power8 strcat-power7 strcat-ppc64 \ > memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \ > - strncpy-power8 strstr-power7 strstr-ppc64 > + strncpy-power8 strstr-power7 strstr-ppc64 \ > + strspn-power8 strspn-ppc64 > > CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops > CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops > diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > index 11a8215..3e1f099 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > @@ -322,6 +322,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, strcat, 1, > __strcat_ppc)) > > + /* Support sysdeps/powerpc/powerpc64/multiarch/strspn.c. */ > + IFUNC_IMPL (i, name, strspn, > + IFUNC_IMPL_ADD (array, i, strspn, > + hwcap2 & PPC_FEATURE2_ARCH_2_07, > + __strspn_power8) > + IFUNC_IMPL_ADD (array, i, strspn, 1, > + __strspn_ppc)) > + > /* Support sysdeps/powerpc/powerpc64/multiarch/strstr.c. */ > IFUNC_IMPL (i, name, strstr, > IFUNC_IMPL_ADD (array, i, strstr, > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S > new file mode 100644 > index 0000000..86a4e09 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S > @@ -0,0 +1,40 @@ > +/* Optimized strspn implementation for POWER8. > + Copyright (C) 2016 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + . */ > + > +#include > + > +#undef EALIGN > +#define EALIGN(name, alignt, words) \ > + .section ".text"; \ > + ENTRY_2(__strspn_power8) \ > + .align ALIGNARG(alignt); \ > + EALIGN_W_##words; \ > + BODY_LABEL(__strspn_power8): \ > + cfi_startproc; \ > + LOCALENTRY(__strspn_power8) > + > +#undef END > +#define END(name) \ > + cfi_endproc; \ > + TRACEBACK(__strspn_power8) \ > + END_2(__strspn_power8) > + > +#undef libc_hidden_builtin_def > +#define libc_hidden_builtin_def(name) > + > +#include > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c > new file mode 100644 > index 0000000..4c63665 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c > @@ -0,0 +1,25 @@ > +/* Default strspn implementation for PowerPC64. > + Copyright (C) 2016 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + . */ > + > +#define STRSPN __strspn_ppc > +#ifdef SHARED > +#undef libc_hidden_def > +#define libc_hidden_def(name) > +#endif > + > +#include > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn.c b/sysdeps/powerpc/powerpc64/multiarch/strspn.c > new file mode 100644 > index 0000000..0e653f3 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn.c > @@ -0,0 +1,35 @@ > +/* Multiple versions of strspn. PowerPC64 version. > + Copyright (C) 2016 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + . */ > + > +# include > +# include > +# include "init-arch.h" > + > +#undef strspn > +extern __typeof (strspn) __libc_strspn; > + > +extern __typeof (strspn) __strspn_ppc attribute_hidden; > +extern __typeof (strspn) __strspn_power8 attribute_hidden; > + > +libc_ifunc (__libc_strspn, > + (hwcap2 & PPC_FEATURE2_ARCH_2_07) > + ? __strspn_power8 > + : __strspn_ppc); > + > +weak_alias (__libc_strspn, strspn) > +libc_hidden_builtin_def (strspn) > diff --git a/sysdeps/powerpc/powerpc64/power8/strspn.S b/sysdeps/powerpc/powerpc64/power8/strspn.S > new file mode 100644 > index 0000000..35d868e > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/power8/strspn.S > @@ -0,0 +1,179 @@ > +/* Optimized strspn implementation for Power8. > + > + Copyright (C) 2016 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + . */ > + > +/* size_t [r3] strspn (const char *string [r3], > + const char *needleAccept [r4] */ > + > +/* This takes a novel approach by computing a 256 bit mask whereby > + each set bit implies the byte is "accepted". P8 vector hardware > + has extremely efficient hardware for selecting bits from a mask. > + > + One might ask "why not use bpermd for short strings"? It is > + so slow that its performance about matches the generic PPC64 > + variant without any fancy masking, with the added expense of > + making the mask. That was the first variant of this. */ > + > + > + > +#include "sysdep.h" > + > +/* Simple macro to use VSX instructions in overlapping VR's. */ > +#define XXVR(insn, vrt, vra, vrb) \ > + insn 32+vrt, 32+vra, 32+vrb > + > +/* ISA 2.07B instructions are not all defined for older binutils. > + Macros are defined below for these newer instructions in order > + to maintain compatibility. */ > + > +/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs. */ > +#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) > +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) > + > +#define VBPERMQ(t,a,b) .long (0x1000054c \ > + | ((t)<<(32-11)) \ > + | ((a)<<(32-16)) \ > + | ((b)<<(32-21)) ) > + > + /* This can be updated to power8 once the minimum version of > + binutils supports power8 and the above instructions. */ > + .machine power7 > +EALIGN(strspn, 4, 0) > + CALL_MCOUNT 2 > + > + /* Generate useful constants for later on. */ > + vspltisb v1, 7 > + vspltisb v2, -1 > + vslb v1, v1, v1 /* 0x80 to swap high bit for vbpermq. */ > + vspltisb v10, 0 > + vsldoi v4, v10, v2, 2 /* 0xFFFF into vr4. */ > + XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches. */ > + > + /* Prepare to compute 256b mask. */ > + addi r4, r4, -1 > + li r5, 0 > + li r6, 0 > + li r7, 0 > + li r8, 0 > + li r11, 1 > + sldi r11, r11, 63 > + > + /* Start interleaved Mask computation. > + This will eventually or 1's into ignored bits from vbpermq. */ > + lvsr v11, 0, r3 > + vspltb v11, v11, 0 /* Splat shift constant. */ > + > + /* Build a 256b mask in r5-r8. */ > + .align 4 > +L(next_needle): > + lbzu r9, 1(r4) > + > + cmpldi cr0, r9, 0 > + cmpldi cr1, r9, 128 > + > + /* This is a little tricky. srd only uses the first 7 bits, > + and if bit 7 is set, value is always 0. So, we can > + effectively shift 128b in this case. */ > + xori r12, r9, 0x40 /* Invert bit 6. */ > + srd r10, r11, r9 /* Mask for bits 0-63. */ > + srd r12, r11, r12 /* Mask for bits 64-127. */ > + > + beq cr0, L(start_cmp) > + > + /* Now, or the value into the correct GPR. */ > + bge cr1,L(needle_gt128) > + or r5, r5, r10 /* 0 - 63. */ > + or r6, r6, r12 /* 64 - 127. */ > + b L(next_needle) > + > + .align 4 > +L(needle_gt128): > + or r7, r7, r10 /* 128 - 191. */ > + or r8, r8, r12 /* 192 - 255. */ > + b L(next_needle) > + > + > + .align 4 > +L(start_cmp): > + /* Move and merge bitmap into 2 VRs. bpermd is slower on P8. */ > + mr r0, r3 /* Save r3 for final length computation. */ > + MTVRD (v5, r5) > + MTVRD (v6, r6) > + MTVRD (v7, r7) > + MTVRD (v8, r8) > + > + /* Continue interleaved mask generation. */ > +#ifdef __LITTLE_ENDIAN__ > + vsrw v11, v2, v11 /* Note, shift ignores higher order bits. */ > + vsplth v11, v11, 0 /* Only care about the high 16 bits of v10. */ > +#else > + vslw v11, v2, v11 /* Note, shift ignores higher order bits. */ > + vsplth v11, v11, 1 /* Only care about the low 16 bits of v10. */ > +#endif > + lvx v0, 0, r3 /* Note, unaligned load ignores lower bits. */ > + > + /* Do the merging of the bitmask. */ > + XXVR(xxmrghd, v5, v5, v6) > + XXVR(xxmrghd, v6, v7, v8) > + > + /* Finish mask generation. */ > + vand v11, v11, v4 /* Throwaway bits not in the mask. */ > + > + /* Compare the first 1-16B, while masking unwanted bytes. */ > + clrrdi r3, r3, 4 /* Note, counts from qw boundaries. */ > + vxor v9, v0, v1 /* Swap high bit. */ > + VBPERMQ (v8, v5, v0) > + VBPERMQ (v7, v6, v9) > + vor v7, v7, v8 > + vor v7, v7, v11 /* Ignore non-participating bytes. */ > + vcmpequh. v8, v7, v4 > + bnl cr6, L(done) > + > + addi r3, r3, 16 > + > + .align 4 > +L(vec): > + lvx v0, 0, r3 > + addi r3, r3, 16 > + vxor v9, v0, v1 /* Swap high bit. */ > + VBPERMQ (v8, v5, v0) > + VBPERMQ (v7, v6, v9) > + vor v7, v7, v8 > + vcmpequh. v8, v7, v4 > + blt cr6, L(vec) > + > + addi r3, r3, -16 > +L(done): > + subf r3, r0, r3 > + MFVRD (r10, v7) > + > +#ifdef __LITTLE_ENDIAN__ > + addi r0, r10, 1 /* Count the trailing 1's. */ > + andc r10, r10, r0 > + popcntd r10, r10 > +#else > + xori r10, r10, 0xffff /* Count leading 1's by inverting. */ > + addi r3, r3, -48 /* Account for the extra leading zeros. */ > + cntlzd r10, r10 > +#endif > + > + add r3, r3, r10 > + blr > + > +END(strspn) > +libc_hidden_builtin_def (strspn) >