From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1969) id BFD493858438; Thu, 7 Dec 2023 17:11:20 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org BFD493858438 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1701969080; bh=wEokAI1BvTXL9C5rBLR7dEeEgqq6OAgM6gpkkik/QvI=; h=From:To:Subject:Date:From; b=CsU/9sSey/r+WY9Phtl9++XJ2mk5tHDpsH+FCzFARvay+72KFlVFoTmM3vXSX2ROg sRwKvfjoe3Jy2HNOlM0J9FfdCdjDBBwdadqN2duDOwfzqGYWFOUw8IYjmRk7yidjNQ 56bxdT6VokCofPkGs8B1Su2sDxgvRuPQWJmufpq0= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Rajalakshmi Srinivasaraghavan To: glibc-cvs@sourceware.org Subject: [glibc] powerpc: Optimized strcmp for power10 X-Act-Checkin: glibc X-Git-Author: Amrita H S X-Git-Refname: refs/heads/master X-Git-Oldrev: 546a1ba664626603660b595662249d524e429013 X-Git-Newrev: 3367d8e180848030d1646f088759f02b8dfe0d6f Message-Id: <20231207171120.BFD493858438@sourceware.org> Date: Thu, 7 Dec 2023 17:11:20 +0000 (GMT) List-Id: https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=3367d8e180848030d1646f088759f02b8dfe0d6f commit 3367d8e180848030d1646f088759f02b8dfe0d6f Author: Amrita H S Date: Wed Dec 6 11:43:11 2023 -0500 powerpc: Optimized strcmp for power10 This patch is based on __strcmp_power9 and __strlen_power10. Improvements from __strcmp_power9: 1. Uses new POWER10 instructions - This code uses lxvp to decrease contention on load by loading 32 bytes per instruction. 2. Performance implication - This version has around 30% better performance on average. - Performance regression is seen for a specific combination of sizes and alignments. Some of them is observed without changes also, while rest may be induced by the patch. Signed-off-by: Amrita H S Reviewed-by: Paul E. Murphy Diff: --- sysdeps/powerpc/powerpc64/le/power10/strcmp.S | 204 +++++++++++++++++++++ sysdeps/powerpc/powerpc64/multiarch/Makefile | 3 +- .../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 4 + .../powerpc/powerpc64/multiarch/strcmp-power10.S | 26 +++ sysdeps/powerpc/powerpc64/multiarch/strcmp.c | 4 + 5 files changed, 240 insertions(+), 1 deletion(-) diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S new file mode 100644 index 0000000000..a3c1adad53 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S @@ -0,0 +1,204 @@ +/* Optimized strcmp implementation for PowerPC64/POWER10. + Copyright (C) 2021-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ +#include + +#ifndef STRCMP +# define STRCMP strcmp +#endif + +/* Implements the function + int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]). */ + +/* TODO: Change this to actual instructions when minimum binutils is upgraded + to 2.27. Macros are defined below for these newer instructions in order + to maintain compatibility. */ + +#define LXVP(xtp,dq,ra) \ + .long(((6)<<(32-6)) \ + | ((((xtp)-32)>>1)<<(32-10)) \ + | ((1)<<(32-11)) \ + | ((ra)<<(32-16)) \ + | dq) + +#define COMPARE_16(vreg1,vreg2,offset) \ + lxv vreg1+32,offset(r3); \ + lxv vreg2+32,offset(r4); \ + vcmpnezb. v7,vreg1,vreg2; \ + bne cr6,L(different); \ + +#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \ + LXVP(vreg1+32,offset,r3); \ + LXVP(vreg2+32,offset,r4); \ + vcmpnezb. v7,vreg1+1,vreg2+1; \ + bne cr6,L(label1); \ + vcmpnezb. v7,vreg1,vreg2; \ + bne cr6,L(label2); \ + +#define TAIL(vreg1,vreg2) \ + vctzlsbb r6,v7; \ + vextubrx r5,r6,vreg1; \ + vextubrx r4,r6,vreg2; \ + subf r3,r4,r5; \ + blr; \ + +#define CHECK_N_BYTES(reg1,reg2,len_reg) \ + sldi r0,len_reg,56; \ + lxvl 32+v4,reg1,r0; \ + lxvl 32+v5,reg2,r0; \ + add reg1,reg1,len_reg; \ + add reg2,reg2,len_reg; \ + vcmpnezb. v7,v4,v5; \ + vctzlsbb r6,v7; \ + cmpld cr7,r6,len_reg; \ + blt cr7,L(different); \ + + /* TODO: change this to .machine power10 when the minimum required + binutils allows it. */ + + .machine power9 +ENTRY_TOCLESS (STRCMP, 4) + li r11,16 + /* eq bit of cr1 used as swap status flag to indicate if + source pointers were swapped. */ + crclr 4*cr1+eq + vspltisb v19,-1 + andi. r7,r3,15 + sub r7,r11,r7 /* r7(nalign1) = 16 - (str1 & 15). */ + andi. r9,r4,15 + sub r5,r11,r9 /* r5(nalign2) = 16 - (str2 & 15). */ + cmpld cr7,r7,r5 + beq cr7,L(same_aligned) + blt cr7,L(nalign1_min) + /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the + pointer which is closer to the next 16B boundary so that only + one CHECK_N_BYTES is needed before entering the loop below. */ + mr r8,r4 + mr r4,r3 + mr r3,r8 + mr r12,r7 + mr r7,r5 + mr r5,r12 + crset 4*cr1+eq /* Set bit on swapping source pointers. */ + + .p2align 5 +L(nalign1_min): + CHECK_N_BYTES(r3,r4,r7) + + .p2align 5 +L(s1_aligned): + /* r9 and r5 is number of bytes to be read after and before + page boundary correspondingly. */ + sub r5,r5,r7 + subfic r9,r5,16 + /* Now let r7 hold the count of quadwords which can be + checked without crossing a page boundary. quadword offset is + (str2>>4)&0xFF. */ + rlwinm r7,r4,28,0xFF + /* Below check is required only for first iteration. For second + iteration and beyond, the new loop counter is always 255. */ + cmpldi r7,255 + beq L(L3) + /* Get the initial loop count by 255-((str2>>4)&0xFF). */ + subfic r11,r7,255 + + .p2align 5 +L(L1): + mtctr r11 + + .p2align 5 +L(L2): + COMPARE_16(v4,v5,0) /* Load 16B blocks using lxv. */ + addi r3,r3,16 + addi r4,r4,16 + bdnz L(L2) + /* Cross the page boundary of s2, carefully. */ + + .p2align 5 +L(L3): + CHECK_N_BYTES(r3,r4,r5) + CHECK_N_BYTES(r3,r4,r9) + li r11,255 /* Load the new loop counter. */ + b L(L1) + + .p2align 5 +L(same_aligned): + CHECK_N_BYTES(r3,r4,r7) + /* Align s1 to 32B and adjust s2 address. + Use lxvp only if both s1 and s2 are 32B aligned. */ + COMPARE_16(v4,v5,0) + COMPARE_16(v4,v5,16) + COMPARE_16(v4,v5,32) + COMPARE_16(v4,v5,48) + addi r3,r3,64 + addi r4,r4,64 + COMPARE_16(v4,v5,0) + COMPARE_16(v4,v5,16) + + clrldi r6,r3,59 + subfic r5,r6,32 + add r3,r3,r5 + add r4,r4,r5 + andi. r5,r4,0x1F + beq cr0,L(32B_aligned_loop) + + .p2align 5 +L(16B_aligned_loop): + COMPARE_16(v4,v5,0) + COMPARE_16(v4,v5,16) + COMPARE_16(v4,v5,32) + COMPARE_16(v4,v5,48) + addi r3,r3,64 + addi r4,r4,64 + b L(16B_aligned_loop) + + /* Calculate and return the difference. */ +L(different): + vctzlsbb r6,v7 + vextubrx r5,r6,v4 + vextubrx r4,r6,v5 + bt 4*cr1+eq,L(swapped) + subf r3,r4,r5 + blr + + /* If src pointers were swapped, then swap the + indices and calculate the return value. */ +L(swapped): + subf r3,r5,r4 + blr + + .p2align 5 +L(32B_aligned_loop): + COMPARE_32(v14,v16,0,tail1,tail2) + COMPARE_32(v18,v20,32,tail3,tail4) + COMPARE_32(v22,v24,64,tail5,tail6) + COMPARE_32(v26,v28,96,tail7,tail8) + addi r3,r3,128 + addi r4,r4,128 + b L(32B_aligned_loop) + +L(tail1): TAIL(v15,v17) +L(tail2): TAIL(v14,v16) +L(tail3): TAIL(v19,v21) +L(tail4): TAIL(v18,v20) +L(tail5): TAIL(v23,v25) +L(tail6): TAIL(v22,v24) +L(tail7): TAIL(v27,v29) +L(tail8): TAIL(v26,v28) + +END (STRCMP) +libc_hidden_builtin_def (strcmp) diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index 27d8495503..d7824a922b 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -33,7 +33,8 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ ifneq (,$(filter %le,$(config-machine))) sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \ rawmemchr-power9 rawmemchr-power10 \ - strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ + strcmp-power9 strcmp-power10 strncmp-power9 \ + strcpy-power9 stpcpy-power9 \ strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 endif CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index fc26dd0e17..965dd17786 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -377,6 +377,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */ IFUNC_IMPL (i, name, strcmp, #ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, strcmp, + (hwcap2 & PPC_FEATURE2_ARCH_3_1) + && (hwcap & PPC_FEATURE_HAS_VSX), + __strcmp_power10) IFUNC_IMPL_ADD (array, i, strcmp, hwcap2 & PPC_FEATURE2_ARCH_3_00 && hwcap & PPC_FEATURE_HAS_ALTIVEC, diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S new file mode 100644 index 0000000000..c80067ce33 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S @@ -0,0 +1,26 @@ +/* Optimized strcmp implementation for POWER10/PPC64. + Copyright (C) 2021-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if defined __LITTLE_ENDIAN__ && IS_IN (libc) +#define STRCMP __strcmp_power10 + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include +#endif /* __LITTLE_ENDIAN__ && IS_IN (libc) */ diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c index 31fcdee916..f1dac99b66 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c @@ -29,12 +29,16 @@ extern __typeof (strcmp) __strcmp_power7 attribute_hidden; extern __typeof (strcmp) __strcmp_power8 attribute_hidden; # ifdef __LITTLE_ENDIAN__ extern __typeof (strcmp) __strcmp_power9 attribute_hidden; +extern __typeof (strcmp) __strcmp_power10 attribute_hidden; # endif # undef strcmp libc_ifunc_redirected (__redirect_strcmp, strcmp, # ifdef __LITTLE_ENDIAN__ + (hwcap2 & PPC_FEATURE2_ARCH_3_1 + && hwcap & PPC_FEATURE_HAS_VSX) + ? __strcmp_power10 : (hwcap2 & PPC_FEATURE2_ARCH_3_00 && hwcap & PPC_FEATURE_HAS_ALTIVEC) ? __strcmp_power9 :