* [PATCH] powerpc64: strrchr optimization for power8
@ 2017-02-14 11:06 Rajalakshmi Srinivasaraghavan
2017-02-20 13:10 ` Gabriel F. T. Gomes
` (2 more replies)
0 siblings, 3 replies; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-02-14 11:06 UTC (permalink / raw)
To: libc-alpha; +Cc: Rajalakshmi Srinivasaraghavan
Changes from previous version [1]
- Comments correction and alignment changes.
--
P7 code is used for <=32B strings and for > 32B vectorized loops are used.
This shows as an average 25% improvement depending on the position of search
character. The performance is same for shorter strings.
Tested on ppc64 and ppc64le.
2017-02-14 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/multiarch/Makefile
(sysdep_routines): Add strrchr-power8.
* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
(strrchr): Add __strrchr_power8 to list of strrchr functions.
* sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S: New file.
* sysdeps/powerpc/powerpc64/multiarch/strrchr.c
(strrchr): Add __strrchr_power8 to ifunc list.
* sysdeps/powerpc/powerpc64/power8/strrchr.S: New file.
---
sysdeps/powerpc/powerpc64/multiarch/Makefile | 3 +-
.../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 3 +
.../powerpc/powerpc64/multiarch/strrchr-power8.S | 39 ++
sysdeps/powerpc/powerpc64/multiarch/strrchr.c | 3 +
sysdeps/powerpc/powerpc64/power8/strrchr.S | 464 +++++++++++++++++++++
5 files changed, 511 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
create mode 100644 sysdeps/powerpc/powerpc64/power8/strrchr.S
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index f5889a3..0fc0ebc 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -14,7 +14,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
strchrnul-power8 strchrnul-power7 strchrnul-ppc64 \
strcpy-power8 strcpy-power7 strcpy-ppc64 stpcpy-power8 \
stpcpy-power7 stpcpy-ppc64 \
- strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
+ strrchr-power8 strrchr-power7 strrchr-ppc64 \
+ strncat-power7 strncat-ppc64 \
strncpy-power7 strncpy-ppc64 \
stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
strcmp-power9 strcmp-power8 strcmp-power7 strcmp-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 209aec5..d77c47f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -281,6 +281,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strrchr.c. */
IFUNC_IMPL (i, name, strrchr,
IFUNC_IMPL_ADD (array, i, strrchr,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __strrchr_power8)
+ IFUNC_IMPL_ADD (array, i, strrchr,
hwcap & PPC_FEATURE_HAS_VSX,
__strrchr_power7)
IFUNC_IMPL_ADD (array, i, strrchr, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
new file mode 100644
index 0000000..23365a1
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
@@ -0,0 +1,39 @@
+/* Optimized strrchr implementation for POWER8.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#undef ENTRY
+#define ENTRY(name) \
+ .section ".text"; \
+ ENTRY_2(__strrchr_power8) \
+ .align ALIGNARG(2); \
+ BODY_LABEL(__strrchr_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__strrchr_power8)
+
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__strrchr_power8) \
+ END_2(__strrchr_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strrchr.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
index dc1d3d0..0f94c9d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
@@ -25,11 +25,14 @@
extern __typeof (strrchr) __strrchr_ppc attribute_hidden;
extern __typeof (strrchr) __strrchr_power7 attribute_hidden;
+extern __typeof (strrchr) __strrchr_power8 attribute_hidden;
#undef strrchr
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc_redirected (__redirect_strrchr, strrchr,
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __strrchr_power8 :
(hwcap & PPC_FEATURE_HAS_VSX)
? __strrchr_power7
: __strrchr_ppc);
diff --git a/sysdeps/powerpc/powerpc64/power8/strrchr.S b/sysdeps/powerpc/powerpc64/power8/strrchr.S
new file mode 100644
index 0000000..8eb7485
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strrchr.S
@@ -0,0 +1,464 @@
+/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* char *[r3] strrchr (char *s [r3], int c [r4]) */
+/* TODO: change these to the actual instructions when the minimum required
+ binutils allows it. */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VADDUQM(t,a,b) .long (0x10000100 \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+#ifdef __LITTLE_ENDIAN__
+/* Find the match position from v6 and place result in r6. */
+# define CALCULATE_MATCH() \
+ VBPERMQ(v6, v6, v10); \
+ vsldoi v6, v6, v6, 6; \
+ MFVRD(r7, v6); \
+ cntlzd r6, r7; \
+ subfic r6, r6, 15;
+/*
+ * Find the first null position to mask bytes after null.
+ * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw.
+ * Result placed at v2.
+ */
+# define FIND_NULL_POS(reg) \
+ vspltisb v11, -1; \
+ VADDUQM(v11, reg, v11); \
+ vandc v11, v11, reg; \
+ VPOPCNTD(v2, v11); \
+ vspltb v11, v2, 15; \
+ vcmpequb. v11, v11, v9; \
+ blt cr6, 1f; \
+ vsldoi v9, v0, v9, 1; \
+ vslo v2, v2, v9; \
+1: \
+ vsumsws v2, v2, v0;
+#else
+# define CALCULATE_MATCH() \
+ VBPERMQ(v6, v6, v10); \
+ MFVRD(r7, v6); \
+ addi r6, r7, -1; \
+ andc r6, r6, r7; \
+ popcntd r6, r6; \
+ subfic r6, r6, 15;
+# define FIND_NULL_POS(reg) \
+ VCLZD(v2, reg); \
+ vspltb v11, v2, 7; \
+ vcmpequb. v11, v11, v9; \
+ blt cr6, 1f; \
+ vsldoi v9, v0, v9, 1; \
+ vsro v2, v2, v9; \
+1: \
+ vsumsws v2, v2, v0;
+#endif /* !__LITTLE_ENDIAN__ */
+ .machine power7
+ENTRY (strrchr)
+ CALL_MCOUNT 2
+ dcbt 0,r3
+ clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
+ cmpdi cr7,r4,0
+ ld r12,0(r8) /* Load doubleword from memory. */
+ li r9,0 /* Used to store last occurence. */
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+
+ beq cr7,L(null_match)
+
+ /* Replicate byte to doubleword. */
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32
+ insrdi r4,r4,32,0
+
+ /* r4 is changed now. If it's passed more chars, then
+ check for null again. */
+ cmpdi cr7,r4,0
+ beq cr7,L(null_match)
+ /* Now r4 has a doubleword of c bytes and r0 has
+ a doubleword of null bytes. */
+
+ cmpb r10,r12,r4 /* Compare each byte against c byte. */
+ cmpb r11,r12,r0 /* Compare each byte against null byte. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r10,r10,r6
+ srd r11,r11,r6
+ sld r10,r10,r6
+ sld r11,r11,r6
+#else
+ sld r10,r10,r6
+ sld r11,r11,r6
+ srd r10,r10,r6
+ srd r11,r11,r6
+#endif
+ or r5,r10,r11 /* OR the results to speed things up. */
+ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done)
+
+L(align):
+ andi. r12, r8, 15
+
+ /* Are we now aligned to a doubleword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bne cr0, L(loop)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ bne cr7,L(done)
+ b L(loop) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+ .p2align 5
+L(loop):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r7,16(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ cmpb r6,r7,r4
+ cmpb r7,r7,r0
+ or r12,r10,r11
+ or r5,r6,r7
+ or r5,r12,r5
+ cmpdi cr7,r5,0
+ beq cr7,L(vector)
+
+ /* OK, one (or both) of the doublewords contains a c/null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a c/null byte. */
+ cmpdi cr6,r12,0
+ addi r8,r8,-8
+ bne cr6,L(done)
+
+ /* The c/null byte must be in the second doubleword. Adjust the
+ address again and move the result of cmpb to r10 so we can calculate
+ the pointer. */
+
+ mr r10,r6
+ mr r11,r7
+ addi r8,r8,8
+
+ /* r10/r11 have the output of the cmpb instructions, that is,
+ 0xff in the same position as the c/null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+
+L(done):
+ /* If there are more than one 0xff in r11, find the first position of
+ 0xff in r11 and fill r10 with 0 from that position. */
+ cmpdi cr7,r11,0
+ beq cr7,L(no_null)
+#ifdef __LITTLE_ENDIAN__
+ addi r3,r11,-1
+ andc r3,r3,r11
+ popcntd r0,r3
+#else
+ cntlzd r0,r11
+#endif
+ subfic r0,r0,63
+ li r6,-1
+#ifdef __LITTLE_ENDIAN__
+ srd r0,r6,r0
+#else
+ sld r0,r6,r0
+#endif
+ and r10,r0,r10
+L(no_null):
+#ifdef __LITTLE_ENDIAN__
+ cntlzd r0,r10 /* Count leading zeros before c matches. */
+ addi r3,r10,-1
+ andc r3,r3,r10
+ addi r10,r11,-1
+ andc r10,r10,r11
+ cmpld cr7,r3,r10
+ bgt cr7,L(no_match)
+#else
+ addi r3,r10,-1 /* Count trailing zeros before c matches. */
+ andc r3,r3,r10
+ popcntd r0,r3
+ cmpld cr7,r11,r10
+ bgt cr7,L(no_match)
+#endif
+ srdi r0,r0,3 /* Convert trailing zeros to bytes. */
+ subfic r0,r0,7
+ add r9,r8,r0 /* Return address of the matching c byte
+ or null in case c was not found. */
+ li r0,0
+ cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */
+ beq cr7,L(align)
+
+ .align 4
+L(no_match):
+ mr r3,r9
+ blr
+
+/* Check the first 32B in GPR's and move to vectorized loop. */
+ .p2align 5
+L(vector):
+ addi r3, r8, 8
+ /* Make sure 32B aligned. */
+ andi. r10, r3, 31
+ bne cr0, L(loop)
+ vspltisb v0, 0
+ /* Precompute vbpermq constant. */
+ vspltisb v10, 3
+ lvsl v11, r0, r0
+ vslb v10, v11, v10
+ MTVRD(v1, r4)
+ li r5, 16
+ vspltb v1, v1, 7
+ /* Compare 32 bytes in each loop. */
+L(continue):
+ lvx v4, 0, r3
+ lvx v5, r3, r5
+ vcmpequb v2, v0, v4
+ vcmpequb v3, v0, v5
+ vcmpequb v6, v1, v4
+ vcmpequb v7, v1, v5
+ vor v8, v2, v3
+ vor v9, v6, v7
+ vor v11, v8, v9
+ vcmpequb. v11, v0, v11
+ addi r3, r3, 32
+ blt cr6, L(continue)
+ vcmpequb. v8, v0, v8
+ blt cr6, L(match)
+
+ /* One (or both) of the quadwords contains c/null. */
+ vspltisb v8, 2
+ vspltisb v9, 5
+ /* Precompute values used for comparison. */
+ vsl v9, v8, v9 /* v9 = 0x4040404040404040. */
+ vaddubm v8, v9, v9
+ vsldoi v8, v0, v8, 1 /* v8 = 0x80. */
+
+ /* Check if null is in second qw. */
+ vcmpequb. v11, v0, v2
+ blt cr6, L(secondqw)
+
+ /* Null found in first qw. */
+ addi r8, r3, -32
+ /* Calculate the null position. */
+ FIND_NULL_POS(v2)
+ /* Check if null is in the first byte. */
+ vcmpequb. v11, v0, v2
+ blt cr6, L(no_match)
+ vsububm v2, v8, v2
+ /* Mask unwanted bytes after null. */
+#ifdef __LITTLE_ENDIAN__
+ vslo v6, v6, v2
+ vsro v6, v6, v2
+#else
+ vsro v6, v6, v2
+ vslo v6, v6, v2
+#endif
+ vcmpequb. v11, v0, v6
+ blt cr6, L(no_match)
+ /* Found a match before null. */
+ CALCULATE_MATCH()
+ add r3, r8, r6
+ blr
+
+L(secondqw):
+ addi r8, r3, -16
+ FIND_NULL_POS(v3)
+ vcmpequb. v11, v0, v2
+ blt cr6, L(no_match1)
+ vsububm v2, v8, v2
+ /* Mask unwanted bytes after null. */
+#ifdef __LITTLE_ENDIAN__
+ vslo v7, v7, v2
+ vsro v7, v7, v2
+#else
+ vsro v7, v7, v2
+ vslo v7, v7, v2
+#endif
+ vcmpequb. v11, v0, v7
+ blt cr6, L(no_match1)
+ addi r8, r8, 16
+ vor v6, v0, v7
+L(no_match1):
+ addi r8, r8, -16
+ vcmpequb. v11, v0, v6
+ blt cr6, L(no_match)
+ /* Found a match before null. */
+ CALCULATE_MATCH()
+ add r3, r8, r6
+ blr
+
+L(match):
+ /* One (or both) of the quadwords contains a match. */
+ mr r8, r3
+ vcmpequb. v8, v0, v7
+ blt cr6, L(firstqw)
+ /* Match found in second qw. */
+ addi r8, r8, 16
+ vor v6, v0, v7
+L(firstqw):
+ addi r8, r8, -32
+ CALCULATE_MATCH()
+ add r9, r8, r6 /* Compute final length. */
+ b L(continue)
+/* We are here because strrchr was called with a null byte. */
+ .align 4
+L(null_match):
+ /* r0 has a doubleword of null bytes. */
+
+ cmpb r5,r12,r0 /* Compare each byte against null bytes. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r5,r5,r6
+ sld r5,r5,r6
+#else
+ sld r5,r5,r6
+ srd r5,r5,r6
+#endif
+ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done_null)
+
+ andi. r12, r8, 15
+
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bne cr0, L(loop_null)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r5,r12,r0
+ cmpdi cr7,r5,0
+ bne cr7,L(done_null)
+ b L(loop_null) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+
+ /* Main loop to look for the end of the string. Since it's a
+ small loop (< 8 instructions), align it to 32-bytes. */
+ .p2align 5
+L(loop_null):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r11,16(r8)
+ cmpb r5,r12,r0
+ cmpb r10,r11,r0
+ or r6,r5,r10
+ cmpdi cr7,r6,0
+ beq cr7,L(vector1)
+
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ cmpdi cr6,r5,0
+ addi r8,r8,-8
+ bne cr6,L(done_null)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ pointer. */
+
+ mr r5,r10
+ addi r8,r8,8
+
+ /* r5 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r5,-1
+ andc r0,r0,r5
+ popcntd r0,r0
+#else
+ cntlzd r0,r5 /* Count leading zeros before the match. */
+#endif
+ srdi r0,r0,3 /* Convert trailing zeros to bytes. */
+ add r3,r8,r0 /* Return address of the matching null byte. */
+ blr
+/* Check the first 32B in GPR's and move to vectorized loop. */
+ .p2align 5
+L(vector1):
+ addi r3, r8, 8
+ /* Make sure 32B aligned. */
+ andi. r10, r3, 31
+ bne cr0, L(loop_null)
+ vspltisb v0, 0
+ /* Precompute vbpermq constant. */
+ vspltisb v10, 3
+ lvsl v11, r0, r0
+ vslb v10, v11, v10
+ li r5, 16
+ /* Compare 32 bytes in each loop. */
+L(continue1):
+ lvx v4, 0, r3
+ lvx v5, r3, r5
+ vcmpequb v2, v0, v4
+ vcmpequb v3, v0, v5
+ vor v8, v2, v3
+ vcmpequb. v11, v0, v8
+ addi r3, r3, 32
+ blt cr6, L(continue1)
+ addi r3, r3, -32
+ VBPERMQ(v2, v2, v10)
+ VBPERMQ(v3, v3, v10)
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v3, v3, v3, 2
+#else
+ vsldoi v2, v2, v2, 6
+ vsldoi v3, v3, v3, 4
+#endif
+ /* Merge the results and move to a GPR. */
+ vor v4, v3, v2
+ MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+ addi r6, r5, -1
+ andc r6, r6, r5
+ popcntd r6, r6
+#else
+ cntlzd r6, r5 /* Count leading zeros before the match. */
+#endif
+ add r3, r3, r6 /* Compute final length. */
+ blr
+END (strrchr)
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
--
2.7.4
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-02-14 11:06 [PATCH] powerpc64: strrchr optimization for power8 Rajalakshmi Srinivasaraghavan
@ 2017-02-20 13:10 ` Gabriel F. T. Gomes
2017-02-20 13:43 ` Carlos O'Donell
2017-02-21 17:02 ` Carlos Eduardo Seo
2 siblings, 0 replies; 18+ messages in thread
From: Gabriel F. T. Gomes @ 2017-02-20 13:10 UTC (permalink / raw)
To: Rajalakshmi Srinivasaraghavan; +Cc: libc-alpha
LGTM.
On Tue, 14 Feb 2017 16:35:21 +0530
Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> wrote:
> Changes from previous version [1]
>
> - Comments correction and alignment changes.
>
> --
> P7 code is used for <=32B strings and for > 32B vectorized loops are used.
> This shows as an average 25% improvement depending on the position of search
> character. The performance is same for shorter strings.
> Tested on ppc64 and ppc64le.
>
> 2017-02-14 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>
> * sysdeps/powerpc/powerpc64/multiarch/Makefile
> (sysdep_routines): Add strrchr-power8.
> * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> (strrchr): Add __strrchr_power8 to list of strrchr functions.
> * sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S: New file.
> * sysdeps/powerpc/powerpc64/multiarch/strrchr.c
> (strrchr): Add __strrchr_power8 to ifunc list.
> * sysdeps/powerpc/powerpc64/power8/strrchr.S: New file.
> ---
> sysdeps/powerpc/powerpc64/multiarch/Makefile | 3 +-
> .../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 3 +
> .../powerpc/powerpc64/multiarch/strrchr-power8.S | 39 ++
> sysdeps/powerpc/powerpc64/multiarch/strrchr.c | 3 +
> sysdeps/powerpc/powerpc64/power8/strrchr.S | 464 +++++++++++++++++++++
> 5 files changed, 511 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
> create mode 100644 sysdeps/powerpc/powerpc64/power8/strrchr.S
>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index f5889a3..0fc0ebc 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -14,7 +14,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
> strchrnul-power8 strchrnul-power7 strchrnul-ppc64 \
> strcpy-power8 strcpy-power7 strcpy-ppc64 stpcpy-power8 \
> stpcpy-power7 stpcpy-ppc64 \
> - strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
> + strrchr-power8 strrchr-power7 strrchr-ppc64 \
> + strncat-power7 strncat-ppc64 \
> strncpy-power7 strncpy-ppc64 \
> stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
> strcmp-power9 strcmp-power8 strcmp-power7 strcmp-ppc64 \
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index 209aec5..d77c47f 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -281,6 +281,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> /* Support sysdeps/powerpc/powerpc64/multiarch/strrchr.c. */
> IFUNC_IMPL (i, name, strrchr,
> IFUNC_IMPL_ADD (array, i, strrchr,
> + hwcap2 & PPC_FEATURE2_ARCH_2_07,
> + __strrchr_power8)
> + IFUNC_IMPL_ADD (array, i, strrchr,
> hwcap & PPC_FEATURE_HAS_VSX,
> __strrchr_power7)
> IFUNC_IMPL_ADD (array, i, strrchr, 1,
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
> new file mode 100644
> index 0000000..23365a1
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
> @@ -0,0 +1,39 @@
> +/* Optimized strrchr implementation for POWER8.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +#undef ENTRY
> +#define ENTRY(name) \
> + .section ".text"; \
> + ENTRY_2(__strrchr_power8) \
> + .align ALIGNARG(2); \
> + BODY_LABEL(__strrchr_power8): \
> + cfi_startproc; \
> + LOCALENTRY(__strrchr_power8)
> +
> +#undef END
> +#define END(name) \
> + cfi_endproc; \
> + TRACEBACK(__strrchr_power8) \
> + END_2(__strrchr_power8)
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/power8/strrchr.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
> index dc1d3d0..0f94c9d 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
> @@ -25,11 +25,14 @@
>
> extern __typeof (strrchr) __strrchr_ppc attribute_hidden;
> extern __typeof (strrchr) __strrchr_power7 attribute_hidden;
> +extern __typeof (strrchr) __strrchr_power8 attribute_hidden;
> #undef strrchr
>
> /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> ifunc symbol properly. */
> libc_ifunc_redirected (__redirect_strrchr, strrchr,
> + (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> + ? __strrchr_power8 :
> (hwcap & PPC_FEATURE_HAS_VSX)
> ? __strrchr_power7
> : __strrchr_ppc);
> diff --git a/sysdeps/powerpc/powerpc64/power8/strrchr.S b/sysdeps/powerpc/powerpc64/power8/strrchr.S
> new file mode 100644
> index 0000000..8eb7485
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/power8/strrchr.S
> @@ -0,0 +1,464 @@
> +/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +/* char *[r3] strrchr (char *s [r3], int c [r4]) */
> +/* TODO: change these to the actual instructions when the minimum required
> + binutils allows it. */
> +#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
> +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
> +#define VBPERMQ(t,a,b) .long (0x1000054c \
> + | ((t)<<(32-11)) \
> + | ((a)<<(32-16)) \
> + | ((b)<<(32-21)) )
> +#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21)))
> +#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21)))
> +#define VADDUQM(t,a,b) .long (0x10000100 \
> + | ((t)<<(32-11)) \
> + | ((a)<<(32-16)) \
> + | ((b)<<(32-21)) )
> +#ifdef __LITTLE_ENDIAN__
> +/* Find the match position from v6 and place result in r6. */
> +# define CALCULATE_MATCH() \
> + VBPERMQ(v6, v6, v10); \
> + vsldoi v6, v6, v6, 6; \
> + MFVRD(r7, v6); \
> + cntlzd r6, r7; \
> + subfic r6, r6, 15;
> +/*
> + * Find the first null position to mask bytes after null.
> + * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw.
> + * Result placed at v2.
> + */
> +# define FIND_NULL_POS(reg) \
> + vspltisb v11, -1; \
> + VADDUQM(v11, reg, v11); \
> + vandc v11, v11, reg; \
> + VPOPCNTD(v2, v11); \
> + vspltb v11, v2, 15; \
> + vcmpequb. v11, v11, v9; \
> + blt cr6, 1f; \
> + vsldoi v9, v0, v9, 1; \
> + vslo v2, v2, v9; \
> +1: \
> + vsumsws v2, v2, v0;
> +#else
> +# define CALCULATE_MATCH() \
> + VBPERMQ(v6, v6, v10); \
> + MFVRD(r7, v6); \
> + addi r6, r7, -1; \
> + andc r6, r6, r7; \
> + popcntd r6, r6; \
> + subfic r6, r6, 15;
> +# define FIND_NULL_POS(reg) \
> + VCLZD(v2, reg); \
> + vspltb v11, v2, 7; \
> + vcmpequb. v11, v11, v9; \
> + blt cr6, 1f; \
> + vsldoi v9, v0, v9, 1; \
> + vsro v2, v2, v9; \
> +1: \
> + vsumsws v2, v2, v0;
> +#endif /* !__LITTLE_ENDIAN__ */
> + .machine power7
> +ENTRY (strrchr)
> + CALL_MCOUNT 2
> + dcbt 0,r3
> + clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
> + cmpdi cr7,r4,0
> + ld r12,0(r8) /* Load doubleword from memory. */
> + li r9,0 /* Used to store last occurence. */
> + li r0,0 /* Doubleword with null chars to use
> + with cmpb. */
> +
> + rlwinm r6,r3,3,26,28 /* Calculate padding. */
> +
> + beq cr7,L(null_match)
> +
> + /* Replicate byte to doubleword. */
> + insrdi r4,r4,8,48
> + insrdi r4,r4,16,32
> + insrdi r4,r4,32,0
> +
> + /* r4 is changed now. If it's passed more chars, then
> + check for null again. */
> + cmpdi cr7,r4,0
> + beq cr7,L(null_match)
> + /* Now r4 has a doubleword of c bytes and r0 has
> + a doubleword of null bytes. */
> +
> + cmpb r10,r12,r4 /* Compare each byte against c byte. */
> + cmpb r11,r12,r0 /* Compare each byte against null byte. */
> +
> + /* Move the doublewords left and right to discard the bits that are
> + not part of the string and bring them back as zeros. */
> +#ifdef __LITTLE_ENDIAN__
> + srd r10,r10,r6
> + srd r11,r11,r6
> + sld r10,r10,r6
> + sld r11,r11,r6
> +#else
> + sld r10,r10,r6
> + sld r11,r11,r6
> + srd r10,r10,r6
> + srd r11,r11,r6
> +#endif
> + or r5,r10,r11 /* OR the results to speed things up. */
> + cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
> + have been found. */
> + bne cr7,L(done)
> +
> +L(align):
> + andi. r12, r8, 15
> +
> + /* Are we now aligned to a doubleword boundary? If so, skip to
> + the main loop. Otherwise, go through the alignment code. */
> +
> + bne cr0, L(loop)
> +
> + /* Handle WORD2 of pair. */
> + ldu r12,8(r8)
> + cmpb r10,r12,r4
> + cmpb r11,r12,r0
> + or r5,r10,r11
> + cmpdi cr7,r5,0
> + bne cr7,L(done)
> + b L(loop) /* We branch here (rather than falling through)
> + to skip the nops due to heavy alignment
> + of the loop below. */
> + .p2align 5
> +L(loop):
> + /* Load two doublewords, compare and merge in a
> + single register for speed. This is an attempt
> + to speed up the null-checking process for bigger strings. */
> + ld r12,8(r8)
> + ldu r7,16(r8)
> + cmpb r10,r12,r4
> + cmpb r11,r12,r0
> + cmpb r6,r7,r4
> + cmpb r7,r7,r0
> + or r12,r10,r11
> + or r5,r6,r7
> + or r5,r12,r5
> + cmpdi cr7,r5,0
> + beq cr7,L(vector)
> +
> + /* OK, one (or both) of the doublewords contains a c/null byte. Check
> + the first doubleword and decrement the address in case the first
> + doubleword really contains a c/null byte. */
> + cmpdi cr6,r12,0
> + addi r8,r8,-8
> + bne cr6,L(done)
> +
> + /* The c/null byte must be in the second doubleword. Adjust the
> + address again and move the result of cmpb to r10 so we can calculate
> + the pointer. */
> +
> + mr r10,r6
> + mr r11,r7
> + addi r8,r8,8
> +
> + /* r10/r11 have the output of the cmpb instructions, that is,
> + 0xff in the same position as the c/null byte in the original
> + doubleword from the string. Use that to calculate the pointer. */
> +
> +L(done):
> + /* If there are more than one 0xff in r11, find the first position of
> + 0xff in r11 and fill r10 with 0 from that position. */
> + cmpdi cr7,r11,0
> + beq cr7,L(no_null)
> +#ifdef __LITTLE_ENDIAN__
> + addi r3,r11,-1
> + andc r3,r3,r11
> + popcntd r0,r3
> +#else
> + cntlzd r0,r11
> +#endif
> + subfic r0,r0,63
> + li r6,-1
> +#ifdef __LITTLE_ENDIAN__
> + srd r0,r6,r0
> +#else
> + sld r0,r6,r0
> +#endif
> + and r10,r0,r10
> +L(no_null):
> +#ifdef __LITTLE_ENDIAN__
> + cntlzd r0,r10 /* Count leading zeros before c matches. */
> + addi r3,r10,-1
> + andc r3,r3,r10
> + addi r10,r11,-1
> + andc r10,r10,r11
> + cmpld cr7,r3,r10
> + bgt cr7,L(no_match)
> +#else
> + addi r3,r10,-1 /* Count trailing zeros before c matches. */
> + andc r3,r3,r10
> + popcntd r0,r3
> + cmpld cr7,r11,r10
> + bgt cr7,L(no_match)
> +#endif
> + srdi r0,r0,3 /* Convert trailing zeros to bytes. */
> + subfic r0,r0,7
> + add r9,r8,r0 /* Return address of the matching c byte
> + or null in case c was not found. */
> + li r0,0
> + cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */
> + beq cr7,L(align)
> +
> + .align 4
> +L(no_match):
> + mr r3,r9
> + blr
> +
> +/* Check the first 32B in GPR's and move to vectorized loop. */
> + .p2align 5
> +L(vector):
> + addi r3, r8, 8
> + /* Make sure 32B aligned. */
> + andi. r10, r3, 31
> + bne cr0, L(loop)
> + vspltisb v0, 0
> + /* Precompute vbpermq constant. */
> + vspltisb v10, 3
> + lvsl v11, r0, r0
> + vslb v10, v11, v10
> + MTVRD(v1, r4)
> + li r5, 16
> + vspltb v1, v1, 7
> + /* Compare 32 bytes in each loop. */
> +L(continue):
> + lvx v4, 0, r3
> + lvx v5, r3, r5
> + vcmpequb v2, v0, v4
> + vcmpequb v3, v0, v5
> + vcmpequb v6, v1, v4
> + vcmpequb v7, v1, v5
> + vor v8, v2, v3
> + vor v9, v6, v7
> + vor v11, v8, v9
> + vcmpequb. v11, v0, v11
> + addi r3, r3, 32
> + blt cr6, L(continue)
> + vcmpequb. v8, v0, v8
> + blt cr6, L(match)
> +
> + /* One (or both) of the quadwords contains c/null. */
> + vspltisb v8, 2
> + vspltisb v9, 5
> + /* Precompute values used for comparison. */
> + vsl v9, v8, v9 /* v9 = 0x4040404040404040. */
> + vaddubm v8, v9, v9
> + vsldoi v8, v0, v8, 1 /* v8 = 0x80. */
> +
> + /* Check if null is in second qw. */
> + vcmpequb. v11, v0, v2
> + blt cr6, L(secondqw)
> +
> + /* Null found in first qw. */
> + addi r8, r3, -32
> + /* Calculate the null position. */
> + FIND_NULL_POS(v2)
> + /* Check if null is in the first byte. */
> + vcmpequb. v11, v0, v2
> + blt cr6, L(no_match)
> + vsububm v2, v8, v2
> + /* Mask unwanted bytes after null. */
> +#ifdef __LITTLE_ENDIAN__
> + vslo v6, v6, v2
> + vsro v6, v6, v2
> +#else
> + vsro v6, v6, v2
> + vslo v6, v6, v2
> +#endif
> + vcmpequb. v11, v0, v6
> + blt cr6, L(no_match)
> + /* Found a match before null. */
> + CALCULATE_MATCH()
> + add r3, r8, r6
> + blr
> +
> +L(secondqw):
> + addi r8, r3, -16
> + FIND_NULL_POS(v3)
> + vcmpequb. v11, v0, v2
> + blt cr6, L(no_match1)
> + vsububm v2, v8, v2
> + /* Mask unwanted bytes after null. */
> +#ifdef __LITTLE_ENDIAN__
> + vslo v7, v7, v2
> + vsro v7, v7, v2
> +#else
> + vsro v7, v7, v2
> + vslo v7, v7, v2
> +#endif
> + vcmpequb. v11, v0, v7
> + blt cr6, L(no_match1)
> + addi r8, r8, 16
> + vor v6, v0, v7
> +L(no_match1):
> + addi r8, r8, -16
> + vcmpequb. v11, v0, v6
> + blt cr6, L(no_match)
> + /* Found a match before null. */
> + CALCULATE_MATCH()
> + add r3, r8, r6
> + blr
> +
> +L(match):
> + /* One (or both) of the quadwords contains a match. */
> + mr r8, r3
> + vcmpequb. v8, v0, v7
> + blt cr6, L(firstqw)
> + /* Match found in second qw. */
> + addi r8, r8, 16
> + vor v6, v0, v7
> +L(firstqw):
> + addi r8, r8, -32
> + CALCULATE_MATCH()
> + add r9, r8, r6 /* Compute final length. */
> + b L(continue)
> +/* We are here because strrchr was called with a null byte. */
> + .align 4
> +L(null_match):
> + /* r0 has a doubleword of null bytes. */
> +
> + cmpb r5,r12,r0 /* Compare each byte against null bytes. */
> +
> + /* Move the doublewords left and right to discard the bits that are
> + not part of the string and bring them back as zeros. */
> +#ifdef __LITTLE_ENDIAN__
> + srd r5,r5,r6
> + sld r5,r5,r6
> +#else
> + sld r5,r5,r6
> + srd r5,r5,r6
> +#endif
> + cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
> + have been found. */
> + bne cr7,L(done_null)
> +
> + andi. r12, r8, 15
> +
> + /* Are we now aligned to a quadword boundary? If so, skip to
> + the main loop. Otherwise, go through the alignment code. */
> +
> + bne cr0, L(loop_null)
> +
> + /* Handle WORD2 of pair. */
> + ldu r12,8(r8)
> + cmpb r5,r12,r0
> + cmpdi cr7,r5,0
> + bne cr7,L(done_null)
> + b L(loop_null) /* We branch here (rather than falling through)
> + to skip the nops due to heavy alignment
> + of the loop below. */
> +
> + /* Main loop to look for the end of the string. Since it's a
> + small loop (< 8 instructions), align it to 32-bytes. */
> + .p2align 5
> +L(loop_null):
> + /* Load two doublewords, compare and merge in a
> + single register for speed. This is an attempt
> + to speed up the null-checking process for bigger strings. */
> + ld r12,8(r8)
> + ldu r11,16(r8)
> + cmpb r5,r12,r0
> + cmpb r10,r11,r0
> + or r6,r5,r10
> + cmpdi cr7,r6,0
> + beq cr7,L(vector1)
> +
> + /* OK, one (or both) of the doublewords contains a null byte. Check
> + the first doubleword and decrement the address in case the first
> + doubleword really contains a null byte. */
> +
> + cmpdi cr6,r5,0
> + addi r8,r8,-8
> + bne cr6,L(done_null)
> +
> + /* The null byte must be in the second doubleword. Adjust the address
> + again and move the result of cmpb to r10 so we can calculate the
> + pointer. */
> +
> + mr r5,r10
> + addi r8,r8,8
> +
> + /* r5 has the output of the cmpb instruction, that is, it contains
> + 0xff in the same position as the null byte in the original
> + doubleword from the string. Use that to calculate the pointer. */
> +L(done_null):
> +#ifdef __LITTLE_ENDIAN__
> + addi r0,r5,-1
> + andc r0,r0,r5
> + popcntd r0,r0
> +#else
> + cntlzd r0,r5 /* Count leading zeros before the match. */
> +#endif
> + srdi r0,r0,3 /* Convert trailing zeros to bytes. */
> + add r3,r8,r0 /* Return address of the matching null byte. */
> + blr
> +/* Check the first 32B in GPR's and move to vectorized loop. */
> + .p2align 5
> +L(vector1):
> + addi r3, r8, 8
> + /* Make sure 32B aligned. */
> + andi. r10, r3, 31
> + bne cr0, L(loop_null)
> + vspltisb v0, 0
> + /* Precompute vbpermq constant. */
> + vspltisb v10, 3
> + lvsl v11, r0, r0
> + vslb v10, v11, v10
> + li r5, 16
> + /* Compare 32 bytes in each loop. */
> +L(continue1):
> + lvx v4, 0, r3
> + lvx v5, r3, r5
> + vcmpequb v2, v0, v4
> + vcmpequb v3, v0, v5
> + vor v8, v2, v3
> + vcmpequb. v11, v0, v8
> + addi r3, r3, 32
> + blt cr6, L(continue1)
> + addi r3, r3, -32
> + VBPERMQ(v2, v2, v10)
> + VBPERMQ(v3, v3, v10)
> + /* Shift each component into its correct position for merging. */
> +#ifdef __LITTLE_ENDIAN__
> + vsldoi v3, v3, v3, 2
> +#else
> + vsldoi v2, v2, v2, 6
> + vsldoi v3, v3, v3, 4
> +#endif
> + /* Merge the results and move to a GPR. */
> + vor v4, v3, v2
> + MFVRD(r5, v4)
> +#ifdef __LITTLE_ENDIAN__
> + addi r6, r5, -1
> + andc r6, r6, r5
> + popcntd r6, r6
> +#else
> + cntlzd r6, r5 /* Count leading zeros before the match. */
> +#endif
> + add r3, r3, r6 /* Compute final length. */
> + blr
> +END (strrchr)
> +weak_alias (strrchr, rindex)
> +libc_hidden_builtin_def (strrchr)
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-02-14 11:06 [PATCH] powerpc64: strrchr optimization for power8 Rajalakshmi Srinivasaraghavan
2017-02-20 13:10 ` Gabriel F. T. Gomes
@ 2017-02-20 13:43 ` Carlos O'Donell
2017-02-20 16:01 ` Rajalakshmi Srinivasaraghavan
2017-02-21 17:02 ` Carlos Eduardo Seo
2 siblings, 1 reply; 18+ messages in thread
From: Carlos O'Donell @ 2017-02-20 13:43 UTC (permalink / raw)
To: Rajalakshmi Srinivasaraghavan, libc-alpha
On 02/14/2017 06:05 AM, Rajalakshmi Srinivasaraghavan wrote:
> P7 code is used for <=32B strings and for > 32B vectorized loops are used.
> This shows as an average 25% improvement depending on the position of search
> character. The performance is same for shorter strings.
> Tested on ppc64 and ppc64le.
What did you use to test the 25% improvement?
--
Cheers,
Carlos.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-02-20 13:43 ` Carlos O'Donell
@ 2017-02-20 16:01 ` Rajalakshmi Srinivasaraghavan
2017-02-20 16:06 ` Carlos O'Donell
0 siblings, 1 reply; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-02-20 16:01 UTC (permalink / raw)
To: libc-alpha
On 02/20/2017 07:12 PM, Carlos O'Donell wrote:
> On 02/14/2017 06:05 AM, Rajalakshmi Srinivasaraghavan wrote:
>> P7 code is used for <=32B strings and for > 32B vectorized loops are used.
>> This shows as an average 25% improvement depending on the position of search
>> character. The performance is same for shorter strings.
>> Tested on ppc64 and ppc64le.
> What did you use to test the 25% improvement?
This improvement is seen when compared to power7. Benchtest is
modified to use length from 0 to 400 to find the average for
different lengths.
--
Thanks
Rajalakshmi S
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-02-20 16:01 ` Rajalakshmi Srinivasaraghavan
@ 2017-02-20 16:06 ` Carlos O'Donell
2017-02-20 16:50 ` Rajalakshmi Srinivasaraghavan
2017-02-28 7:32 ` Rajalakshmi Srinivasaraghavan
0 siblings, 2 replies; 18+ messages in thread
From: Carlos O'Donell @ 2017-02-20 16:06 UTC (permalink / raw)
To: Rajalakshmi Srinivasaraghavan, libc-alpha
On 02/20/2017 11:01 AM, Rajalakshmi Srinivasaraghavan wrote:
>
>
> On 02/20/2017 07:12 PM, Carlos O'Donell wrote:
>> On 02/14/2017 06:05 AM, Rajalakshmi Srinivasaraghavan wrote:
>>> P7 code is used for <=32B strings and for > 32B vectorized loops are used.
>>> This shows as an average 25% improvement depending on the position of search
>>> character. The performance is same for shorter strings.
>>> Tested on ppc64 and ppc64le.
>> What did you use to test the 25% improvement?
>
> This improvement is seen when compared to power7. Benchtest is
> modified to use length from 0 to 400 to find the average for
> different lengths.
Could you post your modifications for review an explain your
process in a little more detail. I'm curious about the changes
you made.
--
Cheers,
Carlos.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-02-20 16:06 ` Carlos O'Donell
@ 2017-02-20 16:50 ` Rajalakshmi Srinivasaraghavan
2017-02-28 7:32 ` Rajalakshmi Srinivasaraghavan
1 sibling, 0 replies; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-02-20 16:50 UTC (permalink / raw)
To: Carlos O'Donell, libc-alpha
[-- Attachment #1: Type: text/plain, Size: 1289 bytes --]
On 02/20/2017 09:36 PM, Carlos O'Donell wrote:
> On 02/20/2017 11:01 AM, Rajalakshmi Srinivasaraghavan wrote:
>>
>> On 02/20/2017 07:12 PM, Carlos O'Donell wrote:
>>> On 02/14/2017 06:05 AM, Rajalakshmi Srinivasaraghavan wrote:
>>>> P7 code is used for <=32B strings and for > 32B vectorized loops are used.
>>>> This shows as an average 25% improvement depending on the position of search
>>>> character. The performance is same for shorter strings.
>>>> Tested on ppc64 and ppc64le.
>>> What did you use to test the 25% improvement?
>> This improvement is seen when compared to power7. Benchtest is
>> modified to use length from 0 to 400 to find the average for
>> different lengths.
> Could you post your modifications for review an explain your
> process in a little more detail. I'm curious about the changes
> you made.
I modified benchtest/bench-strrchr.c to measure only the following loop
and commented the existing 'for' loops.
for (i = 0; i < 400; ++i)
{
do_test (0, i, i + 1, 0, SMALL_CHAR);
do_test (i, i, i + 1, 0, BIG_CHAR);
}
Then the benchtests generated is copied to a spreadsheet to
calculate the improvement.
Attached is the benchtests result for ppc64le.
Note: The numbers vary slightly from run to run.
--
Thanks
Rajalakshmi S
[-- Attachment #2: bench-result --]
[-- Type: text/plain, Size: 53538 bytes --]
simple_strrchr __strrchr_power8 __strrchr_power7 __strrchr_ppc
Length 1, alignment in bytes 0: 2.34375 3.04688 2.9375 4.73438
Length 1, alignment in bytes 0: 2.39062 2.9375 2.875 4.45312
Length 2, alignment in bytes 0: 3.10938 2.8125 2.84375 4.67188
Length 2, alignment in bytes 1: 2.96875 2.89062 2.89062 4.5625
Length 3, alignment in bytes 0: 4.57812 2.85938 2.85938 4.57812
Length 3, alignment in bytes 2: 4.20312 2.9375 2.82812 4.625
Length 4, alignment in bytes 0: 4.46875 2.92188 2.85938 4.65625
Length 4, alignment in bytes 3: 4.15625 2.98438 2.84375 4.57812
Length 5, alignment in bytes 0: 4.98438 2.85938 2.85938 4.57812
Length 5, alignment in bytes 4: 4.71875 2.875 2.98438 5.23438
Length 6, alignment in bytes 0: 5.53125 2.98438 2.79688 4.625
Length 6, alignment in bytes 5: 5.34375 2.8125 2.82812 4.98438
Length 7, alignment in bytes 0: 6.28125 2.92188 2.875 4.625
Length 7, alignment in bytes 6: 6.03125 2.79688 2.78125 4.95312
Length 8, alignment in bytes 0: 7.09375 2.89062 2.8125 4.57812
Length 8, alignment in bytes 7: 6.89062 2.79688 2.78125 4.78125
Length 9, alignment in bytes 0: 7.5625 2.84375 2.78125 4.875
Length 9, alignment in bytes 0: 7.45312 2.73438 2.71875 4.89062
Length 10, alignment in bytes 0: 8.60938 2.78125 2.71875 4.82812
Length 10, alignment in bytes 1: 8.40625 2.71875 2.76562 4.9375
Length 11, alignment in bytes 0: 10.2812 2.71875 2.71875 4.9375
Length 11, alignment in bytes 2: 10.2969 2.70312 2.70312 4.82812
Length 12, alignment in bytes 0: 10.875 2.76562 2.70312 4.85938
Length 12, alignment in bytes 3: 10.8281 2.73438 2.71875 4.89062
Length 13, alignment in bytes 0: 11.4375 2.73438 2.73438 4.76562
Length 13, alignment in bytes 4: 11.4219 3.25 3.3125 5.57812
Length 14, alignment in bytes 0: 11.9375 2.73438 2.70312 4.76562
Length 14, alignment in bytes 5: 12 3.23438 3.1875 5.45312
Length 15, alignment in bytes 0: 12.4844 2.79688 2.6875 4.875
Length 15, alignment in bytes 6: 12.5312 3.26562 3.15625 5.39062
Length 16, alignment in bytes 0: 13.0938 2.75 2.73438 4.90625
Length 16, alignment in bytes 7: 13.0469 3.1875 3.15625 5.35938
Length 17, alignment in bytes 0: 13.9219 3.25 3.1875 5.42188
Length 17, alignment in bytes 0: 13.9219 3.20312 3.14062 5.4375
Length 18, alignment in bytes 0: 14.3125 3.21875 3.23438 5.45312
Length 18, alignment in bytes 1: 14.3594 3.34375 3.25 5.4375
Length 19, alignment in bytes 0: 14.8594 3.15625 3.10938 5.53125
Length 19, alignment in bytes 2: 14.8594 3.1875 3.14062 5.42188
Length 20, alignment in bytes 0: 15.4844 3.23438 3.15625 5.46875
Length 20, alignment in bytes 3: 15.4062 3.1875 3.14062 5.48438
Length 21, alignment in bytes 0: 16.1406 3.20312 3.15625 5.45312
Length 21, alignment in bytes 4: 16.1562 3.34375 3.4375 6.5
Length 22, alignment in bytes 0: 17.0781 3.23438 3.28125 5.65625
Length 22, alignment in bytes 5: 17.0938 3.14062 3.20312 6.20312
Length 23, alignment in bytes 0: 17.8281 3.20312 3.20312 5.57812
Length 23, alignment in bytes 6: 17.7812 3.07812 3.15625 6.28125
Length 24, alignment in bytes 0: 18.3281 3.21875 3.15625 5.51562
Length 24, alignment in bytes 7: 18.3906 3.10938 3.03125 6.14062
Length 25, alignment in bytes 0: 18.9062 3.09375 3.0625 6.14062
Length 25, alignment in bytes 0: 18.875 3.14062 3 5.98438
Length 26, alignment in bytes 0: 19.5 3.14062 3.07812 6.20312
Length 26, alignment in bytes 1: 19.4688 3.07812 3.03125 6.07812
Length 27, alignment in bytes 0: 19.9844 3.0625 3.04688 6.10938
Length 27, alignment in bytes 2: 20.0469 3.10938 3.09375 5.95312
Length 28, alignment in bytes 0: 20.5625 3.0625 3.01562 6.01562
Length 28, alignment in bytes 3: 20.5156 3.03125 3.04688 6.0625
Length 29, alignment in bytes 0: 21.1094 3.04688 3.03125 6
Length 29, alignment in bytes 4: 21.1562 4.07812 4.40625 7.04688
Length 30, alignment in bytes 0: 21.6562 3.15625 3.07812 6.09375
Length 30, alignment in bytes 5: 21.6719 3.76562 3.89062 6.73438
Length 31, alignment in bytes 0: 22.25 3.09375 3.0625 6.20312
Length 31, alignment in bytes 6: 22.25 3.73438 3.85938 6.79688
Length 32, alignment in bytes 0: 22.7344 3.04688 3.10938 6.23438
Length 32, alignment in bytes 7: 22.7812 3.625 3.75 6.64062
Length 33, alignment in bytes 0: 23.4062 3.73438 3.85938 6.70312
Length 33, alignment in bytes 0: 23.3438 3.64062 3.85938 6.60938
Length 34, alignment in bytes 0: 23.8438 3.64062 3.70312 6.82812
Length 34, alignment in bytes 1: 23.9219 3.67188 3.71875 6.75
Length 35, alignment in bytes 0: 24.5 3.60938 3.67188 6.54688
Length 35, alignment in bytes 2: 24.4375 3.73438 3.73438 6.67188
Length 36, alignment in bytes 0: 24.9688 3.57812 3.73438 6.625
Length 36, alignment in bytes 3: 25.0312 3.60938 3.73438 6.70312
Length 37, alignment in bytes 0: 25.6094 3.67188 3.73438 6.59375
Length 37, alignment in bytes 4: 25.5312 3.65625 4.15625 7.60938
Length 38, alignment in bytes 0: 26.0625 3.78125 3.75 6.89062
Length 38, alignment in bytes 5: 26.1562 3.59375 3.875 7.35938
Length 39, alignment in bytes 0: 26.7188 3.70312 3.71875 6.95312
Length 39, alignment in bytes 6: 26.6562 3.8125 3.625 7.5625
Length 40, alignment in bytes 0: 27.1562 3.70312 3.70312 6.78125
Length 40, alignment in bytes 7: 27.2188 3.57812 3.625 7.29688
Length 41, alignment in bytes 0: 27.8594 3.65625 3.60938 7.35938
Length 41, alignment in bytes 0: 27.7812 3.625 3.42188 7.3125
Length 42, alignment in bytes 0: 28.2812 3.75 3.375 7.23438
Length 42, alignment in bytes 1: 28.3125 3.6875 3.46875 7.1875
Length 43, alignment in bytes 0: 28.9375 3.67188 3.45312 7.23438
Length 43, alignment in bytes 2: 28.9219 3.73438 3.54688 7.26562
Length 44, alignment in bytes 0: 29.4219 3.65625 3.48438 7.29688
Length 44, alignment in bytes 3: 29.375 3.64062 3.46875 7.3125
Length 45, alignment in bytes 0: 30.0312 3.73438 3.48438 7.28125
Length 45, alignment in bytes 4: 30.0781 3.64062 4.39062 8.26562
Length 46, alignment in bytes 0: 30.5625 3.64062 3.6875 7.54688
Length 46, alignment in bytes 5: 30.5 3.70312 4.3125 8.125
Length 47, alignment in bytes 0: 31.0625 3.73438 3.51562 7.39062
Length 47, alignment in bytes 6: 31.1562 3.625 4.17188 8.09375
Length 48, alignment in bytes 0: 31.7344 3.73438 3.5 7.39062
Length 48, alignment in bytes 7: 31.6875 3.78125 4.17188 8.23438
Length 49, alignment in bytes 0: 32.1719 3.60938 4.17188 7.95312
Length 49, alignment in bytes 0: 32.1562 3.70312 4.17188 8.03125
Length 50, alignment in bytes 0: 32.75 3.67188 4.0625 7.82812
Length 50, alignment in bytes 1: 32.8438 3.6875 4.0625 8.03125
Length 51, alignment in bytes 0: 33.3906 3.625 4.14062 7.95312
Length 51, alignment in bytes 2: 33.3125 3.65625 4.21875 7.9375
Length 52, alignment in bytes 0: 33.8281 3.64062 4.25 7.92188
Length 52, alignment in bytes 3: 33.8281 3.70312 4.10938 7.89062
Length 53, alignment in bytes 0: 34.4375 3.73438 4.28125 7.89062
Length 53, alignment in bytes 4: 34.5 3.67188 4.28125 8.6875
Length 54, alignment in bytes 0: 35.0312 3.67188 4.10938 8.03125
Length 54, alignment in bytes 5: 35 3.65625 4.09375 8.54688
Length 55, alignment in bytes 0: 35.5938 3.65625 4.15625 8.15625
Length 55, alignment in bytes 6: 35.625 3.78125 3.9375 8.54688
Length 56, alignment in bytes 0: 36.125 3.65625 4.3125 8.10938
Length 56, alignment in bytes 7: 36.0781 3.625 3.90625 8.5625
Length 57, alignment in bytes 0: 36.5781 3.57812 4 8.46875
Length 57, alignment in bytes 0: 36.5938 3.65625 3.89062 8.39062
Length 58, alignment in bytes 0: 37.2188 3.76562 3.96875 8.4375
Length 58, alignment in bytes 1: 37.2812 3.625 3.90625 8.53125
Length 59, alignment in bytes 0: 37.8594 3.73438 3.82812 8.53125
Length 59, alignment in bytes 2: 37.8125 3.67188 3.96875 8.40625
Length 60, alignment in bytes 0: 38.2969 3.6875 3.89062 8.42188
Length 60, alignment in bytes 3: 38.2656 3.70312 3.90625 8.5
Length 61, alignment in bytes 0: 38.8125 3.73438 3.9375 8.60938
Length 61, alignment in bytes 4: 38.8594 4.6875 5.14062 9.28125
Length 62, alignment in bytes 0: 39.4531 3.78125 3.89062 8.64062
Length 62, alignment in bytes 5: 39.4844 4.78125 4.71875 9.09375
Length 63, alignment in bytes 0: 40.0938 3.67188 3.875 8.53125
Length 63, alignment in bytes 6: 40.0469 4.29688 4.64062 9.17188
Length 64, alignment in bytes 0: 40.5625 3.84375 3.875 8.59375
Length 64, alignment in bytes 7: 40.5156 4.26562 4.57812 9.29688
Length 65, alignment in bytes 0: 41.0156 4.1875 4.57812 9.01562
Length 65, alignment in bytes 0: 41 4.23438 4.54688 9.125
Length 66, alignment in bytes 0: 41.6094 4.29688 4.79688 9.14062
Length 66, alignment in bytes 1: 41.625 4.32812 4.60938 8.96875
Length 67, alignment in bytes 0: 42.2344 4.42188 4.79688 9
Length 67, alignment in bytes 2: 42.2656 4.54688 4.65625 8.98438
Length 68, alignment in bytes 0: 42.8438 4.57812 4.5625 9.03125
Length 68, alignment in bytes 3: 42.875 4.4375 4.5625 8.98438
Length 69, alignment in bytes 0: 43.375 4.46875 4.75 9.09375
Length 69, alignment in bytes 4: 43.3281 4.375 4.51562 10.0312
Length 70, alignment in bytes 0: 43.8594 4.42188 4.82812 9.125
Length 70, alignment in bytes 5: 43.8125 4.5 4.48438 9.85938
Length 71, alignment in bytes 0: 44.3438 4.46875 4.625 9.17188
Length 71, alignment in bytes 6: 44.3438 4.25 4.35938 9.8125
Length 72, alignment in bytes 0: 44.9062 4.23438 4.67188 9.14062
Length 72, alignment in bytes 7: 44.9375 4.1875 4.39062 9.57812
Length 73, alignment in bytes 0: 45.5 4.42188 4.3125 9.78125
Length 73, alignment in bytes 0: 45.5312 4.29688 4.3125 9.75
Length 74, alignment in bytes 0: 46.1094 4.40625 4.34375 9.70312
Length 74, alignment in bytes 1: 46.1562 4.39062 4.375 9.70312
Length 75, alignment in bytes 0: 46.7188 4.21875 4.40625 9.6875
Length 75, alignment in bytes 2: 46.75 4.25 4.35938 9.70312
Length 76, alignment in bytes 0: 47.3125 4.17188 4.28125 9.70312
Length 76, alignment in bytes 3: 47.2969 4.4375 4.35938 9.90625
Length 77, alignment in bytes 0: 47.8281 4.51562 4.34375 9.65625
Length 77, alignment in bytes 4: 47.7969 4.54688 5.625 10.3281
Length 78, alignment in bytes 0: 48.3594 4.39062 4.32812 10.0156
Length 78, alignment in bytes 5: 48.3438 4.35938 5.23438 10.4062
Length 79, alignment in bytes 0: 48.8906 4.4375 4.32812 9.79688
Length 79, alignment in bytes 6: 48.875 4.51562 5.20312 10.4531
Length 80, alignment in bytes 0: 49.4219 4.32812 4.34375 9.84375
Length 80, alignment in bytes 7: 49.4219 4.29688 5.20312 10.4531
Length 81, alignment in bytes 0: 49.9688 4.21875 5.14062 10.2656
Length 81, alignment in bytes 0: 49.9375 4.21875 5.15625 10.3438
Length 82, alignment in bytes 0: 50.4844 4.28125 5.0625 10.3594
Length 82, alignment in bytes 1: 50.4844 4.25 5.04688 10.3438
Length 83, alignment in bytes 0: 51.0625 4.26562 5.10938 10.3594
Length 83, alignment in bytes 2: 51.0312 4.20312 4.98438 10.3594
Length 84, alignment in bytes 0: 51.5781 4.25 5.0625 10.4844
Length 84, alignment in bytes 3: 51.5938 4.26562 5.125 10.3438
Length 85, alignment in bytes 0: 52.1562 4.23438 4.96875 10.3281
Length 85, alignment in bytes 4: 52.1562 4.5 5.07812 11.0781
Length 86, alignment in bytes 0: 52.7344 4.48438 5.07812 10.4375
Length 86, alignment in bytes 5: 52.75 4.35938 4.89062 11.0312
Length 87, alignment in bytes 0: 53.2969 4.40625 5.10938 10.6094
Length 87, alignment in bytes 6: 53.3281 4.4375 4.84375 11.1094
Length 88, alignment in bytes 0: 53.875 4.45312 5.0625 10.5469
Length 88, alignment in bytes 7: 53.8906 4.35938 4.71875 11.0781
Length 89, alignment in bytes 0: 54.4688 4.21875 4.76562 11
Length 89, alignment in bytes 0: 54.4844 4.23438 4.85938 11
Length 90, alignment in bytes 0: 55.0781 4.375 4.82812 10.9375
Length 90, alignment in bytes 1: 55.0938 4.375 4.73438 11
Length 91, alignment in bytes 0: 55.6406 4.45312 4.70312 11.0312
Length 91, alignment in bytes 2: 55.625 4.35938 4.73438 10.9375
Length 92, alignment in bytes 0: 56.2031 4.3125 4.82812 11
Length 92, alignment in bytes 3: 56.1719 4.34375 4.73438 11
Length 93, alignment in bytes 0: 56.6875 4.20312 4.85938 10.9688
Length 93, alignment in bytes 4: 56.625 5.32812 6.25 14.2812
Length 94, alignment in bytes 0: 57.1562 4.46875 4.70312 10.9688
Length 94, alignment in bytes 5: 57.1406 4.9375 5.67188 14.3281
Length 95, alignment in bytes 0: 57.6406 4.42188 4.95312 10.9844
Length 95, alignment in bytes 6: 57.6562 4.70312 5.59375 14.3594
Length 96, alignment in bytes 0: 58.2031 4.1875 4.875 11.1094
Length 96, alignment in bytes 7: 58.25 4.78125 5.57812 14.4219
Length 97, alignment in bytes 0: 58.8438 4.64062 5.5625 14.3125
Length 97, alignment in bytes 0: 58.875 4.67188 5.4375 14.375
Length 98, alignment in bytes 0: 59.5 4.625 5.53125 14.3438
Length 98, alignment in bytes 1: 59.5156 4.71875 5.42188 14.2656
Length 99, alignment in bytes 0: 60.0781 4.75 5.35938 14.3125
Length 99, alignment in bytes 2: 60.0156 4.6875 5.375 14.2656
Length 100, alignment in bytes 0: 60.5156 4.70312 5.4375 14.375
Length 100, alignment in bytes 3: 60.4531 4.71875 5.40625 14.4844
Length 101, alignment in bytes 0: 60.9688 4.67188 5.39062 14.25
Length 101, alignment in bytes 4: 60.9375 4.6875 5.5625 15.3281
Length 102, alignment in bytes 0: 61.5938 4.67188 5.46875 14.7031
Length 102, alignment in bytes 5: 61.6406 4.6875 5.34375 15.25
Length 103, alignment in bytes 0: 62.2812 4.67188 5.42188 14.5781
Length 103, alignment in bytes 6: 62.3125 4.625 5.25 15.3281
Length 104, alignment in bytes 0: 62.8281 4.625 5.42188 14.5625
Length 104, alignment in bytes 7: 62.7656 4.625 5.1875 15.3281
Length 105, alignment in bytes 0: 63.2188 4.67188 5.21875 15.2969
Length 105, alignment in bytes 0: 63.1719 4.67188 5.21875 15.2656
Length 106, alignment in bytes 0: 63.7969 4.67188 5.21875 15.2656
Length 106, alignment in bytes 1: 63.8438 4.65625 5.20312 15.2344
Length 107, alignment in bytes 0: 111.891 4.78125 5.35938 15.6094
Length 107, alignment in bytes 2: 66.8281 4.85938 5.40625 15.9531
Length 108, alignment in bytes 0: 68.875 4.98438 5.53125 16.3594
Length 108, alignment in bytes 3: 70.4375 5.01562 5.70312 16.8125
Length 109, alignment in bytes 0: 73.0781 5.23438 5.82812 17.2031
Length 109, alignment in bytes 4: 75.1875 5.35938 7.875 18.4219
Length 110, alignment in bytes 0: 77.875 5.625 6.42188 18.5781
Length 110, alignment in bytes 5: 80.5156 5.76562 7.48438 19.5781
Length 111, alignment in bytes 0: 83.8438 5.92188 6.70312 19.5938
Length 111, alignment in bytes 6: 86.8438 6.09375 8.14062 21.1406
Length 112, alignment in bytes 0: 90.9688 6.3125 7.21875 21.1562
Length 112, alignment in bytes 7: 94.9531 6.78125 8.60938 22.8438
Length 113, alignment in bytes 0: 100.016 7 8.96875 24
Length 113, alignment in bytes 0: 105.156 7.375 9.28125 60.8594
Length 114, alignment in bytes 0: 112.859 7.90625 9.8125 26.9531
Length 114, alignment in bytes 1: 120.344 8.45312 10.4844 28.5156
Length 115, alignment in bytes 0: 123.125 8.3125 10.5 28.2031
Length 115, alignment in bytes 2: 123.234 8.375 10.5625 28.2812
Length 116, alignment in bytes 0: 124.391 8.375 10.5 28.4531
Length 116, alignment in bytes 3: 124.422 8.48438 10.5625 28.4531
Length 117, alignment in bytes 0: 125.344 8.45312 10.5938 28.4844
Length 117, alignment in bytes 4: 125.188 8.20312 10.9531 29.5781
Length 118, alignment in bytes 0: 126.078 8.45312 10.9531 28.4844
Length 118, alignment in bytes 5: 126.172 8.48438 10.4531 29.4531
Length 119, alignment in bytes 0: 127.328 8.375 10.7812 28.4375
Length 119, alignment in bytes 6: 127.406 8.3125 10.3906 29.5469
Length 120, alignment in bytes 0: 128.281 8.3125 10.5 28.4375
Length 120, alignment in bytes 7: 128.094 8.46875 10.3438 29.4219
Length 121, alignment in bytes 0: 129.078 8.28125 10.3594 29.3438
Length 121, alignment in bytes 0: 129.219 8.40625 10.1094 29.4844
Length 122, alignment in bytes 0: 130.359 8.28125 10.2344 29.4219
Length 122, alignment in bytes 1: 130.312 8.375 10.25 29.4375
Length 123, alignment in bytes 0: 131.125 8.39062 10.0781 29.5
Length 123, alignment in bytes 2: 131.062 8.34375 10.1562 29.4062
Length 124, alignment in bytes 0: 132.234 8.28125 10.375 29.4531
Length 124, alignment in bytes 3: 132.359 8.34375 10.125 29.5625
Length 125, alignment in bytes 0: 133.234 8.375 10.2344 29.7812
Length 125, alignment in bytes 4: 133.016 9.375 12.4531 30.4844
Length 126, alignment in bytes 0: 134.141 8.5 10.0625 29.6562
Length 126, alignment in bytes 5: 134.328 9.17188 11.6406 30.7969
Length 127, alignment in bytes 0: 135.25 8.46875 10.0938 29.5
Length 127, alignment in bytes 6: 135.016 9.29688 11.2812 30.6719
Length 128, alignment in bytes 0: 136.078 8.5625 10.0312 29.4531
Length 128, alignment in bytes 7: 136.312 9.35938 11.7656 30.5156
Length 129, alignment in bytes 0: 137.203 9.32812 11.375 30.6562
Length 129, alignment in bytes 0: 137 9.29688 11.5938 30.5781
Length 130, alignment in bytes 0: 138.219 9.07812 11.5781 30.7188
Length 130, alignment in bytes 1: 138.312 9.17188 11.2812 30.6406
Length 131, alignment in bytes 0: 139.062 9.48438 11.375 30.5469
Length 131, alignment in bytes 2: 139.062 9.3125 11.5156 30.6406
Length 132, alignment in bytes 0: 140.266 9.07812 11.5781 30.6406
Length 132, alignment in bytes 3: 140.156 9.14062 11.6094 30.7188
Length 133, alignment in bytes 0: 140.969 9.10938 11.6562 30.6875
Length 133, alignment in bytes 4: 141.25 8.95312 11.875 31.5469
Length 134, alignment in bytes 0: 142.25 9.04688 11.6562 30.7812
Length 134, alignment in bytes 5: 141.984 9.21875 11.0938 31.5
Length 135, alignment in bytes 0: 143.141 9.03125 11.625 30.7344
Length 135, alignment in bytes 6: 143.297 9.0625 11.1875 31.8125
Length 136, alignment in bytes 0: 144 9.07812 11.1719 30.4844
Length 136, alignment in bytes 7: 144.109 9.125 10.9375 31.7031
Length 137, alignment in bytes 0: 145.219 9.09375 11.0625 31.75
Length 137, alignment in bytes 0: 144.953 9.32812 11.0469 31.4688
Length 138, alignment in bytes 0: 146.188 9.10938 10.6875 31.7344
Length 138, alignment in bytes 1: 146.188 9.04688 10.7188 31.8438
Length 139, alignment in bytes 0: 146.922 9.09375 10.75 31.5469
Length 139, alignment in bytes 2: 147.234 8.98438 10.8594 31.6094
Length 140, alignment in bytes 0: 148.125 9.10938 10.8906 31.7031
Length 140, alignment in bytes 3: 147.953 9.32812 10.7344 31.5625
Length 141, alignment in bytes 0: 149.219 9.26562 10.875 31.7969
Length 141, alignment in bytes 4: 148.984 9.29688 12.9688 32.625
Length 142, alignment in bytes 0: 150.234 9.34375 10.9062 31.8594
Length 142, alignment in bytes 5: 150.016 9.28125 12.375 32.6094
Length 143, alignment in bytes 0: 151.062 9.34375 11.0156 31.5625
Length 143, alignment in bytes 6: 151.172 9.34375 12.0469 32.6719
Length 144, alignment in bytes 0: 151.906 9.20312 10.9375 31.6719
Length 144, alignment in bytes 7: 152.203 9.14062 12.0625 32.7656
Length 145, alignment in bytes 0: 152.984 9.26562 12.0938 32.7188
Length 145, alignment in bytes 0: 153.062 8.98438 12.0469 32.5781
Length 146, alignment in bytes 0: 154.125 9.14062 12.0781 32.5781
Length 146, alignment in bytes 1: 153.906 9.125 12.3906 32.5781
Length 147, alignment in bytes 0: 155.188 9.15625 12.3438 32.625
Length 147, alignment in bytes 2: 154.906 9.10938 12.1562 32.5781
Length 148, alignment in bytes 0: 156.188 9.3125 12.0781 32.6562
Length 148, alignment in bytes 3: 155.891 9.46875 12.3906 32.625
Length 149, alignment in bytes 0: 157.172 9.17188 12.3438 32.6562
Length 149, alignment in bytes 4: 156.938 9.17188 12.4844 33.6094
Length 150, alignment in bytes 0: 158.094 9.40625 12.4688 32.9062
Length 150, alignment in bytes 5: 158.016 9.32812 12.1875 33.625
Length 151, alignment in bytes 0: 159.016 9.07812 12.125 32.6562
Length 151, alignment in bytes 6: 159.062 9.125 11.7969 33.5
Length 152, alignment in bytes 0: 159.984 9.07812 12.0781 32.6406
Length 152, alignment in bytes 7: 160.062 9.35938 12 33.6406
Length 153, alignment in bytes 0: 160.938 9.34375 11.7969 33.4844
Length 153, alignment in bytes 0: 161.078 9.28125 11.7969 33.5938
Length 154, alignment in bytes 0: 161.969 9.07812 11.8281 33.4531
Length 154, alignment in bytes 1: 162.016 9.3125 11.5312 33.5312
Length 155, alignment in bytes 0: 162.953 8.95312 11.6875 33.3906
Length 155, alignment in bytes 2: 163 9.32812 11.7656 33.5938
Length 156, alignment in bytes 0: 163.938 9.09375 11.6875 33.5469
Length 156, alignment in bytes 3: 164 9.29688 12.0156 33.5312
Length 157, alignment in bytes 0: 165.016 9.04688 11.9375 33.7656
Length 157, alignment in bytes 4: 164.953 10.1562 13.5625 34.4219
Length 158, alignment in bytes 0: 166.031 9.375 11.9062 33.5938
Length 158, alignment in bytes 5: 165.844 9.85938 12.9844 34.75
Length 159, alignment in bytes 0: 167.062 8.9375 11.6719 33.9844
Length 159, alignment in bytes 6: 166.812 10.0156 12.8594 34.4531
Length 160, alignment in bytes 0: 168.094 9.32812 11.625 33.5938
Length 160, alignment in bytes 7: 167.859 9.78125 12.7188 34.5
Length 161, alignment in bytes 0: 169.016 9.875 12.8594 34.5781
Length 161, alignment in bytes 0: 168.938 10.1094 12.5312 34.5625
Length 162, alignment in bytes 0: 169.859 9.875 12.7656 34.2969
Length 162, alignment in bytes 1: 170.094 9.89062 13.0469 34.7969
Length 163, alignment in bytes 0: 170.812 9.90625 13.0781 34.5
Length 163, alignment in bytes 2: 171.047 9.84375 12.7031 34.5625
Length 164, alignment in bytes 0: 171.906 9.92188 12.5938 34.6719
Length 164, alignment in bytes 3: 171.875 9.625 13.1094 34.4844
Length 165, alignment in bytes 0: 173.047 9.89062 12.9844 34.4062
Length 165, alignment in bytes 4: 172.781 9.78125 13.5 35.5312
Length 166, alignment in bytes 0: 174.031 9.8125 12.6562 34.5781
Length 166, alignment in bytes 5: 173.891 10.1406 12.8906 35.6406
Length 167, alignment in bytes 0: 174.812 10.0312 13.125 34.5625
Length 167, alignment in bytes 6: 174.891 9.78125 12.9062 35.625
Length 168, alignment in bytes 0: 176.062 10.1562 12.6875 34.9531
Length 168, alignment in bytes 7: 175.781 9.84375 12.9531 35.5156
Length 169, alignment in bytes 0: 176.938 9.95312 12.9062 35.6406
Length 169, alignment in bytes 0: 176.953 9.84375 12.9531 35.5469
Length 170, alignment in bytes 0: 177.766 9.70312 12.9531 35.4531
Length 170, alignment in bytes 1: 177.969 10.125 13.0312 35.5312
Length 171, alignment in bytes 0: 178.938 10 13.0312 35.5156
Length 171, alignment in bytes 2: 178.781 9.82812 13.0312 35.625
Length 172, alignment in bytes 0: 179.984 9.875 12.9531 35.5312
Length 172, alignment in bytes 3: 179.922 10.1562 13.1719 35.5156
Length 173, alignment in bytes 0: 180.734 9.92188 13.125 35.3281
Length 173, alignment in bytes 4: 180.953 9.82812 15.0156 36.75
Length 174, alignment in bytes 0: 181.953 9.89062 13.0156 35.7344
Length 174, alignment in bytes 5: 181.984 9.8125 14.0312 36.6719
Length 175, alignment in bytes 0: 182.891 9.875 12.8594 35.6562
Length 175, alignment in bytes 6: 182.75 9.92188 13.7031 36.5
Length 176, alignment in bytes 0: 183.906 9.8125 13.1719 35.7031
Length 176, alignment in bytes 7: 183.969 9.70312 13.6562 36.6719
Length 177, alignment in bytes 0: 184.75 9.84375 13.4375 36.5469
Length 177, alignment in bytes 0: 184.766 10.1094 13.6094 36.6875
Length 178, alignment in bytes 0: 185.953 9.92188 13.5625 36.7031
Length 178, alignment in bytes 1: 185.891 9.8125 13.5 36.8281
Length 179, alignment in bytes 0: 186.734 9.82812 13.6094 36.4375
Length 179, alignment in bytes 2: 186.797 9.85938 13.5625 36.5
Length 180, alignment in bytes 0: 187.953 9.75 13.7031 36.8125
Length 180, alignment in bytes 3: 187.859 9.82812 13.5156 36.4844
Length 181, alignment in bytes 0: 188.703 9.82812 13.4688 36.6719
Length 181, alignment in bytes 4: 188.797 9.8125 14.1562 37.8281
Length 182, alignment in bytes 0: 189.922 9.82812 13.7812 36.8594
Length 182, alignment in bytes 5: 189.844 9.6875 13.375 37.6094
Length 183, alignment in bytes 0: 190.719 9.79688 13.5781 36.5938
Length 183, alignment in bytes 6: 190.781 9.82812 13.2969 37.8281
Length 184, alignment in bytes 0: 191.875 9.8125 13.5781 36.5156
Length 184, alignment in bytes 7: 191.891 9.71875 13.3906 37.7969
Length 185, alignment in bytes 0: 192.766 10.0938 13.1094 37.9531
Length 185, alignment in bytes 0: 192.703 10.1094 13.125 37.6406
Length 186, alignment in bytes 0: 193.734 10.25 13.4062 37.625
Length 186, alignment in bytes 1: 193.875 9.78125 13.125 37.875
Length 187, alignment in bytes 0: 194.875 9.82812 13.2656 37.8438
Length 187, alignment in bytes 2: 194.734 10.0938 13.125 37.6719
Length 188, alignment in bytes 0: 195.672 9.89062 13.3125 37.7656
Length 188, alignment in bytes 3: 195.703 9.89062 13.4219 37.7812
Length 189, alignment in bytes 0: 196.797 9.92188 13.2812 37.8438
Length 189, alignment in bytes 4: 196.703 11.8906 15.1406 38.8125
Length 190, alignment in bytes 0: 197.656 10.3438 13.3594 37.5312
Length 190, alignment in bytes 5: 197.688 11.2031 14.6406 38.875
Length 191, alignment in bytes 0: 198.734 10.2656 13.3281 37.7656
Length 191, alignment in bytes 6: 198.844 11.2344 14.375 38.875
Length 192, alignment in bytes 0: 199.875 10.5 13.2344 37.9531
Length 192, alignment in bytes 7: 199.812 11.1719 14.3594 38.7188
Length 193, alignment in bytes 0: 200.734 10.7188 14.375 38.8125
Length 193, alignment in bytes 0: 200.688 10.8281 14.1406 38.7344
Length 194, alignment in bytes 0: 201.641 10.7188 14.2031 38.8594
Length 194, alignment in bytes 1: 201.625 10.7969 14.375 38.8125
Length 195, alignment in bytes 0: 202.688 10.7031 14.1562 38.8125
Length 195, alignment in bytes 2: 202.734 10.7656 14.3125 38.7812
Length 196, alignment in bytes 0: 203.812 10.8125 14.3125 38.8594
Length 196, alignment in bytes 3: 203.812 10.7031 14.4531 38.7969
Length 197, alignment in bytes 0: 204.844 10.7812 14.375 38.9375
Length 197, alignment in bytes 4: 204.781 10.6875 14.7188 39.9375
Length 198, alignment in bytes 0: 205.781 10.7031 14.5625 38.7188
Length 198, alignment in bytes 5: 205.688 10.7031 14.1719 40
Length 199, alignment in bytes 0: 206.688 10.7031 14.25 38.9844
Length 199, alignment in bytes 6: 206.625 10.7969 14.1094 39.625
Length 200, alignment in bytes 0: 207.594 10.6875 14.5156 38.7969
Length 200, alignment in bytes 7: 207.609 10.7188 14.3594 39.7656
Length 201, alignment in bytes 0: 208.609 10.7969 14 39.7031
Length 201, alignment in bytes 0: 208.625 10.7656 13.9062 39.7188
Length 202, alignment in bytes 0: 209.625 10.75 13.9844 39.7344
Length 202, alignment in bytes 1: 209.641 10.75 14.1094 39.75
Length 203, alignment in bytes 0: 210.641 10.8125 14.0938 39.75
Length 203, alignment in bytes 2: 210.656 10.7812 13.9531 39.7031
Length 204, alignment in bytes 0: 211.672 10.8281 14.0625 39.8438
Length 204, alignment in bytes 3: 211.672 10.6719 13.9531 39.7188
Length 205, alignment in bytes 0: 212.625 11.0156 14.0156 39.6719
Length 205, alignment in bytes 4: 212.672 10.75 16.25 40.6875
Length 206, alignment in bytes 0: 213.672 10.8438 14.4844 39.7188
Length 206, alignment in bytes 5: 213.672 10.75 16.3906 40.8594
Length 207, alignment in bytes 0: 214.641 10.7031 14.4375 39.9375
Length 207, alignment in bytes 6: 214.672 10.7656 15.8438 40.7656
Length 208, alignment in bytes 0: 215.625 10.7969 14.5156 39.9688
Length 208, alignment in bytes 7: 215.625 10.7344 15.5938 40.7031
Length 209, alignment in bytes 0: 216.578 10.7969 15.5781 40.8594
Length 209, alignment in bytes 0: 216.578 10.8438 15.5312 40.6562
Length 210, alignment in bytes 0: 217.578 10.7969 15.2656 40.7031
Length 210, alignment in bytes 1: 217.578 10.7656 15.2344 40.7969
Length 211, alignment in bytes 0: 218.609 10.75 15.125 40.9375
Length 211, alignment in bytes 2: 218.609 10.75 15.2656 40.9844
Length 212, alignment in bytes 0: 219.641 10.7344 15.2188 40.9688
Length 212, alignment in bytes 3: 219.656 10.7344 15.3125 41.0781
Length 213, alignment in bytes 0: 220.688 10.7812 15.2344 40.7656
Length 213, alignment in bytes 4: 220.703 10.875 15.5156 41.9531
Length 214, alignment in bytes 0: 221.641 10.7812 15.4688 40.8906
Length 214, alignment in bytes 5: 221.656 10.8125 15.1719 41.9375
Length 215, alignment in bytes 0: 222.578 10.7031 15.2188 40.7969
Length 215, alignment in bytes 6: 222.531 10.7344 15.0938 41.7812
Length 216, alignment in bytes 0: 223.516 10.6562 15.2031 40.9844
Length 216, alignment in bytes 7: 223.531 10.7188 15.3281 41.8906
Length 217, alignment in bytes 0: 224.562 10.7344 15.0938 42.1719
Length 217, alignment in bytes 0: 224.641 10.7188 14.9531 42.1406
Length 218, alignment in bytes 0: 225.672 10.6875 15.0625 42.1094
Length 218, alignment in bytes 1: 225.609 10.7812 15.0781 42.1094
Length 219, alignment in bytes 0: 226.594 10.7344 14.9531 42
Length 219, alignment in bytes 2: 226.531 10.75 15.1875 41.9219
Length 220, alignment in bytes 0: 227.516 10.7188 15.0312 41.875
Length 220, alignment in bytes 3: 227.516 10.7969 14.8594 41.9688
Length 221, alignment in bytes 0: 228.547 10.6875 15 42.0312
Length 221, alignment in bytes 4: 228.594 12.25 16.7656 43.2656
Length 222, alignment in bytes 0: 229.625 11.0156 15.5 42.125
Length 222, alignment in bytes 5: 229.562 12.4375 16.5938 42.9844
Length 223, alignment in bytes 0: 230.547 11.0625 15.5469 41.875
Length 223, alignment in bytes 6: 230.484 12 16.8906 43.1875
Length 224, alignment in bytes 0: 231.516 11.1406 15.5625 41.8594
Length 224, alignment in bytes 7: 231.562 12.2656 16.3906 43.3125
Length 225, alignment in bytes 0: 232.594 11.5781 16.625 43.4062
Length 225, alignment in bytes 0: 232.531 11.7344 16.0625 43.0312
Length 226, alignment in bytes 0: 233.516 11.5781 16.0781 43.0469
Length 226, alignment in bytes 1: 233.5 11.5625 16 43.0938
Length 227, alignment in bytes 0: 234.531 11.8594 16.0625 43.125
Length 227, alignment in bytes 2: 234.516 11.4688 15.8125 42.4688
Length 228, alignment in bytes 0: 231.844 11.3125 15.9375 42.4219
Length 228, alignment in bytes 3: 230.062 11.4688 15.5938 41.7656
Length 229, alignment in bytes 0: 229.094 11.4062 15.6875 41.6719
Length 229, alignment in bytes 4: 225.719 11 15.4375 42.1719
Length 230, alignment in bytes 0: 226.484 11.0469 15.8125 41.2344
Length 230, alignment in bytes 5: 223.109 11.0469 15.125 41.5
Length 231, alignment in bytes 0: 224.031 10.9062 15.2031 40.7344
Length 231, alignment in bytes 6: 220.703 10.7969 14.6719 40.9219
Length 232, alignment in bytes 0: 221.641 10.7344 15.125 39.875
Length 232, alignment in bytes 7: 218.344 10.4531 14.4062 40.4062
Length 233, alignment in bytes 0: 219.25 10.5625 14.4688 39.8125
Length 233, alignment in bytes 0: 216.078 10.5469 14.1406 39.5625
Length 234, alignment in bytes 0: 216.922 10.4219 14.0469 39.1719
Length 234, alignment in bytes 1: 213.812 10.4688 13.9375 38.9531
Length 235, alignment in bytes 0: 214.703 10.2188 13.875 38.5312
Length 235, alignment in bytes 2: 211.734 10.1094 13.9219 38.4844
Length 236, alignment in bytes 0: 212.672 10.2969 13.7344 38.0938
Length 236, alignment in bytes 3: 209.75 9.9375 13.7031 38.125
Length 237, alignment in bytes 0: 210.516 9.9375 13.4375 37.6719
Length 237, alignment in bytes 4: 207.547 9.90625 15.2812 38.3281
Length 238, alignment in bytes 0: 208.406 9.75 13.6094 37.0156
Length 238, alignment in bytes 5: 205.688 9.70312 14.6562 38.0469
Length 239, alignment in bytes 0: 206.562 9.89062 13.6094 36.6875
Length 239, alignment in bytes 6: 203.672 9.76562 14.4219 37.2812
Length 240, alignment in bytes 0: 204.469 9.53125 13.3906 36.5
Length 240, alignment in bytes 7: 201.938 9.39062 14.1875 36.9062
Length 241, alignment in bytes 0: 202.688 9.39062 13.9062 36.9531
Length 241, alignment in bytes 0: 199.953 9.39062 13.7969 36.2656
Length 242, alignment in bytes 0: 200.859 9.53125 13.8438 36.5312
Length 242, alignment in bytes 1: 198.344 9.21875 13.5156 36.0156
Length 243, alignment in bytes 0: 198.938 9.14062 13.7188 35.8438
Length 243, alignment in bytes 2: 196.578 9 13.4531 35.5
Length 244, alignment in bytes 0: 197.359 9.10938 13.4531 35.5312
Length 244, alignment in bytes 3: 195.656 9.26562 13.3438 34.9844
Length 245, alignment in bytes 0: 195.719 8.90625 13.2344 35.0625
Length 245, alignment in bytes 4: 195.375 8.90625 13.25 35.25
Length 246, alignment in bytes 0: 193.969 8.90625 13.0781 34.5156
Length 246, alignment in bytes 5: 194 8.84375 13.0625 35.1562
Length 247, alignment in bytes 0: 192.266 8.95312 12.9375 34.2969
Length 247, alignment in bytes 6: 192.453 8.73438 12.7969 34.9219
Length 248, alignment in bytes 0: 190.625 8.65625 12.8438 33.7344
Length 248, alignment in bytes 7: 190.906 8.65625 12.6406 34.6406
Length 249, alignment in bytes 0: 189.172 8.5625 12.4375 34.1094
Length 249, alignment in bytes 0: 189.328 8.57812 12.4219 34.125
Length 250, alignment in bytes 0: 189.359 8.42188 12.375 33.7031
Length 250, alignment in bytes 1: 187.797 8.42188 12.3125 33.6875
Length 251, alignment in bytes 0: 188.344 8.54688 12.25 33.6094
Length 251, alignment in bytes 2: 186.375 8.5 12.1406 33.5938
Length 252, alignment in bytes 0: 186.859 8.40625 12.1406 33.3125
Length 252, alignment in bytes 3: 184.969 8.26562 11.9688 33.0469
Length 253, alignment in bytes 0: 185.406 8.29688 12.1094 32.875
Length 253, alignment in bytes 4: 184.344 9.09375 13.0156 33.3125
Length 254, alignment in bytes 0: 184 8.4375 12.2656 32.4375
Length 254, alignment in bytes 5: 184.234 8.9375 12.5781 33
Length 255, alignment in bytes 0: 182.703 8.3125 12.1406 32.2188
Length 255, alignment in bytes 6: 182.719 8.76562 12.5938 32.8438
Length 256, alignment in bytes 0: 181.469 8.32812 11.9219 31.9062
Length 256, alignment in bytes 7: 181.266 8.625 12.4375 32.5469
Length 257, alignment in bytes 0: 181.422 8.48438 12.0625 32.1719
Length 257, alignment in bytes 0: 180.062 8.54688 12.1094 32.2031
Length 258, alignment in bytes 0: 180.688 8.40625 12.0938 32.0938
Length 258, alignment in bytes 1: 178.938 8.32812 12 31.9219
Length 259, alignment in bytes 0: 179.406 8.32812 12 31.9219
Length 259, alignment in bytes 2: 177.766 8.23438 11.8594 31.4375
Length 260, alignment in bytes 0: 178.328 8.15625 11.8125 31.4375
Length 260, alignment in bytes 3: 178.125 8.07812 11.8438 31.25
Length 261, alignment in bytes 0: 176.938 8.17188 11.7344 31.1406
Length 261, alignment in bytes 4: 177.094 8 11.8125 31.875
Length 262, alignment in bytes 0: 175.656 8.15625 11.6719 30.75
Length 262, alignment in bytes 5: 175.672 8.07812 11.6562 31.4062
Length 263, alignment in bytes 0: 176.516 8.0625 11.6094 30.5938
Length 263, alignment in bytes 6: 174.562 8.07812 11.5625 31.2812
Length 264, alignment in bytes 0: 175.047 8.03125 11.5625 30.4062
Length 264, alignment in bytes 7: 173.625 7.84375 11.2656 30.875
Length 265, alignment in bytes 0: 174.156 7.96875 11.2969 30.9062
Length 265, alignment in bytes 0: 173.953 7.73438 11.2812 30.5312
Length 266, alignment in bytes 0: 172.766 7.76562 11.2031 30.5
Length 266, alignment in bytes 1: 172.969 7.89062 11.1875 30.5156
Length 267, alignment in bytes 0: 171.875 7.54688 11.0938 30.1875
Length 267, alignment in bytes 2: 171.688 7.59375 11.125 30.3438
Length 268, alignment in bytes 0: 172.234 7.85938 11.0938 30.0938
Length 268, alignment in bytes 3: 170.672 7.54688 10.9531 29.8594
Length 269, alignment in bytes 0: 171.406 7.5625 11.125 29.8906
Length 269, alignment in bytes 4: 169.906 7.5 11.8281 30.2812
Length 270, alignment in bytes 0: 170.047 7.60938 11.1094 29.6406
Length 270, alignment in bytes 5: 170.078 7.64062 11.75 30.2344
Length 271, alignment in bytes 0: 169.203 7.46875 10.875 29.3125
Length 271, alignment in bytes 6: 169.25 7.35938 11.3125 29.9844
Length 272, alignment in bytes 0: 169.266 7.5 10.9688 29.125
Length 272, alignment in bytes 7: 167.953 7.46875 11.25 29.75
Length 273, alignment in bytes 0: 168.547 7.34375 11.4531 29.5938
Length 273, alignment in bytes 0: 167.047 7.28125 11.3438 29.2969
Length 274, alignment in bytes 0: 167.75 7.40625 11.2969 29.3125
Length 274, alignment in bytes 1: 167.766 7.25 11.0312 29.1875
Length 275, alignment in bytes 0: 166.641 7.09375 11.2188 29.1562
Length 275, alignment in bytes 2: 166.547 7.375 11.2656 29.0781
Length 276, alignment in bytes 0: 166.172 7.29688 11.1719 28.8125
Length 276, alignment in bytes 3: 165.578 7.3125 11.125 28.7031
Length 277, alignment in bytes 0: 166.297 7.28125 11.0781 28.7656
Length 277, alignment in bytes 4: 164.797 7.25 10.6406 29.125
Length 278, alignment in bytes 0: 165.359 6.98438 11 28.6406
Length 278, alignment in bytes 5: 165.297 6.98438 10.5469 29.0312
Length 279, alignment in bytes 0: 164.203 7.15625 10.9844 28.3281
Length 279, alignment in bytes 6: 164.156 7 10.5156 28.8906
Length 280, alignment in bytes 0: 164.016 7.01562 10.875 28.0469
Length 280, alignment in bytes 7: 163.25 7.0625 10.375 28.6719
Length 281, alignment in bytes 0: 163.906 7.0625 10.3438 28.5312
Length 281, alignment in bytes 0: 162.547 6.875 10.1719 28.3281
Length 282, alignment in bytes 0: 163.125 6.82812 10.2188 28.3125
Length 282, alignment in bytes 1: 163.094 7.0625 10.25 28.3438
Length 283, alignment in bytes 0: 162.109 6.9375 10.125 28.1875
Length 283, alignment in bytes 2: 162.047 6.85938 10.1562 28.1875
Length 284, alignment in bytes 0: 162.516 6.78125 10.1719 28.0938
Length 284, alignment in bytes 3: 162.469 6.82812 10.0938 28.1406
Length 285, alignment in bytes 0: 163.031 7 10.1562 28.0469
Length 285, alignment in bytes 4: 163.062 7.78125 11.3594 28.6875
Length 286, alignment in bytes 0: 163.703 7.125 10.3438 28.2031
Length 286, alignment in bytes 5: 163.75 7.53125 11.1406 28.5469
Length 287, alignment in bytes 0: 164.406 7.28125 10.1875 28.125
Length 287, alignment in bytes 6: 164.422 7.39062 11.0938 28.5312
Length 288, alignment in bytes 0: 164.969 7.20312 10.2969 28.1094
Length 288, alignment in bytes 7: 164.875 7.46875 11.0781 28.5469
Length 289, alignment in bytes 0: 165.328 7.34375 10.9219 28.7031
Length 289, alignment in bytes 0: 165.281 7.26562 11.2656 28.6406
Length 290, alignment in bytes 0: 165.812 7.32812 11.4844 28.7812
Length 290, alignment in bytes 1: 165.844 7.29688 11.4062 28.5312
Length 291, alignment in bytes 0: 166.516 7.1875 11.0469 28.6406
Length 291, alignment in bytes 2: 166.531 7.26562 11.0625 28.6406
Length 292, alignment in bytes 0: 167.203 7.32812 11.0469 28.5156
Length 292, alignment in bytes 3: 167.203 7.26562 11.0781 28.6875
Length 293, alignment in bytes 0: 167.719 7.28125 11.0625 28.7031
Length 293, alignment in bytes 4: 167.594 7.23438 10.7969 29.2656
Length 294, alignment in bytes 0: 168.031 7.3125 11.5469 28.7969
Length 294, alignment in bytes 5: 168.047 7.26562 10.8906 29.2344
Length 295, alignment in bytes 0: 168.688 7.25 11.75 28.6875
Length 295, alignment in bytes 6: 168.781 7.32812 10.7656 29.0938
Length 296, alignment in bytes 0: 169.422 7.28125 11.125 28.6406
Length 296, alignment in bytes 7: 169.406 7.4375 10.8594 29.2656
Length 297, alignment in bytes 0: 169.859 7.21875 10.9375 29.3906
Length 297, alignment in bytes 0: 169.734 7.20312 11.2969 29.2344
Length 298, alignment in bytes 0: 170.25 7.29688 10.9062 29.2188
Length 298, alignment in bytes 1: 170.328 7.28125 11 29.1562
Length 299, alignment in bytes 0: 171 7.26562 10.7812 29.2344
Length 299, alignment in bytes 2: 171.109 7.23438 10.6875 29.1875
Length 300, alignment in bytes 0: 171.594 7.26562 10.7812 29.2344
Length 300, alignment in bytes 3: 171.469 7.25 10.8125 29.25
Length 301, alignment in bytes 0: 171.938 7.25 10.8906 29.1406
Length 301, alignment in bytes 4: 171.969 7.29688 11.875 29.7812
Length 302, alignment in bytes 0: 172.641 7.21875 11.25 29.1406
Length 302, alignment in bytes 5: 172.75 7.29688 11.5938 29.7812
Length 303, alignment in bytes 0: 173.25 7.23438 11.4375 29.2656
Length 303, alignment in bytes 6: 173.094 7.26562 11.7188 29.7969
Length 304, alignment in bytes 0: 173.578 7.28125 11.1875 29.2656
Length 304, alignment in bytes 7: 173.688 7.20312 11.7344 29.7188
Length 305, alignment in bytes 0: 174.375 7.21875 11.625 29.6406
Length 305, alignment in bytes 0: 174.406 7.1875 13.4062 29.7969
Length 306, alignment in bytes 0: 174.812 7.26562 11.6562 29.8906
Length 306, alignment in bytes 1: 174.703 7.26562 11.6719 29.6719
Length 307, alignment in bytes 0: 175.328 7.29688 11.6094 29.7031
Length 307, alignment in bytes 2: 175.484 7.4375 11.5938 29.6875
Length 308, alignment in bytes 0: 176.078 7.21875 11.6406 29.8594
Length 308, alignment in bytes 3: 175.891 7.34375 11.6562 29.7188
Length 309, alignment in bytes 0: 176.359 7.26562 11.6094 29.7344
Length 309, alignment in bytes 4: 176.453 7.26562 13.3125 30.1875
Length 310, alignment in bytes 0: 177.156 7.25 11.625 29.7188
Length 310, alignment in bytes 5: 177.094 7.23438 11.4219 30.3125
Length 311, alignment in bytes 0: 177.469 7.26562 11.7188 29.8438
Length 311, alignment in bytes 6: 177.531 7.28125 13.5 30.3125
Length 312, alignment in bytes 0: 178.266 7.23438 11.5938 29.7031
Length 312, alignment in bytes 7: 178.25 7.21875 13.5 30.2812
Length 313, alignment in bytes 0: 178.625 7.23438 11.25 30.3281
Length 313, alignment in bytes 0: 178.609 7.26562 11.4219 30.3438
Length 314, alignment in bytes 0: 179.312 7.20312 13.4531 30.2344
Length 314, alignment in bytes 1: 179.406 7.20312 11.4062 30.2812
Length 315, alignment in bytes 0: 179.797 7.28125 11.4375 30.3125
Length 315, alignment in bytes 2: 179.688 7.28125 11.3906 30.1875
Length 316, alignment in bytes 0: 180.391 7.23438 11.4844 30.2969
Length 316, alignment in bytes 3: 180.5 7.26562 11.3594 30.2969
Length 317, alignment in bytes 0: 180.922 7.20312 11.3906 30.375
Length 317, alignment in bytes 4: 180.797 8.20312 11.8281 30.8594
Length 318, alignment in bytes 0: 181.516 7.51562 11.5625 30.2812
Length 318, alignment in bytes 5: 181.625 7.98438 11.7344 30.9219
Length 319, alignment in bytes 0: 182.016 7.46875 11.6875 30.2969
Length 319, alignment in bytes 6: 181.922 7.95312 11.7188 30.8281
Length 320, alignment in bytes 0: 182.688 7.46875 11.625 30.1875
Length 320, alignment in bytes 7: 182.703 7.90625 11.7031 30.7656
Length 321, alignment in bytes 0: 183.062 7.89062 11.6875 30.9375
Length 321, alignment in bytes 0: 183.078 7.79688 11.6875 30.8125
Length 322, alignment in bytes 0: 183.828 7.78125 11.6719 30.8281
Length 322, alignment in bytes 1: 183.719 7.76562 11.6875 30.9531
Length 323, alignment in bytes 0: 184.156 7.79688 11.7031 30.8906
Length 323, alignment in bytes 2: 184.281 7.78125 11.6562 30.75
Length 324, alignment in bytes 0: 184.922 7.71875 11.6719 30.8906
Length 324, alignment in bytes 3: 184.75 7.79688 11.75 30.9531
Length 325, alignment in bytes 0: 185.344 7.84375 11.7188 30.8906
Length 325, alignment in bytes 4: 185.5 7.78125 11.8594 31.4062
Length 326, alignment in bytes 0: 185.906 7.79688 11.7656 31.1719
Length 326, alignment in bytes 5: 185.797 7.79688 11.8906 31.4219
Length 327, alignment in bytes 0: 186.562 7.73438 11.9375 30.875
Length 327, alignment in bytes 6: 186.531 7.82812 11.8438 31.4219
Length 328, alignment in bytes 0: 186.922 7.78125 11.8125 30.7812
Length 328, alignment in bytes 7: 187.047 7.6875 11.8438 31.375
Length 329, alignment in bytes 0: 187.688 7.85938 11.75 31.5
Length 329, alignment in bytes 0: 187.484 7.82812 11.8281 31.5156
Length 330, alignment in bytes 0: 188.172 7.78125 11.7344 31.5
Length 330, alignment in bytes 1: 188.25 7.67188 11.8125 31.5469
Length 331, alignment in bytes 0: 188.594 7.8125 11.7969 31.4375
Length 331, alignment in bytes 2: 188.703 7.78125 11.7656 31.4531
Length 332, alignment in bytes 0: 189.359 7.82812 11.7344 31.4375
Length 332, alignment in bytes 3: 189.188 7.79688 11.7344 31.4844
Length 333, alignment in bytes 0: 189.812 7.79688 11.7344 31.4688
Length 333, alignment in bytes 4: 189.906 7.70312 16.9375 32.1406
Length 334, alignment in bytes 0: 190.266 7.8125 11.7969 31.5156
Length 334, alignment in bytes 5: 190.391 7.76562 16.8281 31.9062
Length 335, alignment in bytes 0: 190.953 7.79688 11.8438 31.5
Length 335, alignment in bytes 6: 190.812 7.8125 17.0625 31.9688
Length 336, alignment in bytes 0: 191.594 7.78125 11.8438 31.4062
Length 336, alignment in bytes 7: 191.453 7.75 17.0781 32.0938
Length 337, alignment in bytes 0: 191.969 7.73438 16.3594 31.9531
Length 337, alignment in bytes 0: 192.141 7.82812 16.4844 32.0312
Length 338, alignment in bytes 0: 192.469 7.75 16.5312 31.9375
Length 338, alignment in bytes 1: 192.688 7.76562 16.4844 31.9844
Length 339, alignment in bytes 0: 193.094 7.82812 16.4219 32.1094
Length 339, alignment in bytes 2: 193.125 7.76562 16.4531 31.9688
Length 340, alignment in bytes 0: 549.391 7.73438 16.5312 32.0625
Length 340, alignment in bytes 3: 193.812 7.75 16.6562 31.9844
Length 341, alignment in bytes 0: 194.203 7.78125 16.6562 32
Length 341, alignment in bytes 4: 194.234 7.875 16.5625 32.4844
Length 342, alignment in bytes 0: 194.875 7.79688 16.75 32.0781
Length 342, alignment in bytes 5: 194.688 7.84375 16.4844 32.5469
Length 343, alignment in bytes 0: 195.484 7.84375 16.75 32
Length 343, alignment in bytes 6: 195.344 7.90625 16.5 32.5938
Length 344, alignment in bytes 0: 195.891 7.78125 16.8125 31.9844
Length 344, alignment in bytes 7: 196.016 7.75 16.5312 32.6094
Length 345, alignment in bytes 0: 196.344 7.78125 16.4688 32.5156
Length 345, alignment in bytes 0: 196.578 7.79688 16.3438 32.4688
Length 346, alignment in bytes 0: 196.953 7.76562 16.5938 32.5156
Length 346, alignment in bytes 1: 197.062 7.6875 16.4375 32.5625
Length 347, alignment in bytes 0: 197.562 7.75 16.5156 32.6094
Length 347, alignment in bytes 2: 197.547 7.8125 16.4844 32.5156
Length 348, alignment in bytes 0: 198.219 7.78125 16.5625 32.6719
Length 348, alignment in bytes 3: 198.047 7.70312 16.4844 32.4844
Length 349, alignment in bytes 0: 198.797 7.75 16.5312 32.5469
Length 349, alignment in bytes 4: 198.594 8.57812 17.375 33.0781
Length 350, alignment in bytes 0: 199.328 7.95312 16.75 32.5156
Length 350, alignment in bytes 5: 199.234 8.48438 17.1719 33.2188
Length 351, alignment in bytes 0: 199.797 7.98438 16.6562 32.5625
Length 351, alignment in bytes 6: 199.844 8.32812 17.1562 33.2344
Length 352, alignment in bytes 0: 200.312 8.0625 16.8281 32.4688
Length 352, alignment in bytes 7: 200.438 8.3125 17.2812 33.2031
Length 353, alignment in bytes 0: 200.828 8.20312 17.0781 33.0156
Length 353, alignment in bytes 0: 201.016 8.14062 17.2031 33.125
Length 354, alignment in bytes 0: 201.359 8.21875 17.1562 33.0938
Length 354, alignment in bytes 1: 201.562 8.17188 17.1406 33.1719
Length 355, alignment in bytes 0: 201.906 8.17188 17.2344 33.0625
Length 355, alignment in bytes 2: 202.109 8.17188 17.0938 33.1562
Length 356, alignment in bytes 0: 202.5 8.29688 17.2812 33.125
Length 356, alignment in bytes 3: 202.641 8.1875 17.2188 33.0156
Length 357, alignment in bytes 0: 203.094 8.14062 17.2969 33.125
Length 357, alignment in bytes 4: 203.141 8.20312 17.0781 33.6094
Length 358, alignment in bytes 0: 203.672 8.10938 17.2344 33.1562
Length 358, alignment in bytes 5: 203.672 8.25 17 33.5469
Length 359, alignment in bytes 0: 441.5 8.14062 17.3281 33.2969
Length 359, alignment in bytes 6: 204.188 8.15625 17.0156 33.5625
Length 360, alignment in bytes 0: 204.828 8.21875 17.25 33.2031
Length 360, alignment in bytes 7: 204.766 8.14062 17.1406 33.5156
Length 361, alignment in bytes 0: 205.391 8.23438 17.0781 33.7969
Length 361, alignment in bytes 0: 205.312 8.17188 17.0156 33.5469
Length 362, alignment in bytes 0: 205.922 8.125 17.0469 33.8125
Length 362, alignment in bytes 1: 205.906 8.17188 16.875 33.7031
Length 363, alignment in bytes 0: 226.562 8.17188 17.0469 33.7344
Length 363, alignment in bytes 2: 206.516 8.15625 17.0156 33.7031
Length 364, alignment in bytes 0: 206.953 8.25 17.0781 33.5625
Length 364, alignment in bytes 3: 207.078 8.14062 17.0625 33.6562
Length 365, alignment in bytes 0: 207.484 8.28125 16.9688 33.7031
Length 365, alignment in bytes 4: 207.656 8.15625 17.5781 34.1719
Length 366, alignment in bytes 0: 208.016 8.17188 17.4219 33.6562
Length 366, alignment in bytes 5: 208.188 8.20312 17.5625 34.1875
Length 367, alignment in bytes 0: 208.562 8.10938 17.1562 33.5781
Length 367, alignment in bytes 6: 208.75 8.1875 17.5469 34.2031
Length 368, alignment in bytes 0: 209.188 8.15625 17.4062 33.7656
Length 368, alignment in bytes 7: 209.266 8.20312 17.6562 34.2969
Length 369, alignment in bytes 0: 209.781 8.25 17.6719 34.1406
Length 369, alignment in bytes 0: 209.781 8.14062 17.7812 34.2969
Length 370, alignment in bytes 0: 210.391 8.1875 17.625 34.125
Length 370, alignment in bytes 1: 210.281 8.1875 17.625 34.2344
Length 371, alignment in bytes 0: 210.969 8.125 17.625 34.2188
Length 371, alignment in bytes 2: 210.812 8.21875 17.625 34.1719
Length 372, alignment in bytes 0: 211.531 8.14062 17.6719 34.2188
Length 372, alignment in bytes 3: 211.391 8.20312 17.6562 34.1094
Length 373, alignment in bytes 0: 212.047 8.14062 17.6094 34.2344
Length 373, alignment in bytes 4: 212.016 8.1875 17.5781 34.7656
Length 374, alignment in bytes 0: 212.531 8.17188 17.8438 34.3281
Length 374, alignment in bytes 5: 212.609 8.23438 17.6094 34.6875
Length 375, alignment in bytes 0: 213.031 8.1875 17.6719 34.0938
Length 375, alignment in bytes 6: 213.188 8.14062 17.5938 34.7969
Length 376, alignment in bytes 0: 213.625 8.32812 17.7031 34.1406
Length 376, alignment in bytes 7: 213.703 8.15625 17.5625 34.8438
Length 377, alignment in bytes 0: 214.25 8.15625 17.4375 34.8281
Length 377, alignment in bytes 0: 214.172 8.14062 17.5781 34.8125
Length 378, alignment in bytes 0: 214.875 8.17188 17.5625 34.7656
Length 378, alignment in bytes 1: 214.719 8.17188 17.5 34.7344
Length 379, alignment in bytes 0: 215.391 8.15625 17.5938 34.875
Length 379, alignment in bytes 2: 215.328 8.1875 17.4844 34.75
Length 380, alignment in bytes 0: 215.859 8.125 17.5312 34.875
Length 380, alignment in bytes 3: 215.953 8.14062 17.5156 34.7969
Length 381, alignment in bytes 0: 216.375 8.14062 17.4219 34.7344
Length 381, alignment in bytes 4: 216.516 9.15625 18.2344 35.3438
Length 382, alignment in bytes 0: 216.969 8.34375 17.5156 34.6562
Length 382, alignment in bytes 5: 216.969 9.09375 18.1562 35.2969
Length 383, alignment in bytes 0: 217.625 8.35938 17.5156 34.8125
Length 383, alignment in bytes 6: 217.484 8.89062 18.1875 35.2344
Length 384, alignment in bytes 0: 218.141 8.40625 17.6406 35.0781
Length 384, alignment in bytes 7: 218.141 8.875 17.9375 35.3594
Length 385, alignment in bytes 0: 218.594 8.79688 18.1875 35.2969
Length 385, alignment in bytes 0: 218.734 8.67188 18.125 35.3906
Length 386, alignment in bytes 0: 219.219 8.70312 17.9844 35.3125
Length 386, alignment in bytes 1: 219.188 8.76562 18.25 35.3125
Length 387, alignment in bytes 0: 219.859 8.6875 18.0938 35.3906
Length 387, alignment in bytes 2: 219.766 8.70312 18.0156 35.2812
Length 388, alignment in bytes 0: 220.297 8.85938 18.0938 35.2969
Length 388, alignment in bytes 3: 220.406 8.71875 18.0469 35.4219
Length 389, alignment in bytes 0: 220.875 8.71875 17.9688 35.2969
Length 389, alignment in bytes 4: 220.875 8.75 18.0156 35.8906
Length 390, alignment in bytes 0: 221.516 8.73438 18.125 35.6719
Length 390, alignment in bytes 5: 221.406 8.73438 17.9219 35.8906
Length 391, alignment in bytes 0: 221.969 8.70312 18.1875 35.4531
Length 391, alignment in bytes 6: 222.062 8.70312 17.9688 35.8906
Length 392, alignment in bytes 0: 222.5 8.78125 18.2031 35.5938
Length 392, alignment in bytes 7: 222.531 8.70312 18.0938 35.9375
Length 393, alignment in bytes 0: 223.172 8.67188 17.8906 36
Length 393, alignment in bytes 0: 223.062 8.73438 17.9844 35.7812
Length 394, alignment in bytes 0: 223.656 8.6875 17.9844 35.9688
Length 394, alignment in bytes 1: 223.734 8.65625 17.9844 35.8438
Length 395, alignment in bytes 0: 224.188 8.71875 18.0312 35.8438
Length 395, alignment in bytes 2: 224.219 8.70312 17.9531 35.9375
Length 396, alignment in bytes 0: 224.859 8.6875 18.0312 35.875
Length 396, alignment in bytes 3: 224.734 8.71875 18.0156 35.7812
Length 397, alignment in bytes 0: 225.312 8.70312 17.9219 35.875
Length 397, alignment in bytes 4: 225.391 8.6875 18.5312 36.4844
Length 398, alignment in bytes 0: 225.875 8.71875 17.8281 36.3594
Length 398, alignment in bytes 5: 225.828 8.75 18.5469 36.4219
Length 399, alignment in bytes 0: 226.453 8.71875 18.3281 36.1562
Length 399, alignment in bytes 6: 226.469 8.65625 18.5625 36.4062
Length 400, alignment in bytes 0: 226.969 8.73438 18 36.1562
Length 400, alignment in bytes 7: 226.969 8.71875 18.5312 36.5938
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-02-14 11:06 [PATCH] powerpc64: strrchr optimization for power8 Rajalakshmi Srinivasaraghavan
2017-02-20 13:10 ` Gabriel F. T. Gomes
2017-02-20 13:43 ` Carlos O'Donell
@ 2017-02-21 17:02 ` Carlos Eduardo Seo
2 siblings, 0 replies; 18+ messages in thread
From: Carlos Eduardo Seo @ 2017-02-21 17:02 UTC (permalink / raw)
To: Rajalakshmi Srinivasaraghavan, libc-alpha
On 2/14/17, 9:05 AM, "Rajalakshmi Srinivasaraghavan" <libc-alpha-owner@sourceware.org on behalf of raji@linux.vnet.ibm.com> wrote:
Changes from previous version [1]
- Comments correction and alignment changes.
--
P7 code is used for <=32B strings and for > 32B vectorized loops are used.
This shows as an average 25% improvement depending on the position of search
character. The performance is same for shorter strings.
Tested on ppc64 and ppc64le.
LGTM.
--
Carlos Eduardo Seo
Software Engineer - Linux on Power Toolchain
cseo@linux.vnet.ibm.com
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-02-20 16:06 ` Carlos O'Donell
2017-02-20 16:50 ` Rajalakshmi Srinivasaraghavan
@ 2017-02-28 7:32 ` Rajalakshmi Srinivasaraghavan
2017-03-09 6:14 ` Rajalakshmi Srinivasaraghavan
1 sibling, 1 reply; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-02-28 7:32 UTC (permalink / raw)
To: libc-alpha
On 02/20/2017 09:36 PM, Carlos O'Donell wrote:
> On 02/20/2017 11:01 AM, Rajalakshmi Srinivasaraghavan wrote:
>>
>>
>> On 02/20/2017 07:12 PM, Carlos O'Donell wrote:
>>> On 02/14/2017 06:05 AM, Rajalakshmi Srinivasaraghavan wrote:
>>>> P7 code is used for <=32B strings and for > 32B vectorized loops are used.
>>>> This shows as an average 25% improvement depending on the position of search
>>>> character. The performance is same for shorter strings.
>>>> Tested on ppc64 and ppc64le.
>>> What did you use to test the 25% improvement?
>>
>> This improvement is seen when compared to power7. Benchtest is
>> modified to use length from 0 to 400 to find the average for
>> different lengths.
>
> Could you post your modifications for review an explain your
> process in a little more detail. I'm curious about the changes
> you made.
Carlos,
Posted benchtest modification here:
https://sourceware.org/ml/libc-alpha/2017-02/msg00380.html
--
Thanks
Rajalakshmi S
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-02-28 7:32 ` Rajalakshmi Srinivasaraghavan
@ 2017-03-09 6:14 ` Rajalakshmi Srinivasaraghavan
2017-03-17 15:38 ` Carlos O'Donell
0 siblings, 1 reply; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-03-09 6:14 UTC (permalink / raw)
To: libc-alpha, Carlos O'Donell
On 02/28/2017 01:02 PM, Rajalakshmi Srinivasaraghavan wrote:
>
>
> On 02/20/2017 09:36 PM, Carlos O'Donell wrote:
>> On 02/20/2017 11:01 AM, Rajalakshmi Srinivasaraghavan wrote:
>>>
>>>
>>> On 02/20/2017 07:12 PM, Carlos O'Donell wrote:
>>>> On 02/14/2017 06:05 AM, Rajalakshmi Srinivasaraghavan wrote:
>>>>> P7 code is used for <=32B strings and for > 32B vectorized loops
>>>>> are used.
>>>>> This shows as an average 25% improvement depending on the position
>>>>> of search
>>>>> character. The performance is same for shorter strings.
>>>>> Tested on ppc64 and ppc64le.
>>>> What did you use to test the 25% improvement?
>>>
>>> This improvement is seen when compared to power7. Benchtest is
>>> modified to use length from 0 to 400 to find the average for
>>> different lengths.
>>
>> Could you post your modifications for review an explain your
>> process in a little more detail. I'm curious about the changes
>> you made.
>
> Carlos,
> Posted benchtest modification here:
> https://sourceware.org/ml/libc-alpha/2017-02/msg00380.html
Carlos,
Do you have further comments?
--
Thanks
Rajalakshmi S
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-03-09 6:14 ` Rajalakshmi Srinivasaraghavan
@ 2017-03-17 15:38 ` Carlos O'Donell
2017-03-20 8:39 ` Rajalakshmi Srinivasaraghavan
` (2 more replies)
0 siblings, 3 replies; 18+ messages in thread
From: Carlos O'Donell @ 2017-03-17 15:38 UTC (permalink / raw)
To: Rajalakshmi Srinivasaraghavan, libc-alpha
On 03/09/2017 01:14 AM, Rajalakshmi Srinivasaraghavan wrote:
>
>
> On 02/28/2017 01:02 PM, Rajalakshmi Srinivasaraghavan wrote:
>>
>>
>> On 02/20/2017 09:36 PM, Carlos O'Donell wrote:
>>> On 02/20/2017 11:01 AM, Rajalakshmi Srinivasaraghavan wrote:
>>>>
>>>>
>>>> On 02/20/2017 07:12 PM, Carlos O'Donell wrote:
>>>>> On 02/14/2017 06:05 AM, Rajalakshmi Srinivasaraghavan wrote:
>>>>>> P7 code is used for <=32B strings and for > 32B vectorized loops
>>>>>> are used.
>>>>>> This shows as an average 25% improvement depending on the position
>>>>>> of search
>>>>>> character. The performance is same for shorter strings.
>>>>>> Tested on ppc64 and ppc64le.
>>>>> What did you use to test the 25% improvement?
>>>>
>>>> This improvement is seen when compared to power7. Benchtest is
>>>> modified to use length from 0 to 400 to find the average for
>>>> different lengths.
>>>
>>> Could you post your modifications for review an explain your
>>> process in a little more detail. I'm curious about the changes
>>> you made.
>>
>> Carlos,
>> Posted benchtest modification here:
>> https://sourceware.org/ml/libc-alpha/2017-02/msg00380.html
>
> Carlos,
>
> Do you have further comments?
This is exactly what I was interested in seeing, and I see Siddhesh
has approved your commit to benchtests to increase the string lengths
used in the analysis.
When I review these changes I look at:
(a) What microbenchmark did you use?
- Can we include it in glibc?
* We did, your improvements should be going into master so others
can reproduce them.
(b) What assumptions did you make and were they valid?
Increasing the microbenchmarks to measure up to 512 bytes is probably
a good thing to give broad coverage over the performance from small
to large strings that are multiples of most cache lines (and places
where prefetching might start helping).
Does IBM internally have any good data about what the low, median,
average, and high lengths of strings that are being used with the
strrchr API? Such gathered statistical data would allow us to tune
the microbencharmk.
Knowing the mean value of string lengths would let us decide where
to place most of optimization efforts. I don't know that we have any
good references to academic literature here.
Your lack of such references in your patch means you don't know either,
but given that you indicate low string size performance is no worse,
this patch looks fine.
In summary:
- You assume applications will be using strings > 32 bytes, and that's
not an entirely unreasonable assumption to make.
- You show performance with <= 32b remains the same and longer string
lengths improve.
- You contribute the microbenchmark changes that allowed you to measure
these numbers.
That's exactly what I want to see from a good contribution.
Now I plotted the power8 performance and there is a big bump in the middle,
any idea why?
https://docs.google.com/a/redhat.com/spreadsheets/d/16kW90bXH7nC8Ak6Xyoe4cxVIvFPwjVDcO-7qsZs0iVc/pubhtml
--
Cheers,
Carlos.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-03-17 15:38 ` Carlos O'Donell
@ 2017-03-20 8:39 ` Rajalakshmi Srinivasaraghavan
2017-03-20 16:34 ` Carlos O'Donell
2017-03-21 5:15 ` Rajalakshmi Srinivasaraghavan
2017-04-03 15:30 ` Tulio Magno Quites Machado Filho
2 siblings, 1 reply; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-03-20 8:39 UTC (permalink / raw)
To: libc-alpha, Carlos O'Donell
On 03/17/2017 09:08 PM, Carlos O'Donell wrote:
> Now I plotted the power8 performance and there is a big bump in the middle,
> any idea why?
>
> https://docs.google.com/a/redhat.com/spreadsheets/d/16kW90bXH7nC8Ak6Xyoe4cxVIvFPwjVDcO-7qsZs0iVc/pubhtml
I am not able to access this.
--
Thanks
Rajalakshmi S
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-03-20 8:39 ` Rajalakshmi Srinivasaraghavan
@ 2017-03-20 16:34 ` Carlos O'Donell
0 siblings, 0 replies; 18+ messages in thread
From: Carlos O'Donell @ 2017-03-20 16:34 UTC (permalink / raw)
To: Rajalakshmi Srinivasaraghavan, libc-alpha
On 03/20/2017 04:39 AM, Rajalakshmi Srinivasaraghavan wrote:
>
>
> On 03/17/2017 09:08 PM, Carlos O'Donell wrote:
>> Now I plotted the power8 performance and there is a big bump in the middle,
>> any idea why?
>>
>> https://docs.google.com/a/redhat.com/spreadsheets/d/16kW90bXH7nC8Ak6Xyoe4cxVIvFPwjVDcO-7qsZs0iVc/pubhtml
>
> I am not able to access this.
>
This should work.
https://docs.google.com/spreadsheets/d/1e2QHzIvMEp_71z0NqFbhRxY27BzscmV23UeE0NawQKE/edit?usp=sharing
--
Cheers,
Carlos.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-03-17 15:38 ` Carlos O'Donell
2017-03-20 8:39 ` Rajalakshmi Srinivasaraghavan
@ 2017-03-21 5:15 ` Rajalakshmi Srinivasaraghavan
2017-04-03 15:30 ` Tulio Magno Quites Machado Filho
2 siblings, 0 replies; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-03-21 5:15 UTC (permalink / raw)
To: Carlos O'Donell, libc-alpha
On 03/17/2017 09:08 PM, Carlos O'Donell wrote:
> Knowing the mean value of string lengths would let us decide where
> to place most of optimization efforts. I don't know that we have any
> good references to academic literature here.
>
No, I dont have details about strrchr call statistics. This
optimization is to make use of POWER 8 capabilties in general.
> Now I plotted the power8 performance and there is a big bump in the middle,
> any idea why?
I could see some sudden increase for sizes 104-118 and around 222
in the shared sheet. However there is no special logic in the code
related to these sizes. I tried to check if this happens on my
test P8 ppc64le system and I could not recreate this.
--
Thanks
Rajalakshmi S
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-03-17 15:38 ` Carlos O'Donell
2017-03-20 8:39 ` Rajalakshmi Srinivasaraghavan
2017-03-21 5:15 ` Rajalakshmi Srinivasaraghavan
@ 2017-04-03 15:30 ` Tulio Magno Quites Machado Filho
2017-04-17 15:20 ` Carlos O'Donell
2 siblings, 1 reply; 18+ messages in thread
From: Tulio Magno Quites Machado Filho @ 2017-04-03 15:30 UTC (permalink / raw)
To: Carlos O'Donell, Rajalakshmi Srinivasaraghavan, libc-alpha
Carlos O'Donell <carlos@redhat.com> writes:
> Now I plotted the power8 performance and there is a big bump in the middle,
> any idea why?
I remember that we discussed this on #glibc and you noticed the same behavior
on simple_strrchr and we agreed this could be raw hardware behavior.
Do you think this question is still blocking this patch?
--
Tulio Magno
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-04-03 15:30 ` Tulio Magno Quites Machado Filho
@ 2017-04-17 15:20 ` Carlos O'Donell
0 siblings, 0 replies; 18+ messages in thread
From: Carlos O'Donell @ 2017-04-17 15:20 UTC (permalink / raw)
To: Tulio Magno Quites Machado Filho, Rajalakshmi Srinivasaraghavan,
libc-alpha
On 04/03/2017 11:29 AM, Tulio Magno Quites Machado Filho wrote:
> Carlos O'Donell <carlos@redhat.com> writes:
>
>> Now I plotted the power8 performance and there is a big bump in the middle,
>> any idea why?
>
> I remember that we discussed this on #glibc and you noticed the same behavior
> on simple_strrchr and we agreed this could be raw hardware behavior.
>
> Do you think this question is still blocking this patch?
My question does not block the patch.
At this point the patch looks good to me.
You just have an unexplained issue with performance, but it appears to impact
_all_ the algorithms, simple, and the new POWER8 one, so it must be some semantic
of the hardware showing up in the algorithm. It's odd to see that consistent bump.
--
Cheers,
Carlos.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-02-09 5:01 Rajalakshmi Srinivasaraghavan
2017-02-09 15:26 ` Gabriel F. T. Gomes
@ 2017-02-13 16:18 ` Peter Bergner
1 sibling, 0 replies; 18+ messages in thread
From: Peter Bergner @ 2017-02-13 16:18 UTC (permalink / raw)
To: Rajalakshmi Srinivasaraghavan, libc-alpha
On 2/8/17 11:00 PM, Rajalakshmi Srinivasaraghavan wrote:
> + /* r4 is changed now ,if its passed as more chars
> + check for null again */
Not sure without looking at the code closer, but should this
read like the following???
/* r4 is changed now. If it's passed more chars, then check for
null again. */
> + /* if there are more than one 0xff in r11, find the first pos of ff
> + in r11 and fill r10 with 0 from that position */
First word should be capitalized and there is no final '.' and two
spaces at the end of the sentence. I would also write out position
rather than using "pos" like you did at the end of the sentence.
> + vsl v9, v8, v9 /* v9 = 0x4040404040404040. */
Two spaces after the '.'.
Peter
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] powerpc64: strrchr optimization for power8
2017-02-09 5:01 Rajalakshmi Srinivasaraghavan
@ 2017-02-09 15:26 ` Gabriel F. T. Gomes
2017-02-13 16:18 ` Peter Bergner
1 sibling, 0 replies; 18+ messages in thread
From: Gabriel F. T. Gomes @ 2017-02-09 15:26 UTC (permalink / raw)
To: Rajalakshmi Srinivasaraghavan; +Cc: libc-alpha
Hi,
I have a few cosmetic comments...
On Thu, 9 Feb 2017 10:30:54 +0530
Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> wrote:
> +/* int [r3] strrchr (char *s [r3], int c [r4]) */
~~~
Should it be char *, instead?
> +#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
> +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
> +#define VBPERMQ(t,a,b) .long (0x1000054c \
> + | ((t)<<(32-11)) \
> + | ((a)<<(32-16)) \
> + | ((b)<<(32-21)) )
~~~~~~~~~~~~~~~~~~~~~~~~
Eight spaces should be replaced with tabs.
> +#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21)))
> +#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21)))
> +#define VADDUQM(t,a,b) .long (0x10000100 \
> + | ((t)<<(32-11)) \
> + | ((a)<<(32-16)) \
> + | ((b)<<(32-21)) )
~~~~~~~~~~~~~~~~~~~~~~~~
Likewise.
> + /* r4 is changed now ,if its passed as more chars
^
now, if
> + li r5, 16
> + vspltb v1, v1, 7
> + /* Compare 32 bytes in each loop. */
~~~~~~~~
Eight spaces should be replaced with tabs.
> + blt cr6, L(match)
> +
> + /* One (or both) of the quadwords contains c/null. */
~~~~~~~~
Likewise.
> +
> +L(match):
> + /* One (or both) of the quadwords contains a match. */
~~~~~~~~
Likewise.
> + vslb v10, v11, v10
> + li r5, 16
> + /* Compare 32 bytes in each loop. */
~~~~~~~~
Likewise.
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH] powerpc64: strrchr optimization for power8
@ 2017-02-09 5:01 Rajalakshmi Srinivasaraghavan
2017-02-09 15:26 ` Gabriel F. T. Gomes
2017-02-13 16:18 ` Peter Bergner
0 siblings, 2 replies; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-02-09 5:01 UTC (permalink / raw)
To: libc-alpha; +Cc: Rajalakshmi Srinivasaraghavan
P7 code is used for <=32B strings and for > 32B vectorized loops are used.
This shows as an average 25% improvement depending on the position of search
character. The performance is same for shorter strings.
Tested on ppc64 and ppc64le.
2017-02-06 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/multiarch/Makefile
(sysdep_routines): Add strrchr-power8.
* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
(strrchr): Add __strrchr_power8 to list of strrchr functions.
* sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S: New file.
* sysdeps/powerpc/powerpc64/multiarch/strrchr.c
(strrchr): Add __strrchr_power8 to ifunc list.
* sysdeps/powerpc/powerpc64/power8/strrchr.S: New file.
---
sysdeps/powerpc/powerpc64/multiarch/Makefile | 3 +-
.../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 3 +
.../powerpc/powerpc64/multiarch/strrchr-power8.S | 39 ++
sysdeps/powerpc/powerpc64/multiarch/strrchr.c | 3 +
sysdeps/powerpc/powerpc64/power8/strrchr.S | 464 +++++++++++++++++++++
5 files changed, 511 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
create mode 100644 sysdeps/powerpc/powerpc64/power8/strrchr.S
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index f5889a3..0fc0ebc 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -14,7 +14,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
strchrnul-power8 strchrnul-power7 strchrnul-ppc64 \
strcpy-power8 strcpy-power7 strcpy-ppc64 stpcpy-power8 \
stpcpy-power7 stpcpy-ppc64 \
- strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
+ strrchr-power8 strrchr-power7 strrchr-ppc64 \
+ strncat-power7 strncat-ppc64 \
strncpy-power7 strncpy-ppc64 \
stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
strcmp-power9 strcmp-power8 strcmp-power7 strcmp-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 209aec5..d77c47f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -281,6 +281,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strrchr.c. */
IFUNC_IMPL (i, name, strrchr,
IFUNC_IMPL_ADD (array, i, strrchr,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __strrchr_power8)
+ IFUNC_IMPL_ADD (array, i, strrchr,
hwcap & PPC_FEATURE_HAS_VSX,
__strrchr_power7)
IFUNC_IMPL_ADD (array, i, strrchr, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
new file mode 100644
index 0000000..23365a1
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
@@ -0,0 +1,39 @@
+/* Optimized strrchr implementation for POWER8.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#undef ENTRY
+#define ENTRY(name) \
+ .section ".text"; \
+ ENTRY_2(__strrchr_power8) \
+ .align ALIGNARG(2); \
+ BODY_LABEL(__strrchr_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__strrchr_power8)
+
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__strrchr_power8) \
+ END_2(__strrchr_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strrchr.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
index dc1d3d0..0f94c9d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
@@ -25,11 +25,14 @@
extern __typeof (strrchr) __strrchr_ppc attribute_hidden;
extern __typeof (strrchr) __strrchr_power7 attribute_hidden;
+extern __typeof (strrchr) __strrchr_power8 attribute_hidden;
#undef strrchr
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc_redirected (__redirect_strrchr, strrchr,
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __strrchr_power8 :
(hwcap & PPC_FEATURE_HAS_VSX)
? __strrchr_power7
: __strrchr_ppc);
diff --git a/sysdeps/powerpc/powerpc64/power8/strrchr.S b/sysdeps/powerpc/powerpc64/power8/strrchr.S
new file mode 100644
index 0000000..f29fc4e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strrchr.S
@@ -0,0 +1,464 @@
+/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* int [r3] strrchr (char *s [r3], int c [r4]) */
+/* TODO: change these to the actual instructions when the minimum required
+ binutils allows it. */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VADDUQM(t,a,b) .long (0x10000100 \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+#ifdef __LITTLE_ENDIAN__
+/* Find the match position from v6 and place result in r6. */
+# define CALCULATE_MATCH() \
+ VBPERMQ(v6, v6, v10); \
+ vsldoi v6, v6, v6, 6; \
+ MFVRD(r7, v6); \
+ cntlzd r6, r7; \
+ subfic r6, r6, 15;
+/*
+ * Find the first null position to mask bytes after null.
+ * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw.
+ * Result placed at v2.
+ */
+# define FIND_NULL_POS(reg) \
+ vspltisb v11, -1; \
+ VADDUQM(v11, reg, v11); \
+ vandc v11, v11, reg; \
+ VPOPCNTD(v2, v11); \
+ vspltb v11, v2, 15; \
+ vcmpequb. v11, v11, v9; \
+ blt cr6, 1f; \
+ vsldoi v9, v0, v9, 1; \
+ vslo v2, v2, v9; \
+1: \
+ vsumsws v2, v2, v0;
+#else
+# define CALCULATE_MATCH() \
+ VBPERMQ(v6, v6, v10); \
+ MFVRD(r7, v6); \
+ addi r6, r7, -1; \
+ andc r6, r6, r7; \
+ popcntd r6, r6; \
+ subfic r6, r6, 15;
+# define FIND_NULL_POS(reg) \
+ VCLZD(v2, reg); \
+ vspltb v11, v2, 7; \
+ vcmpequb. v11, v11, v9; \
+ blt cr6, 1f; \
+ vsldoi v9, v0, v9, 1; \
+ vsro v2, v2, v9; \
+1: \
+ vsumsws v2, v2, v0;
+#endif /* !__LITTLE_ENDIAN__ */
+ .machine power7
+ENTRY (strrchr)
+ CALL_MCOUNT 2
+ dcbt 0,r3
+ clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
+ cmpdi cr7,r4,0
+ ld r12,0(r8) /* Load doubleword from memory. */
+ li r9,0 /* used to store last occurence */
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+
+ beq cr7,L(null_match)
+
+ /* Replicate byte to doubleword. */
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32
+ insrdi r4,r4,32,0
+
+ /* r4 is changed now ,if its passed as more chars
+ check for null again */
+ cmpdi cr7,r4,0
+ beq cr7,L(null_match)
+ /* Now r4 has a doubleword of c bytes and r0 has
+ a doubleword of null bytes. */
+
+ cmpb r10,r12,r4 /* Compare each byte against c byte. */
+ cmpb r11,r12,r0 /* Compare each byte against null byte. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r10,r10,r6
+ srd r11,r11,r6
+ sld r10,r10,r6
+ sld r11,r11,r6
+#else
+ sld r10,r10,r6
+ sld r11,r11,r6
+ srd r10,r10,r6
+ srd r11,r11,r6
+#endif
+ or r5,r10,r11 /* OR the results to speed things up. */
+ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done)
+
+L(align):
+ andi. r12, r8, 15
+
+ /* Are we now aligned to a doubleword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bne cr0, L(loop)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ bne cr7,L(done)
+ b L(loop) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+ .p2align 5
+L(loop):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r7,16(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ cmpb r6,r7,r4
+ cmpb r7,r7,r0
+ or r12,r10,r11
+ or r5,r6,r7
+ or r5,r12,r5
+ cmpdi cr7,r5,0
+ beq cr7,L(vector)
+
+ /* OK, one (or both) of the doublewords contains a c/null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a c/null byte. */
+ cmpdi cr6,r12,0
+ addi r8,r8,-8
+ bne cr6,L(done)
+
+ /* The c/null byte must be in the second doubleword. Adjust the
+ address again and move the result of cmpb to r10 so we can calculate
+ the pointer. */
+
+ mr r10,r6
+ mr r11,r7
+ addi r8,r8,8
+
+ /* r10/r11 have the output of the cmpb instructions, that is,
+ 0xff in the same position as the c/null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+
+L(done):
+ /* if there are more than one 0xff in r11, find the first pos of ff
+ in r11 and fill r10 with 0 from that position */
+ cmpdi cr7,r11,0
+ beq cr7,L(no_null)
+#ifdef __LITTLE_ENDIAN__
+ addi r3,r11,-1
+ andc r3,r3,r11
+ popcntd r0,r3
+#else
+ cntlzd r0,r11
+#endif
+ subfic r0,r0,63
+ li r6,-1
+#ifdef __LITTLE_ENDIAN__
+ srd r0,r6,r0
+#else
+ sld r0,r6,r0
+#endif
+ and r10,r0,r10
+L(no_null):
+#ifdef __LITTLE_ENDIAN__
+ cntlzd r0,r10 /* Count leading zeros before c matches. */
+ addi r3,r10,-1
+ andc r3,r3,r10
+ addi r10,r11,-1
+ andc r10,r10,r11
+ cmpld cr7,r3,r10
+ bgt cr7,L(no_match)
+#else
+ addi r3,r10,-1 /* Count trailing zeros before c matches. */
+ andc r3,r3,r10
+ popcntd r0,r3
+ cmpld cr7,r11,r10
+ bgt cr7,L(no_match)
+#endif
+ srdi r0,r0,3 /* Convert trailing zeros to bytes. */
+ subfic r0,r0,7
+ add r9,r8,r0 /* Return address of the matching c byte
+ or null in case c was not found. */
+ li r0,0
+ cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */
+ beq cr7,L(align)
+
+ .align 4
+L(no_match):
+ mr r3,r9
+ blr
+
+/* Check the first 32B in GPR's and move to vectorized loop. */
+ .p2align 5
+L(vector):
+ addi r3, r8, 8
+ /* Make sure 32B aligned. */
+ andi. r10, r3, 31
+ bne cr0, L(loop)
+ vspltisb v0, 0
+ /* Precompute vbpermq constant. */
+ vspltisb v10, 3
+ lvsl v11, r0, r0
+ vslb v10, v11, v10
+ MTVRD(v1, r4)
+ li r5, 16
+ vspltb v1, v1, 7
+ /* Compare 32 bytes in each loop. */
+L(continue):
+ lvx v4, 0, r3
+ lvx v5, r3, r5
+ vcmpequb v2, v0, v4
+ vcmpequb v3, v0, v5
+ vcmpequb v6, v1, v4
+ vcmpequb v7, v1, v5
+ vor v8, v2, v3
+ vor v9, v6, v7
+ vor v11, v8, v9
+ vcmpequb. v11, v0, v11
+ addi r3, r3, 32
+ blt cr6, L(continue)
+ vcmpequb. v8, v0, v8
+ blt cr6, L(match)
+
+ /* One (or both) of the quadwords contains c/null. */
+ vspltisb v8, 2
+ vspltisb v9, 5
+ /* Precompute values used for comparison. */
+ vsl v9, v8, v9 /* v9 = 0x4040404040404040. */
+ vaddubm v8, v9, v9
+ vsldoi v8, v0, v8, 1 /* v8 = 0x80. */
+
+ /* Check if null is in second qw. */
+ vcmpequb. v11, v0, v2
+ blt cr6, L(secondqw)
+
+ /* Null found in first qw. */
+ addi r8, r3, -32
+ /* Calculate the null position. */
+ FIND_NULL_POS(v2)
+ /* Check if null is in the first byte. */
+ vcmpequb. v11, v0, v2
+ blt cr6, L(no_match)
+ vsububm v2, v8, v2
+ /* Mask unwanted bytes after null. */
+#ifdef __LITTLE_ENDIAN__
+ vslo v6, v6, v2
+ vsro v6, v6, v2
+#else
+ vsro v6, v6, v2
+ vslo v6, v6, v2
+#endif
+ vcmpequb. v11, v0, v6
+ blt cr6, L(no_match)
+ /* Found a match before null. */
+ CALCULATE_MATCH()
+ add r3, r8, r6
+ blr
+
+L(secondqw):
+ addi r8, r3, -16
+ FIND_NULL_POS(v3)
+ vcmpequb. v11, v0, v2
+ blt cr6, L(no_match1)
+ vsububm v2, v8, v2
+ /* Mask unwanted bytes after null. */
+#ifdef __LITTLE_ENDIAN__
+ vslo v7, v7, v2
+ vsro v7, v7, v2
+#else
+ vsro v7, v7, v2
+ vslo v7, v7, v2
+#endif
+ vcmpequb. v11, v0, v7
+ blt cr6, L(no_match1)
+ addi r8, r8, 16
+ vor v6, v0, v7
+L(no_match1):
+ addi r8, r8, -16
+ vcmpequb. v11, v0, v6
+ blt cr6, L(no_match)
+ /* Found a match before null. */
+ CALCULATE_MATCH()
+ add r3, r8, r6
+ blr
+
+L(match):
+ /* One (or both) of the quadwords contains a match. */
+ mr r8, r3
+ vcmpequb. v8, v0, v7
+ blt cr6, L(firstqw)
+ /* Match found in second qw. */
+ addi r8, r8, 16
+ vor v6, v0, v7
+L(firstqw):
+ addi r8, r8, -32
+ CALCULATE_MATCH()
+ add r9, r8, r6 /* Compute final length. */
+ b L(continue)
+/* We are here because strrchr was called with a null byte. */
+ .align 4
+L(null_match):
+ /* r0 has a doubleword of null bytes. */
+
+ cmpb r5,r12,r0 /* Compare each byte against null bytes. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r5,r5,r6
+ sld r5,r5,r6
+#else
+ sld r5,r5,r6
+ srd r5,r5,r6
+#endif
+ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done_null)
+
+ andi. r12, r8, 15
+
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bne cr0, L(loop_null)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r5,r12,r0
+ cmpdi cr7,r5,0
+ bne cr7,L(done_null)
+ b L(loop_null) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+
+ /* Main loop to look for the end of the string. Since it's a
+ small loop (< 8 instructions), align it to 32-bytes. */
+ .p2align 5
+L(loop_null):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r11,16(r8)
+ cmpb r5,r12,r0
+ cmpb r10,r11,r0
+ or r6,r5,r10
+ cmpdi cr7,r6,0
+ beq cr7,L(vector1)
+
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ cmpdi cr6,r5,0
+ addi r8,r8,-8
+ bne cr6,L(done_null)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ pointer. */
+
+ mr r5,r10
+ addi r8,r8,8
+
+ /* r5 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r5,-1
+ andc r0,r0,r5
+ popcntd r0,r0
+#else
+ cntlzd r0,r5 /* Count leading zeros before the match. */
+#endif
+ srdi r0,r0,3 /* Convert trailing zeros to bytes. */
+ add r3,r8,r0 /* Return address of the matching null byte. */
+ blr
+/* Check the first 32B in GPR's and move to vectorized loop. */
+ .p2align 5
+L(vector1):
+ addi r3, r8, 8
+ /* Make sure 32B aligned. */
+ andi. r10, r3, 31
+ bne cr0, L(loop_null)
+ vspltisb v0, 0
+ /* Precompute vbpermq constant. */
+ vspltisb v10, 3
+ lvsl v11, r0, r0
+ vslb v10, v11, v10
+ li r5, 16
+ /* Compare 32 bytes in each loop. */
+L(continue1):
+ lvx v4, 0, r3
+ lvx v5, r3, r5
+ vcmpequb v2, v0, v4
+ vcmpequb v3, v0, v5
+ vor v8, v2, v3
+ vcmpequb. v11, v0, v8
+ addi r3, r3, 32
+ blt cr6, L(continue1)
+ addi r3, r3, -32
+ VBPERMQ(v2, v2, v10)
+ VBPERMQ(v3, v3, v10)
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v3, v3, v3, 2
+#else
+ vsldoi v2, v2, v2, 6
+ vsldoi v3, v3, v3, 4
+#endif
+ /* Merge the results and move to a GPR. */
+ vor v4, v3, v2
+ MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+ addi r6, r5, -1
+ andc r6, r6, r5
+ popcntd r6, r6
+#else
+ cntlzd r6, r5 /* Count leading zeros before the match. */
+#endif
+ add r3, r3, r6 /* Compute final length. */
+ blr
+END (strrchr)
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
--
2.7.4
^ permalink raw reply [flat|nested] 18+ messages in thread
end of thread, other threads:[~2017-04-17 15:20 UTC | newest]
Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-02-14 11:06 [PATCH] powerpc64: strrchr optimization for power8 Rajalakshmi Srinivasaraghavan
2017-02-20 13:10 ` Gabriel F. T. Gomes
2017-02-20 13:43 ` Carlos O'Donell
2017-02-20 16:01 ` Rajalakshmi Srinivasaraghavan
2017-02-20 16:06 ` Carlos O'Donell
2017-02-20 16:50 ` Rajalakshmi Srinivasaraghavan
2017-02-28 7:32 ` Rajalakshmi Srinivasaraghavan
2017-03-09 6:14 ` Rajalakshmi Srinivasaraghavan
2017-03-17 15:38 ` Carlos O'Donell
2017-03-20 8:39 ` Rajalakshmi Srinivasaraghavan
2017-03-20 16:34 ` Carlos O'Donell
2017-03-21 5:15 ` Rajalakshmi Srinivasaraghavan
2017-04-03 15:30 ` Tulio Magno Quites Machado Filho
2017-04-17 15:20 ` Carlos O'Donell
2017-02-21 17:02 ` Carlos Eduardo Seo
-- strict thread matches above, loose matches on Subject: below --
2017-02-09 5:01 Rajalakshmi Srinivasaraghavan
2017-02-09 15:26 ` Gabriel F. T. Gomes
2017-02-13 16:18 ` Peter Bergner
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).