public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] powerpc64: strrchr optimization for power8
@ 2017-02-14 11:06 Rajalakshmi Srinivasaraghavan
  2017-02-20 13:10 ` Gabriel F. T. Gomes
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-02-14 11:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: Rajalakshmi Srinivasaraghavan

Changes from previous version [1]

  - Comments correction and alignment changes.

--
P7 code is used for <=32B strings and for > 32B vectorized loops are used.
This shows as an average 25% improvement depending on the position of search
character.  The performance is same for shorter strings.
Tested on ppc64 and ppc64le.

2017-02-14  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>

	* sysdeps/powerpc/powerpc64/multiarch/Makefile
	(sysdep_routines): Add strrchr-power8.
	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
	(strrchr): Add __strrchr_power8 to list of strrchr functions.
	* sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S: New file.
	* sysdeps/powerpc/powerpc64/multiarch/strrchr.c
	(strrchr): Add __strrchr_power8 to ifunc list.
	* sysdeps/powerpc/powerpc64/power8/strrchr.S: New file.
---
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   3 +-
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   3 +
 .../powerpc/powerpc64/multiarch/strrchr-power8.S   |  39 ++
 sysdeps/powerpc/powerpc64/multiarch/strrchr.c      |   3 +
 sysdeps/powerpc/powerpc64/power8/strrchr.S         | 464 +++++++++++++++++++++
 5 files changed, 511 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
 create mode 100644 sysdeps/powerpc/powerpc64/power8/strrchr.S

diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index f5889a3..0fc0ebc 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -14,7 +14,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   strchrnul-power8 strchrnul-power7 strchrnul-ppc64 \
 		   strcpy-power8 strcpy-power7 strcpy-ppc64 stpcpy-power8 \
 		   stpcpy-power7 stpcpy-ppc64 \
-		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
+		   strrchr-power8 strrchr-power7 strrchr-ppc64 \
+		   strncat-power7 strncat-ppc64 \
 		   strncpy-power7 strncpy-ppc64 \
 		   stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
 		   strcmp-power9 strcmp-power8 strcmp-power7 strcmp-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 209aec5..d77c47f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -281,6 +281,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strrchr.c.  */
   IFUNC_IMPL (i, name, strrchr,
 	      IFUNC_IMPL_ADD (array, i, strrchr,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strrchr_power8)
+	      IFUNC_IMPL_ADD (array, i, strrchr,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __strrchr_power7)
 	      IFUNC_IMPL_ADD (array, i, strrchr, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
new file mode 100644
index 0000000..23365a1
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
@@ -0,0 +1,39 @@
+/* Optimized strrchr implementation for POWER8.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef ENTRY
+#define ENTRY(name)						\
+  .section ".text";						\
+  ENTRY_2(__strrchr_power8)					\
+  .align ALIGNARG(2);						\
+  BODY_LABEL(__strrchr_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strrchr_power8)
+
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strrchr_power8)					\
+  END_2(__strrchr_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strrchr.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
index dc1d3d0..0f94c9d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
@@ -25,11 +25,14 @@
 
 extern __typeof (strrchr) __strrchr_ppc attribute_hidden;
 extern __typeof (strrchr) __strrchr_power7 attribute_hidden;
+extern __typeof (strrchr) __strrchr_power8 attribute_hidden;
 #undef strrchr
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc_redirected (__redirect_strrchr, strrchr,
+		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+		       ? __strrchr_power8 :
 		       (hwcap & PPC_FEATURE_HAS_VSX)
 		       ? __strrchr_power7
 		       : __strrchr_ppc);
diff --git a/sysdeps/powerpc/powerpc64/power8/strrchr.S b/sysdeps/powerpc/powerpc64/power8/strrchr.S
new file mode 100644
index 0000000..8eb7485
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strrchr.S
@@ -0,0 +1,464 @@
+/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* char *[r3] strrchr (char *s [r3], int c [r4])  */
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b)  .long (0x1000054c \
+				| ((t)<<(32-11)) \
+				| ((a)<<(32-16)) \
+				| ((b)<<(32-21)) )
+#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VADDUQM(t,a,b)  .long (0x10000100 \
+				| ((t)<<(32-11)) \
+				| ((a)<<(32-16)) \
+				| ((b)<<(32-21)) )
+#ifdef __LITTLE_ENDIAN__
+/* Find the match position from v6 and place result in r6.  */
+# define CALCULATE_MATCH() \
+	VBPERMQ(v6, v6, v10); \
+	vsldoi	v6, v6, v6, 6; \
+	MFVRD(r7, v6); \
+	cntlzd	r6, r7; \
+	subfic	r6, r6, 15;
+/*
+ * Find the first null position to mask bytes after null.
+ * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw.
+ * Result placed at v2.
+ */
+# define FIND_NULL_POS(reg) \
+	vspltisb	v11, -1; \
+	VADDUQM(v11, reg, v11); \
+	vandc	v11, v11, reg; \
+	VPOPCNTD(v2, v11); \
+	vspltb	v11, v2, 15; \
+	vcmpequb.	v11, v11, v9; \
+	blt	cr6, 1f; \
+	vsldoi	v9, v0, v9, 1; \
+	vslo	v2, v2, v9; \
+1: \
+	vsumsws	v2, v2, v0;
+#else
+# define CALCULATE_MATCH() \
+	VBPERMQ(v6, v6, v10); \
+	MFVRD(r7, v6); \
+	addi	r6, r7, -1; \
+	andc	r6, r6, r7; \
+	popcntd	r6, r6; \
+	subfic	r6, r6, 15;
+# define FIND_NULL_POS(reg) \
+	VCLZD(v2, reg); \
+	vspltb	v11, v2, 7; \
+	vcmpequb.	v11, v11, v9; \
+	blt	cr6, 1f; \
+	vsldoi	v9, v0, v9, 1; \
+	vsro	v2, v2, v9; \
+1: \
+	vsumsws	v2, v2, v0;
+#endif	/* !__LITTLE_ENDIAN__  */
+	.machine  power7
+ENTRY (strrchr)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
+	cmpdi	cr7,r4,0
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	li	r9,0	      /* Used to store last occurence.  */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+
+	beq	cr7,L(null_match)
+
+	/* Replicate byte to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi	r4,r4,32,0
+
+	/* r4 is changed now.  If it's passed more chars, then
+	   check for null again.  */
+	cmpdi	cr7,r4,0
+	beq	cr7,L(null_match)
+	/* Now r4 has a doubleword of c bytes and r0 has
+	   a doubleword of null bytes.  */
+
+	cmpb	r10,r12,r4     /* Compare each byte against c byte.  */
+	cmpb	r11,r12,r0     /* Compare each byte against null byte.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+#else
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+#endif
+	or	r5,r10,r11    /* OR the results to speed things up.  */
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done)
+
+L(align):
+	andi.	r12, r8, 15
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bne	cr0, L(loop)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r7,16(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	cmpb	r6,r7,r4
+	cmpb	r7,r7,r0
+	or	r12,r10,r11
+	or	r5,r6,r7
+	or	r5,r12,r5
+	cmpdi	cr7,r5,0
+	beq	cr7,L(vector)
+
+	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a c/null byte.  */
+	cmpdi	cr6,r12,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The c/null byte must be in the second doubleword.  Adjust the
+	   address again and move the result of cmpb to r10 so we can calculate
+	   the pointer.  */
+
+	mr	r10,r6
+	mr	r11,r7
+	addi	r8,r8,8
+
+	/* r10/r11 have the output of the cmpb instructions, that is,
+	   0xff in the same position as the c/null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+
+L(done):
+	/* If there are more than one 0xff in r11, find the first position of
+	   0xff in r11 and fill r10 with 0 from that position.  */
+	cmpdi	cr7,r11,0
+	beq	cr7,L(no_null)
+#ifdef __LITTLE_ENDIAN__
+	addi	r3,r11,-1
+	andc	r3,r3,r11
+	popcntd r0,r3
+#else
+	cntlzd	r0,r11
+#endif
+	subfic	r0,r0,63
+	li	r6,-1
+#ifdef __LITTLE_ENDIAN__
+	srd	r0,r6,r0
+#else
+	sld	r0,r6,r0
+#endif
+	and	r10,r0,r10
+L(no_null):
+#ifdef __LITTLE_ENDIAN__
+	cntlzd	r0,r10		/* Count leading zeros before c matches.  */
+	addi	r3,r10,-1
+	andc	r3,r3,r10
+	addi	r10,r11,-1
+	andc	r10,r10,r11
+	cmpld	cr7,r3,r10
+	bgt	cr7,L(no_match)
+#else
+	addi	r3,r10,-1	/* Count trailing zeros before c matches.  */
+	andc	r3,r3,r10
+	popcntd	r0,r3
+	cmpld	cr7,r11,r10
+	bgt	cr7,L(no_match)
+#endif
+	srdi	r0,r0,3		/* Convert trailing zeros to bytes.  */
+	subfic	r0,r0,7
+	add	r9,r8,r0      /* Return address of the matching c byte
+				 or null in case c was not found.  */
+	li	r0,0
+	cmpdi	cr7,r11,0     /* If r11 == 0, no null's have been found.  */
+	beq	cr7,L(align)
+
+	.align	4
+L(no_match):
+	mr	r3,r9
+	blr
+
+/* Check the first 32B in GPR's and move to vectorized loop.  */
+	.p2align  5
+L(vector):
+	addi	r3, r8, 8
+	/* Make sure 32B aligned.  */
+	andi.	r10, r3, 31
+	bne	cr0, L(loop)
+	vspltisb	v0, 0
+	/* Precompute vbpermq constant.  */
+	vspltisb	v10, 3
+	lvsl	v11, r0, r0
+	vslb	v10, v11, v10
+	MTVRD(v1, r4)
+	li	r5, 16
+	vspltb	v1, v1, 7
+	/* Compare 32 bytes in each loop.  */
+L(continue):
+	lvx	v4, 0, r3
+	lvx	v5, r3, r5
+	vcmpequb	v2, v0, v4
+	vcmpequb	v3, v0, v5
+	vcmpequb	v6, v1, v4
+	vcmpequb	v7, v1, v5
+	vor	v8, v2, v3
+	vor	v9, v6, v7
+	vor	v11, v8, v9
+	vcmpequb.	v11, v0, v11
+	addi	r3, r3, 32
+	blt	cr6, L(continue)
+	vcmpequb.	v8, v0, v8
+	blt	cr6, L(match)
+
+	/* One (or both) of the quadwords contains c/null.  */
+	vspltisb	v8, 2
+	vspltisb	v9, 5
+	/* Precompute values used for comparison.  */
+	vsl	v9, v8, v9	/* v9 = 0x4040404040404040.  */
+	vaddubm	v8, v9, v9
+	vsldoi	v8, v0, v8, 1	/* v8 = 0x80.  */
+
+	/* Check if null is in second qw.  */
+	vcmpequb.	v11, v0, v2
+	blt	cr6, L(secondqw)
+
+	/* Null found in first qw.  */
+	addi	r8, r3, -32
+	/* Calculate the null position.  */
+	FIND_NULL_POS(v2)
+	/* Check if null is in the first byte.  */
+	vcmpequb.	v11, v0, v2
+	blt	cr6, L(no_match)
+	vsububm	v2, v8, v2
+	/* Mask unwanted bytes after null.  */
+#ifdef __LITTLE_ENDIAN__
+	vslo	v6, v6, v2
+	vsro	v6, v6, v2
+#else
+	vsro	v6, v6, v2
+	vslo	v6, v6, v2
+#endif
+	vcmpequb.	v11, v0, v6
+	blt	cr6, L(no_match)
+	/* Found a match before null.  */
+	CALCULATE_MATCH()
+	add	r3, r8, r6
+	blr
+
+L(secondqw):
+	addi	r8, r3, -16
+	FIND_NULL_POS(v3)
+	vcmpequb.	v11, v0, v2
+	blt	cr6, L(no_match1)
+	vsububm	v2, v8, v2
+	/* Mask unwanted bytes after null.  */
+#ifdef __LITTLE_ENDIAN__
+	vslo	v7, v7, v2
+	vsro	v7, v7, v2
+#else
+	vsro	v7, v7, v2
+	vslo	v7, v7, v2
+#endif
+	vcmpequb.	v11, v0, v7
+	blt	cr6, L(no_match1)
+	addi	r8, r8, 16
+	vor	v6, v0, v7
+L(no_match1):
+	addi	r8, r8, -16
+	vcmpequb.	v11, v0, v6
+	blt	cr6, L(no_match)
+	/* Found a match before null.  */
+	CALCULATE_MATCH()
+	add	r3, r8, r6
+	blr
+
+L(match):
+	/* One (or both) of the quadwords contains a match.  */
+	mr	r8, r3
+	vcmpequb.	v8, v0, v7
+	blt	cr6, L(firstqw)
+	/* Match found in second qw.  */
+	addi	r8, r8, 16
+	vor	v6, v0, v7
+L(firstqw):
+	addi	r8, r8, -32
+	CALCULATE_MATCH()
+	add	r9, r8, r6      /* Compute final length.  */
+	b	L(continue)
+/* We are here because strrchr was called with a null byte.  */
+	.align	4
+L(null_match):
+	/* r0 has a doubleword of null bytes.  */
+
+	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
+	sld	r5,r5,r6
+	srd	r5,r5,r6
+#endif
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done_null)
+
+	andi.	r12, r8, 15
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bne	cr0, L(loop_null)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r5,r12,r0
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done_null)
+	b	L(loop_null)  /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop_null):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r11,16(r8)
+	cmpb	r5,r12,r0
+	cmpb	r10,r11,r0
+	or	r6,r5,r10
+	cmpdi	cr7,r6,0
+	beq	cr7,L(vector1)
+
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done_null)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   pointer.  */
+
+	mr	r5,r10
+	addi	r8,r8,8
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,r5,-1
+	andc	r0,r0,r5
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srdi	r0,r0,3	      /* Convert trailing zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching null byte.  */
+	blr
+/* Check the first 32B in GPR's and move to vectorized loop.  */
+	.p2align  5
+L(vector1):
+	addi	r3, r8, 8
+	/* Make sure 32B aligned.  */
+	andi.	r10, r3, 31
+	bne	cr0, L(loop_null)
+	vspltisb	v0, 0
+	/* Precompute vbpermq constant.  */
+	vspltisb	v10, 3
+	lvsl	v11, r0, r0
+	vslb	v10, v11, v10
+	li	r5, 16
+	/* Compare 32 bytes in each loop.  */
+L(continue1):
+	lvx	v4, 0, r3
+	lvx	v5, r3, r5
+	vcmpequb	v2, v0, v4
+	vcmpequb	v3, v0, v5
+	vor	v8, v2, v3
+	vcmpequb.	v11, v0, v8
+	addi	r3, r3, 32
+	blt	cr6, L(continue1)
+	addi	r3, r3, -32
+	VBPERMQ(v2, v2, v10)
+	VBPERMQ(v3, v3, v10)
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi	v3, v3, v3, 2
+#else
+	vsldoi	v2, v2, v2, 6
+	vsldoi	v3, v3, v3, 4
+#endif
+	/* Merge the results and move to a GPR.  */
+	vor	v4, v3, v2
+	MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+	addi	r6, r5, -1
+	andc	r6, r6, r5
+	popcntd	r6, r6
+#else
+	cntlzd	r6, r5  /* Count leading zeros before the match.  */
+#endif
+	add	r3, r3, r6      /* Compute final length.  */
+	blr
+END (strrchr)
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
-- 
2.7.4

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-02-14 11:06 [PATCH] powerpc64: strrchr optimization for power8 Rajalakshmi Srinivasaraghavan
@ 2017-02-20 13:10 ` Gabriel F. T. Gomes
  2017-02-20 13:43 ` Carlos O'Donell
  2017-02-21 17:02 ` Carlos Eduardo Seo
  2 siblings, 0 replies; 18+ messages in thread
From: Gabriel F. T. Gomes @ 2017-02-20 13:10 UTC (permalink / raw)
  To: Rajalakshmi Srinivasaraghavan; +Cc: libc-alpha

LGTM.

On Tue, 14 Feb 2017 16:35:21 +0530
Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> wrote:

> Changes from previous version [1]
> 
>   - Comments correction and alignment changes.
> 
> --
> P7 code is used for <=32B strings and for > 32B vectorized loops are used.
> This shows as an average 25% improvement depending on the position of search
> character.  The performance is same for shorter strings.
> Tested on ppc64 and ppc64le.
> 
> 2017-02-14  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
> 
> 	* sysdeps/powerpc/powerpc64/multiarch/Makefile
> 	(sysdep_routines): Add strrchr-power8.
> 	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> 	(strrchr): Add __strrchr_power8 to list of strrchr functions.
> 	* sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S: New file.
> 	* sysdeps/powerpc/powerpc64/multiarch/strrchr.c
> 	(strrchr): Add __strrchr_power8 to ifunc list.
> 	* sysdeps/powerpc/powerpc64/power8/strrchr.S: New file.
> ---
>  sysdeps/powerpc/powerpc64/multiarch/Makefile       |   3 +-
>  .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   3 +
>  .../powerpc/powerpc64/multiarch/strrchr-power8.S   |  39 ++
>  sysdeps/powerpc/powerpc64/multiarch/strrchr.c      |   3 +
>  sysdeps/powerpc/powerpc64/power8/strrchr.S         | 464 +++++++++++++++++++++
>  5 files changed, 511 insertions(+), 1 deletion(-)
>  create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
>  create mode 100644 sysdeps/powerpc/powerpc64/power8/strrchr.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index f5889a3..0fc0ebc 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -14,7 +14,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
>  		   strchrnul-power8 strchrnul-power7 strchrnul-ppc64 \
>  		   strcpy-power8 strcpy-power7 strcpy-ppc64 stpcpy-power8 \
>  		   stpcpy-power7 stpcpy-ppc64 \
> -		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
> +		   strrchr-power8 strrchr-power7 strrchr-ppc64 \
> +		   strncat-power7 strncat-ppc64 \
>  		   strncpy-power7 strncpy-ppc64 \
>  		   stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
>  		   strcmp-power9 strcmp-power8 strcmp-power7 strcmp-ppc64 \
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index 209aec5..d77c47f 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -281,6 +281,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/powerpc/powerpc64/multiarch/strrchr.c.  */
>    IFUNC_IMPL (i, name, strrchr,
>  	      IFUNC_IMPL_ADD (array, i, strrchr,
> +			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
> +			      __strrchr_power8)
> +	      IFUNC_IMPL_ADD (array, i, strrchr,
>  			      hwcap & PPC_FEATURE_HAS_VSX,
>  			      __strrchr_power7)
>  	      IFUNC_IMPL_ADD (array, i, strrchr, 1,
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
> new file mode 100644
> index 0000000..23365a1
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
> @@ -0,0 +1,39 @@
> +/* Optimized strrchr implementation for POWER8.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#undef ENTRY
> +#define ENTRY(name)						\
> +  .section ".text";						\
> +  ENTRY_2(__strrchr_power8)					\
> +  .align ALIGNARG(2);						\
> +  BODY_LABEL(__strrchr_power8):					\
> +  cfi_startproc;						\
> +  LOCALENTRY(__strrchr_power8)
> +
> +#undef END
> +#define END(name)						\
> +  cfi_endproc;							\
> +  TRACEBACK(__strrchr_power8)					\
> +  END_2(__strrchr_power8)
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/power8/strrchr.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
> index dc1d3d0..0f94c9d 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
> @@ -25,11 +25,14 @@
> 
>  extern __typeof (strrchr) __strrchr_ppc attribute_hidden;
>  extern __typeof (strrchr) __strrchr_power7 attribute_hidden;
> +extern __typeof (strrchr) __strrchr_power8 attribute_hidden;
>  #undef strrchr
> 
>  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
>     ifunc symbol properly.  */
>  libc_ifunc_redirected (__redirect_strrchr, strrchr,
> +		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> +		       ? __strrchr_power8 :
>  		       (hwcap & PPC_FEATURE_HAS_VSX)
>  		       ? __strrchr_power7
>  		       : __strrchr_ppc);
> diff --git a/sysdeps/powerpc/powerpc64/power8/strrchr.S b/sysdeps/powerpc/powerpc64/power8/strrchr.S
> new file mode 100644
> index 0000000..8eb7485
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/power8/strrchr.S
> @@ -0,0 +1,464 @@
> +/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* char *[r3] strrchr (char *s [r3], int c [r4])  */
> +/* TODO: change these to the actual instructions when the minimum required
> +   binutils allows it.  */
> +#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
> +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
> +#define VBPERMQ(t,a,b)  .long (0x1000054c \
> +				| ((t)<<(32-11)) \
> +				| ((a)<<(32-16)) \
> +				| ((b)<<(32-21)) )
> +#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21)))
> +#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21)))
> +#define VADDUQM(t,a,b)  .long (0x10000100 \
> +				| ((t)<<(32-11)) \
> +				| ((a)<<(32-16)) \
> +				| ((b)<<(32-21)) )
> +#ifdef __LITTLE_ENDIAN__
> +/* Find the match position from v6 and place result in r6.  */
> +# define CALCULATE_MATCH() \
> +	VBPERMQ(v6, v6, v10); \
> +	vsldoi	v6, v6, v6, 6; \
> +	MFVRD(r7, v6); \
> +	cntlzd	r6, r7; \
> +	subfic	r6, r6, 15;
> +/*
> + * Find the first null position to mask bytes after null.
> + * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw.
> + * Result placed at v2.
> + */
> +# define FIND_NULL_POS(reg) \
> +	vspltisb	v11, -1; \
> +	VADDUQM(v11, reg, v11); \
> +	vandc	v11, v11, reg; \
> +	VPOPCNTD(v2, v11); \
> +	vspltb	v11, v2, 15; \
> +	vcmpequb.	v11, v11, v9; \
> +	blt	cr6, 1f; \
> +	vsldoi	v9, v0, v9, 1; \
> +	vslo	v2, v2, v9; \
> +1: \
> +	vsumsws	v2, v2, v0;
> +#else
> +# define CALCULATE_MATCH() \
> +	VBPERMQ(v6, v6, v10); \
> +	MFVRD(r7, v6); \
> +	addi	r6, r7, -1; \
> +	andc	r6, r6, r7; \
> +	popcntd	r6, r6; \
> +	subfic	r6, r6, 15;
> +# define FIND_NULL_POS(reg) \
> +	VCLZD(v2, reg); \
> +	vspltb	v11, v2, 7; \
> +	vcmpequb.	v11, v11, v9; \
> +	blt	cr6, 1f; \
> +	vsldoi	v9, v0, v9, 1; \
> +	vsro	v2, v2, v9; \
> +1: \
> +	vsumsws	v2, v2, v0;
> +#endif	/* !__LITTLE_ENDIAN__  */
> +	.machine  power7
> +ENTRY (strrchr)
> +	CALL_MCOUNT 2
> +	dcbt	0,r3
> +	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
> +	cmpdi	cr7,r4,0
> +	ld	r12,0(r8)     /* Load doubleword from memory.  */
> +	li	r9,0	      /* Used to store last occurence.  */
> +	li	r0,0	      /* Doubleword with null chars to use
> +				 with cmpb.  */
> +
> +	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
> +
> +	beq	cr7,L(null_match)
> +
> +	/* Replicate byte to doubleword.  */
> +	insrdi	r4,r4,8,48
> +	insrdi	r4,r4,16,32
> +	insrdi	r4,r4,32,0
> +
> +	/* r4 is changed now.  If it's passed more chars, then
> +	   check for null again.  */
> +	cmpdi	cr7,r4,0
> +	beq	cr7,L(null_match)
> +	/* Now r4 has a doubleword of c bytes and r0 has
> +	   a doubleword of null bytes.  */
> +
> +	cmpb	r10,r12,r4     /* Compare each byte against c byte.  */
> +	cmpb	r11,r12,r0     /* Compare each byte against null byte.  */
> +
> +	/* Move the doublewords left and right to discard the bits that are
> +	   not part of the string and bring them back as zeros.  */
> +#ifdef __LITTLE_ENDIAN__
> +	srd	r10,r10,r6
> +	srd	r11,r11,r6
> +	sld	r10,r10,r6
> +	sld	r11,r11,r6
> +#else
> +	sld	r10,r10,r6
> +	sld	r11,r11,r6
> +	srd	r10,r10,r6
> +	srd	r11,r11,r6
> +#endif
> +	or	r5,r10,r11    /* OR the results to speed things up.  */
> +	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
> +				 have been found.  */
> +	bne	cr7,L(done)
> +
> +L(align):
> +	andi.	r12, r8, 15
> +
> +	/* Are we now aligned to a doubleword boundary?  If so, skip to
> +	   the main loop.  Otherwise, go through the alignment code.  */
> +
> +	bne	cr0, L(loop)
> +
> +	/* Handle WORD2 of pair.  */
> +	ldu	r12,8(r8)
> +	cmpb	r10,r12,r4
> +	cmpb	r11,r12,r0
> +	or	r5,r10,r11
> +	cmpdi	cr7,r5,0
> +	bne	cr7,L(done)
> +	b	L(loop)	      /* We branch here (rather than falling through)
> +				 to skip the nops due to heavy alignment
> +				 of the loop below.  */
> +	.p2align  5
> +L(loop):
> +	/* Load two doublewords, compare and merge in a
> +	   single register for speed.  This is an attempt
> +	   to speed up the null-checking process for bigger strings.  */
> +	ld	r12,8(r8)
> +	ldu	r7,16(r8)
> +	cmpb	r10,r12,r4
> +	cmpb	r11,r12,r0
> +	cmpb	r6,r7,r4
> +	cmpb	r7,r7,r0
> +	or	r12,r10,r11
> +	or	r5,r6,r7
> +	or	r5,r12,r5
> +	cmpdi	cr7,r5,0
> +	beq	cr7,L(vector)
> +
> +	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
> +	   the first doubleword and decrement the address in case the first
> +	   doubleword really contains a c/null byte.  */
> +	cmpdi	cr6,r12,0
> +	addi	r8,r8,-8
> +	bne	cr6,L(done)
> +
> +	/* The c/null byte must be in the second doubleword.  Adjust the
> +	   address again and move the result of cmpb to r10 so we can calculate
> +	   the pointer.  */
> +
> +	mr	r10,r6
> +	mr	r11,r7
> +	addi	r8,r8,8
> +
> +	/* r10/r11 have the output of the cmpb instructions, that is,
> +	   0xff in the same position as the c/null byte in the original
> +	   doubleword from the string.  Use that to calculate the pointer.  */
> +
> +L(done):
> +	/* If there are more than one 0xff in r11, find the first position of
> +	   0xff in r11 and fill r10 with 0 from that position.  */
> +	cmpdi	cr7,r11,0
> +	beq	cr7,L(no_null)
> +#ifdef __LITTLE_ENDIAN__
> +	addi	r3,r11,-1
> +	andc	r3,r3,r11
> +	popcntd r0,r3
> +#else
> +	cntlzd	r0,r11
> +#endif
> +	subfic	r0,r0,63
> +	li	r6,-1
> +#ifdef __LITTLE_ENDIAN__
> +	srd	r0,r6,r0
> +#else
> +	sld	r0,r6,r0
> +#endif
> +	and	r10,r0,r10
> +L(no_null):
> +#ifdef __LITTLE_ENDIAN__
> +	cntlzd	r0,r10		/* Count leading zeros before c matches.  */
> +	addi	r3,r10,-1
> +	andc	r3,r3,r10
> +	addi	r10,r11,-1
> +	andc	r10,r10,r11
> +	cmpld	cr7,r3,r10
> +	bgt	cr7,L(no_match)
> +#else
> +	addi	r3,r10,-1	/* Count trailing zeros before c matches.  */
> +	andc	r3,r3,r10
> +	popcntd	r0,r3
> +	cmpld	cr7,r11,r10
> +	bgt	cr7,L(no_match)
> +#endif
> +	srdi	r0,r0,3		/* Convert trailing zeros to bytes.  */
> +	subfic	r0,r0,7
> +	add	r9,r8,r0      /* Return address of the matching c byte
> +				 or null in case c was not found.  */
> +	li	r0,0
> +	cmpdi	cr7,r11,0     /* If r11 == 0, no null's have been found.  */
> +	beq	cr7,L(align)
> +
> +	.align	4
> +L(no_match):
> +	mr	r3,r9
> +	blr
> +
> +/* Check the first 32B in GPR's and move to vectorized loop.  */
> +	.p2align  5
> +L(vector):
> +	addi	r3, r8, 8
> +	/* Make sure 32B aligned.  */
> +	andi.	r10, r3, 31
> +	bne	cr0, L(loop)
> +	vspltisb	v0, 0
> +	/* Precompute vbpermq constant.  */
> +	vspltisb	v10, 3
> +	lvsl	v11, r0, r0
> +	vslb	v10, v11, v10
> +	MTVRD(v1, r4)
> +	li	r5, 16
> +	vspltb	v1, v1, 7
> +	/* Compare 32 bytes in each loop.  */
> +L(continue):
> +	lvx	v4, 0, r3
> +	lvx	v5, r3, r5
> +	vcmpequb	v2, v0, v4
> +	vcmpequb	v3, v0, v5
> +	vcmpequb	v6, v1, v4
> +	vcmpequb	v7, v1, v5
> +	vor	v8, v2, v3
> +	vor	v9, v6, v7
> +	vor	v11, v8, v9
> +	vcmpequb.	v11, v0, v11
> +	addi	r3, r3, 32
> +	blt	cr6, L(continue)
> +	vcmpequb.	v8, v0, v8
> +	blt	cr6, L(match)
> +
> +	/* One (or both) of the quadwords contains c/null.  */
> +	vspltisb	v8, 2
> +	vspltisb	v9, 5
> +	/* Precompute values used for comparison.  */
> +	vsl	v9, v8, v9	/* v9 = 0x4040404040404040.  */
> +	vaddubm	v8, v9, v9
> +	vsldoi	v8, v0, v8, 1	/* v8 = 0x80.  */
> +
> +	/* Check if null is in second qw.  */
> +	vcmpequb.	v11, v0, v2
> +	blt	cr6, L(secondqw)
> +
> +	/* Null found in first qw.  */
> +	addi	r8, r3, -32
> +	/* Calculate the null position.  */
> +	FIND_NULL_POS(v2)
> +	/* Check if null is in the first byte.  */
> +	vcmpequb.	v11, v0, v2
> +	blt	cr6, L(no_match)
> +	vsububm	v2, v8, v2
> +	/* Mask unwanted bytes after null.  */
> +#ifdef __LITTLE_ENDIAN__
> +	vslo	v6, v6, v2
> +	vsro	v6, v6, v2
> +#else
> +	vsro	v6, v6, v2
> +	vslo	v6, v6, v2
> +#endif
> +	vcmpequb.	v11, v0, v6
> +	blt	cr6, L(no_match)
> +	/* Found a match before null.  */
> +	CALCULATE_MATCH()
> +	add	r3, r8, r6
> +	blr
> +
> +L(secondqw):
> +	addi	r8, r3, -16
> +	FIND_NULL_POS(v3)
> +	vcmpequb.	v11, v0, v2
> +	blt	cr6, L(no_match1)
> +	vsububm	v2, v8, v2
> +	/* Mask unwanted bytes after null.  */
> +#ifdef __LITTLE_ENDIAN__
> +	vslo	v7, v7, v2
> +	vsro	v7, v7, v2
> +#else
> +	vsro	v7, v7, v2
> +	vslo	v7, v7, v2
> +#endif
> +	vcmpequb.	v11, v0, v7
> +	blt	cr6, L(no_match1)
> +	addi	r8, r8, 16
> +	vor	v6, v0, v7
> +L(no_match1):
> +	addi	r8, r8, -16
> +	vcmpequb.	v11, v0, v6
> +	blt	cr6, L(no_match)
> +	/* Found a match before null.  */
> +	CALCULATE_MATCH()
> +	add	r3, r8, r6
> +	blr
> +
> +L(match):
> +	/* One (or both) of the quadwords contains a match.  */
> +	mr	r8, r3
> +	vcmpequb.	v8, v0, v7
> +	blt	cr6, L(firstqw)
> +	/* Match found in second qw.  */
> +	addi	r8, r8, 16
> +	vor	v6, v0, v7
> +L(firstqw):
> +	addi	r8, r8, -32
> +	CALCULATE_MATCH()
> +	add	r9, r8, r6      /* Compute final length.  */
> +	b	L(continue)
> +/* We are here because strrchr was called with a null byte.  */
> +	.align	4
> +L(null_match):
> +	/* r0 has a doubleword of null bytes.  */
> +
> +	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
> +
> +	/* Move the doublewords left and right to discard the bits that are
> +	   not part of the string and bring them back as zeros.  */
> +#ifdef __LITTLE_ENDIAN__
> +	srd	r5,r5,r6
> +	sld	r5,r5,r6
> +#else
> +	sld	r5,r5,r6
> +	srd	r5,r5,r6
> +#endif
> +	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
> +				 have been found.  */
> +	bne	cr7,L(done_null)
> +
> +	andi.	r12, r8, 15
> +
> +	/* Are we now aligned to a quadword boundary?  If so, skip to
> +	   the main loop.  Otherwise, go through the alignment code.  */
> +
> +	bne	cr0, L(loop_null)
> +
> +	/* Handle WORD2 of pair.  */
> +	ldu	r12,8(r8)
> +	cmpb	r5,r12,r0
> +	cmpdi	cr7,r5,0
> +	bne	cr7,L(done_null)
> +	b	L(loop_null)  /* We branch here (rather than falling through)
> +				 to skip the nops due to heavy alignment
> +				 of the loop below.  */
> +
> +	/* Main loop to look for the end of the string.  Since it's a
> +	   small loop (< 8 instructions), align it to 32-bytes.  */
> +	.p2align  5
> +L(loop_null):
> +	/* Load two doublewords, compare and merge in a
> +	   single register for speed.  This is an attempt
> +	   to speed up the null-checking process for bigger strings.  */
> +	ld	r12,8(r8)
> +	ldu	r11,16(r8)
> +	cmpb	r5,r12,r0
> +	cmpb	r10,r11,r0
> +	or	r6,r5,r10
> +	cmpdi	cr7,r6,0
> +	beq	cr7,L(vector1)
> +
> +	/* OK, one (or both) of the doublewords contains a null byte.  Check
> +	   the first doubleword and decrement the address in case the first
> +	   doubleword really contains a null byte.  */
> +
> +	cmpdi	cr6,r5,0
> +	addi	r8,r8,-8
> +	bne	cr6,L(done_null)
> +
> +	/* The null byte must be in the second doubleword.  Adjust the address
> +	   again and move the result of cmpb to r10 so we can calculate the
> +	   pointer.  */
> +
> +	mr	r5,r10
> +	addi	r8,r8,8
> +
> +	/* r5 has the output of the cmpb instruction, that is, it contains
> +	   0xff in the same position as the null byte in the original
> +	   doubleword from the string.  Use that to calculate the pointer.  */
> +L(done_null):
> +#ifdef __LITTLE_ENDIAN__
> +	addi	r0,r5,-1
> +	andc	r0,r0,r5
> +	popcntd	r0,r0
> +#else
> +	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
> +#endif
> +	srdi	r0,r0,3	      /* Convert trailing zeros to bytes.  */
> +	add	r3,r8,r0      /* Return address of the matching null byte.  */
> +	blr
> +/* Check the first 32B in GPR's and move to vectorized loop.  */
> +	.p2align  5
> +L(vector1):
> +	addi	r3, r8, 8
> +	/* Make sure 32B aligned.  */
> +	andi.	r10, r3, 31
> +	bne	cr0, L(loop_null)
> +	vspltisb	v0, 0
> +	/* Precompute vbpermq constant.  */
> +	vspltisb	v10, 3
> +	lvsl	v11, r0, r0
> +	vslb	v10, v11, v10
> +	li	r5, 16
> +	/* Compare 32 bytes in each loop.  */
> +L(continue1):
> +	lvx	v4, 0, r3
> +	lvx	v5, r3, r5
> +	vcmpequb	v2, v0, v4
> +	vcmpequb	v3, v0, v5
> +	vor	v8, v2, v3
> +	vcmpequb.	v11, v0, v8
> +	addi	r3, r3, 32
> +	blt	cr6, L(continue1)
> +	addi	r3, r3, -32
> +	VBPERMQ(v2, v2, v10)
> +	VBPERMQ(v3, v3, v10)
> +	/* Shift each component into its correct position for merging.  */
> +#ifdef __LITTLE_ENDIAN__
> +	vsldoi	v3, v3, v3, 2
> +#else
> +	vsldoi	v2, v2, v2, 6
> +	vsldoi	v3, v3, v3, 4
> +#endif
> +	/* Merge the results and move to a GPR.  */
> +	vor	v4, v3, v2
> +	MFVRD(r5, v4)
> +#ifdef __LITTLE_ENDIAN__
> +	addi	r6, r5, -1
> +	andc	r6, r6, r5
> +	popcntd	r6, r6
> +#else
> +	cntlzd	r6, r5  /* Count leading zeros before the match.  */
> +#endif
> +	add	r3, r3, r6      /* Compute final length.  */
> +	blr
> +END (strrchr)
> +weak_alias (strrchr, rindex)
> +libc_hidden_builtin_def (strrchr)

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-02-14 11:06 [PATCH] powerpc64: strrchr optimization for power8 Rajalakshmi Srinivasaraghavan
  2017-02-20 13:10 ` Gabriel F. T. Gomes
@ 2017-02-20 13:43 ` Carlos O'Donell
  2017-02-20 16:01   ` Rajalakshmi Srinivasaraghavan
  2017-02-21 17:02 ` Carlos Eduardo Seo
  2 siblings, 1 reply; 18+ messages in thread
From: Carlos O'Donell @ 2017-02-20 13:43 UTC (permalink / raw)
  To: Rajalakshmi Srinivasaraghavan, libc-alpha

On 02/14/2017 06:05 AM, Rajalakshmi Srinivasaraghavan wrote:
> P7 code is used for <=32B strings and for > 32B vectorized loops are used.
> This shows as an average 25% improvement depending on the position of search
> character.  The performance is same for shorter strings.
> Tested on ppc64 and ppc64le.

What did you use to test the 25% improvement?

-- 
Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-02-20 13:43 ` Carlos O'Donell
@ 2017-02-20 16:01   ` Rajalakshmi Srinivasaraghavan
  2017-02-20 16:06     ` Carlos O'Donell
  0 siblings, 1 reply; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-02-20 16:01 UTC (permalink / raw)
  To: libc-alpha



On 02/20/2017 07:12 PM, Carlos O'Donell wrote:
> On 02/14/2017 06:05 AM, Rajalakshmi Srinivasaraghavan wrote:
>> P7 code is used for <=32B strings and for > 32B vectorized loops are used.
>> This shows as an average 25% improvement depending on the position of search
>> character.  The performance is same for shorter strings.
>> Tested on ppc64 and ppc64le.
> What did you use to test the 25% improvement?

This improvement is seen when compared to power7. Benchtest is
modified to use length from 0 to 400  to find the average for
different lengths.


-- 
Thanks
Rajalakshmi S

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-02-20 16:01   ` Rajalakshmi Srinivasaraghavan
@ 2017-02-20 16:06     ` Carlos O'Donell
  2017-02-20 16:50       ` Rajalakshmi Srinivasaraghavan
  2017-02-28  7:32       ` Rajalakshmi Srinivasaraghavan
  0 siblings, 2 replies; 18+ messages in thread
From: Carlos O'Donell @ 2017-02-20 16:06 UTC (permalink / raw)
  To: Rajalakshmi Srinivasaraghavan, libc-alpha

On 02/20/2017 11:01 AM, Rajalakshmi Srinivasaraghavan wrote:
> 
> 
> On 02/20/2017 07:12 PM, Carlos O'Donell wrote:
>> On 02/14/2017 06:05 AM, Rajalakshmi Srinivasaraghavan wrote:
>>> P7 code is used for <=32B strings and for > 32B vectorized loops are used.
>>> This shows as an average 25% improvement depending on the position of search
>>> character.  The performance is same for shorter strings.
>>> Tested on ppc64 and ppc64le.
>> What did you use to test the 25% improvement?
> 
> This improvement is seen when compared to power7. Benchtest is
> modified to use length from 0 to 400  to find the average for
> different lengths.
 
Could you post your modifications for review an explain your
process in a little more detail. I'm curious about the changes
you made.

-- 
Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-02-20 16:06     ` Carlos O'Donell
@ 2017-02-20 16:50       ` Rajalakshmi Srinivasaraghavan
  2017-02-28  7:32       ` Rajalakshmi Srinivasaraghavan
  1 sibling, 0 replies; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-02-20 16:50 UTC (permalink / raw)
  To: Carlos O'Donell, libc-alpha

[-- Attachment #1: Type: text/plain, Size: 1289 bytes --]



On 02/20/2017 09:36 PM, Carlos O'Donell wrote:
> On 02/20/2017 11:01 AM, Rajalakshmi Srinivasaraghavan wrote:
>>
>> On 02/20/2017 07:12 PM, Carlos O'Donell wrote:
>>> On 02/14/2017 06:05 AM, Rajalakshmi Srinivasaraghavan wrote:
>>>> P7 code is used for <=32B strings and for > 32B vectorized loops are used.
>>>> This shows as an average 25% improvement depending on the position of search
>>>> character.  The performance is same for shorter strings.
>>>> Tested on ppc64 and ppc64le.
>>> What did you use to test the 25% improvement?
>> This improvement is seen when compared to power7. Benchtest is
>> modified to use length from 0 to 400  to find the average for
>> different lengths.
> Could you post your modifications for review an explain your
> process in a little more detail. I'm curious about the changes
> you made.

I modified benchtest/bench-strrchr.c to measure only the following loop
and commented the existing 'for' loops.

   for (i = 0; i < 400; ++i)
     {
       do_test (0, i, i + 1, 0, SMALL_CHAR);
       do_test (i, i, i + 1, 0, BIG_CHAR);
     }

Then the benchtests generated is copied to a spreadsheet to
calculate the improvement.

Attached is the benchtests result for ppc64le.
Note: The numbers vary slightly from run to run. 	

-- 
Thanks
Rajalakshmi S

[-- Attachment #2: bench-result --]
[-- Type: text/plain, Size: 53538 bytes --]

                    	simple_strrchr	__strrchr_power8	__strrchr_power7	__strrchr_ppc
Length    1, alignment in bytes  0:	2.34375	3.04688	2.9375	4.73438
Length    1, alignment in bytes  0:	2.39062	2.9375	2.875	4.45312
Length    2, alignment in bytes  0:	3.10938	2.8125	2.84375	4.67188
Length    2, alignment in bytes  1:	2.96875	2.89062	2.89062	4.5625
Length    3, alignment in bytes  0:	4.57812	2.85938	2.85938	4.57812
Length    3, alignment in bytes  2:	4.20312	2.9375	2.82812	4.625
Length    4, alignment in bytes  0:	4.46875	2.92188	2.85938	4.65625
Length    4, alignment in bytes  3:	4.15625	2.98438	2.84375	4.57812
Length    5, alignment in bytes  0:	4.98438	2.85938	2.85938	4.57812
Length    5, alignment in bytes  4:	4.71875	2.875	2.98438	5.23438
Length    6, alignment in bytes  0:	5.53125	2.98438	2.79688	4.625
Length    6, alignment in bytes  5:	5.34375	2.8125	2.82812	4.98438
Length    7, alignment in bytes  0:	6.28125	2.92188	2.875	4.625
Length    7, alignment in bytes  6:	6.03125	2.79688	2.78125	4.95312
Length    8, alignment in bytes  0:	7.09375	2.89062	2.8125	4.57812
Length    8, alignment in bytes  7:	6.89062	2.79688	2.78125	4.78125
Length    9, alignment in bytes  0:	7.5625	2.84375	2.78125	4.875
Length    9, alignment in bytes  0:	7.45312	2.73438	2.71875	4.89062
Length   10, alignment in bytes  0:	8.60938	2.78125	2.71875	4.82812
Length   10, alignment in bytes  1:	8.40625	2.71875	2.76562	4.9375
Length   11, alignment in bytes  0:	10.2812	2.71875	2.71875	4.9375
Length   11, alignment in bytes  2:	10.2969	2.70312	2.70312	4.82812
Length   12, alignment in bytes  0:	10.875	2.76562	2.70312	4.85938
Length   12, alignment in bytes  3:	10.8281	2.73438	2.71875	4.89062
Length   13, alignment in bytes  0:	11.4375	2.73438	2.73438	4.76562
Length   13, alignment in bytes  4:	11.4219	3.25	3.3125	5.57812
Length   14, alignment in bytes  0:	11.9375	2.73438	2.70312	4.76562
Length   14, alignment in bytes  5:	12	3.23438	3.1875	5.45312
Length   15, alignment in bytes  0:	12.4844	2.79688	2.6875	4.875
Length   15, alignment in bytes  6:	12.5312	3.26562	3.15625	5.39062
Length   16, alignment in bytes  0:	13.0938	2.75	2.73438	4.90625
Length   16, alignment in bytes  7:	13.0469	3.1875	3.15625	5.35938
Length   17, alignment in bytes  0:	13.9219	3.25	3.1875	5.42188
Length   17, alignment in bytes  0:	13.9219	3.20312	3.14062	5.4375
Length   18, alignment in bytes  0:	14.3125	3.21875	3.23438	5.45312
Length   18, alignment in bytes  1:	14.3594	3.34375	3.25	5.4375
Length   19, alignment in bytes  0:	14.8594	3.15625	3.10938	5.53125
Length   19, alignment in bytes  2:	14.8594	3.1875	3.14062	5.42188
Length   20, alignment in bytes  0:	15.4844	3.23438	3.15625	5.46875
Length   20, alignment in bytes  3:	15.4062	3.1875	3.14062	5.48438
Length   21, alignment in bytes  0:	16.1406	3.20312	3.15625	5.45312
Length   21, alignment in bytes  4:	16.1562	3.34375	3.4375	6.5
Length   22, alignment in bytes  0:	17.0781	3.23438	3.28125	5.65625
Length   22, alignment in bytes  5:	17.0938	3.14062	3.20312	6.20312
Length   23, alignment in bytes  0:	17.8281	3.20312	3.20312	5.57812
Length   23, alignment in bytes  6:	17.7812	3.07812	3.15625	6.28125
Length   24, alignment in bytes  0:	18.3281	3.21875	3.15625	5.51562
Length   24, alignment in bytes  7:	18.3906	3.10938	3.03125	6.14062
Length   25, alignment in bytes  0:	18.9062	3.09375	3.0625	6.14062
Length   25, alignment in bytes  0:	18.875	3.14062	3	5.98438
Length   26, alignment in bytes  0:	19.5	3.14062	3.07812	6.20312
Length   26, alignment in bytes  1:	19.4688	3.07812	3.03125	6.07812
Length   27, alignment in bytes  0:	19.9844	3.0625	3.04688	6.10938
Length   27, alignment in bytes  2:	20.0469	3.10938	3.09375	5.95312
Length   28, alignment in bytes  0:	20.5625	3.0625	3.01562	6.01562
Length   28, alignment in bytes  3:	20.5156	3.03125	3.04688	6.0625
Length   29, alignment in bytes  0:	21.1094	3.04688	3.03125	6
Length   29, alignment in bytes  4:	21.1562	4.07812	4.40625	7.04688
Length   30, alignment in bytes  0:	21.6562	3.15625	3.07812	6.09375
Length   30, alignment in bytes  5:	21.6719	3.76562	3.89062	6.73438
Length   31, alignment in bytes  0:	22.25	3.09375	3.0625	6.20312
Length   31, alignment in bytes  6:	22.25	3.73438	3.85938	6.79688
Length   32, alignment in bytes  0:	22.7344	3.04688	3.10938	6.23438
Length   32, alignment in bytes  7:	22.7812	3.625	3.75	6.64062
Length   33, alignment in bytes  0:	23.4062	3.73438	3.85938	6.70312
Length   33, alignment in bytes  0:	23.3438	3.64062	3.85938	6.60938
Length   34, alignment in bytes  0:	23.8438	3.64062	3.70312	6.82812
Length   34, alignment in bytes  1:	23.9219	3.67188	3.71875	6.75
Length   35, alignment in bytes  0:	24.5	3.60938	3.67188	6.54688
Length   35, alignment in bytes  2:	24.4375	3.73438	3.73438	6.67188
Length   36, alignment in bytes  0:	24.9688	3.57812	3.73438	6.625
Length   36, alignment in bytes  3:	25.0312	3.60938	3.73438	6.70312
Length   37, alignment in bytes  0:	25.6094	3.67188	3.73438	6.59375
Length   37, alignment in bytes  4:	25.5312	3.65625	4.15625	7.60938
Length   38, alignment in bytes  0:	26.0625	3.78125	3.75	6.89062
Length   38, alignment in bytes  5:	26.1562	3.59375	3.875	7.35938
Length   39, alignment in bytes  0:	26.7188	3.70312	3.71875	6.95312
Length   39, alignment in bytes  6:	26.6562	3.8125	3.625	7.5625
Length   40, alignment in bytes  0:	27.1562	3.70312	3.70312	6.78125
Length   40, alignment in bytes  7:	27.2188	3.57812	3.625	7.29688
Length   41, alignment in bytes  0:	27.8594	3.65625	3.60938	7.35938
Length   41, alignment in bytes  0:	27.7812	3.625	3.42188	7.3125
Length   42, alignment in bytes  0:	28.2812	3.75	3.375	7.23438
Length   42, alignment in bytes  1:	28.3125	3.6875	3.46875	7.1875
Length   43, alignment in bytes  0:	28.9375	3.67188	3.45312	7.23438
Length   43, alignment in bytes  2:	28.9219	3.73438	3.54688	7.26562
Length   44, alignment in bytes  0:	29.4219	3.65625	3.48438	7.29688
Length   44, alignment in bytes  3:	29.375	3.64062	3.46875	7.3125
Length   45, alignment in bytes  0:	30.0312	3.73438	3.48438	7.28125
Length   45, alignment in bytes  4:	30.0781	3.64062	4.39062	8.26562
Length   46, alignment in bytes  0:	30.5625	3.64062	3.6875	7.54688
Length   46, alignment in bytes  5:	30.5	3.70312	4.3125	8.125
Length   47, alignment in bytes  0:	31.0625	3.73438	3.51562	7.39062
Length   47, alignment in bytes  6:	31.1562	3.625	4.17188	8.09375
Length   48, alignment in bytes  0:	31.7344	3.73438	3.5	7.39062
Length   48, alignment in bytes  7:	31.6875	3.78125	4.17188	8.23438
Length   49, alignment in bytes  0:	32.1719	3.60938	4.17188	7.95312
Length   49, alignment in bytes  0:	32.1562	3.70312	4.17188	8.03125
Length   50, alignment in bytes  0:	32.75	3.67188	4.0625	7.82812
Length   50, alignment in bytes  1:	32.8438	3.6875	4.0625	8.03125
Length   51, alignment in bytes  0:	33.3906	3.625	4.14062	7.95312
Length   51, alignment in bytes  2:	33.3125	3.65625	4.21875	7.9375
Length   52, alignment in bytes  0:	33.8281	3.64062	4.25	7.92188
Length   52, alignment in bytes  3:	33.8281	3.70312	4.10938	7.89062
Length   53, alignment in bytes  0:	34.4375	3.73438	4.28125	7.89062
Length   53, alignment in bytes  4:	34.5	3.67188	4.28125	8.6875
Length   54, alignment in bytes  0:	35.0312	3.67188	4.10938	8.03125
Length   54, alignment in bytes  5:	35	3.65625	4.09375	8.54688
Length   55, alignment in bytes  0:	35.5938	3.65625	4.15625	8.15625
Length   55, alignment in bytes  6:	35.625	3.78125	3.9375	8.54688
Length   56, alignment in bytes  0:	36.125	3.65625	4.3125	8.10938
Length   56, alignment in bytes  7:	36.0781	3.625	3.90625	8.5625
Length   57, alignment in bytes  0:	36.5781	3.57812	4	8.46875
Length   57, alignment in bytes  0:	36.5938	3.65625	3.89062	8.39062
Length   58, alignment in bytes  0:	37.2188	3.76562	3.96875	8.4375
Length   58, alignment in bytes  1:	37.2812	3.625	3.90625	8.53125
Length   59, alignment in bytes  0:	37.8594	3.73438	3.82812	8.53125
Length   59, alignment in bytes  2:	37.8125	3.67188	3.96875	8.40625
Length   60, alignment in bytes  0:	38.2969	3.6875	3.89062	8.42188
Length   60, alignment in bytes  3:	38.2656	3.70312	3.90625	8.5
Length   61, alignment in bytes  0:	38.8125	3.73438	3.9375	8.60938
Length   61, alignment in bytes  4:	38.8594	4.6875	5.14062	9.28125
Length   62, alignment in bytes  0:	39.4531	3.78125	3.89062	8.64062
Length   62, alignment in bytes  5:	39.4844	4.78125	4.71875	9.09375
Length   63, alignment in bytes  0:	40.0938	3.67188	3.875	8.53125
Length   63, alignment in bytes  6:	40.0469	4.29688	4.64062	9.17188
Length   64, alignment in bytes  0:	40.5625	3.84375	3.875	8.59375
Length   64, alignment in bytes  7:	40.5156	4.26562	4.57812	9.29688
Length   65, alignment in bytes  0:	41.0156	4.1875	4.57812	9.01562
Length   65, alignment in bytes  0:	41	4.23438	4.54688	9.125
Length   66, alignment in bytes  0:	41.6094	4.29688	4.79688	9.14062
Length   66, alignment in bytes  1:	41.625	4.32812	4.60938	8.96875
Length   67, alignment in bytes  0:	42.2344	4.42188	4.79688	9
Length   67, alignment in bytes  2:	42.2656	4.54688	4.65625	8.98438
Length   68, alignment in bytes  0:	42.8438	4.57812	4.5625	9.03125
Length   68, alignment in bytes  3:	42.875	4.4375	4.5625	8.98438
Length   69, alignment in bytes  0:	43.375	4.46875	4.75	9.09375
Length   69, alignment in bytes  4:	43.3281	4.375	4.51562	10.0312
Length   70, alignment in bytes  0:	43.8594	4.42188	4.82812	9.125
Length   70, alignment in bytes  5:	43.8125	4.5	4.48438	9.85938
Length   71, alignment in bytes  0:	44.3438	4.46875	4.625	9.17188
Length   71, alignment in bytes  6:	44.3438	4.25	4.35938	9.8125
Length   72, alignment in bytes  0:	44.9062	4.23438	4.67188	9.14062
Length   72, alignment in bytes  7:	44.9375	4.1875	4.39062	9.57812
Length   73, alignment in bytes  0:	45.5	4.42188	4.3125	9.78125
Length   73, alignment in bytes  0:	45.5312	4.29688	4.3125	9.75
Length   74, alignment in bytes  0:	46.1094	4.40625	4.34375	9.70312
Length   74, alignment in bytes  1:	46.1562	4.39062	4.375	9.70312
Length   75, alignment in bytes  0:	46.7188	4.21875	4.40625	9.6875
Length   75, alignment in bytes  2:	46.75	4.25	4.35938	9.70312
Length   76, alignment in bytes  0:	47.3125	4.17188	4.28125	9.70312
Length   76, alignment in bytes  3:	47.2969	4.4375	4.35938	9.90625
Length   77, alignment in bytes  0:	47.8281	4.51562	4.34375	9.65625
Length   77, alignment in bytes  4:	47.7969	4.54688	5.625	10.3281
Length   78, alignment in bytes  0:	48.3594	4.39062	4.32812	10.0156
Length   78, alignment in bytes  5:	48.3438	4.35938	5.23438	10.4062
Length   79, alignment in bytes  0:	48.8906	4.4375	4.32812	9.79688
Length   79, alignment in bytes  6:	48.875	4.51562	5.20312	10.4531
Length   80, alignment in bytes  0:	49.4219	4.32812	4.34375	9.84375
Length   80, alignment in bytes  7:	49.4219	4.29688	5.20312	10.4531
Length   81, alignment in bytes  0:	49.9688	4.21875	5.14062	10.2656
Length   81, alignment in bytes  0:	49.9375	4.21875	5.15625	10.3438
Length   82, alignment in bytes  0:	50.4844	4.28125	5.0625	10.3594
Length   82, alignment in bytes  1:	50.4844	4.25	5.04688	10.3438
Length   83, alignment in bytes  0:	51.0625	4.26562	5.10938	10.3594
Length   83, alignment in bytes  2:	51.0312	4.20312	4.98438	10.3594
Length   84, alignment in bytes  0:	51.5781	4.25	5.0625	10.4844
Length   84, alignment in bytes  3:	51.5938	4.26562	5.125	10.3438
Length   85, alignment in bytes  0:	52.1562	4.23438	4.96875	10.3281
Length   85, alignment in bytes  4:	52.1562	4.5	5.07812	11.0781
Length   86, alignment in bytes  0:	52.7344	4.48438	5.07812	10.4375
Length   86, alignment in bytes  5:	52.75	4.35938	4.89062	11.0312
Length   87, alignment in bytes  0:	53.2969	4.40625	5.10938	10.6094
Length   87, alignment in bytes  6:	53.3281	4.4375	4.84375	11.1094
Length   88, alignment in bytes  0:	53.875	4.45312	5.0625	10.5469
Length   88, alignment in bytes  7:	53.8906	4.35938	4.71875	11.0781
Length   89, alignment in bytes  0:	54.4688	4.21875	4.76562	11
Length   89, alignment in bytes  0:	54.4844	4.23438	4.85938	11
Length   90, alignment in bytes  0:	55.0781	4.375	4.82812	10.9375
Length   90, alignment in bytes  1:	55.0938	4.375	4.73438	11
Length   91, alignment in bytes  0:	55.6406	4.45312	4.70312	11.0312
Length   91, alignment in bytes  2:	55.625	4.35938	4.73438	10.9375
Length   92, alignment in bytes  0:	56.2031	4.3125	4.82812	11
Length   92, alignment in bytes  3:	56.1719	4.34375	4.73438	11
Length   93, alignment in bytes  0:	56.6875	4.20312	4.85938	10.9688
Length   93, alignment in bytes  4:	56.625	5.32812	6.25	14.2812
Length   94, alignment in bytes  0:	57.1562	4.46875	4.70312	10.9688
Length   94, alignment in bytes  5:	57.1406	4.9375	5.67188	14.3281
Length   95, alignment in bytes  0:	57.6406	4.42188	4.95312	10.9844
Length   95, alignment in bytes  6:	57.6562	4.70312	5.59375	14.3594
Length   96, alignment in bytes  0:	58.2031	4.1875	4.875	11.1094
Length   96, alignment in bytes  7:	58.25	4.78125	5.57812	14.4219
Length   97, alignment in bytes  0:	58.8438	4.64062	5.5625	14.3125
Length   97, alignment in bytes  0:	58.875	4.67188	5.4375	14.375
Length   98, alignment in bytes  0:	59.5	4.625	5.53125	14.3438
Length   98, alignment in bytes  1:	59.5156	4.71875	5.42188	14.2656
Length   99, alignment in bytes  0:	60.0781	4.75	5.35938	14.3125
Length   99, alignment in bytes  2:	60.0156	4.6875	5.375	14.2656
Length  100, alignment in bytes  0:	60.5156	4.70312	5.4375	14.375
Length  100, alignment in bytes  3:	60.4531	4.71875	5.40625	14.4844
Length  101, alignment in bytes  0:	60.9688	4.67188	5.39062	14.25
Length  101, alignment in bytes  4:	60.9375	4.6875	5.5625	15.3281
Length  102, alignment in bytes  0:	61.5938	4.67188	5.46875	14.7031
Length  102, alignment in bytes  5:	61.6406	4.6875	5.34375	15.25
Length  103, alignment in bytes  0:	62.2812	4.67188	5.42188	14.5781
Length  103, alignment in bytes  6:	62.3125	4.625	5.25	15.3281
Length  104, alignment in bytes  0:	62.8281	4.625	5.42188	14.5625
Length  104, alignment in bytes  7:	62.7656	4.625	5.1875	15.3281
Length  105, alignment in bytes  0:	63.2188	4.67188	5.21875	15.2969
Length  105, alignment in bytes  0:	63.1719	4.67188	5.21875	15.2656
Length  106, alignment in bytes  0:	63.7969	4.67188	5.21875	15.2656
Length  106, alignment in bytes  1:	63.8438	4.65625	5.20312	15.2344
Length  107, alignment in bytes  0:	111.891	4.78125	5.35938	15.6094
Length  107, alignment in bytes  2:	66.8281	4.85938	5.40625	15.9531
Length  108, alignment in bytes  0:	68.875	4.98438	5.53125	16.3594
Length  108, alignment in bytes  3:	70.4375	5.01562	5.70312	16.8125
Length  109, alignment in bytes  0:	73.0781	5.23438	5.82812	17.2031
Length  109, alignment in bytes  4:	75.1875	5.35938	7.875	18.4219
Length  110, alignment in bytes  0:	77.875	5.625	6.42188	18.5781
Length  110, alignment in bytes  5:	80.5156	5.76562	7.48438	19.5781
Length  111, alignment in bytes  0:	83.8438	5.92188	6.70312	19.5938
Length  111, alignment in bytes  6:	86.8438	6.09375	8.14062	21.1406
Length  112, alignment in bytes  0:	90.9688	6.3125	7.21875	21.1562
Length  112, alignment in bytes  7:	94.9531	6.78125	8.60938	22.8438
Length  113, alignment in bytes  0:	100.016	7	8.96875	24
Length  113, alignment in bytes  0:	105.156	7.375	9.28125	60.8594
Length  114, alignment in bytes  0:	112.859	7.90625	9.8125	26.9531
Length  114, alignment in bytes  1:	120.344	8.45312	10.4844	28.5156
Length  115, alignment in bytes  0:	123.125	8.3125	10.5	28.2031
Length  115, alignment in bytes  2:	123.234	8.375	10.5625	28.2812
Length  116, alignment in bytes  0:	124.391	8.375	10.5	28.4531
Length  116, alignment in bytes  3:	124.422	8.48438	10.5625	28.4531
Length  117, alignment in bytes  0:	125.344	8.45312	10.5938	28.4844
Length  117, alignment in bytes  4:	125.188	8.20312	10.9531	29.5781
Length  118, alignment in bytes  0:	126.078	8.45312	10.9531	28.4844
Length  118, alignment in bytes  5:	126.172	8.48438	10.4531	29.4531
Length  119, alignment in bytes  0:	127.328	8.375	10.7812	28.4375
Length  119, alignment in bytes  6:	127.406	8.3125	10.3906	29.5469
Length  120, alignment in bytes  0:	128.281	8.3125	10.5	28.4375
Length  120, alignment in bytes  7:	128.094	8.46875	10.3438	29.4219
Length  121, alignment in bytes  0:	129.078	8.28125	10.3594	29.3438
Length  121, alignment in bytes  0:	129.219	8.40625	10.1094	29.4844
Length  122, alignment in bytes  0:	130.359	8.28125	10.2344	29.4219
Length  122, alignment in bytes  1:	130.312	8.375	10.25	29.4375
Length  123, alignment in bytes  0:	131.125	8.39062	10.0781	29.5
Length  123, alignment in bytes  2:	131.062	8.34375	10.1562	29.4062
Length  124, alignment in bytes  0:	132.234	8.28125	10.375	29.4531
Length  124, alignment in bytes  3:	132.359	8.34375	10.125	29.5625
Length  125, alignment in bytes  0:	133.234	8.375	10.2344	29.7812
Length  125, alignment in bytes  4:	133.016	9.375	12.4531	30.4844
Length  126, alignment in bytes  0:	134.141	8.5	10.0625	29.6562
Length  126, alignment in bytes  5:	134.328	9.17188	11.6406	30.7969
Length  127, alignment in bytes  0:	135.25	8.46875	10.0938	29.5
Length  127, alignment in bytes  6:	135.016	9.29688	11.2812	30.6719
Length  128, alignment in bytes  0:	136.078	8.5625	10.0312	29.4531
Length  128, alignment in bytes  7:	136.312	9.35938	11.7656	30.5156
Length  129, alignment in bytes  0:	137.203	9.32812	11.375	30.6562
Length  129, alignment in bytes  0:	137	9.29688	11.5938	30.5781
Length  130, alignment in bytes  0:	138.219	9.07812	11.5781	30.7188
Length  130, alignment in bytes  1:	138.312	9.17188	11.2812	30.6406
Length  131, alignment in bytes  0:	139.062	9.48438	11.375	30.5469
Length  131, alignment in bytes  2:	139.062	9.3125	11.5156	30.6406
Length  132, alignment in bytes  0:	140.266	9.07812	11.5781	30.6406
Length  132, alignment in bytes  3:	140.156	9.14062	11.6094	30.7188
Length  133, alignment in bytes  0:	140.969	9.10938	11.6562	30.6875
Length  133, alignment in bytes  4:	141.25	8.95312	11.875	31.5469
Length  134, alignment in bytes  0:	142.25	9.04688	11.6562	30.7812
Length  134, alignment in bytes  5:	141.984	9.21875	11.0938	31.5
Length  135, alignment in bytes  0:	143.141	9.03125	11.625	30.7344
Length  135, alignment in bytes  6:	143.297	9.0625	11.1875	31.8125
Length  136, alignment in bytes  0:	144	9.07812	11.1719	30.4844
Length  136, alignment in bytes  7:	144.109	9.125	10.9375	31.7031
Length  137, alignment in bytes  0:	145.219	9.09375	11.0625	31.75
Length  137, alignment in bytes  0:	144.953	9.32812	11.0469	31.4688
Length  138, alignment in bytes  0:	146.188	9.10938	10.6875	31.7344
Length  138, alignment in bytes  1:	146.188	9.04688	10.7188	31.8438
Length  139, alignment in bytes  0:	146.922	9.09375	10.75	31.5469
Length  139, alignment in bytes  2:	147.234	8.98438	10.8594	31.6094
Length  140, alignment in bytes  0:	148.125	9.10938	10.8906	31.7031
Length  140, alignment in bytes  3:	147.953	9.32812	10.7344	31.5625
Length  141, alignment in bytes  0:	149.219	9.26562	10.875	31.7969
Length  141, alignment in bytes  4:	148.984	9.29688	12.9688	32.625
Length  142, alignment in bytes  0:	150.234	9.34375	10.9062	31.8594
Length  142, alignment in bytes  5:	150.016	9.28125	12.375	32.6094
Length  143, alignment in bytes  0:	151.062	9.34375	11.0156	31.5625
Length  143, alignment in bytes  6:	151.172	9.34375	12.0469	32.6719
Length  144, alignment in bytes  0:	151.906	9.20312	10.9375	31.6719
Length  144, alignment in bytes  7:	152.203	9.14062	12.0625	32.7656
Length  145, alignment in bytes  0:	152.984	9.26562	12.0938	32.7188
Length  145, alignment in bytes  0:	153.062	8.98438	12.0469	32.5781
Length  146, alignment in bytes  0:	154.125	9.14062	12.0781	32.5781
Length  146, alignment in bytes  1:	153.906	9.125	12.3906	32.5781
Length  147, alignment in bytes  0:	155.188	9.15625	12.3438	32.625
Length  147, alignment in bytes  2:	154.906	9.10938	12.1562	32.5781
Length  148, alignment in bytes  0:	156.188	9.3125	12.0781	32.6562
Length  148, alignment in bytes  3:	155.891	9.46875	12.3906	32.625
Length  149, alignment in bytes  0:	157.172	9.17188	12.3438	32.6562
Length  149, alignment in bytes  4:	156.938	9.17188	12.4844	33.6094
Length  150, alignment in bytes  0:	158.094	9.40625	12.4688	32.9062
Length  150, alignment in bytes  5:	158.016	9.32812	12.1875	33.625
Length  151, alignment in bytes  0:	159.016	9.07812	12.125	32.6562
Length  151, alignment in bytes  6:	159.062	9.125	11.7969	33.5
Length  152, alignment in bytes  0:	159.984	9.07812	12.0781	32.6406
Length  152, alignment in bytes  7:	160.062	9.35938	12	33.6406
Length  153, alignment in bytes  0:	160.938	9.34375	11.7969	33.4844
Length  153, alignment in bytes  0:	161.078	9.28125	11.7969	33.5938
Length  154, alignment in bytes  0:	161.969	9.07812	11.8281	33.4531
Length  154, alignment in bytes  1:	162.016	9.3125	11.5312	33.5312
Length  155, alignment in bytes  0:	162.953	8.95312	11.6875	33.3906
Length  155, alignment in bytes  2:	163	9.32812	11.7656	33.5938
Length  156, alignment in bytes  0:	163.938	9.09375	11.6875	33.5469
Length  156, alignment in bytes  3:	164	9.29688	12.0156	33.5312
Length  157, alignment in bytes  0:	165.016	9.04688	11.9375	33.7656
Length  157, alignment in bytes  4:	164.953	10.1562	13.5625	34.4219
Length  158, alignment in bytes  0:	166.031	9.375	11.9062	33.5938
Length  158, alignment in bytes  5:	165.844	9.85938	12.9844	34.75
Length  159, alignment in bytes  0:	167.062	8.9375	11.6719	33.9844
Length  159, alignment in bytes  6:	166.812	10.0156	12.8594	34.4531
Length  160, alignment in bytes  0:	168.094	9.32812	11.625	33.5938
Length  160, alignment in bytes  7:	167.859	9.78125	12.7188	34.5
Length  161, alignment in bytes  0:	169.016	9.875	12.8594	34.5781
Length  161, alignment in bytes  0:	168.938	10.1094	12.5312	34.5625
Length  162, alignment in bytes  0:	169.859	9.875	12.7656	34.2969
Length  162, alignment in bytes  1:	170.094	9.89062	13.0469	34.7969
Length  163, alignment in bytes  0:	170.812	9.90625	13.0781	34.5
Length  163, alignment in bytes  2:	171.047	9.84375	12.7031	34.5625
Length  164, alignment in bytes  0:	171.906	9.92188	12.5938	34.6719
Length  164, alignment in bytes  3:	171.875	9.625	13.1094	34.4844
Length  165, alignment in bytes  0:	173.047	9.89062	12.9844	34.4062
Length  165, alignment in bytes  4:	172.781	9.78125	13.5	35.5312
Length  166, alignment in bytes  0:	174.031	9.8125	12.6562	34.5781
Length  166, alignment in bytes  5:	173.891	10.1406	12.8906	35.6406
Length  167, alignment in bytes  0:	174.812	10.0312	13.125	34.5625
Length  167, alignment in bytes  6:	174.891	9.78125	12.9062	35.625
Length  168, alignment in bytes  0:	176.062	10.1562	12.6875	34.9531
Length  168, alignment in bytes  7:	175.781	9.84375	12.9531	35.5156
Length  169, alignment in bytes  0:	176.938	9.95312	12.9062	35.6406
Length  169, alignment in bytes  0:	176.953	9.84375	12.9531	35.5469
Length  170, alignment in bytes  0:	177.766	9.70312	12.9531	35.4531
Length  170, alignment in bytes  1:	177.969	10.125	13.0312	35.5312
Length  171, alignment in bytes  0:	178.938	10	13.0312	35.5156
Length  171, alignment in bytes  2:	178.781	9.82812	13.0312	35.625
Length  172, alignment in bytes  0:	179.984	9.875	12.9531	35.5312
Length  172, alignment in bytes  3:	179.922	10.1562	13.1719	35.5156
Length  173, alignment in bytes  0:	180.734	9.92188	13.125	35.3281
Length  173, alignment in bytes  4:	180.953	9.82812	15.0156	36.75
Length  174, alignment in bytes  0:	181.953	9.89062	13.0156	35.7344
Length  174, alignment in bytes  5:	181.984	9.8125	14.0312	36.6719
Length  175, alignment in bytes  0:	182.891	9.875	12.8594	35.6562
Length  175, alignment in bytes  6:	182.75	9.92188	13.7031	36.5
Length  176, alignment in bytes  0:	183.906	9.8125	13.1719	35.7031
Length  176, alignment in bytes  7:	183.969	9.70312	13.6562	36.6719
Length  177, alignment in bytes  0:	184.75	9.84375	13.4375	36.5469
Length  177, alignment in bytes  0:	184.766	10.1094	13.6094	36.6875
Length  178, alignment in bytes  0:	185.953	9.92188	13.5625	36.7031
Length  178, alignment in bytes  1:	185.891	9.8125	13.5	36.8281
Length  179, alignment in bytes  0:	186.734	9.82812	13.6094	36.4375
Length  179, alignment in bytes  2:	186.797	9.85938	13.5625	36.5
Length  180, alignment in bytes  0:	187.953	9.75	13.7031	36.8125
Length  180, alignment in bytes  3:	187.859	9.82812	13.5156	36.4844
Length  181, alignment in bytes  0:	188.703	9.82812	13.4688	36.6719
Length  181, alignment in bytes  4:	188.797	9.8125	14.1562	37.8281
Length  182, alignment in bytes  0:	189.922	9.82812	13.7812	36.8594
Length  182, alignment in bytes  5:	189.844	9.6875	13.375	37.6094
Length  183, alignment in bytes  0:	190.719	9.79688	13.5781	36.5938
Length  183, alignment in bytes  6:	190.781	9.82812	13.2969	37.8281
Length  184, alignment in bytes  0:	191.875	9.8125	13.5781	36.5156
Length  184, alignment in bytes  7:	191.891	9.71875	13.3906	37.7969
Length  185, alignment in bytes  0:	192.766	10.0938	13.1094	37.9531
Length  185, alignment in bytes  0:	192.703	10.1094	13.125	37.6406
Length  186, alignment in bytes  0:	193.734	10.25	13.4062	37.625
Length  186, alignment in bytes  1:	193.875	9.78125	13.125	37.875
Length  187, alignment in bytes  0:	194.875	9.82812	13.2656	37.8438
Length  187, alignment in bytes  2:	194.734	10.0938	13.125	37.6719
Length  188, alignment in bytes  0:	195.672	9.89062	13.3125	37.7656
Length  188, alignment in bytes  3:	195.703	9.89062	13.4219	37.7812
Length  189, alignment in bytes  0:	196.797	9.92188	13.2812	37.8438
Length  189, alignment in bytes  4:	196.703	11.8906	15.1406	38.8125
Length  190, alignment in bytes  0:	197.656	10.3438	13.3594	37.5312
Length  190, alignment in bytes  5:	197.688	11.2031	14.6406	38.875
Length  191, alignment in bytes  0:	198.734	10.2656	13.3281	37.7656
Length  191, alignment in bytes  6:	198.844	11.2344	14.375	38.875
Length  192, alignment in bytes  0:	199.875	10.5	13.2344	37.9531
Length  192, alignment in bytes  7:	199.812	11.1719	14.3594	38.7188
Length  193, alignment in bytes  0:	200.734	10.7188	14.375	38.8125
Length  193, alignment in bytes  0:	200.688	10.8281	14.1406	38.7344
Length  194, alignment in bytes  0:	201.641	10.7188	14.2031	38.8594
Length  194, alignment in bytes  1:	201.625	10.7969	14.375	38.8125
Length  195, alignment in bytes  0:	202.688	10.7031	14.1562	38.8125
Length  195, alignment in bytes  2:	202.734	10.7656	14.3125	38.7812
Length  196, alignment in bytes  0:	203.812	10.8125	14.3125	38.8594
Length  196, alignment in bytes  3:	203.812	10.7031	14.4531	38.7969
Length  197, alignment in bytes  0:	204.844	10.7812	14.375	38.9375
Length  197, alignment in bytes  4:	204.781	10.6875	14.7188	39.9375
Length  198, alignment in bytes  0:	205.781	10.7031	14.5625	38.7188
Length  198, alignment in bytes  5:	205.688	10.7031	14.1719	40
Length  199, alignment in bytes  0:	206.688	10.7031	14.25	38.9844
Length  199, alignment in bytes  6:	206.625	10.7969	14.1094	39.625
Length  200, alignment in bytes  0:	207.594	10.6875	14.5156	38.7969
Length  200, alignment in bytes  7:	207.609	10.7188	14.3594	39.7656
Length  201, alignment in bytes  0:	208.609	10.7969	14	39.7031
Length  201, alignment in bytes  0:	208.625	10.7656	13.9062	39.7188
Length  202, alignment in bytes  0:	209.625	10.75	13.9844	39.7344
Length  202, alignment in bytes  1:	209.641	10.75	14.1094	39.75
Length  203, alignment in bytes  0:	210.641	10.8125	14.0938	39.75
Length  203, alignment in bytes  2:	210.656	10.7812	13.9531	39.7031
Length  204, alignment in bytes  0:	211.672	10.8281	14.0625	39.8438
Length  204, alignment in bytes  3:	211.672	10.6719	13.9531	39.7188
Length  205, alignment in bytes  0:	212.625	11.0156	14.0156	39.6719
Length  205, alignment in bytes  4:	212.672	10.75	16.25	40.6875
Length  206, alignment in bytes  0:	213.672	10.8438	14.4844	39.7188
Length  206, alignment in bytes  5:	213.672	10.75	16.3906	40.8594
Length  207, alignment in bytes  0:	214.641	10.7031	14.4375	39.9375
Length  207, alignment in bytes  6:	214.672	10.7656	15.8438	40.7656
Length  208, alignment in bytes  0:	215.625	10.7969	14.5156	39.9688
Length  208, alignment in bytes  7:	215.625	10.7344	15.5938	40.7031
Length  209, alignment in bytes  0:	216.578	10.7969	15.5781	40.8594
Length  209, alignment in bytes  0:	216.578	10.8438	15.5312	40.6562
Length  210, alignment in bytes  0:	217.578	10.7969	15.2656	40.7031
Length  210, alignment in bytes  1:	217.578	10.7656	15.2344	40.7969
Length  211, alignment in bytes  0:	218.609	10.75	15.125	40.9375
Length  211, alignment in bytes  2:	218.609	10.75	15.2656	40.9844
Length  212, alignment in bytes  0:	219.641	10.7344	15.2188	40.9688
Length  212, alignment in bytes  3:	219.656	10.7344	15.3125	41.0781
Length  213, alignment in bytes  0:	220.688	10.7812	15.2344	40.7656
Length  213, alignment in bytes  4:	220.703	10.875	15.5156	41.9531
Length  214, alignment in bytes  0:	221.641	10.7812	15.4688	40.8906
Length  214, alignment in bytes  5:	221.656	10.8125	15.1719	41.9375
Length  215, alignment in bytes  0:	222.578	10.7031	15.2188	40.7969
Length  215, alignment in bytes  6:	222.531	10.7344	15.0938	41.7812
Length  216, alignment in bytes  0:	223.516	10.6562	15.2031	40.9844
Length  216, alignment in bytes  7:	223.531	10.7188	15.3281	41.8906
Length  217, alignment in bytes  0:	224.562	10.7344	15.0938	42.1719
Length  217, alignment in bytes  0:	224.641	10.7188	14.9531	42.1406
Length  218, alignment in bytes  0:	225.672	10.6875	15.0625	42.1094
Length  218, alignment in bytes  1:	225.609	10.7812	15.0781	42.1094
Length  219, alignment in bytes  0:	226.594	10.7344	14.9531	42
Length  219, alignment in bytes  2:	226.531	10.75	15.1875	41.9219
Length  220, alignment in bytes  0:	227.516	10.7188	15.0312	41.875
Length  220, alignment in bytes  3:	227.516	10.7969	14.8594	41.9688
Length  221, alignment in bytes  0:	228.547	10.6875	15	42.0312
Length  221, alignment in bytes  4:	228.594	12.25	16.7656	43.2656
Length  222, alignment in bytes  0:	229.625	11.0156	15.5	42.125
Length  222, alignment in bytes  5:	229.562	12.4375	16.5938	42.9844
Length  223, alignment in bytes  0:	230.547	11.0625	15.5469	41.875
Length  223, alignment in bytes  6:	230.484	12	16.8906	43.1875
Length  224, alignment in bytes  0:	231.516	11.1406	15.5625	41.8594
Length  224, alignment in bytes  7:	231.562	12.2656	16.3906	43.3125
Length  225, alignment in bytes  0:	232.594	11.5781	16.625	43.4062
Length  225, alignment in bytes  0:	232.531	11.7344	16.0625	43.0312
Length  226, alignment in bytes  0:	233.516	11.5781	16.0781	43.0469
Length  226, alignment in bytes  1:	233.5	11.5625	16	43.0938
Length  227, alignment in bytes  0:	234.531	11.8594	16.0625	43.125
Length  227, alignment in bytes  2:	234.516	11.4688	15.8125	42.4688
Length  228, alignment in bytes  0:	231.844	11.3125	15.9375	42.4219
Length  228, alignment in bytes  3:	230.062	11.4688	15.5938	41.7656
Length  229, alignment in bytes  0:	229.094	11.4062	15.6875	41.6719
Length  229, alignment in bytes  4:	225.719	11	15.4375	42.1719
Length  230, alignment in bytes  0:	226.484	11.0469	15.8125	41.2344
Length  230, alignment in bytes  5:	223.109	11.0469	15.125	41.5
Length  231, alignment in bytes  0:	224.031	10.9062	15.2031	40.7344
Length  231, alignment in bytes  6:	220.703	10.7969	14.6719	40.9219
Length  232, alignment in bytes  0:	221.641	10.7344	15.125	39.875
Length  232, alignment in bytes  7:	218.344	10.4531	14.4062	40.4062
Length  233, alignment in bytes  0:	219.25	10.5625	14.4688	39.8125
Length  233, alignment in bytes  0:	216.078	10.5469	14.1406	39.5625
Length  234, alignment in bytes  0:	216.922	10.4219	14.0469	39.1719
Length  234, alignment in bytes  1:	213.812	10.4688	13.9375	38.9531
Length  235, alignment in bytes  0:	214.703	10.2188	13.875	38.5312
Length  235, alignment in bytes  2:	211.734	10.1094	13.9219	38.4844
Length  236, alignment in bytes  0:	212.672	10.2969	13.7344	38.0938
Length  236, alignment in bytes  3:	209.75	9.9375	13.7031	38.125
Length  237, alignment in bytes  0:	210.516	9.9375	13.4375	37.6719
Length  237, alignment in bytes  4:	207.547	9.90625	15.2812	38.3281
Length  238, alignment in bytes  0:	208.406	9.75	13.6094	37.0156
Length  238, alignment in bytes  5:	205.688	9.70312	14.6562	38.0469
Length  239, alignment in bytes  0:	206.562	9.89062	13.6094	36.6875
Length  239, alignment in bytes  6:	203.672	9.76562	14.4219	37.2812
Length  240, alignment in bytes  0:	204.469	9.53125	13.3906	36.5
Length  240, alignment in bytes  7:	201.938	9.39062	14.1875	36.9062
Length  241, alignment in bytes  0:	202.688	9.39062	13.9062	36.9531
Length  241, alignment in bytes  0:	199.953	9.39062	13.7969	36.2656
Length  242, alignment in bytes  0:	200.859	9.53125	13.8438	36.5312
Length  242, alignment in bytes  1:	198.344	9.21875	13.5156	36.0156
Length  243, alignment in bytes  0:	198.938	9.14062	13.7188	35.8438
Length  243, alignment in bytes  2:	196.578	9	13.4531	35.5
Length  244, alignment in bytes  0:	197.359	9.10938	13.4531	35.5312
Length  244, alignment in bytes  3:	195.656	9.26562	13.3438	34.9844
Length  245, alignment in bytes  0:	195.719	8.90625	13.2344	35.0625
Length  245, alignment in bytes  4:	195.375	8.90625	13.25	35.25
Length  246, alignment in bytes  0:	193.969	8.90625	13.0781	34.5156
Length  246, alignment in bytes  5:	194	8.84375	13.0625	35.1562
Length  247, alignment in bytes  0:	192.266	8.95312	12.9375	34.2969
Length  247, alignment in bytes  6:	192.453	8.73438	12.7969	34.9219
Length  248, alignment in bytes  0:	190.625	8.65625	12.8438	33.7344
Length  248, alignment in bytes  7:	190.906	8.65625	12.6406	34.6406
Length  249, alignment in bytes  0:	189.172	8.5625	12.4375	34.1094
Length  249, alignment in bytes  0:	189.328	8.57812	12.4219	34.125
Length  250, alignment in bytes  0:	189.359	8.42188	12.375	33.7031
Length  250, alignment in bytes  1:	187.797	8.42188	12.3125	33.6875
Length  251, alignment in bytes  0:	188.344	8.54688	12.25	33.6094
Length  251, alignment in bytes  2:	186.375	8.5	12.1406	33.5938
Length  252, alignment in bytes  0:	186.859	8.40625	12.1406	33.3125
Length  252, alignment in bytes  3:	184.969	8.26562	11.9688	33.0469
Length  253, alignment in bytes  0:	185.406	8.29688	12.1094	32.875
Length  253, alignment in bytes  4:	184.344	9.09375	13.0156	33.3125
Length  254, alignment in bytes  0:	184	8.4375	12.2656	32.4375
Length  254, alignment in bytes  5:	184.234	8.9375	12.5781	33
Length  255, alignment in bytes  0:	182.703	8.3125	12.1406	32.2188
Length  255, alignment in bytes  6:	182.719	8.76562	12.5938	32.8438
Length  256, alignment in bytes  0:	181.469	8.32812	11.9219	31.9062
Length  256, alignment in bytes  7:	181.266	8.625	12.4375	32.5469
Length  257, alignment in bytes  0:	181.422	8.48438	12.0625	32.1719
Length  257, alignment in bytes  0:	180.062	8.54688	12.1094	32.2031
Length  258, alignment in bytes  0:	180.688	8.40625	12.0938	32.0938
Length  258, alignment in bytes  1:	178.938	8.32812	12	31.9219
Length  259, alignment in bytes  0:	179.406	8.32812	12	31.9219
Length  259, alignment in bytes  2:	177.766	8.23438	11.8594	31.4375
Length  260, alignment in bytes  0:	178.328	8.15625	11.8125	31.4375
Length  260, alignment in bytes  3:	178.125	8.07812	11.8438	31.25
Length  261, alignment in bytes  0:	176.938	8.17188	11.7344	31.1406
Length  261, alignment in bytes  4:	177.094	8	11.8125	31.875
Length  262, alignment in bytes  0:	175.656	8.15625	11.6719	30.75
Length  262, alignment in bytes  5:	175.672	8.07812	11.6562	31.4062
Length  263, alignment in bytes  0:	176.516	8.0625	11.6094	30.5938
Length  263, alignment in bytes  6:	174.562	8.07812	11.5625	31.2812
Length  264, alignment in bytes  0:	175.047	8.03125	11.5625	30.4062
Length  264, alignment in bytes  7:	173.625	7.84375	11.2656	30.875
Length  265, alignment in bytes  0:	174.156	7.96875	11.2969	30.9062
Length  265, alignment in bytes  0:	173.953	7.73438	11.2812	30.5312
Length  266, alignment in bytes  0:	172.766	7.76562	11.2031	30.5
Length  266, alignment in bytes  1:	172.969	7.89062	11.1875	30.5156
Length  267, alignment in bytes  0:	171.875	7.54688	11.0938	30.1875
Length  267, alignment in bytes  2:	171.688	7.59375	11.125	30.3438
Length  268, alignment in bytes  0:	172.234	7.85938	11.0938	30.0938
Length  268, alignment in bytes  3:	170.672	7.54688	10.9531	29.8594
Length  269, alignment in bytes  0:	171.406	7.5625	11.125	29.8906
Length  269, alignment in bytes  4:	169.906	7.5	11.8281	30.2812
Length  270, alignment in bytes  0:	170.047	7.60938	11.1094	29.6406
Length  270, alignment in bytes  5:	170.078	7.64062	11.75	30.2344
Length  271, alignment in bytes  0:	169.203	7.46875	10.875	29.3125
Length  271, alignment in bytes  6:	169.25	7.35938	11.3125	29.9844
Length  272, alignment in bytes  0:	169.266	7.5	10.9688	29.125
Length  272, alignment in bytes  7:	167.953	7.46875	11.25	29.75
Length  273, alignment in bytes  0:	168.547	7.34375	11.4531	29.5938
Length  273, alignment in bytes  0:	167.047	7.28125	11.3438	29.2969
Length  274, alignment in bytes  0:	167.75	7.40625	11.2969	29.3125
Length  274, alignment in bytes  1:	167.766	7.25	11.0312	29.1875
Length  275, alignment in bytes  0:	166.641	7.09375	11.2188	29.1562
Length  275, alignment in bytes  2:	166.547	7.375	11.2656	29.0781
Length  276, alignment in bytes  0:	166.172	7.29688	11.1719	28.8125
Length  276, alignment in bytes  3:	165.578	7.3125	11.125	28.7031
Length  277, alignment in bytes  0:	166.297	7.28125	11.0781	28.7656
Length  277, alignment in bytes  4:	164.797	7.25	10.6406	29.125
Length  278, alignment in bytes  0:	165.359	6.98438	11	28.6406
Length  278, alignment in bytes  5:	165.297	6.98438	10.5469	29.0312
Length  279, alignment in bytes  0:	164.203	7.15625	10.9844	28.3281
Length  279, alignment in bytes  6:	164.156	7	10.5156	28.8906
Length  280, alignment in bytes  0:	164.016	7.01562	10.875	28.0469
Length  280, alignment in bytes  7:	163.25	7.0625	10.375	28.6719
Length  281, alignment in bytes  0:	163.906	7.0625	10.3438	28.5312
Length  281, alignment in bytes  0:	162.547	6.875	10.1719	28.3281
Length  282, alignment in bytes  0:	163.125	6.82812	10.2188	28.3125
Length  282, alignment in bytes  1:	163.094	7.0625	10.25	28.3438
Length  283, alignment in bytes  0:	162.109	6.9375	10.125	28.1875
Length  283, alignment in bytes  2:	162.047	6.85938	10.1562	28.1875
Length  284, alignment in bytes  0:	162.516	6.78125	10.1719	28.0938
Length  284, alignment in bytes  3:	162.469	6.82812	10.0938	28.1406
Length  285, alignment in bytes  0:	163.031	7	10.1562	28.0469
Length  285, alignment in bytes  4:	163.062	7.78125	11.3594	28.6875
Length  286, alignment in bytes  0:	163.703	7.125	10.3438	28.2031
Length  286, alignment in bytes  5:	163.75	7.53125	11.1406	28.5469
Length  287, alignment in bytes  0:	164.406	7.28125	10.1875	28.125
Length  287, alignment in bytes  6:	164.422	7.39062	11.0938	28.5312
Length  288, alignment in bytes  0:	164.969	7.20312	10.2969	28.1094
Length  288, alignment in bytes  7:	164.875	7.46875	11.0781	28.5469
Length  289, alignment in bytes  0:	165.328	7.34375	10.9219	28.7031
Length  289, alignment in bytes  0:	165.281	7.26562	11.2656	28.6406
Length  290, alignment in bytes  0:	165.812	7.32812	11.4844	28.7812
Length  290, alignment in bytes  1:	165.844	7.29688	11.4062	28.5312
Length  291, alignment in bytes  0:	166.516	7.1875	11.0469	28.6406
Length  291, alignment in bytes  2:	166.531	7.26562	11.0625	28.6406
Length  292, alignment in bytes  0:	167.203	7.32812	11.0469	28.5156
Length  292, alignment in bytes  3:	167.203	7.26562	11.0781	28.6875
Length  293, alignment in bytes  0:	167.719	7.28125	11.0625	28.7031
Length  293, alignment in bytes  4:	167.594	7.23438	10.7969	29.2656
Length  294, alignment in bytes  0:	168.031	7.3125	11.5469	28.7969
Length  294, alignment in bytes  5:	168.047	7.26562	10.8906	29.2344
Length  295, alignment in bytes  0:	168.688	7.25	11.75	28.6875
Length  295, alignment in bytes  6:	168.781	7.32812	10.7656	29.0938
Length  296, alignment in bytes  0:	169.422	7.28125	11.125	28.6406
Length  296, alignment in bytes  7:	169.406	7.4375	10.8594	29.2656
Length  297, alignment in bytes  0:	169.859	7.21875	10.9375	29.3906
Length  297, alignment in bytes  0:	169.734	7.20312	11.2969	29.2344
Length  298, alignment in bytes  0:	170.25	7.29688	10.9062	29.2188
Length  298, alignment in bytes  1:	170.328	7.28125	11	29.1562
Length  299, alignment in bytes  0:	171	7.26562	10.7812	29.2344
Length  299, alignment in bytes  2:	171.109	7.23438	10.6875	29.1875
Length  300, alignment in bytes  0:	171.594	7.26562	10.7812	29.2344
Length  300, alignment in bytes  3:	171.469	7.25	10.8125	29.25
Length  301, alignment in bytes  0:	171.938	7.25	10.8906	29.1406
Length  301, alignment in bytes  4:	171.969	7.29688	11.875	29.7812
Length  302, alignment in bytes  0:	172.641	7.21875	11.25	29.1406
Length  302, alignment in bytes  5:	172.75	7.29688	11.5938	29.7812
Length  303, alignment in bytes  0:	173.25	7.23438	11.4375	29.2656
Length  303, alignment in bytes  6:	173.094	7.26562	11.7188	29.7969
Length  304, alignment in bytes  0:	173.578	7.28125	11.1875	29.2656
Length  304, alignment in bytes  7:	173.688	7.20312	11.7344	29.7188
Length  305, alignment in bytes  0:	174.375	7.21875	11.625	29.6406
Length  305, alignment in bytes  0:	174.406	7.1875	13.4062	29.7969
Length  306, alignment in bytes  0:	174.812	7.26562	11.6562	29.8906
Length  306, alignment in bytes  1:	174.703	7.26562	11.6719	29.6719
Length  307, alignment in bytes  0:	175.328	7.29688	11.6094	29.7031
Length  307, alignment in bytes  2:	175.484	7.4375	11.5938	29.6875
Length  308, alignment in bytes  0:	176.078	7.21875	11.6406	29.8594
Length  308, alignment in bytes  3:	175.891	7.34375	11.6562	29.7188
Length  309, alignment in bytes  0:	176.359	7.26562	11.6094	29.7344
Length  309, alignment in bytes  4:	176.453	7.26562	13.3125	30.1875
Length  310, alignment in bytes  0:	177.156	7.25	11.625	29.7188
Length  310, alignment in bytes  5:	177.094	7.23438	11.4219	30.3125
Length  311, alignment in bytes  0:	177.469	7.26562	11.7188	29.8438
Length  311, alignment in bytes  6:	177.531	7.28125	13.5	30.3125
Length  312, alignment in bytes  0:	178.266	7.23438	11.5938	29.7031
Length  312, alignment in bytes  7:	178.25	7.21875	13.5	30.2812
Length  313, alignment in bytes  0:	178.625	7.23438	11.25	30.3281
Length  313, alignment in bytes  0:	178.609	7.26562	11.4219	30.3438
Length  314, alignment in bytes  0:	179.312	7.20312	13.4531	30.2344
Length  314, alignment in bytes  1:	179.406	7.20312	11.4062	30.2812
Length  315, alignment in bytes  0:	179.797	7.28125	11.4375	30.3125
Length  315, alignment in bytes  2:	179.688	7.28125	11.3906	30.1875
Length  316, alignment in bytes  0:	180.391	7.23438	11.4844	30.2969
Length  316, alignment in bytes  3:	180.5	7.26562	11.3594	30.2969
Length  317, alignment in bytes  0:	180.922	7.20312	11.3906	30.375
Length  317, alignment in bytes  4:	180.797	8.20312	11.8281	30.8594
Length  318, alignment in bytes  0:	181.516	7.51562	11.5625	30.2812
Length  318, alignment in bytes  5:	181.625	7.98438	11.7344	30.9219
Length  319, alignment in bytes  0:	182.016	7.46875	11.6875	30.2969
Length  319, alignment in bytes  6:	181.922	7.95312	11.7188	30.8281
Length  320, alignment in bytes  0:	182.688	7.46875	11.625	30.1875
Length  320, alignment in bytes  7:	182.703	7.90625	11.7031	30.7656
Length  321, alignment in bytes  0:	183.062	7.89062	11.6875	30.9375
Length  321, alignment in bytes  0:	183.078	7.79688	11.6875	30.8125
Length  322, alignment in bytes  0:	183.828	7.78125	11.6719	30.8281
Length  322, alignment in bytes  1:	183.719	7.76562	11.6875	30.9531
Length  323, alignment in bytes  0:	184.156	7.79688	11.7031	30.8906
Length  323, alignment in bytes  2:	184.281	7.78125	11.6562	30.75
Length  324, alignment in bytes  0:	184.922	7.71875	11.6719	30.8906
Length  324, alignment in bytes  3:	184.75	7.79688	11.75	30.9531
Length  325, alignment in bytes  0:	185.344	7.84375	11.7188	30.8906
Length  325, alignment in bytes  4:	185.5	7.78125	11.8594	31.4062
Length  326, alignment in bytes  0:	185.906	7.79688	11.7656	31.1719
Length  326, alignment in bytes  5:	185.797	7.79688	11.8906	31.4219
Length  327, alignment in bytes  0:	186.562	7.73438	11.9375	30.875
Length  327, alignment in bytes  6:	186.531	7.82812	11.8438	31.4219
Length  328, alignment in bytes  0:	186.922	7.78125	11.8125	30.7812
Length  328, alignment in bytes  7:	187.047	7.6875	11.8438	31.375
Length  329, alignment in bytes  0:	187.688	7.85938	11.75	31.5
Length  329, alignment in bytes  0:	187.484	7.82812	11.8281	31.5156
Length  330, alignment in bytes  0:	188.172	7.78125	11.7344	31.5
Length  330, alignment in bytes  1:	188.25	7.67188	11.8125	31.5469
Length  331, alignment in bytes  0:	188.594	7.8125	11.7969	31.4375
Length  331, alignment in bytes  2:	188.703	7.78125	11.7656	31.4531
Length  332, alignment in bytes  0:	189.359	7.82812	11.7344	31.4375
Length  332, alignment in bytes  3:	189.188	7.79688	11.7344	31.4844
Length  333, alignment in bytes  0:	189.812	7.79688	11.7344	31.4688
Length  333, alignment in bytes  4:	189.906	7.70312	16.9375	32.1406
Length  334, alignment in bytes  0:	190.266	7.8125	11.7969	31.5156
Length  334, alignment in bytes  5:	190.391	7.76562	16.8281	31.9062
Length  335, alignment in bytes  0:	190.953	7.79688	11.8438	31.5
Length  335, alignment in bytes  6:	190.812	7.8125	17.0625	31.9688
Length  336, alignment in bytes  0:	191.594	7.78125	11.8438	31.4062
Length  336, alignment in bytes  7:	191.453	7.75	17.0781	32.0938
Length  337, alignment in bytes  0:	191.969	7.73438	16.3594	31.9531
Length  337, alignment in bytes  0:	192.141	7.82812	16.4844	32.0312
Length  338, alignment in bytes  0:	192.469	7.75	16.5312	31.9375
Length  338, alignment in bytes  1:	192.688	7.76562	16.4844	31.9844
Length  339, alignment in bytes  0:	193.094	7.82812	16.4219	32.1094
Length  339, alignment in bytes  2:	193.125	7.76562	16.4531	31.9688
Length  340, alignment in bytes  0:	549.391	7.73438	16.5312	32.0625
Length  340, alignment in bytes  3:	193.812	7.75	16.6562	31.9844
Length  341, alignment in bytes  0:	194.203	7.78125	16.6562	32
Length  341, alignment in bytes  4:	194.234	7.875	16.5625	32.4844
Length  342, alignment in bytes  0:	194.875	7.79688	16.75	32.0781
Length  342, alignment in bytes  5:	194.688	7.84375	16.4844	32.5469
Length  343, alignment in bytes  0:	195.484	7.84375	16.75	32
Length  343, alignment in bytes  6:	195.344	7.90625	16.5	32.5938
Length  344, alignment in bytes  0:	195.891	7.78125	16.8125	31.9844
Length  344, alignment in bytes  7:	196.016	7.75	16.5312	32.6094
Length  345, alignment in bytes  0:	196.344	7.78125	16.4688	32.5156
Length  345, alignment in bytes  0:	196.578	7.79688	16.3438	32.4688
Length  346, alignment in bytes  0:	196.953	7.76562	16.5938	32.5156
Length  346, alignment in bytes  1:	197.062	7.6875	16.4375	32.5625
Length  347, alignment in bytes  0:	197.562	7.75	16.5156	32.6094
Length  347, alignment in bytes  2:	197.547	7.8125	16.4844	32.5156
Length  348, alignment in bytes  0:	198.219	7.78125	16.5625	32.6719
Length  348, alignment in bytes  3:	198.047	7.70312	16.4844	32.4844
Length  349, alignment in bytes  0:	198.797	7.75	16.5312	32.5469
Length  349, alignment in bytes  4:	198.594	8.57812	17.375	33.0781
Length  350, alignment in bytes  0:	199.328	7.95312	16.75	32.5156
Length  350, alignment in bytes  5:	199.234	8.48438	17.1719	33.2188
Length  351, alignment in bytes  0:	199.797	7.98438	16.6562	32.5625
Length  351, alignment in bytes  6:	199.844	8.32812	17.1562	33.2344
Length  352, alignment in bytes  0:	200.312	8.0625	16.8281	32.4688
Length  352, alignment in bytes  7:	200.438	8.3125	17.2812	33.2031
Length  353, alignment in bytes  0:	200.828	8.20312	17.0781	33.0156
Length  353, alignment in bytes  0:	201.016	8.14062	17.2031	33.125
Length  354, alignment in bytes  0:	201.359	8.21875	17.1562	33.0938
Length  354, alignment in bytes  1:	201.562	8.17188	17.1406	33.1719
Length  355, alignment in bytes  0:	201.906	8.17188	17.2344	33.0625
Length  355, alignment in bytes  2:	202.109	8.17188	17.0938	33.1562
Length  356, alignment in bytes  0:	202.5	8.29688	17.2812	33.125
Length  356, alignment in bytes  3:	202.641	8.1875	17.2188	33.0156
Length  357, alignment in bytes  0:	203.094	8.14062	17.2969	33.125
Length  357, alignment in bytes  4:	203.141	8.20312	17.0781	33.6094
Length  358, alignment in bytes  0:	203.672	8.10938	17.2344	33.1562
Length  358, alignment in bytes  5:	203.672	8.25	17	33.5469
Length  359, alignment in bytes  0:	441.5	8.14062	17.3281	33.2969
Length  359, alignment in bytes  6:	204.188	8.15625	17.0156	33.5625
Length  360, alignment in bytes  0:	204.828	8.21875	17.25	33.2031
Length  360, alignment in bytes  7:	204.766	8.14062	17.1406	33.5156
Length  361, alignment in bytes  0:	205.391	8.23438	17.0781	33.7969
Length  361, alignment in bytes  0:	205.312	8.17188	17.0156	33.5469
Length  362, alignment in bytes  0:	205.922	8.125	17.0469	33.8125
Length  362, alignment in bytes  1:	205.906	8.17188	16.875	33.7031
Length  363, alignment in bytes  0:	226.562	8.17188	17.0469	33.7344
Length  363, alignment in bytes  2:	206.516	8.15625	17.0156	33.7031
Length  364, alignment in bytes  0:	206.953	8.25	17.0781	33.5625
Length  364, alignment in bytes  3:	207.078	8.14062	17.0625	33.6562
Length  365, alignment in bytes  0:	207.484	8.28125	16.9688	33.7031
Length  365, alignment in bytes  4:	207.656	8.15625	17.5781	34.1719
Length  366, alignment in bytes  0:	208.016	8.17188	17.4219	33.6562
Length  366, alignment in bytes  5:	208.188	8.20312	17.5625	34.1875
Length  367, alignment in bytes  0:	208.562	8.10938	17.1562	33.5781
Length  367, alignment in bytes  6:	208.75	8.1875	17.5469	34.2031
Length  368, alignment in bytes  0:	209.188	8.15625	17.4062	33.7656
Length  368, alignment in bytes  7:	209.266	8.20312	17.6562	34.2969
Length  369, alignment in bytes  0:	209.781	8.25	17.6719	34.1406
Length  369, alignment in bytes  0:	209.781	8.14062	17.7812	34.2969
Length  370, alignment in bytes  0:	210.391	8.1875	17.625	34.125
Length  370, alignment in bytes  1:	210.281	8.1875	17.625	34.2344
Length  371, alignment in bytes  0:	210.969	8.125	17.625	34.2188
Length  371, alignment in bytes  2:	210.812	8.21875	17.625	34.1719
Length  372, alignment in bytes  0:	211.531	8.14062	17.6719	34.2188
Length  372, alignment in bytes  3:	211.391	8.20312	17.6562	34.1094
Length  373, alignment in bytes  0:	212.047	8.14062	17.6094	34.2344
Length  373, alignment in bytes  4:	212.016	8.1875	17.5781	34.7656
Length  374, alignment in bytes  0:	212.531	8.17188	17.8438	34.3281
Length  374, alignment in bytes  5:	212.609	8.23438	17.6094	34.6875
Length  375, alignment in bytes  0:	213.031	8.1875	17.6719	34.0938
Length  375, alignment in bytes  6:	213.188	8.14062	17.5938	34.7969
Length  376, alignment in bytes  0:	213.625	8.32812	17.7031	34.1406
Length  376, alignment in bytes  7:	213.703	8.15625	17.5625	34.8438
Length  377, alignment in bytes  0:	214.25	8.15625	17.4375	34.8281
Length  377, alignment in bytes  0:	214.172	8.14062	17.5781	34.8125
Length  378, alignment in bytes  0:	214.875	8.17188	17.5625	34.7656
Length  378, alignment in bytes  1:	214.719	8.17188	17.5	34.7344
Length  379, alignment in bytes  0:	215.391	8.15625	17.5938	34.875
Length  379, alignment in bytes  2:	215.328	8.1875	17.4844	34.75
Length  380, alignment in bytes  0:	215.859	8.125	17.5312	34.875
Length  380, alignment in bytes  3:	215.953	8.14062	17.5156	34.7969
Length  381, alignment in bytes  0:	216.375	8.14062	17.4219	34.7344
Length  381, alignment in bytes  4:	216.516	9.15625	18.2344	35.3438
Length  382, alignment in bytes  0:	216.969	8.34375	17.5156	34.6562
Length  382, alignment in bytes  5:	216.969	9.09375	18.1562	35.2969
Length  383, alignment in bytes  0:	217.625	8.35938	17.5156	34.8125
Length  383, alignment in bytes  6:	217.484	8.89062	18.1875	35.2344
Length  384, alignment in bytes  0:	218.141	8.40625	17.6406	35.0781
Length  384, alignment in bytes  7:	218.141	8.875	17.9375	35.3594
Length  385, alignment in bytes  0:	218.594	8.79688	18.1875	35.2969
Length  385, alignment in bytes  0:	218.734	8.67188	18.125	35.3906
Length  386, alignment in bytes  0:	219.219	8.70312	17.9844	35.3125
Length  386, alignment in bytes  1:	219.188	8.76562	18.25	35.3125
Length  387, alignment in bytes  0:	219.859	8.6875	18.0938	35.3906
Length  387, alignment in bytes  2:	219.766	8.70312	18.0156	35.2812
Length  388, alignment in bytes  0:	220.297	8.85938	18.0938	35.2969
Length  388, alignment in bytes  3:	220.406	8.71875	18.0469	35.4219
Length  389, alignment in bytes  0:	220.875	8.71875	17.9688	35.2969
Length  389, alignment in bytes  4:	220.875	8.75	18.0156	35.8906
Length  390, alignment in bytes  0:	221.516	8.73438	18.125	35.6719
Length  390, alignment in bytes  5:	221.406	8.73438	17.9219	35.8906
Length  391, alignment in bytes  0:	221.969	8.70312	18.1875	35.4531
Length  391, alignment in bytes  6:	222.062	8.70312	17.9688	35.8906
Length  392, alignment in bytes  0:	222.5	8.78125	18.2031	35.5938
Length  392, alignment in bytes  7:	222.531	8.70312	18.0938	35.9375
Length  393, alignment in bytes  0:	223.172	8.67188	17.8906	36
Length  393, alignment in bytes  0:	223.062	8.73438	17.9844	35.7812
Length  394, alignment in bytes  0:	223.656	8.6875	17.9844	35.9688
Length  394, alignment in bytes  1:	223.734	8.65625	17.9844	35.8438
Length  395, alignment in bytes  0:	224.188	8.71875	18.0312	35.8438
Length  395, alignment in bytes  2:	224.219	8.70312	17.9531	35.9375
Length  396, alignment in bytes  0:	224.859	8.6875	18.0312	35.875
Length  396, alignment in bytes  3:	224.734	8.71875	18.0156	35.7812
Length  397, alignment in bytes  0:	225.312	8.70312	17.9219	35.875
Length  397, alignment in bytes  4:	225.391	8.6875	18.5312	36.4844
Length  398, alignment in bytes  0:	225.875	8.71875	17.8281	36.3594
Length  398, alignment in bytes  5:	225.828	8.75	18.5469	36.4219
Length  399, alignment in bytes  0:	226.453	8.71875	18.3281	36.1562
Length  399, alignment in bytes  6:	226.469	8.65625	18.5625	36.4062
Length  400, alignment in bytes  0:	226.969	8.73438	18	36.1562
Length  400, alignment in bytes  7:	226.969	8.71875	18.5312	36.5938

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-02-14 11:06 [PATCH] powerpc64: strrchr optimization for power8 Rajalakshmi Srinivasaraghavan
  2017-02-20 13:10 ` Gabriel F. T. Gomes
  2017-02-20 13:43 ` Carlos O'Donell
@ 2017-02-21 17:02 ` Carlos Eduardo Seo
  2 siblings, 0 replies; 18+ messages in thread
From: Carlos Eduardo Seo @ 2017-02-21 17:02 UTC (permalink / raw)
  To: Rajalakshmi Srinivasaraghavan, libc-alpha



On 2/14/17, 9:05 AM, "Rajalakshmi Srinivasaraghavan" <libc-alpha-owner@sourceware.org on behalf of raji@linux.vnet.ibm.com> wrote:

    Changes from previous version [1]
    
      - Comments correction and alignment changes.
    
    --
    P7 code is used for <=32B strings and for > 32B vectorized loops are used.
    This shows as an average 25% improvement depending on the position of search
    character.  The performance is same for shorter strings.
    Tested on ppc64 and ppc64le.
    
LGTM.

--
Carlos Eduardo Seo
Software Engineer - Linux on Power Toolchain
cseo@linux.vnet.ibm.com
 



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-02-20 16:06     ` Carlos O'Donell
  2017-02-20 16:50       ` Rajalakshmi Srinivasaraghavan
@ 2017-02-28  7:32       ` Rajalakshmi Srinivasaraghavan
  2017-03-09  6:14         ` Rajalakshmi Srinivasaraghavan
  1 sibling, 1 reply; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-02-28  7:32 UTC (permalink / raw)
  To: libc-alpha



On 02/20/2017 09:36 PM, Carlos O'Donell wrote:
> On 02/20/2017 11:01 AM, Rajalakshmi Srinivasaraghavan wrote:
>>
>>
>> On 02/20/2017 07:12 PM, Carlos O'Donell wrote:
>>> On 02/14/2017 06:05 AM, Rajalakshmi Srinivasaraghavan wrote:
>>>> P7 code is used for <=32B strings and for > 32B vectorized loops are used.
>>>> This shows as an average 25% improvement depending on the position of search
>>>> character.  The performance is same for shorter strings.
>>>> Tested on ppc64 and ppc64le.
>>> What did you use to test the 25% improvement?
>>
>> This improvement is seen when compared to power7. Benchtest is
>> modified to use length from 0 to 400  to find the average for
>> different lengths.
>
> Could you post your modifications for review an explain your
> process in a little more detail. I'm curious about the changes
> you made.

Carlos,
Posted benchtest modification here:
https://sourceware.org/ml/libc-alpha/2017-02/msg00380.html
-- 
Thanks
Rajalakshmi S

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-02-28  7:32       ` Rajalakshmi Srinivasaraghavan
@ 2017-03-09  6:14         ` Rajalakshmi Srinivasaraghavan
  2017-03-17 15:38           ` Carlos O'Donell
  0 siblings, 1 reply; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-03-09  6:14 UTC (permalink / raw)
  To: libc-alpha, Carlos O'Donell



On 02/28/2017 01:02 PM, Rajalakshmi Srinivasaraghavan wrote:
>
>
> On 02/20/2017 09:36 PM, Carlos O'Donell wrote:
>> On 02/20/2017 11:01 AM, Rajalakshmi Srinivasaraghavan wrote:
>>>
>>>
>>> On 02/20/2017 07:12 PM, Carlos O'Donell wrote:
>>>> On 02/14/2017 06:05 AM, Rajalakshmi Srinivasaraghavan wrote:
>>>>> P7 code is used for <=32B strings and for > 32B vectorized loops
>>>>> are used.
>>>>> This shows as an average 25% improvement depending on the position
>>>>> of search
>>>>> character.  The performance is same for shorter strings.
>>>>> Tested on ppc64 and ppc64le.
>>>> What did you use to test the 25% improvement?
>>>
>>> This improvement is seen when compared to power7. Benchtest is
>>> modified to use length from 0 to 400  to find the average for
>>> different lengths.
>>
>> Could you post your modifications for review an explain your
>> process in a little more detail. I'm curious about the changes
>> you made.
>
> Carlos,
> Posted benchtest modification here:
> https://sourceware.org/ml/libc-alpha/2017-02/msg00380.html

Carlos,

Do you have further comments?

-- 
Thanks
Rajalakshmi S

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-03-09  6:14         ` Rajalakshmi Srinivasaraghavan
@ 2017-03-17 15:38           ` Carlos O'Donell
  2017-03-20  8:39             ` Rajalakshmi Srinivasaraghavan
                               ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Carlos O'Donell @ 2017-03-17 15:38 UTC (permalink / raw)
  To: Rajalakshmi Srinivasaraghavan, libc-alpha

On 03/09/2017 01:14 AM, Rajalakshmi Srinivasaraghavan wrote:
> 
> 
> On 02/28/2017 01:02 PM, Rajalakshmi Srinivasaraghavan wrote:
>>
>>
>> On 02/20/2017 09:36 PM, Carlos O'Donell wrote:
>>> On 02/20/2017 11:01 AM, Rajalakshmi Srinivasaraghavan wrote:
>>>>
>>>>
>>>> On 02/20/2017 07:12 PM, Carlos O'Donell wrote:
>>>>> On 02/14/2017 06:05 AM, Rajalakshmi Srinivasaraghavan wrote:
>>>>>> P7 code is used for <=32B strings and for > 32B vectorized loops
>>>>>> are used.
>>>>>> This shows as an average 25% improvement depending on the position
>>>>>> of search
>>>>>> character.  The performance is same for shorter strings.
>>>>>> Tested on ppc64 and ppc64le.
>>>>> What did you use to test the 25% improvement?
>>>>
>>>> This improvement is seen when compared to power7. Benchtest is
>>>> modified to use length from 0 to 400  to find the average for
>>>> different lengths.
>>>
>>> Could you post your modifications for review an explain your
>>> process in a little more detail. I'm curious about the changes
>>> you made.
>>
>> Carlos,
>> Posted benchtest modification here:
>> https://sourceware.org/ml/libc-alpha/2017-02/msg00380.html
> 
> Carlos,
> 
> Do you have further comments?
 
This is exactly what I was interested in seeing, and I see Siddhesh
has approved your commit to benchtests to increase the string lengths
used in the analysis.

When I review these changes I look at:

(a) What microbenchmark did you use?

- Can we include it in glibc?

  * We did, your improvements should be going into master so others
    can reproduce them.

(b) What assumptions did you make and were they valid?

Increasing the microbenchmarks to measure up to 512 bytes is probably
a good thing to give broad coverage over the performance from small
to large strings that are multiples of most cache lines (and places
where prefetching might start helping).

Does IBM internally have any good data about what the low, median,
average, and high lengths of strings that are being used with the
strrchr API? Such gathered statistical data would allow us to tune
the microbencharmk.

Knowing the mean value of string lengths would let us decide where
to place most of optimization efforts. I don't know that we have any
good references to academic literature here.

Your lack of such references in your patch means you don't know either,
but given that you indicate low string size performance is no worse,
this patch looks fine.

In summary:

- You assume applications will be using strings > 32 bytes, and that's
  not an entirely unreasonable assumption to make.

- You show performance with <= 32b remains the same and longer string
  lengths improve.

- You contribute the microbenchmark changes that allowed you to measure
  these numbers.

That's exactly what I want to see from a good contribution.

Now I plotted the power8 performance and there is a big bump in the middle,
any idea why?

https://docs.google.com/a/redhat.com/spreadsheets/d/16kW90bXH7nC8Ak6Xyoe4cxVIvFPwjVDcO-7qsZs0iVc/pubhtml

-- 
Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-03-17 15:38           ` Carlos O'Donell
@ 2017-03-20  8:39             ` Rajalakshmi Srinivasaraghavan
  2017-03-20 16:34               ` Carlos O'Donell
  2017-03-21  5:15             ` Rajalakshmi Srinivasaraghavan
  2017-04-03 15:30             ` Tulio Magno Quites Machado Filho
  2 siblings, 1 reply; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-03-20  8:39 UTC (permalink / raw)
  To: libc-alpha, Carlos O'Donell



On 03/17/2017 09:08 PM, Carlos O'Donell wrote:
> Now I plotted the power8 performance and there is a big bump in the middle,
> any idea why?
>
> https://docs.google.com/a/redhat.com/spreadsheets/d/16kW90bXH7nC8Ak6Xyoe4cxVIvFPwjVDcO-7qsZs0iVc/pubhtml

I am not able to access this.

-- 
Thanks
Rajalakshmi S

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-03-20  8:39             ` Rajalakshmi Srinivasaraghavan
@ 2017-03-20 16:34               ` Carlos O'Donell
  0 siblings, 0 replies; 18+ messages in thread
From: Carlos O'Donell @ 2017-03-20 16:34 UTC (permalink / raw)
  To: Rajalakshmi Srinivasaraghavan, libc-alpha

On 03/20/2017 04:39 AM, Rajalakshmi Srinivasaraghavan wrote:
> 
> 
> On 03/17/2017 09:08 PM, Carlos O'Donell wrote:
>> Now I plotted the power8 performance and there is a big bump in the middle,
>> any idea why?
>>
>> https://docs.google.com/a/redhat.com/spreadsheets/d/16kW90bXH7nC8Ak6Xyoe4cxVIvFPwjVDcO-7qsZs0iVc/pubhtml
> 
> I am not able to access this.
> 

This should work.

https://docs.google.com/spreadsheets/d/1e2QHzIvMEp_71z0NqFbhRxY27BzscmV23UeE0NawQKE/edit?usp=sharing

-- 
Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-03-17 15:38           ` Carlos O'Donell
  2017-03-20  8:39             ` Rajalakshmi Srinivasaraghavan
@ 2017-03-21  5:15             ` Rajalakshmi Srinivasaraghavan
  2017-04-03 15:30             ` Tulio Magno Quites Machado Filho
  2 siblings, 0 replies; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-03-21  5:15 UTC (permalink / raw)
  To: Carlos O'Donell, libc-alpha



On 03/17/2017 09:08 PM, Carlos O'Donell wrote:
> Knowing the mean value of string lengths would let us decide where
> to place most of optimization efforts. I don't know that we have any
> good references to academic literature here.
>
No, I dont have details about strrchr call statistics. This
optimization is to make use of POWER 8 capabilties in general.

> Now I plotted the power8 performance and there is a big bump in the middle,
> any idea why?

I could see some sudden increase for sizes 104-118 and around 222
in the shared sheet. However there is no special logic in the code
related to these sizes. I tried to check if this happens on my
test P8 ppc64le system and I could not recreate this.
-- 
Thanks
Rajalakshmi S

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-03-17 15:38           ` Carlos O'Donell
  2017-03-20  8:39             ` Rajalakshmi Srinivasaraghavan
  2017-03-21  5:15             ` Rajalakshmi Srinivasaraghavan
@ 2017-04-03 15:30             ` Tulio Magno Quites Machado Filho
  2017-04-17 15:20               ` Carlos O'Donell
  2 siblings, 1 reply; 18+ messages in thread
From: Tulio Magno Quites Machado Filho @ 2017-04-03 15:30 UTC (permalink / raw)
  To: Carlos O'Donell, Rajalakshmi Srinivasaraghavan, libc-alpha

Carlos O'Donell <carlos@redhat.com> writes:

> Now I plotted the power8 performance and there is a big bump in the middle,
> any idea why?

I remember that we discussed this on #glibc and you noticed the same behavior
on simple_strrchr and we agreed this could be raw hardware behavior.

Do you think this question is still blocking this patch?

-- 
Tulio Magno

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-04-03 15:30             ` Tulio Magno Quites Machado Filho
@ 2017-04-17 15:20               ` Carlos O'Donell
  0 siblings, 0 replies; 18+ messages in thread
From: Carlos O'Donell @ 2017-04-17 15:20 UTC (permalink / raw)
  To: Tulio Magno Quites Machado Filho, Rajalakshmi Srinivasaraghavan,
	libc-alpha

On 04/03/2017 11:29 AM, Tulio Magno Quites Machado Filho wrote:
> Carlos O'Donell <carlos@redhat.com> writes:
> 
>> Now I plotted the power8 performance and there is a big bump in the middle,
>> any idea why?
> 
> I remember that we discussed this on #glibc and you noticed the same behavior
> on simple_strrchr and we agreed this could be raw hardware behavior.
> 
> Do you think this question is still blocking this patch?
 
My question does not block the patch.

At this point the patch looks good to me.

You just have an unexplained issue with performance, but it appears to impact
_all_ the algorithms, simple, and the new POWER8 one, so it must be some semantic
of the hardware showing up in the algorithm. It's odd to see that consistent bump.

-- 
Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-02-09  5:01 Rajalakshmi Srinivasaraghavan
  2017-02-09 15:26 ` Gabriel F. T. Gomes
@ 2017-02-13 16:18 ` Peter Bergner
  1 sibling, 0 replies; 18+ messages in thread
From: Peter Bergner @ 2017-02-13 16:18 UTC (permalink / raw)
  To: Rajalakshmi Srinivasaraghavan, libc-alpha

On 2/8/17 11:00 PM, Rajalakshmi Srinivasaraghavan wrote:

> +	/* r4 is changed now ,if its passed as more chars
> +	   check for null again */

Not sure without looking at the code closer, but should this
read like the following???

/* r4 is changed now.  If it's passed more chars, then check for
    null again.  */



> +	/* if there are more than one 0xff in r11, find the first pos of ff
> +	   in r11 and fill r10 with 0 from that position */

First word should be capitalized and there is no final '.' and two
spaces at the end of the sentence.  I would also write out position
rather than using "pos" like you did at the end of the sentence.


> +	vsl	v9, v8, v9	/* v9 = 0x4040404040404040. */

Two spaces after the '.'.

Peter



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] powerpc64: strrchr optimization for power8
  2017-02-09  5:01 Rajalakshmi Srinivasaraghavan
@ 2017-02-09 15:26 ` Gabriel F. T. Gomes
  2017-02-13 16:18 ` Peter Bergner
  1 sibling, 0 replies; 18+ messages in thread
From: Gabriel F. T. Gomes @ 2017-02-09 15:26 UTC (permalink / raw)
  To: Rajalakshmi Srinivasaraghavan; +Cc: libc-alpha

Hi,

I have a few cosmetic comments...

On Thu,  9 Feb 2017 10:30:54 +0530
Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> wrote:

> +/* int [r3] strrchr (char *s [r3], int c [r4])  */
      ~~~
Should it be char *, instead?

> +#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
> +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
> +#define VBPERMQ(t,a,b)  .long (0x1000054c \
> +                        | ((t)<<(32-11)) \
> +                        | ((a)<<(32-16)) \
> +                        | ((b)<<(32-21)) )
   ~~~~~~~~~~~~~~~~~~~~~~~~
Eight spaces should be replaced with tabs.

> +#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21)))
> +#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21)))
> +#define VADDUQM(t,a,b)  .long (0x10000100 \
> +                        | ((t)<<(32-11)) \
> +                        | ((a)<<(32-16)) \
> +                        | ((b)<<(32-21)) )
   ~~~~~~~~~~~~~~~~~~~~~~~~
Likewise.

> +	/* r4 is changed now ,if its passed as more chars
                            ^
now, if

> +	li	r5, 16
> +	vspltb	v1, v1, 7
> +        /* Compare 32 bytes in each loop.  */
   ~~~~~~~~
Eight spaces should be replaced with tabs.


> +	blt	cr6, L(match)
> +
> +        /* One (or both) of the quadwords contains c/null.  */
   ~~~~~~~~
Likewise.

> +
> +L(match):
> +        /* One (or both) of the quadwords contains a match.  */
   ~~~~~~~~
Likewise.

> +	vslb	v10, v11, v10
> +	li	r5, 16
> +        /* Compare 32 bytes in each loop.  */
   ~~~~~~~~
Likewise.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] powerpc64: strrchr optimization for power8
@ 2017-02-09  5:01 Rajalakshmi Srinivasaraghavan
  2017-02-09 15:26 ` Gabriel F. T. Gomes
  2017-02-13 16:18 ` Peter Bergner
  0 siblings, 2 replies; 18+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-02-09  5:01 UTC (permalink / raw)
  To: libc-alpha; +Cc: Rajalakshmi Srinivasaraghavan

P7 code is used for <=32B strings and for > 32B vectorized loops are used.
This shows as an average 25% improvement depending on the position of search
character.  The performance is same for shorter strings.
Tested on ppc64 and ppc64le.

2017-02-06  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>

	* sysdeps/powerpc/powerpc64/multiarch/Makefile
	(sysdep_routines): Add strrchr-power8.
	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
	(strrchr): Add __strrchr_power8 to list of strrchr functions.
	* sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S: New file.
	* sysdeps/powerpc/powerpc64/multiarch/strrchr.c
	(strrchr): Add __strrchr_power8 to ifunc list.
	* sysdeps/powerpc/powerpc64/power8/strrchr.S: New file.
---
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   3 +-
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   3 +
 .../powerpc/powerpc64/multiarch/strrchr-power8.S   |  39 ++
 sysdeps/powerpc/powerpc64/multiarch/strrchr.c      |   3 +
 sysdeps/powerpc/powerpc64/power8/strrchr.S         | 464 +++++++++++++++++++++
 5 files changed, 511 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
 create mode 100644 sysdeps/powerpc/powerpc64/power8/strrchr.S

diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index f5889a3..0fc0ebc 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -14,7 +14,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   strchrnul-power8 strchrnul-power7 strchrnul-ppc64 \
 		   strcpy-power8 strcpy-power7 strcpy-ppc64 stpcpy-power8 \
 		   stpcpy-power7 stpcpy-ppc64 \
-		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
+		   strrchr-power8 strrchr-power7 strrchr-ppc64 \
+		   strncat-power7 strncat-ppc64 \
 		   strncpy-power7 strncpy-ppc64 \
 		   stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
 		   strcmp-power9 strcmp-power8 strcmp-power7 strcmp-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 209aec5..d77c47f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -281,6 +281,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strrchr.c.  */
   IFUNC_IMPL (i, name, strrchr,
 	      IFUNC_IMPL_ADD (array, i, strrchr,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strrchr_power8)
+	      IFUNC_IMPL_ADD (array, i, strrchr,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __strrchr_power7)
 	      IFUNC_IMPL_ADD (array, i, strrchr, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
new file mode 100644
index 0000000..23365a1
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr-power8.S
@@ -0,0 +1,39 @@
+/* Optimized strrchr implementation for POWER8.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef ENTRY
+#define ENTRY(name)						\
+  .section ".text";						\
+  ENTRY_2(__strrchr_power8)					\
+  .align ALIGNARG(2);						\
+  BODY_LABEL(__strrchr_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strrchr_power8)
+
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strrchr_power8)					\
+  END_2(__strrchr_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strrchr.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
index dc1d3d0..0f94c9d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
@@ -25,11 +25,14 @@
 
 extern __typeof (strrchr) __strrchr_ppc attribute_hidden;
 extern __typeof (strrchr) __strrchr_power7 attribute_hidden;
+extern __typeof (strrchr) __strrchr_power8 attribute_hidden;
 #undef strrchr
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc_redirected (__redirect_strrchr, strrchr,
+		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+		       ? __strrchr_power8 :
 		       (hwcap & PPC_FEATURE_HAS_VSX)
 		       ? __strrchr_power7
 		       : __strrchr_ppc);
diff --git a/sysdeps/powerpc/powerpc64/power8/strrchr.S b/sysdeps/powerpc/powerpc64/power8/strrchr.S
new file mode 100644
index 0000000..f29fc4e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strrchr.S
@@ -0,0 +1,464 @@
+/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] strrchr (char *s [r3], int c [r4])  */
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b)  .long (0x1000054c \
+                        | ((t)<<(32-11)) \
+                        | ((a)<<(32-16)) \
+                        | ((b)<<(32-21)) )
+#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VADDUQM(t,a,b)  .long (0x10000100 \
+                        | ((t)<<(32-11)) \
+                        | ((a)<<(32-16)) \
+                        | ((b)<<(32-21)) )
+#ifdef __LITTLE_ENDIAN__
+/* Find the match position from v6 and place result in r6.  */
+# define CALCULATE_MATCH() \
+	VBPERMQ(v6, v6, v10); \
+	vsldoi	v6, v6, v6, 6; \
+	MFVRD(r7, v6); \
+	cntlzd	r6, r7; \
+	subfic	r6, r6, 15;
+/*
+ * Find the first null position to mask bytes after null.
+ * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw.
+ * Result placed at v2.
+ */
+# define FIND_NULL_POS(reg) \
+	vspltisb	v11, -1; \
+	VADDUQM(v11, reg, v11); \
+	vandc	v11, v11, reg; \
+	VPOPCNTD(v2, v11); \
+	vspltb	v11, v2, 15; \
+	vcmpequb.	v11, v11, v9; \
+	blt	cr6, 1f; \
+	vsldoi	v9, v0, v9, 1; \
+	vslo	v2, v2, v9; \
+1: \
+	vsumsws	v2, v2, v0;
+#else
+# define CALCULATE_MATCH() \
+	VBPERMQ(v6, v6, v10); \
+	MFVRD(r7, v6); \
+	addi	r6, r7, -1; \
+	andc	r6, r6, r7; \
+	popcntd	r6, r6; \
+	subfic	r6, r6, 15;
+# define FIND_NULL_POS(reg) \
+	VCLZD(v2, reg); \
+	vspltb	v11, v2, 7; \
+	vcmpequb.	v11, v11, v9; \
+	blt	cr6, 1f; \
+	vsldoi	v9, v0, v9, 1; \
+	vsro	v2, v2, v9; \
+1: \
+	vsumsws	v2, v2, v0;
+#endif	/* !__LITTLE_ENDIAN__ */
+	.machine  power7
+ENTRY (strrchr)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
+	cmpdi	cr7,r4,0
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	li	r9,0	      /* used to store last occurence */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+
+	beq	cr7,L(null_match)
+
+	/* Replicate byte to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi	r4,r4,32,0
+
+	/* r4 is changed now ,if its passed as more chars
+	   check for null again */
+	cmpdi	cr7,r4,0
+	beq	cr7,L(null_match)
+	/* Now r4 has a doubleword of c bytes and r0 has
+	   a doubleword of null bytes.  */
+
+	cmpb	r10,r12,r4     /* Compare each byte against c byte.  */
+	cmpb	r11,r12,r0     /* Compare each byte against null byte.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+#else
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+#endif
+	or	r5,r10,r11    /* OR the results to speed things up.  */
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done)
+
+L(align):
+	andi.	r12, r8, 15
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bne	cr0, L(loop)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r7,16(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	cmpb	r6,r7,r4
+	cmpb	r7,r7,r0
+	or	r12,r10,r11
+	or	r5,r6,r7
+	or	r5,r12,r5
+	cmpdi	cr7,r5,0
+	beq	cr7,L(vector)
+
+	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a c/null byte.  */
+	cmpdi	cr6,r12,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The c/null byte must be in the second doubleword.  Adjust the
+	   address again and move the result of cmpb to r10 so we can calculate
+	   the pointer.  */
+
+	mr	r10,r6
+	mr	r11,r7
+	addi	r8,r8,8
+
+	/* r10/r11 have the output of the cmpb instructions, that is,
+	   0xff in the same position as the c/null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+
+L(done):
+	/* if there are more than one 0xff in r11, find the first pos of ff
+	   in r11 and fill r10 with 0 from that position */
+	cmpdi	cr7,r11,0
+	beq	cr7,L(no_null)
+#ifdef __LITTLE_ENDIAN__
+	addi	r3,r11,-1
+	andc	r3,r3,r11
+	popcntd r0,r3
+#else
+	cntlzd	r0,r11
+#endif
+	subfic	r0,r0,63
+	li	r6,-1
+#ifdef __LITTLE_ENDIAN__
+	srd	r0,r6,r0
+#else
+	sld	r0,r6,r0
+#endif
+	and	r10,r0,r10
+L(no_null):
+#ifdef __LITTLE_ENDIAN__
+	cntlzd	r0,r10		/* Count leading zeros before c matches.  */
+	addi	r3,r10,-1
+	andc	r3,r3,r10
+	addi	r10,r11,-1
+	andc	r10,r10,r11
+	cmpld	cr7,r3,r10
+	bgt	cr7,L(no_match)
+#else
+	addi	r3,r10,-1	/* Count trailing zeros before c matches.  */
+	andc	r3,r3,r10
+	popcntd	r0,r3
+	cmpld	cr7,r11,r10
+	bgt	cr7,L(no_match)
+#endif
+	srdi	r0,r0,3		/* Convert trailing zeros to bytes.  */
+	subfic	r0,r0,7
+	add	r9,r8,r0      /* Return address of the matching c byte
+				 or null in case c was not found.  */
+	li	r0,0
+	cmpdi	cr7,r11,0     /* If r11 == 0, no null's have been found.  */
+	beq	cr7,L(align)
+
+	.align	4
+L(no_match):
+	mr	r3,r9
+	blr
+
+/* Check the first 32B in GPR's and move to vectorized loop.  */
+	.p2align  5
+L(vector):
+	addi	r3, r8, 8
+	/* Make sure 32B aligned.  */
+	andi.	r10, r3, 31
+	bne	cr0, L(loop)
+	vspltisb	v0, 0
+	/* Precompute vbpermq constant.  */
+	vspltisb	v10, 3
+	lvsl	v11, r0, r0
+	vslb	v10, v11, v10
+	MTVRD(v1, r4)
+	li	r5, 16
+	vspltb	v1, v1, 7
+        /* Compare 32 bytes in each loop.  */
+L(continue):
+	lvx	v4, 0, r3
+	lvx	v5, r3, r5
+	vcmpequb	v2, v0, v4
+	vcmpequb	v3, v0, v5
+	vcmpequb	v6, v1, v4
+	vcmpequb	v7, v1, v5
+	vor	v8, v2, v3
+	vor	v9, v6, v7
+	vor	v11, v8, v9
+	vcmpequb.	v11, v0, v11
+	addi	r3, r3, 32
+	blt	cr6, L(continue)
+	vcmpequb.	v8, v0, v8
+	blt	cr6, L(match)
+
+        /* One (or both) of the quadwords contains c/null.  */
+	vspltisb	v8, 2
+	vspltisb	v9, 5
+	/* Precompute values used for comparison.  */
+	vsl	v9, v8, v9	/* v9 = 0x4040404040404040. */
+	vaddubm	v8, v9, v9
+	vsldoi	v8, v0, v8, 1	/* v8 = 0x80.  */
+
+	/* Check if null is in second qw.  */
+	vcmpequb.	v11, v0, v2
+	blt	cr6, L(secondqw)
+
+	/* Null found in first qw.  */
+	addi	r8, r3, -32
+	/* Calculate the null position.  */
+	FIND_NULL_POS(v2)
+	/* Check if null is in the first byte.  */
+	vcmpequb.	v11, v0, v2
+	blt	cr6, L(no_match)
+	vsububm	v2, v8, v2
+	/* Mask unwanted bytes after null.  */
+#ifdef __LITTLE_ENDIAN__
+	vslo	v6, v6, v2
+	vsro	v6, v6, v2
+#else
+	vsro	v6, v6, v2
+	vslo	v6, v6, v2
+#endif
+	vcmpequb.	v11, v0, v6
+	blt	cr6, L(no_match)
+	/* Found a match before null.  */
+	CALCULATE_MATCH()
+	add	r3, r8, r6
+	blr
+
+L(secondqw):
+	addi	r8, r3, -16
+	FIND_NULL_POS(v3)
+	vcmpequb.	v11, v0, v2
+	blt	cr6, L(no_match1)
+	vsububm	v2, v8, v2
+	/* Mask unwanted bytes after null.  */
+#ifdef __LITTLE_ENDIAN__
+	vslo	v7, v7, v2
+	vsro	v7, v7, v2
+#else
+	vsro	v7, v7, v2
+	vslo	v7, v7, v2
+#endif
+	vcmpequb.	v11, v0, v7
+	blt	cr6, L(no_match1)
+	addi	r8, r8, 16
+	vor	v6, v0, v7
+L(no_match1):
+	addi	r8, r8, -16
+	vcmpequb.	v11, v0, v6
+	blt	cr6, L(no_match)
+	/* Found a match before null.  */
+	CALCULATE_MATCH()
+	add	r3, r8, r6
+	blr
+
+L(match):
+        /* One (or both) of the quadwords contains a match.  */
+	mr	r8, r3
+	vcmpequb.	v8, v0, v7
+	blt	cr6, L(firstqw)
+	/* Match found in second qw.  */
+	addi	r8, r8, 16
+	vor	v6, v0, v7
+L(firstqw):
+	addi	r8, r8, -32
+	CALCULATE_MATCH()
+	add	r9, r8, r6      /* Compute final length.  */
+	b	L(continue)
+/* We are here because strrchr was called with a null byte.  */
+	.align	4
+L(null_match):
+	/* r0 has a doubleword of null bytes.  */
+
+	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
+	sld	r5,r5,r6
+	srd	r5,r5,r6
+#endif
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done_null)
+
+	andi.	r12, r8, 15
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bne	cr0, L(loop_null)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r5,r12,r0
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done_null)
+	b	L(loop_null)  /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop_null):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r11,16(r8)
+	cmpb	r5,r12,r0
+	cmpb	r10,r11,r0
+	or	r6,r5,r10
+	cmpdi	cr7,r6,0
+	beq	cr7,L(vector1)
+
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done_null)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   pointer.  */
+
+	mr	r5,r10
+	addi	r8,r8,8
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,r5,-1
+	andc	r0,r0,r5
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srdi	r0,r0,3	      /* Convert trailing zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching null byte.  */
+	blr
+/* Check the first 32B in GPR's and move to vectorized loop.  */
+	.p2align  5
+L(vector1):
+	addi	r3, r8, 8
+	/* Make sure 32B aligned.  */
+	andi.	r10, r3, 31
+	bne	cr0, L(loop_null)
+	vspltisb	v0, 0
+	/* Precompute vbpermq constant.  */
+	vspltisb	v10, 3
+	lvsl	v11, r0, r0
+	vslb	v10, v11, v10
+	li	r5, 16
+        /* Compare 32 bytes in each loop.  */
+L(continue1):
+	lvx	v4, 0, r3
+	lvx	v5, r3, r5
+	vcmpequb	v2, v0, v4
+	vcmpequb	v3, v0, v5
+	vor	v8, v2, v3
+	vcmpequb.	v11, v0, v8
+	addi	r3, r3, 32
+	blt	cr6, L(continue1)
+	addi	r3, r3, -32
+	VBPERMQ(v2, v2, v10)
+	VBPERMQ(v3, v3, v10)
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi	v3, v3, v3, 2
+#else
+	vsldoi	v2, v2, v2, 6
+	vsldoi	v3, v3, v3, 4
+#endif
+	/* Merge the results and move to a GPR.  */
+	vor	v4, v3, v2
+	MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+	addi	r6, r5, -1
+	andc	r6, r6, r5
+	popcntd	r6, r6
+#else
+	cntlzd	r6, r5  /* Count leading zeros before the match.  */
+#endif
+	add	r3, r3, r6      /* Compute final length.  */
+	blr
+END (strrchr)
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
-- 
2.7.4

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2017-04-17 15:20 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-02-14 11:06 [PATCH] powerpc64: strrchr optimization for power8 Rajalakshmi Srinivasaraghavan
2017-02-20 13:10 ` Gabriel F. T. Gomes
2017-02-20 13:43 ` Carlos O'Donell
2017-02-20 16:01   ` Rajalakshmi Srinivasaraghavan
2017-02-20 16:06     ` Carlos O'Donell
2017-02-20 16:50       ` Rajalakshmi Srinivasaraghavan
2017-02-28  7:32       ` Rajalakshmi Srinivasaraghavan
2017-03-09  6:14         ` Rajalakshmi Srinivasaraghavan
2017-03-17 15:38           ` Carlos O'Donell
2017-03-20  8:39             ` Rajalakshmi Srinivasaraghavan
2017-03-20 16:34               ` Carlos O'Donell
2017-03-21  5:15             ` Rajalakshmi Srinivasaraghavan
2017-04-03 15:30             ` Tulio Magno Quites Machado Filho
2017-04-17 15:20               ` Carlos O'Donell
2017-02-21 17:02 ` Carlos Eduardo Seo
  -- strict thread matches above, loose matches on Subject: below --
2017-02-09  5:01 Rajalakshmi Srinivasaraghavan
2017-02-09 15:26 ` Gabriel F. T. Gomes
2017-02-13 16:18 ` Peter Bergner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).