public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] powerpc: Add optimized P8 strspn
@ 2016-03-25 17:10 Paul E. Murphy
  2016-03-28 17:55 ` Adhemerval Zanella
  0 siblings, 1 reply; 2+ messages in thread
From: Paul E. Murphy @ 2016-03-25 17:10 UTC (permalink / raw)
  To: libc-alpha; +Cc: Tulio Magno Quites Machado Filho

[-- Attachment #1: Type: text/plain, Size: 13734 bytes --]

Tested on PPC64 and PPC64LE.  Benchmarks from PPC64LE
attached.

----8<----
This utilizes vectors and bitmasks.  For small needle, large
haystack, the performance improvement is upto 8x.  For short
strings (0-4B), the cost of computing the bitmask dominates,
and is a tad slower.

2016-03-24  Paul E. Murphy  <murphyp@linux.vnet.ibm.com>

	* sysdeps/powerpc/powerpc64/multiarch/Makefile:
	(sysdep_routines): Add new strspn targets.
	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c:
	(__libc_ifunc_impl_list): Add strspn.
	* sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S:
	New file.
	* sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.S:
	Likewise.
	* sysdeps/powerpc/powerpc64/multiarch/strspn.c:
	Likewise.
	* sysdeps/powerpc/powerpc64/power8/strspn.S:
	Likewise.
---
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   3 +-
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   8 +
 .../powerpc/powerpc64/multiarch/strspn-power8.S    |  44 +++++
 sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.S |  44 +++++
 sysdeps/powerpc/powerpc64/multiarch/strspn.c       |  37 +++++
 sysdeps/powerpc/powerpc64/power8/strspn.S          | 179 +++++++++++++++++++++
 6 files changed, 314 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strspn.c
 create mode 100644 sysdeps/powerpc/powerpc64/power8/strspn.S

diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 3b0e3a0..7ed56bf 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -19,7 +19,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   strcmp-power8 strcmp-power7 strcmp-ppc64 \
 		   strcat-power8 strcat-power7 strcat-ppc64 \
 		   memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \
-		   strncpy-power8 strstr-power7 strstr-ppc64
+		   strncpy-power8 strstr-power7 strstr-ppc64 \
+		   strspn-power8 strspn-ppc64
 
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 11a8215..3e1f099 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -322,6 +322,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcat, 1,
 			     __strcat_ppc))
 
+  /* Support sysdeps/powerpc/powerpc64/multiarch/strspn.c.  */
+  IFUNC_IMPL (i, name, strspn,
+             IFUNC_IMPL_ADD (array, i, strspn,
+                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
+                             __strspn_power8)
+             IFUNC_IMPL_ADD (array, i, strspn, 1,
+                             __strspn_ppc))
+
   /* Support sysdeps/powerpc/powerpc64/multiarch/strstr.c.  */
   IFUNC_IMPL (i, name, strstr,
              IFUNC_IMPL_ADD (array, i, strstr,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S
new file mode 100644
index 0000000..0beff3c
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S
@@ -0,0 +1,44 @@
+/* Optimized strspn implementation for POWER8.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__strspn_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__strspn_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strspn_power8)
+
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strspn_power8)					\
+  END_2(__strspn_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#endif
+
+#include <sysdeps/powerpc/powerpc64/power8/strspn.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.S b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.S
new file mode 100644
index 0000000..4e870a9
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.S
@@ -0,0 +1,44 @@
+/* Optimized strspn implementation for POWER8.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__strspn_ppc)						\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__strspn_ppc):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strspn_ppc)
+
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strspn_ppc)					\
+  END_2(__strspn_ppc)
+
+#undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+
+#endif
+
+#include <sysdeps/powerpc/powerpc64/strspn.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn.c b/sysdeps/powerpc/powerpc64/multiarch/strspn.c
new file mode 100644
index 0000000..8769de3
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strspn.c
@@ -0,0 +1,37 @@
+/* Multiple versions of strspn. PowerPC64 version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <string.h>
+# include <shlib-compat.h>
+# include "init-arch.h"
+
+#undef strspn
+extern __typeof (strspn) __libc_strspn;
+
+extern __typeof (strspn) __strspn_ppc attribute_hidden;
+extern __typeof (strspn) __strspn_power8 attribute_hidden;
+
+libc_ifunc (__libc_strspn,
+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+	    ? __strspn_power8
+	    : __strspn_ppc);
+
+weak_alias (__libc_strspn, strspn)
+libc_hidden_builtin_def (strspn)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strspn.S b/sysdeps/powerpc/powerpc64/power8/strspn.S
new file mode 100644
index 0000000..dd1838e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strspn.S
@@ -0,0 +1,179 @@
+/* Optimized strspn implementation for Power8.
+
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* size_t [r3] strspn (const char *string [r3],
+                       const char *needleAccept [r4]  */
+
+/* This takes a novel approach by computing a 256 bit mask whereby
+   each set bit implies the byte is "accepted". P8 vector hardware
+   has extremely efficient hardware for selecting bits from a mask.
+
+   One might ask "why not use bpermd for short strings"?  It is
+   so slow that its performance about matches the generic PPC64
+   variant without any fancy masking, with the added expense of
+   making the mask. That was the first variant of this.  */
+
+
+
+#include "sysdep.h"
+
+/* Simple macro to use VSX instructions in overlapping VR's.  */
+#define XXVR(insn, vrt, vra, vrb) \
+	insn 32+vrt, 32+vra, 32+vrb
+
+/* ISA 2.07B instructions are not all defined for older binutils.
+   Macros are defined below for these newer instructions in order
+   to maintain compatibility.  */
+
+/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs.  */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+			      | ((t)<<(32-11))	\
+			      | ((a)<<(32-16))	\
+			      | ((b)<<(32-21)) )
+
+	/* This can be updated to power8 once the minimum version of
+	   binutils supports power8 and the above instructions.  */
+	.machine power7
+EALIGN(strspn, 4, 0)
+	CALL_MCOUNT 2
+
+	/* Generate useful constants for later on.  */
+	vspltisb v1, 7
+	vspltisb v2, -1
+	vslb	v1, v1, v1	/* 0x80 to swap high bit for vbpermq.  */
+	vspltisb v10, 0
+	vsldoi	v4, v10, v2, 2	/* 0xFFFF into vr4.  */
+	XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches.  */
+
+	/* Prepare to compute 256b mask.  */
+	addi	r4, r4, -1
+	li	r5, 0
+	li	r6, 0
+	li	r7, 0
+	li	r8, 0
+	li	r11, 1
+	sldi	r11, r11, 63
+
+	/* Start interleaved Mask computation.
+	   This will eventually or 1's into ignored bits from vbpermq.  */
+	lvsr	v11, r0, r3
+	vspltb  v11, v11, 0	/* Splat shift constant.  */
+
+	/* Build a 256b mask in r5-r8.  */
+	.align 4
+L(next_needle):
+	lbzu	r9, 1(r4)
+
+	cmpldi	cr0, r9, 0
+	cmpldi	cr1, r9, 128
+
+	/* This is a little tricky. srd only uses the first 7 bits,
+	   and if bit 7 is set, value is always 0. So, we can
+	   effectively shift 128b in this case.  */
+	xori	r12, r9,  0x40	/* Invert bit 6.  */
+	srd	r10, r11, r9	/* Mask for bits 0-63.  */
+	srd	r12, r11, r12	/* Mask for bits 64-127.  */
+
+	beq	cr0, L(start_cmp)
+
+	/* Now, or the value into the correct GPR.  */
+	bge cr1,L(needle_gt128)
+	or	r5, r5, r10	/* 0 - 63.  */
+	or	r6, r6, r12	/* 64 - 127.  */
+	b L(next_needle)
+
+	.align 4
+L(needle_gt128):
+	or	r7, r7, r10	/* 128 - 191.  */
+	or	r8, r8, r12	/* 192 - 255.  */
+	b L(next_needle)
+
+
+	.align 4
+L(start_cmp):
+	/* Move and merge bitmap into 2 VRs.  bpermd is slower on P8.  */
+	mr	r0, r3		/* Save r3 for final length computation.  */
+	MTVRD (v5, r5)
+	MTVRD (v6, r6)
+	MTVRD (v7, r7)
+	MTVRD (v8, r8)
+
+	/* Continue interleaved mask generation.  */
+#ifdef __LITTLE_ENDIAN__
+	vsrw	v11, v2, v11	/* Note, shift ignores higher order bits. */
+	vsplth  v11, v11, 0	/* Only care about the high 16 bits of v10.  */
+#else
+	vslw	v11, v2, v11	/* Note, shift ignores higher order bits. */
+	vsplth  v11, v11, 1	/* Only care about the low 16 bits of v10.  */
+#endif
+	lvx	v0, r0, r3	/* Note, unaligned load ignores lower bits.  */
+
+	/* Do the merging of the bitmask.  */
+	XXVR(xxmrghd, v5, v5, v6)
+	XXVR(xxmrghd, v6, v7, v8)
+
+	/* Finish mask generation.  */
+	vand	v11, v11, v4	/* Throwaway bits not in the mask.  */
+
+	/* Compare the first 1-16B, while masking unwanted bytes.  */
+	clrrdi  r3, r3, 4	/* Note,  counts from qw boundaries.  */
+	vxor	v9, v0, v1	/* Swap high bit.  */
+	VBPERMQ (v8, v5, v0)
+	VBPERMQ (v7, v6, v9)
+	vor	v7, v7, v8
+	vor	v7, v7, v11	/* Ignore non-participating bytes.  */
+	vcmpequh. v8, v7, v4
+	bnl	cr6, L(done)
+
+	addi	r3, r3, 16
+
+	.align 4
+L(vec):
+	lvx	v0, r0, r3
+	addi	r3, r3, 16
+	vxor	v9, v0, v1	/* Swap high bit.  */
+	VBPERMQ (v8, v5, v0)
+	VBPERMQ (v7, v6, v9)
+	vor	v7, v7, v8
+	vcmpequh. v8, v7, v4
+	blt	cr6, L(vec)
+
+	addi	r3, r3, -16
+L(done):
+	subf	r3, r0, r3
+	MFVRD (r10, v7)
+
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,  r10, 1	/* Count the trailing 1's.  */
+	andc	r10, r10, r0
+	popcntd	r10, r10
+#else
+	xori	r10, r10, 0xffff /* Count leading 1's by inverting.  */
+	addi	r3,  r3,  -48	/* Account for the extra leading zeros.  */
+	cntlzd  r10, r10
+#endif
+
+	add	r3, r3, r10
+	blr
+
+END(strspn)
+libc_hidden_builtin_def (strspn)
-- 
2.4.3


[-- Attachment #2: strspn_p8_ppc64le.txt --]
[-- Type: text/plain, Size: 10405 bytes --]

                                	simple_strspn	stupid_strspn	__strspn_power8	__strspn_ppc
Length  512, alignment  0, acc len  1:	247.391	389.087	19.3916	105.12
Length  512, alignment  1, acc len  1:	247.315	388.841	18.984	105.935
Length  512, alignment  0, acc len  2:	880.533	948.645	19.1783	105.738
Length  512, alignment  2, acc len  2:	860.911	985.322	19.9538	105.526
Length  512, alignment  0, acc len  3:	1044.72	755.482	20.268	104.639
Length  512, alignment  3, acc len  3:	1060.35	768.662	20.3379	104.779
Length  512, alignment  0, acc len  4:	1173.23	876.323	21.2709	104.715
Length  512, alignment  4, acc len  4:	1252.11	1002.27	21.2775	105.202
Length  512, alignment  0, acc len  5:	1542.62	1212.08	21.7363	104.929
Length  512, alignment  5, acc len  5:	1423.67	1150.12	21.8966	104.819
Length  512, alignment  0, acc len  6:	1664.98	1289.91	22.1532	104.807
Length  512, alignment  6, acc len  6:	1673.9	1237.8	22.2289	105.133
Length  512, alignment  0, acc len  7:	1970.66	1544.57	22.4462	106.011
Length  512, alignment  7, acc len  7:	1941.12	1539.02	23.0592	106.484
Length  512, alignment  0, acc len  8:	2050.92	1670.56	23.7757	106.8
Length  512, alignment  0, acc len  8:	2028.78	1739.79	24.0438	106.467
Length  512, alignment  0, acc len  9:	2226.59	1919.62	24.7029	106.571
Length  512, alignment  1, acc len  9:	2239.91	1835.85	23.9311	106.894
Length  512, alignment  0, acc len 10:	2174.27	1851.18	24.6958	107.03
Length  512, alignment  2, acc len 10:	2410.4	2078.24	25.1727	106.569
Length  512, alignment  0, acc len 11:	2417.44	2128.1	25.8387	107.324
Length  512, alignment  0, acc len 12:	2641.18	2432.16	25.8485	107.404
Length  512, alignment  4, acc len 12:	2594.5	2442.14	26.3134	107.075
Length  512, alignment  0, acc len 13:	2729.18	2608.35	26.1373	108.354
Length  512, alignment  5, acc len 13:	2664.77	2488.01	26.2367	107.889
Length  512, alignment  0, acc len 14:	2820.07	2678.45	30.2259	108.009
Length  512, alignment  6, acc len 14:	2891.68	2684.13	26.4849	108.113
Length  512, alignment  0, acc len 15:	2996.96	2885.29	27.8608	107.727
Length  512, alignment  7, acc len 15:	3064.74	2966.17	27.0803	108.58
Length  512, alignment  0, acc len 16:	3076.52	2958.13	29.0371	108.314
Length  512, alignment  0, acc len 16:	3107.33	2990.83	28.3449	108.064
Length  512, alignment  0, acc len 17:	3177.32	3059.33	31.2363	108.987
Length  512, alignment  1, acc len 17:	3172.46	3069.73	28.1314	107.973
Length  512, alignment  0, acc len 18:	3211.05	3118.81	29.9118	109.198
Length  512, alignment  2, acc len 18:	3333.02	3233.41	30.426	108.701
Length  512, alignment  0, acc len 19:	3103.66	3081.45	31.1606	110.374
Length  512, alignment  3, acc len 19:	3404.47	3395.08	30.2296	109.722
Length  512, alignment  0, acc len 20:	3400.39	3245.21	31.6207	110.818
Length  512, alignment  4, acc len 20:	3524.71	3356.27	31.1228	108.766
Length  512, alignment  0, acc len 21:	3568.52	3460.77	32.1403	110.513
Length  512, alignment  5, acc len 21:	3579.43	3445.33	32.3676	110.26
Length  512, alignment  0, acc len 22:	3731.93	3574.64	32.6562	112.131
Length  512, alignment  6, acc len 22:	3593.7	3513.75	31.712	113.662
Length  512, alignment  0, acc len 23:	3899.12	3798.32	33.3421	113.752
Length  512, alignment  7, acc len 23:	3780.45	3649.02	32.6627	114.075
Length  512, alignment  0, acc len 24:	3705.25	3579.57	35.6307	114.15
Length  512, alignment  0, acc len 24:	3967.54	3866.18	33.8688	114.265
Length  512, alignment  0, acc len 25:	3744.28	3648.79	32.7681	114.713
Length  512, alignment  1, acc len 25:	3785.42	3661.69	36.2923	114.425
Length  512, alignment  0, acc len 26:	4142.5	4029.88	35.6899	114.899
Length  512, alignment  2, acc len 26:	4057.13	3934.84	34.8831	114.915
Length  512, alignment  0, acc len 27:	4027.67	3887.32	38.4689	115.313
Length  512, alignment  3, acc len 27:	3916.26	3798.77	38.7165	115.693
Length  512, alignment  0, acc len 28:	4329.61	4174.72	36.6861	115.864
Length  512, alignment  4, acc len 28:	4369.4	4230.17	39.0696	114.393
Length  512, alignment  0, acc len 29:	3995.37	3851.98	38.5862	115.957
Length  512, alignment  5, acc len 29:	4234.66	4109.18	41.4066	116.188
Length  512, alignment  0, acc len 30:	4511.06	4332.48	40.3588	116.472
Length  512, alignment  6, acc len 30:	4170.88	4022.48	41.0703	116.598
Length  512, alignment  0, acc len 31:	4521.69	4365.26	40.7933	116.902
Length  512, alignment  7, acc len 31:	4628.83	4492.56	42.9545	117.202
Length   32, alignment  0, acc len  4:	52.5355	43.6484	5.23536	15.9326
Length   32, alignment  1, acc len  4:	45.552	41.3702	5.22929	15.8095
Length   64, alignment  0, acc len  4:	103.975	76.2126	5.92029	21.5882
Length   64, alignment  2, acc len  4:	125.545	93.3049	6.13565	21.5875
Length  128, alignment  0, acc len  4:	239.404	164.031	7.58818	34.6127
Length  128, alignment  3, acc len  4:	208.7	159.147	7.44646	34.6478
Length  256, alignment  0, acc len  4:	525.345	370.674	10.3231	58.2144
Length  256, alignment  4, acc len  4:	579.087	395.787	11.0268	58.2586
Length  512, alignment  0, acc len  4:	1249.25	935.88	21.3232	105.863
Length  512, alignment  5, acc len  4:	1243.73	1020.16	20.9738	106.294
Length 1024, alignment  0, acc len  4:	2710.92	2332.64	32.3364	200.645
Length 1024, alignment  6, acc len  4:	2714.92	2309.63	32.6725	200.986
Length 2048, alignment  0, acc len  4:	5974.44	5372.36	54.7526	391.385
Length 2048, alignment  7, acc len  4:	6030.32	5367.72	54.8706	392.65
Length   64, alignment  1, acc len 10:	250.607	186.599	10.8846	23.0727
Length   64, alignment  2, acc len 10:	230.08	171.582	10.9436	23.0379
Length   64, alignment  3, acc len 10:	284.557	212.213	10.6227	23.1637
Length   64, alignment  4, acc len 10:	247.059	207.956	10.5483	23.0075
Length   64, alignment  5, acc len 10:	263.605	206.827	11.8137	23.2043
Length   64, alignment  6, acc len 10:	296.242	228.421	9.1267	22.6872
Length   64, alignment  7, acc len 10:	212.138	180.413	10.3011	22.9391
Length    0, alignment  0, acc len  6:	3.32213	8.07111	6.22917	5.96161
Length    1, alignment  0, acc len  6:	4.56817	8.48249	6.25301	5.99805
Length    2, alignment  0, acc len  6:	6.18119	10.7659	6.22741	6.29288
Length    3, alignment  0, acc len  6:	8.85657	12.3108	5.95951	6.44751
Length    4, alignment  0, acc len  6:	12.0155	13.8204	6.33045	6.99398
Length    5, alignment  0, acc len  6:	13.8749	14.895	8.15747	6.76193
Length    6, alignment  0, acc len  6:	12.4949	15.5357	6.31256	7.07944
Length    7, alignment  0, acc len  6:	14.3167	15.2466	6.17634	6.87319
Length    8, alignment  0, acc len  6:	24.751	30.125	6.22883	7.99929
Length    9, alignment  0, acc len  6:	21.4579	22.0981	5.84266	7.41211
Length   10, alignment  0, acc len  6:	23.5194	24.6926	6.13078	7.81329
Length   11, alignment  0, acc len  6:	30.3735	25.1774	5.96374	7.81649
Length   12, alignment  0, acc len  6:	26.2157	26.4195	6.26471	8.14905
Length   13, alignment  0, acc len  6:	36.1732	29.8355	6.2092	8.42183
Length   14, alignment  0, acc len  6:	30.1253	26.2416	6.15966	8.53726
Length   15, alignment  0, acc len  6:	33.7244	30.8948	6.23129	8.36203
Length   16, alignment  0, acc len  6:	31.2174	29.0324	6.0853	9.27288
Length   17, alignment  0, acc len  6:	36.2489	30.9087	6.59999	8.95024
Length   18, alignment  0, acc len  6:	51.8956	37.8179	6.35872	9.23441
Length   19, alignment  0, acc len  6:	41.3922	35.1824	6.07694	9.34331
Length   20, alignment  0, acc len  6:	48.9254	36.5903	6.02779	9.89817
Length   21, alignment  0, acc len  6:	43.7562	41.805	6.17979	9.96868
Length   22, alignment  0, acc len  6:	44.7326	37.0931	6.89179	10.0459
Length   23, alignment  0, acc len  6:	38.5002	38.7758	6.23219	10.57
Length   24, alignment  0, acc len  6:	52.4731	44.8537	6.1358	14.4954
Length   25, alignment  0, acc len  6:	57.187	42.8237	6.63698	14.7604
Length   26, alignment  0, acc len  6:	68.3347	53.6213	6.2175	14.6173
Length   27, alignment  0, acc len  6:	74.0379	57.1295	6.19542	14.6373
Length   28, alignment  0, acc len  6:	65.2933	62.7405	6.0781	15.8798
Length   29, alignment  0, acc len  6:	65.5747	56.0641	6.25383	15.4522
Length   30, alignment  0, acc len  6:	75.6583	56.2281	6.36321	15.9754
Length   31, alignment  0, acc len  6:	84.8686	62.3259	6.12682	15.3354
Length   32, alignment  0, acc len  6:	73.9963	56.9503	6.6889	15.8259
Length   33, alignment  0, acc len  6:	69.3866	56.496	6.39783	15.9334
Length   34, alignment  0, acc len  6:	67.1967	59.0674	6.42574	16.1852
Length   35, alignment  0, acc len  6:	85.8922	69.1943	6.68884	16.971
Length   36, alignment  0, acc len  6:	72.5091	58.7165	9.30953	16.5378
Length   37, alignment  0, acc len  6:	104.197	75.3247	6.94711	16.6234
Length   38, alignment  0, acc len  6:	89.9286	62.3453	6.45051	16.8367
Length   39, alignment  0, acc len  6:	80.9778	68.4045	6.42049	16.747
Length   40, alignment  0, acc len  6:	84.8193	66.9211	6.96988	17.2986
Length   41, alignment  0, acc len  6:	109.166	75.6623	6.42093	17.6362
Length   42, alignment  0, acc len  6:	94.7032	72.7409	6.41623	17.8529
Length   43, alignment  0, acc len  6:	98.0899	75.8267	6.42737	17.7268
Length   44, alignment  0, acc len  6:	103.917	71.6253	6.48582	18.3325
Length   45, alignment  0, acc len  6:	86.8204	73.0092	6.94889	18.01
Length   46, alignment  0, acc len  6:	97.1165	82.2894	6.39182	18.2828
Length   47, alignment  0, acc len  6:	102.259	83.406	6.38933	18.1787
Length   48, alignment  0, acc len  6:	121.315	87.8436	6.81386	18.677
Length   49, alignment  0, acc len  6:	103.611	85.5284	6.9292	19.1381
Length   50, alignment  0, acc len  6:	121.671	87.2411	6.96069	19.1947
Length   51, alignment  0, acc len  6:	121.456	85.9471	7.10546	19.0865
Length   52, alignment  0, acc len  6:	123.801	86.2739	6.88835	19.4765
Length   53, alignment  0, acc len  6:	153.225	110.283	6.80725	19.6336
Length   54, alignment  0, acc len  6:	114.041	108.634	6.80306	20.1115
Length   55, alignment  0, acc len  6:	120.472	91.8054	6.85198	20.5753
Length   56, alignment  0, acc len  6:	137.04	92.8001	7.16925	20.1679
Length   57, alignment  0, acc len  6:	111.641	85.7518	6.87663	20.5458
Length   58, alignment  0, acc len  6:	146.098	115.752	7.0804	20.6941
Length   59, alignment  0, acc len  6:	144.115	105.135	6.807	21.4629
Length   60, alignment  0, acc len  6:	146.968	101.309	7.01036	21.1251
Length   61, alignment  0, acc len  6:	154.694	104.987	7.03012	21.0183
Length   62, alignment  0, acc len  6:	138.684	100.28	7.02394	21.3737
Length   63, alignment  0, acc len  6:	132.218	108.357	6.82063	22.4299


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] powerpc: Add optimized P8 strspn
  2016-03-25 17:10 [PATCH] powerpc: Add optimized P8 strspn Paul E. Murphy
@ 2016-03-28 17:55 ` Adhemerval Zanella
  0 siblings, 0 replies; 2+ messages in thread
From: Adhemerval Zanella @ 2016-03-28 17:55 UTC (permalink / raw)
  To: libc-alpha

LGTM with some comments below:

On 25-03-2016 14:01, Paul E. Murphy wrote:
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S
> @@ -0,0 +1,44 @@
> +/* Optimized strspn implementation for POWER8.
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#if IS_IN (libc)

AFAIK strspn is not build either for the loader or for another library,
so I think there is no need for this preprocessor check.

> +
> +#undef EALIGN
> +#define EALIGN(name, alignt, words)				\
> +  .section ".text";						\
> +  ENTRY_2(__strspn_power8)					\
> +  .align ALIGNARG(alignt);					\
> +  EALIGN_W_##words;						\
> +  BODY_LABEL(__strspn_power8):					\
> +  cfi_startproc;						\
> +  LOCALENTRY(__strspn_power8)
> +
> +#undef END
> +#define END(name)						\
> +  cfi_endproc;							\
> +  TRACEBACK(__strspn_power8)					\
> +  END_2(__strspn_power8)
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#endif
> +
> +#include <sysdeps/powerpc/powerpc64/power8/strspn.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.S b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.S
> new file mode 100644
> index 0000000..4e870a9
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.S
> @@ -0,0 +1,44 @@
> +/* Optimized strspn implementation for POWER8.
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#if IS_IN (libc)

Same as before.

> +
> +#undef EALIGN
> +#define EALIGN(name, alignt, words)				\
> +  .section ".text";						\
> +  ENTRY_2(__strspn_ppc)						\
> +  .align ALIGNARG(alignt);					\
> +  EALIGN_W_##words;						\
> +  BODY_LABEL(__strspn_ppc):					\
> +  cfi_startproc;						\
> +  LOCALENTRY(__strspn_ppc)
> +
> +#undef END
> +#define END(name)						\
> +  cfi_endproc;							\
> +  TRACEBACK(__strspn_ppc)					\
> +  END_2(__strspn_ppc)
> +
> +#undef libc_hidden_builtin_def
> +# define libc_hidden_builtin_def(name)
> +
> +#endif
> +
> +#include <sysdeps/powerpc/powerpc64/strspn.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn.c b/sysdeps/powerpc/powerpc64/multiarch/strspn.c
> new file mode 100644
> index 0000000..8769de3
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn.c
> @@ -0,0 +1,37 @@
> +/* Multiple versions of strspn. PowerPC64 version.
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)

Same as before.

> +# include <string.h>
> +# include <shlib-compat.h>
> +# include "init-arch.h"
> +
> +#undef strspn
> +extern __typeof (strspn) __libc_strspn;
> +
> +extern __typeof (strspn) __strspn_ppc attribute_hidden;
> +extern __typeof (strspn) __strspn_power8 attribute_hidden;
> +
> +libc_ifunc (__libc_strspn,
> +	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> +	    ? __strspn_power8
> +	    : __strspn_ppc);
> +
> +weak_alias (__libc_strspn, strspn)
> +libc_hidden_builtin_def (strspn)
> +#endif
> diff --git a/sysdeps/powerpc/powerpc64/power8/strspn.S b/sysdeps/powerpc/powerpc64/power8/strspn.S
> new file mode 100644
> index 0000000..dd1838e
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/power8/strspn.S
> @@ -0,0 +1,179 @@
> +/* Optimized strspn implementation for Power8.
> +
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* size_t [r3] strspn (const char *string [r3],
> +                       const char *needleAccept [r4]  */
> +
> +/* This takes a novel approach by computing a 256 bit mask whereby
> +   each set bit implies the byte is "accepted". P8 vector hardware
> +   has extremely efficient hardware for selecting bits from a mask.
> +
> +   One might ask "why not use bpermd for short strings"?  It is
> +   so slow that its performance about matches the generic PPC64
> +   variant without any fancy masking, with the added expense of
> +   making the mask. That was the first variant of this.  */
> +
> +
> +
> +#include "sysdep.h"
> +
> +/* Simple macro to use VSX instructions in overlapping VR's.  */
> +#define XXVR(insn, vrt, vra, vrb) \
> +	insn 32+vrt, 32+vra, 32+vrb
> +
> +/* ISA 2.07B instructions are not all defined for older binutils.
> +   Macros are defined below for these newer instructions in order
> +   to maintain compatibility.  */
> +
> +/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs.  */
> +#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
> +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
> +
> +#define VBPERMQ(t,a,b) .long (0x1000054c \
> +			      | ((t)<<(32-11))	\
> +			      | ((a)<<(32-16))	\
> +			      | ((b)<<(32-21)) )
> +
> +	/* This can be updated to power8 once the minimum version of
> +	   binutils supports power8 and the above instructions.  */
> +	.machine power7
> +EALIGN(strspn, 4, 0)
> +	CALL_MCOUNT 2
> +
> +	/* Generate useful constants for later on.  */
> +	vspltisb v1, 7
> +	vspltisb v2, -1
> +	vslb	v1, v1, v1	/* 0x80 to swap high bit for vbpermq.  */
> +	vspltisb v10, 0
> +	vsldoi	v4, v10, v2, 2	/* 0xFFFF into vr4.  */
> +	XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches.  */
> +
> +	/* Prepare to compute 256b mask.  */
> +	addi	r4, r4, -1
> +	li	r5, 0
> +	li	r6, 0
> +	li	r7, 0
> +	li	r8, 0
> +	li	r11, 1
> +	sldi	r11, r11, 63
> +
> +	/* Start interleaved Mask computation.
> +	   This will eventually or 1's into ignored bits from vbpermq.  */
> +	lvsr	v11, r0, r3
> +	vspltb  v11, v11, 0	/* Splat shift constant.  */
> +
> +	/* Build a 256b mask in r5-r8.  */
> +	.align 4
> +L(next_needle):
> +	lbzu	r9, 1(r4)
> +
> +	cmpldi	cr0, r9, 0
> +	cmpldi	cr1, r9, 128
> +
> +	/* This is a little tricky. srd only uses the first 7 bits,
> +	   and if bit 7 is set, value is always 0. So, we can
> +	   effectively shift 128b in this case.  */
> +	xori	r12, r9,  0x40	/* Invert bit 6.  */
> +	srd	r10, r11, r9	/* Mask for bits 0-63.  */
> +	srd	r12, r11, r12	/* Mask for bits 64-127.  */
> +
> +	beq	cr0, L(start_cmp)
> +
> +	/* Now, or the value into the correct GPR.  */
> +	bge cr1,L(needle_gt128)
> +	or	r5, r5, r10	/* 0 - 63.  */
> +	or	r6, r6, r12	/* 64 - 127.  */
> +	b L(next_needle)
> +
> +	.align 4
> +L(needle_gt128):
> +	or	r7, r7, r10	/* 128 - 191.  */
> +	or	r8, r8, r12	/* 192 - 255.  */
> +	b L(next_needle)
> +
> +
> +	.align 4
> +L(start_cmp):
> +	/* Move and merge bitmap into 2 VRs.  bpermd is slower on P8.  */
> +	mr	r0, r3		/* Save r3 for final length computation.  */
> +	MTVRD (v5, r5)
> +	MTVRD (v6, r6)
> +	MTVRD (v7, r7)
> +	MTVRD (v8, r8)
> +
> +	/* Continue interleaved mask generation.  */
> +#ifdef __LITTLE_ENDIAN__
> +	vsrw	v11, v2, v11	/* Note, shift ignores higher order bits. */
> +	vsplth  v11, v11, 0	/* Only care about the high 16 bits of v10.  */
> +#else
> +	vslw	v11, v2, v11	/* Note, shift ignores higher order bits. */
> +	vsplth  v11, v11, 1	/* Only care about the low 16 bits of v10.  */
> +#endif
> +	lvx	v0, r0, r3	/* Note, unaligned load ignores lower bits.  */
> +
> +	/* Do the merging of the bitmask.  */
> +	XXVR(xxmrghd, v5, v5, v6)
> +	XXVR(xxmrghd, v6, v7, v8)
> +
> +	/* Finish mask generation.  */
> +	vand	v11, v11, v4	/* Throwaway bits not in the mask.  */
> +
> +	/* Compare the first 1-16B, while masking unwanted bytes.  */
> +	clrrdi  r3, r3, 4	/* Note,  counts from qw boundaries.  */
> +	vxor	v9, v0, v1	/* Swap high bit.  */
> +	VBPERMQ (v8, v5, v0)
> +	VBPERMQ (v7, v6, v9)
> +	vor	v7, v7, v8
> +	vor	v7, v7, v11	/* Ignore non-participating bytes.  */
> +	vcmpequh. v8, v7, v4
> +	bnl	cr6, L(done)
> +
> +	addi	r3, r3, 16
> +
> +	.align 4
> +L(vec):
> +	lvx	v0, r0, r3
> +	addi	r3, r3, 16
> +	vxor	v9, v0, v1	/* Swap high bit.  */
> +	VBPERMQ (v8, v5, v0)
> +	VBPERMQ (v7, v6, v9)
> +	vor	v7, v7, v8
> +	vcmpequh. v8, v7, v4
> +	blt	cr6, L(vec)
> +
> +	addi	r3, r3, -16
> +L(done):
> +	subf	r3, r0, r3
> +	MFVRD (r10, v7)
> +
> +#ifdef __LITTLE_ENDIAN__
> +	addi	r0,  r10, 1	/* Count the trailing 1's.  */
> +	andc	r10, r10, r0
> +	popcntd	r10, r10
> +#else
> +	xori	r10, r10, 0xffff /* Count leading 1's by inverting.  */
> +	addi	r3,  r3,  -48	/* Account for the extra leading zeros.  */
> +	cntlzd  r10, r10
> +#endif
> +
> +	add	r3, r3, r10
> +	blr
> +
> +END(strspn)
> +libc_hidden_builtin_def (strspn)
> 

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2016-03-28 17:55 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-25 17:10 [PATCH] powerpc: Add optimized P8 strspn Paul E. Murphy
2016-03-28 17:55 ` Adhemerval Zanella

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).