public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH 4/4] Remove powerpc64 strspn, strcspn, and strpbrk implementation
  2016-03-31 14:01 [PATCH 0/4] Improve generic strspn/strcspn/strpbrk Adhemerval Zanella
@ 2016-03-31 14:01 ` Adhemerval Zanella
  2016-03-31 14:01 ` [PATCH 2/4] Improve generic strspn performance Adhemerval Zanella
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 12+ messages in thread
From: Adhemerval Zanella @ 2016-03-31 14:01 UTC (permalink / raw)
  To: libc-alpha

This patch removes the powerpc64 optimized strspn, strcspn, and
strpbrk assembly implementation now that the default C one
implements the same strategy.  On internal glibc benchtests
current implementations shows similar performance with -O2.

Tested on powerpc64le (POWER8).

	* sysdeps/powerpc/powerpc64/strcspn.S: Remove file.
	* sysdeps/powerpc/powerpc64/strpbrk.S: Remove file.
	* sysdeps/powerpc/powerpc64/strspn.S: Remove file.
---
 ChangeLog                           |   4 +
 sysdeps/powerpc/powerpc64/strcspn.S | 127 -------------------------------
 sysdeps/powerpc/powerpc64/strpbrk.S | 135 ---------------------------------
 sysdeps/powerpc/powerpc64/strspn.S  | 144 ------------------------------------
 4 files changed, 4 insertions(+), 406 deletions(-)
 delete mode 100644 sysdeps/powerpc/powerpc64/strcspn.S
 delete mode 100644 sysdeps/powerpc/powerpc64/strpbrk.S
 delete mode 100644 sysdeps/powerpc/powerpc64/strspn.S

diff --git a/sysdeps/powerpc/powerpc64/strcspn.S b/sysdeps/powerpc/powerpc64/strcspn.S
deleted file mode 100644
index 31e619d..0000000
--- a/sysdeps/powerpc/powerpc64/strcspn.S
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Optimized strcspn implementation for PowerPC64.
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* size_t [r3] strcspn (const char [r4] *s, const char [r5] *reject)  */
-
-EALIGN (strcspn, 4, 0)
-	CALL_MCOUNT 3
-
-	/* The idea to speed up the algorithm is to create a lookup table
-	   for fast check if input character should be considered.  For ASCII
-	   or ISO-8859-X character sets it has 256 positions.  */
-
-	/* PPC64 ELF ABI stack is aligned to 16 bytes.  */
-	addi 	r9,r1,-256
-	/* Clear the table with 0 values  */
-	li	r6, 0
-	li	r8, 4
-	mtctr	r8
-	mr	r10, r9
-	.align 	4
-L(zerohash):
-	std	r6, 0(r10)
-	std	r6, 8(r10)
-	std	r6, 16(r10)
-	std	r6, 24(r10)
-	std	r6, 32(r10)
-	std	r6, 40(r10)
-	std	r6, 48(r10)
-	std	r6, 56(r10)
-	addi	r10, r10, 64
-	bdnz	L(zerohash)
-
-	lbz	r10,0(r4)
-	cmpdi	cr7,r10,0	/* reject[0] == '\0' ?  */
-	li	r8,1
-	beq     cr7,L(finish_table)  /* If reject[0] == '\0' skip  */
-
-	/* Initialize the table as:
-	   for (i=0; reject[i]; i++
-	     table[reject[i]]] = 1  */
-	.align	4
-L(init_table):
-	stbx	r8,r9,r10
-	lbzu	r10,1(r4)
-	cmpdi	cr7,r10,0           /* If reject[0] == '\0' finish  */
-	bne	cr7,L(init_table)
-L(finish_table):
-	/* set table[0] = 1  */
-	li 	r10,1
-	stb	r10,0(r9)
-	li	r10,0
-	b	L(mainloop)
-
-	/* Unrool the loop 4 times and check using the table as:
-	   i = 0;
-	   while (1)
-	     {
-	       if (table[input[i++]] == 1)
-	         return i - 1;
-	       if (table[input[i++]] == 1)
-	         return i - 1;
-	       if (table[input[i++]] == 1)
-	         return i - 1;
-	       if (table[input[i++]] == 1)
-	         return i - 1;
-	     }  */
-	.align 4
-L(unroll):
-	lbz	r8,1(r3)
-	addi	r10,r10,4
-	lbzx	r8,r9,r8
-	cmpwi	r7,r8,1
-	beq	cr7,L(end)
-	lbz	r8,2(r3)
-	addi	r3,r3,4
-	lbzx	r8,r9,r8
-	cmpwi	cr7,r8,1
-	beq	cr7,L(end2)
-	lbz	r8,3(r7)
-	lbzx	r8,r9,r8
-	cmpwi	cr7,r8,1
-	beq	cr7,L(end3)
-L(mainloop):
-	lbz	r8,0(r3)
-	mr	r7,r3
-	addi	r6,r10,1
-	addi	r4,r10,2
-	addi	r5,r10,3
-	lbzx	r8,r9,8
-	cmpwi	cr7,r8,1
-	bne	cr7,L(unroll)
-	mr	r3,r10
-	blr
-
-	.align 4
-L(end):
-	mr	r3,r6
-	blr
-
-	.align 4
-L(end2):
-	mr	r3,r4
-	blr
-
-	.align 4
-L(end3):
-	mr	r3,r5
-	blr
-END (strcspn)
-libc_hidden_builtin_def (strcspn)
diff --git a/sysdeps/powerpc/powerpc64/strpbrk.S b/sysdeps/powerpc/powerpc64/strpbrk.S
deleted file mode 100644
index 5e9d1a6..0000000
--- a/sysdeps/powerpc/powerpc64/strpbrk.S
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Optimized strpbrk implementation for PowerPC64.
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* char [r3] *strpbrk(const char [r4] *s, const char [r5] *accept)  */
-
-EALIGN (strpbrk, 4, 0)
-	CALL_MCOUNT 3
-
-	lbz	r10,0(r4)
-	cmpdi	cr7,r10,0	/* accept[0] == '\0' ?  */
-	beq	cr7,L(nullfound)
-
-	/* The idea to speed up the algorithm is to create a lookup table
-	   for fast check if input character should be considered.  For ASCII
-	   or ISO-8859-X character sets it has 256 positions.  */
-
-	/* PPC64 ELF ABI stack is aligned to 16 bytes.  */
-	addi 	r9,r1,-256
-	/* Clear the table with 0 values  */
-	li	r6, 0
-	li	r7, 4
-	mtctr	r7
-	mr	r8, r9
-	.align 	4
-L(zerohash):
-	std	r6, 0(r8)
-	std	r6, 8(r8)
-	std	r6, 16(r8)
-	std	r6, 24(r8)
-	std	r6, 32(r8)
-	std	r6, 40(r8)
-	std	r6, 48(r8)
-	std	r6, 56(r8)
-	addi	r8, r8, 64
-	bdnz	L(zerohash)
-
-	/* Initialize the table as:
-	   for (i=0; accept[i]; i++
-	     table[accept[i]]] = 1  */
-	li      r0,1
-	.align 4
-L(init_table):
-	stbx	r0,r9,r10
-	lbzu	r10,1(r4)
-	cmpdi	r0,r10,0
-	bne	cr0,L(init_table)
-L(finish_table):
-	/* set table[0] = 1  */
-	li	r4,1
-	stb	r4,0(r9)
-	b	L(mainloop)
-
-	/* Unrool the loop 4 times and check using the table as:
-	   i = 0;
-	   while (1)
-	     {
-	       if (table[input[i++]] == 1)
-	         return (s[i -1] ? s + i - 1: NULL);
-	       if (table[input[i++]] == 1)
-	         return (s[i -1] ? s + i - 1: NULL);
-	       if (table[input[i++]] == 1)
-	         return (s[i -1] ? s + i - 1: NULL);
-	       if (table[input[i++]] == 1)
-	         return (s[i -1] ? s + i - 1: NULL);
-	     }  */
-	.align 4
-L(unroll):
-	lbz	r0,1(r3)
-	lbzx	r8,r9,r0
-	cmpwi	cr6,r8,1
-	beq	cr6,L(checkend2)
-	lbz	r10,2(r3)
-	lbzx	r4,r9,r10
-	cmpwi	cr7,r4,1
-	beq	cr7,L(checkend3)
-	lbz	r12,3(r3)
-	addi	r3,r3,4
-	lbzx	r11,r9,r12
-	cmpwi	cr0,r11,1
-	beq	cr0,L(checkend)
-L(mainloop):
-	lbz	r12,0(r3)
-	addi	r11,r3,1
-	addi	r5,r3,2
-	addi	r7,r3,3
-	lbzx	r6,r9,r12
-	cmpwi	cr1,r6,1
-	bne	cr1,L(unroll)
-	cmpdi	cr0,r12,0
-	beq	cr0,L(nullfound)
-L(end):
-	blr
-
-	.align 4
-L(checkend):
-	cmpdi	cr1,r12,0
-	mr	r3,r7
-	bne	cr1,L(end)
-L(nullfound):
-	/* return NULL  */
-	li 3,0
-	blr
-
-	.align 4
-L(checkend2):
-	cmpdi	cr7,r0,0
-	mr	r3,r11
-	beq	cr7,L(nullfound)
-	blr
-
-	.align 4
-L(checkend3):
-	cmpdi	cr6,r10,0
-	mr	r3,r5
-	beq	cr6,L(nullfound)
-	blr
-END (strpbrk)
-libc_hidden_builtin_def (strpbrk)
diff --git a/sysdeps/powerpc/powerpc64/strspn.S b/sysdeps/powerpc/powerpc64/strspn.S
deleted file mode 100644
index cf10da1..0000000
--- a/sysdeps/powerpc/powerpc64/strspn.S
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Optimized strspn implementation for PowerPC64.
-
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-/* size_t [r3] strspn (const char *string [r3],
-                       const char *needleAccept [r4]  */
-
-/* Performance gains are grabbed through following techniques:
-
-   > hashing of needle.
-   > hashing avoids scanning of duplicate entries in needle
-     across the string.
-   > unrolling when scanning for character in string
-     across hash table.  */
-
-/* Algorithm is as below:
-   1. A empty hash table/dictionary is created comprising of
-      256 ascii character set
-   2. When hash entry is found in needle , the hash index
-      is initialized to 1
-   3. The string is scanned until end and for every character,
-      its corresponding hash index is compared.
-   4. initial length of string (count) until first hit of
-      accept needle to be found is set to 0
-   4. If hash index is set to 1 for the index of string,
-      count is returned.
-   5. Otherwise count is incremented and scanning continues
-      until end of string.  */
-
-#include <sysdep.h>
-
-EALIGN(strspn, 4, 0)
-	CALL_MCOUNT 3
-
-	/* PPC64 ELF ABI stack is aligned to 16 bytes.  */
-	addi 	r9,r1,-256
-	/* Clear the table with 0 values  */
-	li	r6, 0
-	li	r8, 4
-	mtctr	r8
-	mr	r10, r9
-	.align 	4
-L(zerohash):
-	std	r6, 0(r10)
-	std	r6, 8(r10)
-	std	r6, 16(r10)
-	std	r6, 24(r10)
-	std	r6, 32(r10)
-	std	r6, 40(r10)
-	std	r6, 48(r10)
-	std	r6, 56(r10)
-	addi	r10, r10, 64
-	bdnz	L(zerohash)
-
-	lbz	r10,0(r4)
-	li r8, 1		/* r8=1, marker into hash if found in
-				   needle  */
-	cmpdi cr7, r10, 0	/* accept needle is NULL  */
-	beq cr7, L(skipHashing)	/* if needle is NULL, skip hashing  */
-
-	.align 4		/* align section to 16 byte boundary  */
-L(hashing):
-	stbx r8, r9, r10	/* update hash with marker for the pivot of
-				   the needle  */
-	lbzu r10, 1(r4)		/* load needle into r10 and update to next  */
-	cmpdi cr7, r10, 0	/* if needle is has reached NULL, continue  */
-	bne cr7, L(hashing)	/* loop to hash the needle  */
-
-L(skipHashing):
-	li r10, 0		/* load counter = 0  */
-	b L(beginScan)
-
-	.align 4		/* align section to 16 byte boundary  */
-L(scanUnroll):
-	lbzx r8, r9, r8		/* load r8 with hash value at index  */
-	cmpwi cr7, r8, 0	/* if we hit marker in hash, we have found
-				   accept needle  */
-	beq cr7, L(ret1stIndex)	/* we have hit accept needle, return the
-				   count  */
-
-	lbz r8, 1(r3)		/* load string[1] into r8  */
-	addi r10, r10, 4	/* increment counter  */
-	lbzx r8, r9, r8		/* load r8 with hash value at index  */
-	cmpwi cr7, r8, 0	/* if we hit marker in hash, we have found
-				   accept needle  */
-	beq cr7, L(ret2ndIndex)	/* we have hit accept needle, return the
-				   count  */
-
-	lbz r8, 2(r3)		/* load string[2] into r8  */
-	lbzx r8, r9, r8		/* load r8 with hash value at index  */
-	cmpwi cr7, r8, 0	/* if we hit marker in hash, we have found
-				   accept needle  */
-	beq cr7, L(ret3rdIndex)	/* we have hit accept needle, return the
-				   count  */
-
-	lbz r8, 3(r3)		/* load string[3] into r8  */
-	lbzx r8, r9, r8		/* load r8 with hash value at index  */
-	addi r3, r3, 4		/* unroll factor , increment string by 4  */
-	cmpwi cr7, r8, 0	/* if we hit marker in hash, we have found
-				   accept needle  */
-	beq cr7,L(ret4thIndex)	/* we have hit accept needle, return the
-				   count  */
-
-L(beginScan):
-	lbz r8, 0(r3)		/* load string[0] into r8  */
-	addi r6, r10, 1		/* place holder for counter + 1  */
-	addi r5, r10, 2		/* place holder for counter + 2  */
-	addi r4, r10, 3		/* place holder for counter + 3  */
-	cmpdi cr7, r8, 0	/* if we hit marker in hash, we have found
-				   accept needle  */
-	bne cr7, L(scanUnroll)	/* continue scanning  */
-
-L(ret1stIndex):
-	mr r3, r10		/* update r3 for return  */
-	blr			/* return  */
-
-L(ret2ndIndex):
-	mr r3, r6		/* update r3 for return  */
-	blr			/* return  */
-
-L(ret3rdIndex):
-	mr r3, r5		/* update r3 for return  */
-	blr			/* return  */
-
-L(ret4thIndex):
-	mr r3, r4		/* update r3 for return  */
-	blr			/* done  */
-END(strspn)
-libc_hidden_builtin_def (strspn)
-- 
1.9.1

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 0/4] Improve generic strspn/strcspn/strpbrk
@ 2016-03-31 14:01 Adhemerval Zanella
  2016-03-31 14:01 ` [PATCH 4/4] Remove powerpc64 strspn, strcspn, and strpbrk implementation Adhemerval Zanella
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Adhemerval Zanella @ 2016-03-31 14:01 UTC (permalink / raw)
  To: libc-alpha

Changes from previous version:

 * Add some optimization on strspn implementation from Wilco
   comments.

 * Move the inline __str{spn,cspn,brk}_* symbol from the string2.h
   header to string-inlike and added compatibility version since
   they will not be generated anymore.

---


This is a followup on previous Wilco patch [1] to optimize strcspn
that also optimizes the generic strspn and strpbrk.  I used the
same strategy Wilco has used on strcspn for strspn and rewrote
strpbrk to just use strcspn instead.

I also tried to play with compiler options to check if it could
omit the memset call generation when initializing the lookup
table, but without success.  This is a similar strategy I used
on powerpc64 str{c}spn optimization.

Wilco initial approach was to remove the __strcspn_c{1,2,3}
inline function in string2.h header, however they are part of
ABI (to support compilers that do not inline the calls) and it
is not safe to remove then.  I have added it back, although the
strcspn new macro does not uses them and I also used the same
strategy for both strspn and strpbrk.

Performance-wise the algorithm is similar with current optimized
assembly one already in GLIBC (x86 and powerpc).  In fact, for
powerpc64 the algorithm performance is similar to assembly
routines which lead me to remove them.  i686 default one
is slight faster, while the SSE4.1 variant shows much better
performance (through the use of SIMD instructions).

[1] https://sourceware.org/ml/libc-alpha/2016-01/msg00173.html

Adhemerval Zanella (3):
  Improve generic strspn performance
  Improve generic strpbrk performance
  Remove powerpc64 strspn, strcspn, and strpbrk implementation

Wilco Dijkstra (1):
  Improve generic strcspn performance

 ChangeLog                           |  46 ++++++++
 string/Versions                     |   2 +
 string/bits/string2.h               | 208 ++----------------------------------
 string/strcspn.c                    |  44 ++++++--
 string/string-inlines.c             |  97 +++++++++++++++++
 string/strpbrk.c                    |  12 +--
 string/strspn.c                     |  54 +++++++---
 sysdeps/i386/string-inlines.c       |  19 +---
 sysdeps/powerpc/powerpc64/strcspn.S | 127 ----------------------
 sysdeps/powerpc/powerpc64/strpbrk.S | 135 -----------------------
 sysdeps/powerpc/powerpc64/strspn.S  | 144 -------------------------
 11 files changed, 233 insertions(+), 655 deletions(-)
 delete mode 100644 sysdeps/powerpc/powerpc64/strcspn.S
 delete mode 100644 sysdeps/powerpc/powerpc64/strpbrk.S
 delete mode 100644 sysdeps/powerpc/powerpc64/strspn.S

-- 
1.9.1

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2/4] Improve generic strspn performance
  2016-03-31 14:01 [PATCH 0/4] Improve generic strspn/strcspn/strpbrk Adhemerval Zanella
  2016-03-31 14:01 ` [PATCH 4/4] Remove powerpc64 strspn, strcspn, and strpbrk implementation Adhemerval Zanella
@ 2016-03-31 14:01 ` Adhemerval Zanella
  2016-03-31 14:01 ` [PATCH 3/4] Improve generic strpbrk performance Adhemerval Zanella
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 12+ messages in thread
From: Adhemerval Zanella @ 2016-03-31 14:01 UTC (permalink / raw)
  To: libc-alpha

As for strcspn, this patch improves strspn performance using a much
faster algorithm.  It first constructs a 256-entry table based on
the accept string and then uses it as a lookup table for the
input string.  As for strcspn optimization, it is generally at least
10 times faster than the existing implementation on bench-strspn
on a few AArch64 implementations.

Also the string/bits/string2.h inlines make no longer sense, as current
implementation will already implement most of the optimizations.

Tested on x86_64, i686, and aarch64.

	* string/strspn.c (strcspn): Rewrite function.
	* string/bits/string2.h (strspn): Use __builtin_strcspn.
	(__strspn_c1): Remove inline function.
	(__strspn_c2): Likewise.
	(__strspn_c3): Likewise.
	* string/string-inlines.c
	[SHLIB_COMPAT(libc, GLIBC_2_1_1, GLIBC_2_24)] (__strspn_c1): Add
	compatibility symbol.
	[SHLIB_COMPAT(libc, GLIBC_2_1_1, GLIBC_2_24)] (__strspn_c2):
	Likewise.
	[SHLIB_COMPAT(libc, GLIBC_2_1_1, GLIBC_2_24)] (__strspn_c3):
	Likewise.
---
 ChangeLog               | 15 ++++++++++
 string/bits/string2.h   | 74 ++-----------------------------------------------
 string/string-inlines.c | 36 ++++++++++++++++++++++++
 string/strspn.c         | 54 ++++++++++++++++++++++++++----------
 4 files changed, 94 insertions(+), 85 deletions(-)

diff --git a/string/bits/string2.h b/string/bits/string2.h
index a8df0db..75a66a1 100644
--- a/string/bits/string2.h
+++ b/string/bits/string2.h
@@ -914,78 +914,10 @@ __stpcpy_small (char *__dest,
 
 /* Return the length of the initial segment of S which
    consists entirely of characters in ACCEPT.  */
-#if !defined _HAVE_STRING_ARCH_strspn || defined _FORCE_INLINES
-# ifndef _HAVE_STRING_ARCH_strspn
-#  if __GNUC_PREREQ (3, 2)
-#   define strspn(s, accept) \
-  __extension__								      \
-  ({ char __a0, __a1, __a2;						      \
-     (__builtin_constant_p (accept) && __string2_1bptr_p (accept)	      \
-      ? ((__builtin_constant_p (s) && __string2_1bptr_p (s))		      \
-	 ? __builtin_strspn (s, accept)					      \
-	 : ((__a0 = ((const char *) (accept))[0], __a0 == '\0')		      \
-	    ? ((void) (s), (size_t) 0)					      \
-	    : ((__a1 = ((const char *) (accept))[1], __a1 == '\0')	      \
-	       ? __strspn_c1 (s, __a0)					      \
-	       : ((__a2 = ((const char *) (accept))[2], __a2 == '\0')	      \
-		  ? __strspn_c2 (s, __a0, __a1)				      \
-		  : (((const char *) (accept))[3] == '\0'		      \
-		     ? __strspn_c3 (s, __a0, __a1, __a2)		      \
-		     : __builtin_strspn (s, accept))))))		      \
-      : __builtin_strspn (s, accept)); })
-#  else
-#   define strspn(s, accept) \
-  __extension__								      \
-  ({ char __a0, __a1, __a2;						      \
-     (__builtin_constant_p (accept) && __string2_1bptr_p (accept)	      \
-      ? ((__a0 = ((const char *) (accept))[0], __a0 == '\0')		      \
-	 ? ((void) (s), (size_t) 0)					      \
-	 : ((__a1 = ((const char *) (accept))[1], __a1 == '\0')		      \
-	    ? __strspn_c1 (s, __a0)					      \
-	    : ((__a2 = ((const char *) (accept))[2], __a2 == '\0')	      \
-	       ? __strspn_c2 (s, __a0, __a1)				      \
-	       : (((const char *) (accept))[3] == '\0'			      \
-		  ? __strspn_c3 (s, __a0, __a1, __a2)			      \
-		  : strspn (s, accept)))))				      \
-      : strspn (s, accept)); })
-#  endif
+#ifndef _HAVE_STRING_ARCH_strspn
+# if __GNUC_PREREQ (3, 2)
+#  define strspn(s, accept) __builtin_strspn (s, accept)
 # endif
-
-__STRING_INLINE size_t __strspn_c1 (const char *__s, int __accept);
-__STRING_INLINE size_t
-__strspn_c1 (const char *__s, int __accept)
-{
-  size_t __result = 0;
-  /* Please note that __accept never can be '\0'.  */
-  while (__s[__result] == __accept)
-    ++__result;
-  return __result;
-}
-
-__STRING_INLINE size_t __strspn_c2 (const char *__s, int __accept1,
-				    int __accept2);
-__STRING_INLINE size_t
-__strspn_c2 (const char *__s, int __accept1, int __accept2)
-{
-  size_t __result = 0;
-  /* Please note that __accept1 and __accept2 never can be '\0'.  */
-  while (__s[__result] == __accept1 || __s[__result] == __accept2)
-    ++__result;
-  return __result;
-}
-
-__STRING_INLINE size_t __strspn_c3 (const char *__s, int __accept1,
-				    int __accept2, int __accept3);
-__STRING_INLINE size_t
-__strspn_c3 (const char *__s, int __accept1, int __accept2, int __accept3)
-{
-  size_t __result = 0;
-  /* Please note that __accept1 to __accept3 never can be '\0'.  */
-  while (__s[__result] == __accept1 || __s[__result] == __accept2
-	 || __s[__result] == __accept3)
-    ++__result;
-  return __result;
-}
 #endif
 
 
diff --git a/string/string-inlines.c b/string/string-inlines.c
index 83bdd6c..754b315 100644
--- a/string/string-inlines.c
+++ b/string/string-inlines.c
@@ -71,4 +71,40 @@ __old_strcspn_c3 (const char *__s, int __reject1, int __reject2,
   return __result;
 }
 compat_symbol (libc, __old_strcspn_c3, __strcspn_c3, GLIBC_2_1_1);
+
+size_t
+__old_strspn_c1 (const char *__s, int __accept)
+{
+  size_t __result = 0;
+  /* Please note that __accept never can be '\0'.  */
+  while (__s[__result] == __accept)
+    ++__result;
+  return __result;
+}
+compat_symbol (libc, __old_strspn_c1, __strspn_c1, GLIBC_2_1_1);
+
+size_t
+__old_strspn_c2 (const char *__s, int __accept1, int __accept2)
+{
+  size_t __result = 0;
+  /* Please note that __accept1 and __accept2 never can be '\0'.  */
+  while (__s[__result] == __accept1 || __s[__result] == __accept2)
+    ++__result;
+  return __result;
+}
+compat_symbol (libc, __old_strspn_c2, __strspn_c2, GLIBC_2_1_1);
+
+size_t
+__old_strspn_c3 (const char *__s, int __accept1, int __accept2,
+		 int __accept3)
+{
+  size_t __result = 0;
+  /* Please note that __accept1 to __accept3 never can be '\0'.  */
+  while (__s[__result] == __accept1 || __s[__result] == __accept2
+	 || __s[__result] == __accept3)
+    ++__result;
+  return __result;
+}
+compat_symbol (libc, __old_strspn_c3, __strspn_c3, GLIBC_2_1_1);
+
 #endif
diff --git a/string/strspn.c b/string/strspn.c
index f0635c1..30f7747 100644
--- a/string/strspn.c
+++ b/string/strspn.c
@@ -25,23 +25,49 @@
 /* Return the length of the maximum initial segment
    of S which contains only characters in ACCEPT.  */
 size_t
-STRSPN (const char *s, const char *accept)
+STRSPN (const char *str, const char *accept)
 {
-  const char *p;
-  const char *a;
-  size_t count = 0;
-
-  for (p = s; *p != '\0'; ++p)
+  if (accept[0] == '\0')
+    return 0;
+  if (__glibc_unlikely (accept[1] == '\0'))
     {
-      for (a = accept; *a != '\0'; ++a)
-	if (*p == *a)
-	  break;
-      if (*a == '\0')
-	return count;
-      else
-	++count;
+      const char *a = str;
+      for (; *str == *accept; str++);
+      return str - a;
     }
 
-  return count;
+  /* Use multiple small memsets to enable inlining on most targets.  */
+  unsigned char table[256];
+  unsigned char *p = memset (table, 0, 64);
+  memset (p + 64, 0, 64);
+  memset (p + 128, 0, 64);
+  memset (p + 192, 0, 64);
+
+  unsigned char *s = (unsigned char*) accept;
+  /* Different from strcspn it does not add the NULL on the table
+     so can avoid check if str[i] is NULL, since table['\0'] will
+     be 0 and thus stopping the loop check.  */
+  do
+    p[*s++] = 1;
+  while (*s);
+
+  s = (unsigned char*) str;
+  if (!p[s[0]]) return 0;
+  if (!p[s[1]]) return 1;
+  if (!p[s[2]]) return 2;
+  if (!p[s[3]]) return 3;
+
+  s = (unsigned char *) ((size_t)(s) & ~3);
+  unsigned int c0, c1, c2, c3;
+  do {
+      s += 4;
+      c0 = p[s[0]];
+      c1 = p[s[1]];
+      c2 = p[s[2]];
+      c3 = p[s[3]];
+  } while ((c0 & c1 & c2 & c3) != 0);
+
+  size_t count = s - (unsigned char *) str;
+  return (c0 & c1) == 0 ? count + c0 : count + c2 + 2;
 }
 libc_hidden_builtin_def (strspn)
-- 
1.9.1

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/4] Improve generic strcspn performance
  2016-03-31 14:01 [PATCH 0/4] Improve generic strspn/strcspn/strpbrk Adhemerval Zanella
                   ` (2 preceding siblings ...)
  2016-03-31 14:01 ` [PATCH 3/4] Improve generic strpbrk performance Adhemerval Zanella
@ 2016-03-31 14:01 ` Adhemerval Zanella
  2016-03-31 14:49   ` Wilco Dijkstra
  2016-03-31 18:29 ` [PATCH 0/4] Improve generic strspn/strcspn/strpbrk Adhemerval Zanella
  4 siblings, 1 reply; 12+ messages in thread
From: Adhemerval Zanella @ 2016-03-31 14:01 UTC (permalink / raw)
  To: libc-alpha; +Cc: Wilco Dijkstra

From: Wilco Dijkstra <wdijkstr@arm.com>

Improve strcspn performance using a much faster algorithm.  It is kept simple
so it works well on most targets.  It is generally at least 10 times faster
than the existing implementation on bench-strcspn on a few AArch64
implementations, and for some tests 100 times as fast (repeatedly calling
strchr on a small string is extremely slow...).

In fact the string/bits/string2.h inlines make no longer sense, as GCC
already uses strlen if reject is an empty string, strchrnul is 5 times as
fast as __strcspn_c1, while __strcspn_c2 and __strcspn_c3 are slower than
the strcspn main loop for large strings (though reject length 2-4 could be
special cased in the future to gain even more performance).

Tested on x86_64, i686, and aarch64.

	* string/Version (libc): Add GLIBC_2.24.
	* string/strcspn.c (strcspn): Rewrite function.
	* string/bits/string2.h (strcspn): Use __builtin_strcspn.
	(__strcspn_c1): Remove inline function.
	(__strcspn_c2): Likewise.
	(__strcspn_c3): Likewise.
	* string/string-inline.c
	[SHLIB_COMPAT(libc, GLIBC_2_1_1, GLIBC_2_24)] (__strcspn_c1): Add
	compatibility symbol.
	[SHLIB_COMPAT(libc, GLIBC_2_1_1, GLIBC_2_24)] (__strcspn_c2):
	Likewise.
	[SHLIB_COMPAT(libc, GLIBC_2_1_1, GLIBC_2_24)] (__strcspn_c3):
	Likewise.
---
 ChangeLog                     | 17 ++++++++++
 string/Versions               |  2 ++
 string/bits/string2.h         | 73 ++-----------------------------------------
 string/strcspn.c              | 44 +++++++++++++++++++++-----
 string/string-inlines.c       | 40 ++++++++++++++++++++++++
 sysdeps/i386/string-inlines.c | 19 +----------
 6 files changed, 99 insertions(+), 96 deletions(-)

diff --git a/string/Versions b/string/Versions
index 59bf35a..475c1fd 100644
--- a/string/Versions
+++ b/string/Versions
@@ -80,4 +80,6 @@ libc {
   GLIBC_2.6 {
     strerror_l;
   }
+  GLIBC_2.24 {
+  }
 }
diff --git a/string/bits/string2.h b/string/bits/string2.h
index 8200ef1..a8df0db 100644
--- a/string/bits/string2.h
+++ b/string/bits/string2.h
@@ -905,77 +905,10 @@ __stpcpy_small (char *__dest,
 
 /* Return the length of the initial segment of S which
    consists entirely of characters not in REJECT.  */
-#if !defined _HAVE_STRING_ARCH_strcspn || defined _FORCE_INLINES
-# ifndef _HAVE_STRING_ARCH_strcspn
-#  if __GNUC_PREREQ (3, 2)
-#   define strcspn(s, reject) \
-  __extension__								      \
-  ({ char __r0, __r1, __r2;						      \
-     (__builtin_constant_p (reject) && __string2_1bptr_p (reject)	      \
-      ? ((__builtin_constant_p (s) && __string2_1bptr_p (s))		      \
-	 ? __builtin_strcspn (s, reject)				      \
-	 : ((__r0 = ((const char *) (reject))[0], __r0 == '\0')		      \
-	    ? strlen (s)						      \
-	    : ((__r1 = ((const char *) (reject))[1], __r1 == '\0')	      \
-	       ? __strcspn_c1 (s, __r0)					      \
-	       : ((__r2 = ((const char *) (reject))[2], __r2 == '\0')	      \
-		  ? __strcspn_c2 (s, __r0, __r1)			      \
-		  : (((const char *) (reject))[3] == '\0'		      \
-		     ? __strcspn_c3 (s, __r0, __r1, __r2)		      \
-		     : __builtin_strcspn (s, reject))))))		      \
-      : __builtin_strcspn (s, reject)); })
-#  else
-#   define strcspn(s, reject) \
-  __extension__								      \
-  ({ char __r0, __r1, __r2;						      \
-     (__builtin_constant_p (reject) && __string2_1bptr_p (reject)	      \
-      ? ((__r0 = ((const char *) (reject))[0], __r0 == '\0')		      \
-	 ? strlen (s)							      \
-	 : ((__r1 = ((const char *) (reject))[1], __r1 == '\0')		      \
-	    ? __strcspn_c1 (s, __r0)					      \
-	    : ((__r2 = ((const char *) (reject))[2], __r2 == '\0')	      \
-	       ? __strcspn_c2 (s, __r0, __r1)				      \
-	       : (((const char *) (reject))[3] == '\0'			      \
-		  ? __strcspn_c3 (s, __r0, __r1, __r2)			      \
-		  : strcspn (s, reject)))))				      \
-      : strcspn (s, reject)); })
-#  endif
+#ifndef _HAVE_STRING_ARCH_strcspn
+# if __GNUC_PREREQ (3, 2)
+#  define strcspn(s, reject) __builtin_strcspn (s, reject)
 # endif
-
-__STRING_INLINE size_t __strcspn_c1 (const char *__s, int __reject);
-__STRING_INLINE size_t
-__strcspn_c1 (const char *__s, int __reject)
-{
-  size_t __result = 0;
-  while (__s[__result] != '\0' && __s[__result] != __reject)
-    ++__result;
-  return __result;
-}
-
-__STRING_INLINE size_t __strcspn_c2 (const char *__s, int __reject1,
-				     int __reject2);
-__STRING_INLINE size_t
-__strcspn_c2 (const char *__s, int __reject1, int __reject2)
-{
-  size_t __result = 0;
-  while (__s[__result] != '\0' && __s[__result] != __reject1
-	 && __s[__result] != __reject2)
-    ++__result;
-  return __result;
-}
-
-__STRING_INLINE size_t __strcspn_c3 (const char *__s, int __reject1,
-				     int __reject2, int __reject3);
-__STRING_INLINE size_t
-__strcspn_c3 (const char *__s, int __reject1, int __reject2,
-	      int __reject3)
-{
-  size_t __result = 0;
-  while (__s[__result] != '\0' && __s[__result] != __reject1
-	 && __s[__result] != __reject2 && __s[__result] != __reject3)
-    ++__result;
-  return __result;
-}
 #endif
 
 
diff --git a/string/strcspn.c b/string/strcspn.c
index 8888919..89ba4ca 100644
--- a/string/strcspn.c
+++ b/string/strcspn.c
@@ -26,16 +26,44 @@
 /* Return the length of the maximum initial segment of S
    which contains no characters from REJECT.  */
 size_t
-STRCSPN (const char *s, const char *reject)
+STRCSPN (const char *str, const char *reject)
 {
-  size_t count = 0;
+  if (reject[0] == '\0' || reject[1] == '\0')
+    return __strchrnul (str, reject [0]) - str;
 
-  while (*s != '\0')
-    if (strchr (reject, *s++) == NULL)
-      ++count;
-    else
-      return count;
+  /* Use multiple small memsets to enable inlining on most targets.  */
+  unsigned char table[256];
+  unsigned char *p = memset (table, 0, 64);
+  memset (p + 64, 0, 64);
+  memset (p + 128, 0, 64);
+  memset (p + 192, 0, 64);
 
-  return count;
+  unsigned char *s = (unsigned char*) reject;
+  unsigned char tmp;
+  do
+    p[tmp = *s++] = 1;
+  while (tmp);
+
+  s = (unsigned char*) str;
+  if (p[s[0]]) return 0;
+  if (p[s[1]]) return 1;
+  if (p[s[2]]) return 2;
+  if (p[s[3]]) return 3;
+
+  s = (unsigned char *) ((size_t)s & ~3);
+
+  unsigned int c0, c1, c2, c3;
+  do
+    {
+      s += 4;
+      c0 = p[s[0]];
+      c1 = p[s[1]];
+      c2 = p[s[2]];
+      c3 = p[s[3]];
+    }
+  while ((c0 | c1 | c2 | c3) == 0);
+
+  size_t count = s - (unsigned char *) str;
+  return (c0 | c1) != 0 ? count - c0 + 1 : count - c2 + 3;
 }
 libc_hidden_builtin_def (strcspn)
diff --git a/string/string-inlines.c b/string/string-inlines.c
index 16db3ea..83bdd6c 100644
--- a/string/string-inlines.c
+++ b/string/string-inlines.c
@@ -32,3 +32,43 @@
 #undef __NO_INLINE__
 #include <bits/string.h>
 #include <bits/string2.h>
+
+#include "shlib-compat.h"
+
+#if SHLIB_COMPAT (libc, GLIBC_2_1_1, GLIBC_2_24)
+/* The inline functions are not used from GLIBC 2.24 and forward, however
+   they are required to provide the symbols through string-inlines.c
+   (if inlining is not possible for compatibility reasons).  */
+size_t
+__old_strcspn_c1 (const char *__s, int __reject)
+{
+  size_t __result = 0;
+  while (__s[__result] != '\0' && __s[__result] != __reject)
+    ++__result;
+  return __result;
+}
+compat_symbol (libc, __old_strcspn_c1, __strcspn_c1, GLIBC_2_1_1);
+
+size_t
+__old_strcspn_c2 (const char *__s, int __reject1, int __reject2)
+{
+  size_t __result = 0;
+  while (__s[__result] != '\0' && __s[__result] != __reject1
+	 && __s[__result] != __reject2)
+    ++__result;
+  return __result;
+}
+compat_symbol (libc, __old_strcspn_c2, __strcspn_c2, GLIBC_2_1_1);
+
+size_t
+__old_strcspn_c3 (const char *__s, int __reject1, int __reject2,
+	      int __reject3)
+{
+  size_t __result = 0;
+  while (__s[__result] != '\0' && __s[__result] != __reject1
+	 && __s[__result] != __reject2 && __s[__result] != __reject3)
+    ++__result;
+  return __result;
+}
+compat_symbol (libc, __old_strcspn_c3, __strcspn_c3, GLIBC_2_1_1);
+#endif
diff --git a/sysdeps/i386/string-inlines.c b/sysdeps/i386/string-inlines.c
index c7de270..64d80e8 100644
--- a/sysdeps/i386/string-inlines.c
+++ b/sysdeps/i386/string-inlines.c
@@ -15,27 +15,10 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/*  <bits/string.h> and <bits/string2.h> declare some extern inline
-    functions.  These functions are declared additionally here if
-    inlining is not possible.  */
-
-#undef __USE_STRING_INLINES
-#define __USE_STRING_INLINES
-#define _FORCE_INLINES
-#define __STRING_INLINE /* empty */
-#define __NO_INLINE__
-
 /* This is to avoid PLT entries for the x86 version.  */
 #define __memcpy_g __memcpy_g_internal
 #define __strchr_g __strchr_g_internal
-
-#include <string.h>
-#undef index
-#undef rindex
-
-#undef __NO_INLINE__
-#include <bits/string.h>
-#include <bits/string2.h>
+#include <string/string-inlines.c>
 
 void *
 (__memcpy_c) (void *d, const void *s, size_t n)
-- 
1.9.1

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 3/4] Improve generic strpbrk performance
  2016-03-31 14:01 [PATCH 0/4] Improve generic strspn/strcspn/strpbrk Adhemerval Zanella
  2016-03-31 14:01 ` [PATCH 4/4] Remove powerpc64 strspn, strcspn, and strpbrk implementation Adhemerval Zanella
  2016-03-31 14:01 ` [PATCH 2/4] Improve generic strspn performance Adhemerval Zanella
@ 2016-03-31 14:01 ` Adhemerval Zanella
  2016-03-31 14:01 ` [PATCH 1/4] Improve generic strcspn performance Adhemerval Zanella
  2016-03-31 18:29 ` [PATCH 0/4] Improve generic strspn/strcspn/strpbrk Adhemerval Zanella
  4 siblings, 0 replies; 12+ messages in thread
From: Adhemerval Zanella @ 2016-03-31 14:01 UTC (permalink / raw)
  To: libc-alpha

With now a faster strcspn implementation, it is also faster to just use
it with some return tests than reimplementing strpbrk itself.
As for strcspn optimization, it is generally at least 10 times faster
than the existing implementation on bench-strspn on a few AArch64
implementations.

Also the string/bits/string2.h inlines make no longer sense, as current
implementation will already implement most of the optimizations.

Tested on x86_64, i386, and aarch64.

	* string/strpbrk.c (strpbrk): Rewrite function.
	* string/bits/string2.h (strpbrk): Use __builtin_strpbrk.
	(__strpbrk_c2): Likewise.
	(__strpbrk_c3): Likewise.
	* string/string-inlines.c
	[SHLIB_COMPAT(libc, GLIBC_2_1_1, GLIBC_2_24)] (__strpbrk_c2):
	Likewise.
	[SHLIB_COMPAT(libc, GLIBC_2_1_1, GLIBC_2_24)] (__strpbrk_c3):
	Likewise.
---
 ChangeLog               | 10 ++++++++
 string/bits/string2.h   | 61 +++----------------------------------------------
 string/string-inlines.c | 21 +++++++++++++++++
 string/strpbrk.c        | 12 ++--------
 4 files changed, 36 insertions(+), 68 deletions(-)

diff --git a/string/bits/string2.h b/string/bits/string2.h
index 75a66a1..5a138dc 100644
--- a/string/bits/string2.h
+++ b/string/bits/string2.h
@@ -922,65 +922,10 @@ __stpcpy_small (char *__dest,
 
 
 /* Find the first occurrence in S of any character in ACCEPT.  */
-#if !defined _HAVE_STRING_ARCH_strpbrk || defined _FORCE_INLINES
-# ifndef _HAVE_STRING_ARCH_strpbrk
-#  if __GNUC_PREREQ (3, 2)
-#   define strpbrk(s, accept) \
-  __extension__								      \
-  ({ char __a0, __a1, __a2;						      \
-     (__builtin_constant_p (accept) && __string2_1bptr_p (accept)	      \
-      ? ((__builtin_constant_p (s) && __string2_1bptr_p (s))		      \
-	 ? __builtin_strpbrk (s, accept)				      \
-	 : ((__a0 = ((const char  *) (accept))[0], __a0 == '\0')	      \
-	    ? ((void) (s), (char *) NULL)				      \
-	    : ((__a1 = ((const char *) (accept))[1], __a1 == '\0')	      \
-	       ? __builtin_strchr (s, __a0)				      \
-	       : ((__a2 = ((const char *) (accept))[2], __a2 == '\0')	      \
-		  ? __strpbrk_c2 (s, __a0, __a1)			      \
-		  : (((const char *) (accept))[3] == '\0'		      \
-		     ? __strpbrk_c3 (s, __a0, __a1, __a2)		      \
-		     : __builtin_strpbrk (s, accept))))))		      \
-      : __builtin_strpbrk (s, accept)); })
-#  else
-#   define strpbrk(s, accept) \
-  __extension__								      \
-  ({ char __a0, __a1, __a2;						      \
-     (__builtin_constant_p (accept) && __string2_1bptr_p (accept)	      \
-      ? ((__a0 = ((const char  *) (accept))[0], __a0 == '\0')		      \
-	 ? ((void) (s), (char *) NULL)					      \
-	 : ((__a1 = ((const char *) (accept))[1], __a1 == '\0')		      \
-	    ? strchr (s, __a0)						      \
-	    : ((__a2 = ((const char *) (accept))[2], __a2 == '\0')	      \
-	       ? __strpbrk_c2 (s, __a0, __a1)				      \
-	       : (((const char *) (accept))[3] == '\0'			      \
-		  ? __strpbrk_c3 (s, __a0, __a1, __a2)			      \
-		  : strpbrk (s, accept)))))				      \
-      : strpbrk (s, accept)); })
-#  endif
+#ifndef _HAVE_STRING_ARCH_strpbrk
+# if __GNUC_PREREQ (3, 2)
+#  define strpbrk(s, accept) __builtin_strpbrk (s, accept)
 # endif
-
-__STRING_INLINE char *__strpbrk_c2 (const char *__s, int __accept1,
-				    int __accept2);
-__STRING_INLINE char *
-__strpbrk_c2 (const char *__s, int __accept1, int __accept2)
-{
-  /* Please note that __accept1 and __accept2 never can be '\0'.  */
-  while (*__s != '\0' && *__s != __accept1 && *__s != __accept2)
-    ++__s;
-  return *__s == '\0' ? NULL : (char *) (size_t) __s;
-}
-
-__STRING_INLINE char *__strpbrk_c3 (const char *__s, int __accept1,
-				    int __accept2, int __accept3);
-__STRING_INLINE char *
-__strpbrk_c3 (const char *__s, int __accept1, int __accept2, int __accept3)
-{
-  /* Please note that __accept1 to __accept3 never can be '\0'.  */
-  while (*__s != '\0' && *__s != __accept1 && *__s != __accept2
-	 && *__s != __accept3)
-    ++__s;
-  return *__s == '\0' ? NULL : (char *) (size_t) __s;
-}
 #endif
 
 
diff --git a/string/string-inlines.c b/string/string-inlines.c
index 754b315..0648376 100644
--- a/string/string-inlines.c
+++ b/string/string-inlines.c
@@ -107,4 +107,25 @@ __old_strspn_c3 (const char *__s, int __accept1, int __accept2,
 }
 compat_symbol (libc, __old_strspn_c3, __strspn_c3, GLIBC_2_1_1);
 
+char *
+__strpbrk_c2 (const char *__s, int __accept1, int __accept2)
+{
+  /* Please note that __accept1 and __accept2 never can be '\0'.  */
+  while (*__s != '\0' && *__s != __accept1 && *__s != __accept2)
+    ++__s;
+  return *__s == '\0' ? NULL : (char *) (size_t) __s;
+}
+compat_symbol (libc, __old_strpbrk_c2, __strpbrk_c2, GLIBC_2_1_1);
+
+char *
+__strpbrk_c3 (const char *__s, int __accept1, int __accept2, int __accept3)
+{
+  /* Please note that __accept1 to __accept3 never can be '\0'.  */
+  while (*__s != '\0' && *__s != __accept1 && *__s != __accept2
+	 && *__s != __accept3)
+    ++__s;
+  return *__s == '\0' ? NULL : (char *) (size_t) __s;
+}
+compat_symbol (libc, __old_strpbrk_c3, __strpbrk_c3, GLIBC_2_1_1);
+
 #endif
diff --git a/string/strpbrk.c b/string/strpbrk.c
index fddd473..1ede719 100644
--- a/string/strpbrk.c
+++ b/string/strpbrk.c
@@ -27,15 +27,7 @@
 char *
 STRPBRK (const char *s, const char *accept)
 {
-  while (*s != '\0')
-    {
-      const char *a = accept;
-      while (*a != '\0')
-	if (*a++ == *s)
-	  return (char *) s;
-      ++s;
-    }
-
-  return NULL;
+  s += strcspn (s, accept);
+  return *s ? (char *)s : NULL;
 }
 libc_hidden_builtin_def (strpbrk)
-- 
1.9.1

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/4] Improve generic strcspn performance
  2016-03-31 14:01 ` [PATCH 1/4] Improve generic strcspn performance Adhemerval Zanella
@ 2016-03-31 14:49   ` Wilco Dijkstra
  2016-03-31 18:28     ` Adhemerval Zanella
  0 siblings, 1 reply; 12+ messages in thread
From: Wilco Dijkstra @ 2016-03-31 14:49 UTC (permalink / raw)
  To: Adhemerval Zanella, libc-alpha; +Cc: nd

Adhemerval Zanella wrote:

 size_t
-STRCSPN (const char *s, const char *reject)
+STRCSPN (const char *str, const char *reject)
 {
-  size_t count = 0;
+  if (reject[0] == '\0' || reject[1] == '\0')
+    return __strchrnul (str, reject [0]) - str;

I agree using __strchrnul is fine if reject[0] == '\0' as that should be very rare.
However it needs __glibc_unlikely again to avoid slowing down the common case.
It's odd that GCC doesn't predict that the first character(s) of a string are non-zero...

Other than that, it looks good.

Wilco

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/4] Improve generic strcspn performance
  2016-03-31 14:49   ` Wilco Dijkstra
@ 2016-03-31 18:28     ` Adhemerval Zanella
  0 siblings, 0 replies; 12+ messages in thread
From: Adhemerval Zanella @ 2016-03-31 18:28 UTC (permalink / raw)
  To: Wilco Dijkstra, libc-alpha; +Cc: nd



On 31-03-2016 11:49, Wilco Dijkstra wrote:
> Adhemerval Zanella wrote:
> 
>  size_t
> -STRCSPN (const char *s, const char *reject)
> +STRCSPN (const char *str, const char *reject)
>  {
> -  size_t count = 0;
> +  if (reject[0] == '\0' || reject[1] == '\0')
> +    return __strchrnul (str, reject [0]) - str;
> 
> I agree using __strchrnul is fine if reject[0] == '\0' as that should be very rare.
> However it needs __glibc_unlikely again to avoid slowing down the common case.
> It's odd that GCC doesn't predict that the first character(s) of a string are non-zero...
> 
> Other than that, it looks good.
> 
> Wilco
> 

Right, I will change that.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4] Improve generic strspn/strcspn/strpbrk
  2016-03-31 14:01 [PATCH 0/4] Improve generic strspn/strcspn/strpbrk Adhemerval Zanella
                   ` (3 preceding siblings ...)
  2016-03-31 14:01 ` [PATCH 1/4] Improve generic strcspn performance Adhemerval Zanella
@ 2016-03-31 18:29 ` Adhemerval Zanella
  4 siblings, 0 replies; 12+ messages in thread
From: Adhemerval Zanella @ 2016-03-31 18:29 UTC (permalink / raw)
  To: libc-alpha

Besides latest Wilco's reviews regarding the __glibc_expect for
strcspn and Richard's note about using uintptr_t instead of
size_t for pointer manipulation, is there any more review
about this set? If not I will commit it this in upcoming hours.

On 31-03-2016 11:00, Adhemerval Zanella wrote:
> Changes from previous version:
> 
>  * Add some optimization on strspn implementation from Wilco
>    comments.
> 
>  * Move the inline __str{spn,cspn,brk}_* symbol from the string2.h
>    header to string-inlike and added compatibility version since
>    they will not be generated anymore.
> 
> ---
> 
> 
> This is a followup on previous Wilco patch [1] to optimize strcspn
> that also optimizes the generic strspn and strpbrk.  I used the
> same strategy Wilco has used on strcspn for strspn and rewrote
> strpbrk to just use strcspn instead.
> 
> I also tried to play with compiler options to check if it could
> omit the memset call generation when initializing the lookup
> table, but without success.  This is a similar strategy I used
> on powerpc64 str{c}spn optimization.
> 
> Wilco initial approach was to remove the __strcspn_c{1,2,3}
> inline function in string2.h header, however they are part of
> ABI (to support compilers that do not inline the calls) and it
> is not safe to remove then.  I have added it back, although the
> strcspn new macro does not uses them and I also used the same
> strategy for both strspn and strpbrk.
> 
> Performance-wise the algorithm is similar with current optimized
> assembly one already in GLIBC (x86 and powerpc).  In fact, for
> powerpc64 the algorithm performance is similar to assembly
> routines which lead me to remove them.  i686 default one
> is slight faster, while the SSE4.1 variant shows much better
> performance (through the use of SIMD instructions).
> 
> [1] https://sourceware.org/ml/libc-alpha/2016-01/msg00173.html
> 
> Adhemerval Zanella (3):
>   Improve generic strspn performance
>   Improve generic strpbrk performance
>   Remove powerpc64 strspn, strcspn, and strpbrk implementation
> 
> Wilco Dijkstra (1):
>   Improve generic strcspn performance
> 
>  ChangeLog                           |  46 ++++++++
>  string/Versions                     |   2 +
>  string/bits/string2.h               | 208 ++----------------------------------
>  string/strcspn.c                    |  44 ++++++--
>  string/string-inlines.c             |  97 +++++++++++++++++
>  string/strpbrk.c                    |  12 +--
>  string/strspn.c                     |  54 +++++++---
>  sysdeps/i386/string-inlines.c       |  19 +---
>  sysdeps/powerpc/powerpc64/strcspn.S | 127 ----------------------
>  sysdeps/powerpc/powerpc64/strpbrk.S | 135 -----------------------
>  sysdeps/powerpc/powerpc64/strspn.S  | 144 -------------------------
>  11 files changed, 233 insertions(+), 655 deletions(-)
>  delete mode 100644 sysdeps/powerpc/powerpc64/strcspn.S
>  delete mode 100644 sysdeps/powerpc/powerpc64/strpbrk.S
>  delete mode 100644 sysdeps/powerpc/powerpc64/strspn.S
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/4] Improve generic strspn performance
  2016-03-28 15:20 ` [PATCH 2/4] Improve generic strspn performance Adhemerval Zanella
@ 2016-03-29 20:32   ` Tulio Magno Quites Machado Filho
  0 siblings, 0 replies; 12+ messages in thread
From: Tulio Magno Quites Machado Filho @ 2016-03-29 20:32 UTC (permalink / raw)
  To: Adhemerval Zanella, libc-alpha

Adhemerval Zanella <adhemerval.zanella@linaro.org> writes:

> As for strcspn, this patch improves strspn performance using a much
> faster algorithm.  It first constructs a 256-entry table based on
> the accept string and then uses it as a lookup table for the
> input string.  As for strcspn optimization, it is generally at least
> 10 times faster than the existing implementation on bench-strspn
> on a few AArch64 implementations.
>
> Also the string/bits/string2.h inlines make no longer sense, as current
> implementation will already implement most of the optimizations.
>
> Tested on x86_64, i686, and aarch64.

And on powerpc64 and powerpc64le.

Git reported some trailing whitespaces in this patch.

> diff --git a/string/strspn.c b/string/strspn.c
> index f0635c1..0547f41 100644
> --- a/string/strspn.c
> +++ b/string/strspn.c
> @@ -25,23 +25,49 @@
>  /* Return the length of the maximum initial segment
>     of S which contains only characters in ACCEPT.  */
>  size_t
> -STRSPN (const char *s, const char *accept)
> +STRSPN (const char *str, const char *accept)
>  {
> -  const char *p;
> -  const char *a;
> -  size_t count = 0;
> -
> -  for (p = s; *p != '\0'; ++p)
> -    {
> -      for (a = accept; *a != '\0'; ++a)
> -	if (*p == *a)
> -	  break;
> -      if (*a == '\0')
> -	return count;
> -      else
> -	++count;
> +  if (accept[0] == '\0')
> +    return 0;
> +  if (accept[1] == '\0')
> +    { 

Here.

> +      const char *a = str;
> +      for (; *str == *accept; str++);
> +      return str - a;
>      }
>
> -  return count;
> +  /* Use multiple small memsets to enable inlining on most targets.  */
> +  unsigned char table[256];
> +  unsigned char *p = memset (table, 0, 64);
> +  memset (p + 64, 0, 64);
> +  memset (p + 128, 0, 64);
> +  memset (p + 192, 0, 64);
> +
> +  unsigned char *s = (unsigned char*) accept;
> +  /* Different from strcspn it does not add the NULL on the table
> +     so can avoid check if str[i] is NULL, since table['\0'] will
> +     be 0 and thus stopping the loop check.  */
> +  do
> +    p[*s++] = 1;
> +  while (*s);
> +
> +  s = (unsigned char*) str;
> +  if (!p[s[0]]) return 0;
> +  if (!p[s[1]]) return 1;
> +  if (!p[s[2]]) return 2;
> +  if (!p[s[3]]) return 3;
> +          

Here.

> +  s = (unsigned char *) ((size_t)(s) & ~3);
> +  unsigned int c0, c1, c2, c3; 

And here.

-- 
Tulio Magno

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/4] Improve generic strspn performance
  2016-03-29 13:02   ` [PATCH 2/4] Improve generic strspn performance Wilco Dijkstra
@ 2016-03-29 14:08     ` Adhemerval Zanella
  0 siblings, 0 replies; 12+ messages in thread
From: Adhemerval Zanella @ 2016-03-29 14:08 UTC (permalink / raw)
  To: Wilco Dijkstra, libc-alpha; +Cc: nd



On 29-03-2016 10:02, Wilco Dijkstra wrote:
> Adhemerval Zanella wrote:
> 
>> +  if (accept[0] == '\0')
>> +    return 0;
>> +  if (accept[1] == '\0')
>> +    { 
> 
> GCC doesn't get the static branch prediction correct for the 2nd if,
> so it would be useful to add __glibc_unlikely given that single-character
> accepts are rare.
> 
>> +  s = (unsigned char *) ((size_t)(s) & ~3);
>> +  unsigned int c0, c1, c2, c3; 
>> +  do {
>> +      s += 4;
>> +      c0 = p[s[0]];
>> +      c1 = p[s[1]];
>> +      c2 = p[s[2]];
>> +      c3 = p[s[3]];
>> +  } while ((c0 && c1 && c2 && c3) == 1);
> 
> That should use '&' rather than '&&' and '!= 0' similar to how I did it in strcspn.
> This will use 3 AND(S) instructions and a single branch.
> 
>> +
>> +  size_t count = s - (unsigned char *) str;
>> +  return (c0 && c1) == 0 ? count - !c0 + 1 : count - !c2 + 3;
> 
> Again, c0 & c1 is better and allows CSE with the while expression above.
> Also -!c0 +1 is equivalent to c0, -!c2 + 3 is equivalent to c2 + 2 - this is simpler
> and faster.
> 
> Otherwise it looks good, and thanks for doing this one too!
> 
> Cheers,
> Wilco
> 

Thanks for the review.  I think this version fixes the points you noted:

diff --git a/string/strspn.c b/string/strspn.c
index f0635c1..15d7fa7 100644
--- a/string/strspn.c
+++ b/string/strspn.c
@@ -25,23 +25,49 @@
 /* Return the length of the maximum initial segment
    of S which contains only characters in ACCEPT.  */
 size_t
-STRSPN (const char *s, const char *accept)
+STRSPN (const char *str, const char *accept)
 {
-  const char *p;
-  const char *a;
-  size_t count = 0;
-
-  for (p = s; *p != '\0'; ++p)
-    {
-      for (a = accept; *a != '\0'; ++a)
-	if (*p == *a)
-	  break;
-      if (*a == '\0')
-	return count;
-      else
-	++count;
+  if (accept[0] == '\0')
+    return 0;
+  if (__glibc_unlikely (accept[1] == '\0'))
+    { 
+      const char *a = str;
+      for (; *str == *accept; str++);
+      return str - a;
     }
 
-  return count;
+  /* Use multiple small memsets to enable inlining on most targets.  */
+  unsigned char table[256];
+  unsigned char *p = memset (table, 0, 64);
+  memset (p + 64, 0, 64);
+  memset (p + 128, 0, 64);
+  memset (p + 192, 0, 64);
+
+  unsigned char *s = (unsigned char*) accept;
+  /* Different from strcspn it does not add the NULL on the table
+     so can avoid check if str[i] is NULL, since table['\0'] will
+     be 0 and thus stopping the loop check.  */
+  do
+    p[*s++] = 1;
+  while (*s);
+
+  s = (unsigned char*) str;
+  if (!p[s[0]]) return 0;
+  if (!p[s[1]]) return 1;
+  if (!p[s[2]]) return 2;
+  if (!p[s[3]]) return 3;
+          
+  s = (unsigned char *) ((size_t)(s) & ~3);
+  unsigned int c0, c1, c2, c3; 
+  do {
+      s += 4;
+      c0 = p[s[0]];
+      c1 = p[s[1]];
+      c2 = p[s[2]];
+      c3 = p[s[3]];
+  } while ((c0 & c1 & c2 & c3) != 0);
+
+  size_t count = s - (unsigned char *) str;
+  return (c0 & c1) == 0 ? count + c0 : count + c2 + 2;
 }
 libc_hidden_builtin_def (strspn)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/4] Improve generic strspn performance
  2016-03-28 15:20 ` [PATCH 1/4] Improve generic strcspn performance Adhemerval Zanella
@ 2016-03-29 13:02   ` Wilco Dijkstra
  2016-03-29 14:08     ` Adhemerval Zanella
  0 siblings, 1 reply; 12+ messages in thread
From: Wilco Dijkstra @ 2016-03-29 13:02 UTC (permalink / raw)
  To: Adhemerval Zanella, libc-alpha; +Cc: nd

Adhemerval Zanella wrote:

> +  if (accept[0] == '\0')
> +    return 0;
> +  if (accept[1] == '\0')
> +    { 

GCC doesn't get the static branch prediction correct for the 2nd if,
so it would be useful to add __glibc_unlikely given that single-character
accepts are rare.

> +  s = (unsigned char *) ((size_t)(s) & ~3);
> +  unsigned int c0, c1, c2, c3; 
> +  do {
> +      s += 4;
> +      c0 = p[s[0]];
> +      c1 = p[s[1]];
> +      c2 = p[s[2]];
> +      c3 = p[s[3]];
> +  } while ((c0 && c1 && c2 && c3) == 1);

That should use '&' rather than '&&' and '!= 0' similar to how I did it in strcspn.
This will use 3 AND(S) instructions and a single branch.

> +
> +  size_t count = s - (unsigned char *) str;
> +  return (c0 && c1) == 0 ? count - !c0 + 1 : count - !c2 + 3;

Again, c0 & c1 is better and allows CSE with the while expression above.
Also -!c0 +1 is equivalent to c0, -!c2 + 3 is equivalent to c2 + 2 - this is simpler
and faster.

Otherwise it looks good, and thanks for doing this one too!

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2/4] Improve generic strspn performance
  2016-03-28 15:20 Adhemerval Zanella
@ 2016-03-28 15:20 ` Adhemerval Zanella
  2016-03-29 20:32   ` Tulio Magno Quites Machado Filho
  2016-03-28 15:20 ` [PATCH 1/4] Improve generic strcspn performance Adhemerval Zanella
  1 sibling, 1 reply; 12+ messages in thread
From: Adhemerval Zanella @ 2016-03-28 15:20 UTC (permalink / raw)
  To: libc-alpha

As for strcspn, this patch improves strspn performance using a much
faster algorithm.  It first constructs a 256-entry table based on
the accept string and then uses it as a lookup table for the
input string.  As for strcspn optimization, it is generally at least
10 times faster than the existing implementation on bench-strspn
on a few AArch64 implementations.

Also the string/bits/string2.h inlines make no longer sense, as current
implementation will already implement most of the optimizations.

Tested on x86_64, i686, and aarch64.

	* string/strspn.c (strspn): Rewrite function.
	* string/bits/string2.h (strspn): Use __builtin_strcspn.
---
 ChangeLog             |  5 +++++
 string/bits/string2.h | 41 ++++++-------------------------------
 string/strspn.c       | 56 +++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 52 insertions(+), 50 deletions(-)

diff --git a/string/bits/string2.h b/string/bits/string2.h
index 1b87686..a1684eb 100644
--- a/string/bits/string2.h
+++ b/string/bits/string2.h
@@ -952,43 +952,14 @@ __strcspn_c3 (const char *__s, int __reject1, int __reject2,
 
 /* Return the length of the initial segment of S which
    consists entirely of characters in ACCEPT.  */
-#if !defined _HAVE_STRING_ARCH_strspn || defined _FORCE_INLINES
-# ifndef _HAVE_STRING_ARCH_strspn
-#  if __GNUC_PREREQ (3, 2)
-#   define strspn(s, accept) \
-  __extension__								      \
-  ({ char __a0, __a1, __a2;						      \
-     (__builtin_constant_p (accept) && __string2_1bptr_p (accept)	      \
-      ? ((__builtin_constant_p (s) && __string2_1bptr_p (s))		      \
-	 ? __builtin_strspn (s, accept)					      \
-	 : ((__a0 = ((const char *) (accept))[0], __a0 == '\0')		      \
-	    ? ((void) (s), (size_t) 0)					      \
-	    : ((__a1 = ((const char *) (accept))[1], __a1 == '\0')	      \
-	       ? __strspn_c1 (s, __a0)					      \
-	       : ((__a2 = ((const char *) (accept))[2], __a2 == '\0')	      \
-		  ? __strspn_c2 (s, __a0, __a1)				      \
-		  : (((const char *) (accept))[3] == '\0'		      \
-		     ? __strspn_c3 (s, __a0, __a1, __a2)		      \
-		     : __builtin_strspn (s, accept))))))		      \
-      : __builtin_strspn (s, accept)); })
-#  else
-#   define strspn(s, accept) \
-  __extension__								      \
-  ({ char __a0, __a1, __a2;						      \
-     (__builtin_constant_p (accept) && __string2_1bptr_p (accept)	      \
-      ? ((__a0 = ((const char *) (accept))[0], __a0 == '\0')		      \
-	 ? ((void) (s), (size_t) 0)					      \
-	 : ((__a1 = ((const char *) (accept))[1], __a1 == '\0')		      \
-	    ? __strspn_c1 (s, __a0)					      \
-	    : ((__a2 = ((const char *) (accept))[2], __a2 == '\0')	      \
-	       ? __strspn_c2 (s, __a0, __a1)				      \
-	       : (((const char *) (accept))[3] == '\0'			      \
-		  ? __strspn_c3 (s, __a0, __a1, __a2)			      \
-		  : strspn (s, accept)))))				      \
-      : strspn (s, accept)); })
-#  endif
+#ifndef _HAVE_STRING_ARCH_strspn
+# if __GNUC_PREREQ (3, 2)
+#  define strspn(s, accept) __builtin_strspn (s, accept)
 # endif
 
+/* The inline functions are not used from GLIBC 2.24 and forward, however
+   they are required to provide the symbols through string-inlines.c
+   (if inlining is not possible for compatibility reasons).  */
 __STRING_INLINE size_t __strspn_c1 (const char *__s, int __accept);
 __STRING_INLINE size_t
 __strspn_c1 (const char *__s, int __accept)
diff --git a/string/strspn.c b/string/strspn.c
index f0635c1..0547f41 100644
--- a/string/strspn.c
+++ b/string/strspn.c
@@ -25,23 +25,49 @@
 /* Return the length of the maximum initial segment
    of S which contains only characters in ACCEPT.  */
 size_t
-STRSPN (const char *s, const char *accept)
+STRSPN (const char *str, const char *accept)
 {
-  const char *p;
-  const char *a;
-  size_t count = 0;
-
-  for (p = s; *p != '\0'; ++p)
-    {
-      for (a = accept; *a != '\0'; ++a)
-	if (*p == *a)
-	  break;
-      if (*a == '\0')
-	return count;
-      else
-	++count;
+  if (accept[0] == '\0')
+    return 0;
+  if (accept[1] == '\0')
+    { 
+      const char *a = str;
+      for (; *str == *accept; str++);
+      return str - a;
     }
 
-  return count;
+  /* Use multiple small memsets to enable inlining on most targets.  */
+  unsigned char table[256];
+  unsigned char *p = memset (table, 0, 64);
+  memset (p + 64, 0, 64);
+  memset (p + 128, 0, 64);
+  memset (p + 192, 0, 64);
+
+  unsigned char *s = (unsigned char*) accept;
+  /* Different from strcspn it does not add the NULL on the table
+     so can avoid check if str[i] is NULL, since table['\0'] will
+     be 0 and thus stopping the loop check.  */
+  do
+    p[*s++] = 1;
+  while (*s);
+
+  s = (unsigned char*) str;
+  if (!p[s[0]]) return 0;
+  if (!p[s[1]]) return 1;
+  if (!p[s[2]]) return 2;
+  if (!p[s[3]]) return 3;
+          
+  s = (unsigned char *) ((size_t)(s) & ~3);
+  unsigned int c0, c1, c2, c3; 
+  do {
+      s += 4;
+      c0 = p[s[0]];
+      c1 = p[s[1]];
+      c2 = p[s[2]];
+      c3 = p[s[3]];
+  } while ((c0 && c1 && c2 && c3) == 1);
+
+  size_t count = s - (unsigned char *) str;
+  return (c0 && c1) == 0 ? count - !c0 + 1 : count - !c2 + 3;
 }
 libc_hidden_builtin_def (strspn)
-- 
1.9.1

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2016-03-31 18:29 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-31 14:01 [PATCH 0/4] Improve generic strspn/strcspn/strpbrk Adhemerval Zanella
2016-03-31 14:01 ` [PATCH 4/4] Remove powerpc64 strspn, strcspn, and strpbrk implementation Adhemerval Zanella
2016-03-31 14:01 ` [PATCH 2/4] Improve generic strspn performance Adhemerval Zanella
2016-03-31 14:01 ` [PATCH 3/4] Improve generic strpbrk performance Adhemerval Zanella
2016-03-31 14:01 ` [PATCH 1/4] Improve generic strcspn performance Adhemerval Zanella
2016-03-31 14:49   ` Wilco Dijkstra
2016-03-31 18:28     ` Adhemerval Zanella
2016-03-31 18:29 ` [PATCH 0/4] Improve generic strspn/strcspn/strpbrk Adhemerval Zanella
  -- strict thread matches above, loose matches on Subject: below --
2016-03-28 15:20 Adhemerval Zanella
2016-03-28 15:20 ` [PATCH 2/4] Improve generic strspn performance Adhemerval Zanella
2016-03-29 20:32   ` Tulio Magno Quites Machado Filho
2016-03-28 15:20 ` [PATCH 1/4] Improve generic strcspn performance Adhemerval Zanella
2016-03-29 13:02   ` [PATCH 2/4] Improve generic strspn performance Wilco Dijkstra
2016-03-29 14:08     ` Adhemerval Zanella

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).