[PATCH v2 03/16] Improve generic strlen

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH v2 03/16] Improve generic strlen
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
                   ` (10 preceding siblings ...)
  2016-12-21 23:06 ` [PATCH v2 12/16] hppa: Add string-fzb.h and string-fzi.h Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 05/16] Improve generic memchr Richard Henderson
                   ` (3 subsequent siblings)
  15 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

Extract has_zero and index_first_zero tests into headers that
can be tailored for the architecture.

	[BZ #5806]
    	* sysdeps/generic/string-fza.h: New file.
    	* sysdeps/generic/string-fzb.h: New file.
    	* sysdeps/generic/string-fzi.h: New file.
    	* sysdeps/generic/string-extbyte.h: New file.
    	* string/strlen.c: Use them.
---
 string/strlen.c                  |  89 ++++++------------------
 sysdeps/generic/string-extbyte.h |  35 ++++++++++
 sysdeps/generic/string-fza.h     | 117 +++++++++++++++++++++++++++++++
 sysdeps/generic/string-fzb.h     |  49 +++++++++++++
 sysdeps/generic/string-fzi.h     | 146 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 369 insertions(+), 67 deletions(-)
 create mode 100644 sysdeps/generic/string-extbyte.h
 create mode 100644 sysdeps/generic/string-fza.h
 create mode 100644 sysdeps/generic/string-fzb.h
 create mode 100644 sysdeps/generic/string-fzi.h

diff --git a/string/strlen.c b/string/strlen.c
index 4943ce2..4aa95d5 100644
--- a/string/strlen.c
+++ b/string/strlen.c
@@ -20,90 +20,45 @@
 
 #include <string.h>
 #include <stdlib.h>
+#include <stdint.h>
+#include <string-fzb.h>
+#include <string-fzi.h>
 
 #undef strlen
 
-#ifndef STRLEN
-# define STRLEN strlen
+#ifdef STRLEN
+# define strlen STRLEN
 #endif
 
 /* Return the length of the null-terminated string STR.  Scan for
    the null terminator quickly by testing four bytes at a time.  */
 size_t
-STRLEN (const char *str)
+strlen (const char *str)
 {
-  const char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, himagic, lomagic;
+  const char *char_ptr = str;
+  const op_t *word_ptr;
+  op_t word;
+  uintptr_t i, align;
 
   /* Handle the first few characters by reading one character at a time.
      Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = str; ((unsigned long int) char_ptr
-			& (sizeof (longword) - 1)) != 0;
-       ++char_ptr)
+  align = -(uintptr_t)char_ptr % sizeof(word);
+  for (i = 0; i < align; ++i, ++char_ptr)
     if (*char_ptr == '\0')
       return char_ptr - str;
 
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-  himagic = 0x80808080L;
-  lomagic = 0x01010101L;
-  if (sizeof (longword) > 4)
+  word_ptr = (const op_t *) char_ptr;
+  do
     {
-      /* 64-bit version of the magic.  */
-      /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-      himagic = ((himagic << 16) << 16) | himagic;
-      lomagic = ((lomagic << 16) << 16) | lomagic;
+      word = *word_ptr++;
     }
-  if (sizeof (longword) > 8)
-    abort ();
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  for (;;)
-    {
-      longword = *longword_ptr++;
+  while (!has_zero (word));
 
-      if (((longword - lomagic) & ~longword & himagic) != 0)
-	{
-	  /* Which of the bytes was the zero?  If none of them were, it was
-	     a misfire; continue the search.  */
-
-	  const char *cp = (const char *) (longword_ptr - 1);
-
-	  if (cp[0] == 0)
-	    return cp - str;
-	  if (cp[1] == 0)
-	    return cp - str + 1;
-	  if (cp[2] == 0)
-	    return cp - str + 2;
-	  if (cp[3] == 0)
-	    return cp - str + 3;
-	  if (sizeof (longword) > 4)
-	    {
-	      if (cp[4] == 0)
-		return cp - str + 4;
-	      if (cp[5] == 0)
-		return cp - str + 5;
-	      if (cp[6] == 0)
-		return cp - str + 6;
-	      if (cp[7] == 0)
-		return cp - str + 7;
-	    }
-	}
-    }
+  char_ptr = (const char *) (word_ptr - 1);
+  char_ptr += index_first_zero (word);
+  return char_ptr - str;
 }
+
+#ifndef STRLEN
 libc_hidden_builtin_def (strlen)
+#endif
diff --git a/sysdeps/generic/string-extbyte.h b/sysdeps/generic/string-extbyte.h
new file mode 100644
index 0000000..1ccd5b3
--- /dev/null
+++ b/sysdeps/generic/string-extbyte.h
@@ -0,0 +1,35 @@
+/* string-extbyte.h -- function memory order byte extract.  Generic C version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef STRING_EXTBYTE_H
+#define STRING_EXTBYTE_H 1
+
+#include <limits.h>
+#include <endian.h>
+#include <string-optype.h>
+
+static inline unsigned char
+extractbyte (op_t x, unsigned idx)
+{
+  if (__BYTE_ORDER == __LITTLE_ENDIAN)
+    return x >> (idx * CHAR_BIT);
+  else
+    return x >> (sizeof (x) - 1 - idx) * CHAR_BIT;
+}
+
+#endif /* STRING_EXTBYTE_H */
diff --git a/sysdeps/generic/string-fza.h b/sysdeps/generic/string-fza.h
new file mode 100644
index 0000000..638df2e
--- /dev/null
+++ b/sysdeps/generic/string-fza.h
@@ -0,0 +1,117 @@
+/* string-fza.h -- zero byte detection; basics.  Generic C version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef STRING_FZA_H
+#define STRING_FZA_H 1
+
+#include <limits.h>
+#include <string-optype.h>
+
+/* This function returns non-zero if any byte in X is zero.
+   More specifically, at least one bit set within the least significant
+   byte that was zero; other bytes within the word are indeterminate.  */
+
+static inline op_t
+find_zero_low (op_t x)
+{
+  /* This expression comes from
+       https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+     Subtracting 1 sets 0x80 in a byte that was 0; anding ~x clears
+     0x80 in a byte that was >= 128; anding 0x80 isolates that test bit.  */
+  op_t lsb = (op_t)-1 / 0xff;
+  op_t msb = lsb << (CHAR_BIT - 1);
+  return (x - lsb) & ~x & msb;
+}
+
+/* This function returns at least one bit set within every byte of X that
+   is zero.  The result is exact in that, unlike find_zero_low, all bytes
+   are determinate.  This is usually used for finding the index of the
+   most significant byte that was zero.  */
+
+static inline op_t
+find_zero_all (op_t x)
+{
+  /* For each byte, find not-zero by
+     (0) And 0x7f so that we cannot carry between bytes,
+     (1) Add 0x7f so that non-zero carries into 0x80,
+     (2) Or in the original byte (which might have had 0x80 set).
+     Then invert and mask such that 0x80 is set iff that byte was zero.  */
+  op_t m = ((op_t)-1 / 0xff) * 0x7f;
+  return ~(((x & m) + m) | x | m);
+}
+
+/* With similar caveats, identify bytes that are equal between X1 and X2.  */
+
+static inline op_t
+find_eq_low (op_t x1, op_t x2)
+{
+  return find_zero_low (x1 ^ x2);
+}
+
+static inline op_t
+find_eq_all (op_t x1, op_t x2)
+{
+  return find_zero_all (x1 ^ x2);
+}
+
+/* With similar caveats, identify zero bytes in X1 and bytes that are
+   equal between in X1 and X2.  */
+
+static inline op_t
+find_zero_eq_low (op_t x1, op_t x2)
+{
+  op_t lsb = (op_t)-1 / 0xff;
+  op_t msb = lsb << (CHAR_BIT - 1);
+  op_t eq = x1 ^ x2;
+  return (((x1 - lsb) & ~x1) | ((eq - lsb) & ~eq)) & msb;
+}
+
+static inline op_t
+find_zero_eq_all (op_t x1, op_t x2)
+{
+  op_t m = ((op_t)-1 / 0xff) * 0x7f;
+  op_t eq = x1 ^ x2;
+  op_t c1 = ((x1 & m) + m) | x1;
+  op_t c2 = ((eq & m) + m) | eq;
+  return ~((c1 & c2) | m);
+}
+
+/* With similar caveats, identify zero bytes in X1 and bytes that are
+   not equal between in X1 and X2.  */
+
+static inline op_t
+find_zero_ne_low (op_t x1, op_t x2)
+{
+  op_t m = ((op_t)-1 / 0xff) * 0x7f;
+  op_t eq = x1 ^ x2;
+  op_t nz1 = (x1 + m) | x1;	/* msb set if byte not zero */
+  op_t ne2 = (eq + m) | eq;	/* msb set if byte not equal */
+  return (ne2 | ~nz1) & ~m;	/* msb set if x1 zero or x2 not equal */
+}
+
+static inline op_t
+find_zero_ne_all (op_t x1, op_t x2)
+{
+  op_t m = ((op_t)-1 / 0xff) * 0x7f;
+  op_t eq = x1 ^ x2;
+  op_t nz1 = ((x1 & m) + m) | x1;
+  op_t ne2 = ((eq & m) + m) | eq;
+  return (ne2 | ~nz1) & ~m;
+}
+
+#endif /* STRING_FZA_H */
diff --git a/sysdeps/generic/string-fzb.h b/sysdeps/generic/string-fzb.h
new file mode 100644
index 0000000..e0fc26f
--- /dev/null
+++ b/sysdeps/generic/string-fzb.h
@@ -0,0 +1,49 @@
+/* string-fzb.h -- zero byte detection, boolean.  Generic C version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef STRING_FZB_H
+#define STRING_FZB_H 1
+
+#include <endian.h>
+#include <string-fza.h>
+
+/* Determine if any byte within X is zero.  This is a pure boolean test.  */
+
+static inline _Bool
+has_zero (op_t x)
+{
+  return find_zero_low (x) != 0;
+}
+
+/* Likewise, but for byte equality between X1 and X2.  */
+
+static inline _Bool
+has_eq (op_t x1, op_t x2)
+{
+  return find_eq_low (x1, x2) != 0;
+}
+
+/* Likewise, but for zeros in X1 and equal bytes between X1 and X2.  */
+
+static inline _Bool
+has_zero_eq (op_t x1, op_t x2)
+{
+  return find_zero_eq_low (x1, x2);
+}
+
+#endif /* STRING_FZB_H */
diff --git a/sysdeps/generic/string-fzi.h b/sysdeps/generic/string-fzi.h
new file mode 100644
index 0000000..ea2408f
--- /dev/null
+++ b/sysdeps/generic/string-fzi.h
@@ -0,0 +1,146 @@
+/* string-fzi.h -- zero byte detection; indexes.  Generic C version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef STRING_FZI_H
+#define STRING_FZI_H 1
+
+#include <limits.h>
+#include <endian.h>
+#include <string-fza.h>
+
+/* A subroutine for the index_zero functions.  Given a test word C, return
+   the (memory order) index of the first byte (in memory order) that is
+   non-zero.  */
+
+static inline unsigned int
+index_first_ (op_t c)
+{
+  _Static_assert (sizeof (op_t) == sizeof (long)
+		  || sizeof (op_t) == sizeof (long long),
+		  "Unhandled word size");
+
+  unsigned r;
+  if (__BYTE_ORDER == __LITTLE_ENDIAN)
+    {
+      if (sizeof (op_t) == sizeof (long))
+	r = __builtin_ctzl (c);
+      else
+	r = __builtin_ctzll (c);
+    }
+  else
+    {
+      if (sizeof (op_t) == sizeof (long))
+	r = __builtin_clzl (c);
+      else
+	r = __builtin_clzll (c);
+    }
+  return r / CHAR_BIT;
+}
+
+/* Similarly, but return the (memory order) index of the last byte
+   that is non-zero.  */
+
+static inline unsigned int
+index_last_ (op_t c)
+{
+  _Static_assert (sizeof (op_t) == sizeof (long)
+		  || sizeof (op_t) == sizeof (long long),
+		  "Unhandled word size");
+
+  unsigned r;
+  if (__BYTE_ORDER == __LITTLE_ENDIAN)
+    {
+      if (sizeof (op_t) == sizeof (long))
+	r = __builtin_clzl (c);
+      else
+	r = __builtin_clzll (c);
+    }
+  else
+    {
+      if (sizeof (op_t) == sizeof (long))
+	r = __builtin_ctzl (c);
+      else
+	r = __builtin_ctzll (c);
+    }
+  return sizeof (op_t) - 1 - (r / CHAR_BIT);
+}
+
+/* Given a word X that is known to contain a zero byte, return the
+   index of the first such within the word in memory order.  */
+
+static inline unsigned int
+index_first_zero (op_t x)
+{
+  if (__BYTE_ORDER == __LITTLE_ENDIAN)
+    x = find_zero_low (x);
+  else
+    x = find_zero_all (x);
+  return index_first_ (x);
+}
+
+/* Similarly, but perform the search for byte equality between X1 and X2.  */
+
+static inline unsigned int
+index_first_eq (op_t x1, op_t x2)
+{
+  if (__BYTE_ORDER == __LITTLE_ENDIAN)
+    x1 = find_eq_low (x1, x2);
+  else
+    x1 = find_eq_all (x1, x2);
+  return index_first_ (x1);
+}
+
+/* Similarly, but perform the search for zero within X1 or
+   equality between X1 and X2.  */
+
+static inline unsigned int
+index_first_zero_eq (op_t x1, op_t x2)
+{
+  if (__BYTE_ORDER == __LITTLE_ENDIAN)
+    x1 = find_zero_eq_low (x1, x2);
+  else
+    x1 = find_zero_eq_all (x1, x2);
+  return index_first_ (x1);
+}
+
+/* Similarly, but perform the search for zero within X1 or
+   inequality between X1 and X2.  */
+
+static inline unsigned int
+index_first_zero_ne (op_t x1, op_t x2)
+{
+  if (__BYTE_ORDER == __LITTLE_ENDIAN)
+    x1 = find_zero_ne_low (x1, x2);
+  else
+    x1 = find_zero_ne_all (x1, x2);
+  return index_first_ (x1);
+}
+
+/* Similarly, but search for the last zero within X.  */
+
+static inline unsigned int
+index_last_zero (op_t x)
+{
+  if (__BYTE_ORDER == __LITTLE_ENDIAN)
+    x = find_zero_all (x);
+  else
+    x = find_zero_low (x);
+  return index_last_ (x);
+}
+
+#endif /* STRING_FZI_H */
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 09/16] Improve generic strnlen
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
                   ` (14 preceding siblings ...)
  2016-12-21 23:06 ` [PATCH v2 01/16] Parameterize op_t " Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  15 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

	[BZ #5806]
	* string/strnlen.c: Rewrite in terms of __memchr.
---
 string/strnlen.c | 133 +++----------------------------------------------------
 1 file changed, 5 insertions(+), 128 deletions(-)

diff --git a/string/strnlen.c b/string/strnlen.c
index b2b0664..d01f9c4 100644
--- a/string/strnlen.c
+++ b/string/strnlen.c
@@ -21,7 +21,6 @@
    not, see <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
-#include <stdlib.h>
 
 /* Find the length of S, but scan at most MAXLEN characters.  If no
    '\0' terminator is found in that many characters, return MAXLEN.  */
@@ -31,136 +30,14 @@
 #endif
 
 size_t
-__strnlen (const char *str, size_t maxlen)
+__strnlen (const char *s, size_t maxlen)
 {
-  const char *char_ptr, *end_ptr = str + maxlen;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, himagic, lomagic;
-
-  if (maxlen == 0)
-    return 0;
-
-  if (__glibc_unlikely (end_ptr < str))
-    end_ptr = (const char *) ~0UL;
-
-  /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = str; ((unsigned long int) char_ptr
-			& (sizeof (longword) - 1)) != 0;
-       ++char_ptr)
-    if (*char_ptr == '\0')
-      {
-	if (char_ptr > end_ptr)
-	  char_ptr = end_ptr;
-	return char_ptr - str;
-      }
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-  himagic = 0x80808080L;
-  lomagic = 0x01010101L;
-  if (sizeof (longword) > 4)
-    {
-      /* 64-bit version of the magic.  */
-      /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-      himagic = ((himagic << 16) << 16) | himagic;
-      lomagic = ((lomagic << 16) << 16) | lomagic;
-    }
-  if (sizeof (longword) > 8)
-    abort ();
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  while (longword_ptr < (unsigned long int *) end_ptr)
-    {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.  */
-
-      longword = *longword_ptr++;
-
-      if ((longword - lomagic) & himagic)
-	{
-	  /* Which of the bytes was the zero?  If none of them were, it was
-	     a misfire; continue the search.  */
-
-	  const char *cp = (const char *) (longword_ptr - 1);
-
-	  char_ptr = cp;
-	  if (cp[0] == 0)
-	    break;
-	  char_ptr = cp + 1;
-	  if (cp[1] == 0)
-	    break;
-	  char_ptr = cp + 2;
-	  if (cp[2] == 0)
-	    break;
-	  char_ptr = cp + 3;
-	  if (cp[3] == 0)
-	    break;
-	  if (sizeof (longword) > 4)
-	    {
-	      char_ptr = cp + 4;
-	      if (cp[4] == 0)
-		break;
-	      char_ptr = cp + 5;
-	      if (cp[5] == 0)
-		break;
-	      char_ptr = cp + 6;
-	      if (cp[6] == 0)
-		break;
-	      char_ptr = cp + 7;
-	      if (cp[7] == 0)
-		break;
-	    }
-	}
-      char_ptr = end_ptr;
-    }
-
-  if (char_ptr > end_ptr)
-    char_ptr = end_ptr;
-  return char_ptr - str;
+  const char *found = memchr (s, '\0', maxlen);
+  return found ? found - s : maxlen;
 }
+
 #ifndef STRNLEN
-libc_hidden_def (__strnlen)
 weak_alias (__strnlen, strnlen)
+libc_hidden_def (__strnlen)
 #endif
 libc_hidden_def (strnlen)
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 07/16] Improve generic strrchr
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
                   ` (7 preceding siblings ...)
  2016-12-21 23:06 ` [PATCH v2 10/16] Improve generic strcmp Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 11/16] hppa: Add memcopy.h Richard Henderson
                   ` (6 subsequent siblings)
  15 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

	* string/strrchr.c: Use string-fzb.h and string-fzi.h.
---
 string/strrchr.c | 76 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 61 insertions(+), 15 deletions(-)

diff --git a/string/strrchr.c b/string/strrchr.c
index a07457e..09c1043 100644
--- a/string/strrchr.c
+++ b/string/strrchr.c
@@ -16,38 +16,84 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
+#include <stdint.h>
+#include <string-fzb.h>
+#include <string-fzi.h>
+#include <string-extbyte.h>
 
 #undef strrchr
+#undef rindex
 
-#ifndef STRRCHR
-# define STRRCHR strrchr
+#ifdef STRRCHR
+#define strrchr STRRCHR
 #endif
 
 /* Find the last occurrence of C in S.  */
 char *
-STRRCHR (const char *s, int c)
+strrchr (const char *s, int int_c)
 {
-  const char *found, *p;
+  const unsigned char *found_c = NULL, *ptr_c;
+  const op_t *found_w = NULL, *ptr_w;
+  op_t word, repeated_c;
+  uintptr_t i, align;
+  unsigned char c;
 
-  c = (unsigned char) c;
+  c = (unsigned char) int_c;
+  ptr_c = (const unsigned char *) s;
 
-  /* Since strchr is fast, we use it rather than the obvious loop.  */
+  /* Handle the first few characters by reading one character at a time.
+     Do this until CHAR_PTR is aligned on a word boundary.  */
+  align = -(uintptr_t)ptr_c % sizeof(word);
+  for (i = 0; i < align; ++i, ++ptr_c)
+    {
+      unsigned char this_c = *ptr_c;
+      if (this_c == c)
+        found_c = ptr_c;
+      if (this_c == '\0')
+        return (char *) found_c;
+    }
 
-  if (c == '\0')
-    return strchr (s, '\0');
+  /* Set up a word, each of whose bytes is C.  */
+  repeated_c = ((op_t)-1 / 0xff) * c;
 
-  found = NULL;
-  while ((p = strchr (s, c)) != NULL)
+  /* Search words for C.  At this point, merely record the last word
+     that contained the character.  Stop when we find EOS.  */
+  ptr_w = (const op_t *) ptr_c;
+  while (1)
     {
-      found = p;
-      s = p + 1;
+      word = *ptr_w;
+      if (has_zero (word))
+	break;
+      if (has_eq (word, repeated_c))
+	found_w = ptr_w;
+      ptr_w++;
     }
 
-  return (char *) found;
+  /* Check to see if we've got C in the last word.  */
+  i = index_first_zero_eq (word, repeated_c);
+  if (extractbyte (word, i) == c)
+    found_w = ptr_w;
+
+  /* If we found a word containing C, go back and search it byte by byte.
+     This is probably cheaper than indexing for the zero within WORD,
+     using that to mask out following bytes that might be C, and then
+     indexing to find the last C.  */
+  if (found_w)
+    {
+      ptr_c = (const unsigned char *) found_w;
+      for (i = 0; i < sizeof (word); ++i, ++ptr_c)
+	{
+	  unsigned char this_c = *ptr_c;
+	  if (this_c == c)
+	    found_c = ptr_c;
+	  if (this_c == '\0')
+	    break;
+	}
+    }
+  return (char *) found_c;
 }
 
-#ifdef weak_alias
-#undef rindex
+#ifndef STRRCHR
 weak_alias (strrchr, rindex)
 #endif
 libc_hidden_builtin_def (strrchr)
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 05/16] Improve generic memchr
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
                   ` (11 preceding siblings ...)
  2016-12-21 23:06 ` [PATCH v2 03/16] Improve generic strlen Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  2016-12-23 18:33   ` Tulio Magno Quites Machado Filho
  2016-12-21 23:06 ` [PATCH v2 02/16] Parameterize OP_T_THRES from memcopy.h Richard Henderson
                   ` (2 subsequent siblings)
  15 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

	[BZ #5806]
	* string/memchr.c: Use string-fzb.h, string-fzi.h, string-opthr.h.
---
 string/memchr.c | 146 ++++++++++++++++----------------------------------------
 1 file changed, 41 insertions(+), 105 deletions(-)

diff --git a/string/memchr.c b/string/memchr.c
index ca9fd99..f724911 100644
--- a/string/memchr.c
+++ b/string/memchr.c
@@ -25,138 +25,74 @@
 #endif
 
 #include <string.h>
-
 #include <stddef.h>
-
-#include <limits.h>
+#include <stdint.h>
+#include <string-fzb.h>
+#include <string-fzi.h>
+#include <string-opthr.h>
 
 #undef __memchr
-#ifdef _LIBC
-# undef memchr
-#endif
+#undef memchr
 
-#ifndef weak_alias
-# define __memchr memchr
-#endif
-
-#ifndef MEMCHR
-# define MEMCHR __memchr
+#ifdef MEMCHR
+#define __memchr MEMCHR
 #endif
 
 /* Search no more than N bytes of S for C.  */
 void *
-MEMCHR (void const *s, int c_in, size_t n)
+__memchr (void const *s, int c_in, size_t n)
 {
-  /* On 32-bit hardware, choosing longword to be a 32-bit unsigned
-     long instead of a 64-bit uintmax_t tends to give better
-     performance.  On 64-bit hardware, unsigned long is generally 64
-     bits already.  Change this typedef to experiment with
-     performance.  */
-  typedef unsigned long int longword;
-
   const unsigned char *char_ptr;
-  const longword *longword_ptr;
-  longword repeated_one;
-  longword repeated_c;
+  const op_t *word_ptr;
+  op_t word, repeated_c;
   unsigned char c;
+  uintptr_t i, align;
 
   c = (unsigned char) c_in;
+  char_ptr = (const unsigned char *) s;
 
   /* Handle the first few bytes by reading one byte at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s;
-       n > 0 && (size_t) char_ptr % sizeof (longword) != 0;
-       --n, ++char_ptr)
+     Do this until CHAR_PTR is aligned on a word boundary, or
+     the entirety of small inputs.  */
+  align = -(uintptr_t)char_ptr % sizeof (word);
+  if (n < OP_T_THRES || align > n)
+    align = n;
+  for (i = 0; i < align; ++i, ++char_ptr)
     if (*char_ptr == c)
       return (void *) char_ptr;
 
-  longword_ptr = (const longword *) char_ptr;
+  /* Set up a word, each of whose bytes is C.  */
+  repeated_c = ((op_t)-1 / 0xff) * c;
 
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to any size longwords.  */
+  word_ptr = (const op_t *) char_ptr;
+  n -= align;
+  if (__glibc_unlikely (n == 0))
+    return NULL;
 
-  /* Compute auxiliary longword values:
-     repeated_one is a value which has a 1 in every byte.
-     repeated_c has c in every byte.  */
-  repeated_one = 0x01010101;
-  repeated_c = c | (c << 8);
-  repeated_c |= repeated_c << 16;
-  if (0xffffffffU < (longword) -1)
+  /* Loop while we have more than one word remaining.  */
+  while (n > sizeof (word))
     {
-      repeated_one |= repeated_one << 31 << 1;
-      repeated_c |= repeated_c << 31 << 1;
-      if (8 < sizeof (longword))
-	{
-	  size_t i;
-
-	  for (i = 64; i < sizeof (longword) * 8; i *= 2)
-	    {
-	      repeated_one |= repeated_one << i;
-	      repeated_c |= repeated_c << i;
-	    }
-	}
+      word = *word_ptr;
+      if (has_eq (word, repeated_c))
+	goto found;
+      word_ptr++;
+      n -= sizeof (word);
     }
 
-  /* Instead of the traditional loop which tests each byte, we will test a
-     longword at a time.  The tricky part is testing if *any of the four*
-     bytes in the longword in question are equal to c.  We first use an xor
-     with repeated_c.  This reduces the task to testing whether *any of the
-     four* bytes in longword1 is zero.
-
-     We compute tmp =
-       ((longword1 - repeated_one) & ~longword1) & (repeated_one << 7).
-     That is, we perform the following operations:
-       1. Subtract repeated_one.
-       2. & ~longword1.
-       3. & a mask consisting of 0x80 in every byte.
-     Consider what happens in each byte:
-       - If a byte of longword1 is zero, step 1 and 2 transform it into 0xff,
-	 and step 3 transforms it into 0x80.  A carry can also be propagated
-	 to more significant bytes.
-       - If a byte of longword1 is nonzero, let its lowest 1 bit be at
-	 position k (0 <= k <= 7); so the lowest k bits are 0.  After step 1,
-	 the byte ends in a single bit of value 0 and k bits of value 1.
-	 After step 2, the result is just k bits of value 1: 2^k - 1.  After
-	 step 3, the result is 0.  And no carry is produced.
-     So, if longword1 has only non-zero bytes, tmp is zero.
-     Whereas if longword1 has a zero byte, call j the position of the least
-     significant zero byte.  Then the result has a zero at positions 0, ...,
-     j-1 and a 0x80 at position j.  We cannot predict the result at the more
-     significant bytes (positions j+1..3), but it does not matter since we
-     already have a non-zero bit at position 8*j+7.
-
-     So, the test whether any byte in longword1 is zero is equivalent to
-     testing whether tmp is nonzero.  */
-
-  while (n >= sizeof (longword))
+  /* Since our pointer is aligned, we can always read the last word.  */
+  word = *word_ptr;
+  if (has_eq (word, repeated_c))
     {
-      longword longword1 = *longword_ptr ^ repeated_c;
-
-      if ((((longword1 - repeated_one) & ~longword1)
-	   & (repeated_one << 7)) != 0)
-	break;
-      longword_ptr++;
-      n -= sizeof (longword);
-    }
-
-  char_ptr = (const unsigned char *) longword_ptr;
-
-  /* At this point, we know that either n < sizeof (longword), or one of the
-     sizeof (longword) bytes starting at char_ptr is == c.  On little-endian
-     machines, we could determine the first such byte without any further
-     memory accesses, just by looking at the tmp result from the last loop
-     iteration.  But this does not work on big-endian machines.  Choose code
-     that works in both cases.  */
-
-  for (; n > 0; --n, ++char_ptr)
-    {
-      if (*char_ptr == c)
-	return (void *) char_ptr;
+ found:
+      i = index_first_eq (word, repeated_c);
+      if (i < n)
+	return (char *) word_ptr + i;
     }
 
   return NULL;
 }
-#ifdef weak_alias
+
+#ifndef MEMCHR
 weak_alias (__memchr, memchr)
-#endif
 libc_hidden_builtin_def (memchr)
+#endif
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 14/16] arm: Add string-fza.h
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 15/16] arm: Define __memchr Richard Henderson
                   ` (14 subsequent siblings)
  15 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

While arm has the more important string functions in assembly,
there are still a few generic routines used.

Use the UQSUB8 insn for testing of zeros.

	* sysdeps/arm/armv6t2/string-fza.h: New file.
---
 sysdeps/arm/armv6t2/string-fza.h | 69 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 sysdeps/arm/armv6t2/string-fza.h

diff --git a/sysdeps/arm/armv6t2/string-fza.h b/sysdeps/arm/armv6t2/string-fza.h
new file mode 100644
index 0000000..e46abb1
--- /dev/null
+++ b/sysdeps/arm/armv6t2/string-fza.h
@@ -0,0 +1,69 @@
+/* string-fza.h -- zero byte detection; basics.  ARM version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef STRING_FZA_H
+#define STRING_FZA_H 1
+
+#include <string-optype.h>
+
+/* This function returns at least one bit set within every byte
+   of X that is zero.  */
+
+static inline op_t
+find_zero_all (op_t x)
+{
+  /* Use unsigned saturated subtraction from 1 in each byte.
+     That leaves 1 for every byte that was zero.  */
+  op_t ret, ones = (op_t)-1 / 0xff;
+  asm ("uqsub8 %0,%1,%2" : "=r"(ret) : "r"(ones), "r"(x));
+  return ret;
+}
+
+/* Identify bytes that are equal between X1 and X2.  */
+
+static inline op_t
+find_eq_all (op_t x1, op_t x2)
+{
+  return find_zero_all (x1 ^ x2);
+}
+
+/* Identify zero bytes in X1 or equality between X1 and X2.  */
+
+static inline op_t
+find_zero_eq_all (op_t x1, op_t x2)
+{
+  return find_zero_all (x1) | find_zero_all (x1 ^ x2);
+}
+
+/* Identify zero bytes in X1 or inequality between X1 and X2.  */
+
+static inline op_t
+find_zero_ne_all (op_t x1, op_t x2)
+{
+  /* Make use of the fact that we'll already have ONES in a register.  */
+  op_t ones = (op_t)-1 / 0xff;
+  return find_zero_all (x1) | (find_zero_all (x1 ^ x2) ^ ones);
+}
+
+/* Define the "inexact" versions in terms of the exact versions.  */
+#define find_zero_low		find_zero_all
+#define find_eq_low		find_eq_all
+#define find_zero_eq_low	find_zero_eq_all
+#define find_zero_ne_low	find_zero_ne_all
+
+#endif /* STRING_FZA_H */
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 06/16] Improve generic strchrnul
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 14/16] arm: Add string-fza.h Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 15/16] arm: Define __memchr Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 16/16] powerpc: Add string-fza.h Richard Henderson
                   ` (12 subsequent siblings)
  15 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

	[BZ #5806]
	* string/strchrnul.c: Use string-fzb.h, string-fzi.h.
---
 string/strchrnul.c | 142 +++++++++--------------------------------------------
 1 file changed, 24 insertions(+), 118 deletions(-)

diff --git a/string/strchrnul.c b/string/strchrnul.c
index 629db46..7090c05 100644
--- a/string/strchrnul.c
+++ b/string/strchrnul.c
@@ -21,146 +21,52 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
-#include <memcopy.h>
 #include <stdlib.h>
+#include <stdint.h>
+#include <string-fzb.h>
+#include <string-fzi.h>
 
 #undef __strchrnul
 #undef strchrnul
 
-#ifndef STRCHRNUL
-# define STRCHRNUL __strchrnul
+#ifdef STRCHRNUL
+# define __strchrnul STRCHRNUL
 #endif
 
 /* Find the first occurrence of C in S or the final NUL byte.  */
 char *
-STRCHRNUL (const char *s, int c_in)
+__strchrnul (const char *s, int c_in)
 {
   const unsigned char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, charmask;
+  const op_t *word_ptr;
+  op_t word, repeated_c;
+  uintptr_t i, align, found;
   unsigned char c;
 
   c = (unsigned char) c_in;
+  char_ptr = (const unsigned char *) s;
 
   /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s;
-       ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0;
-       ++char_ptr)
+     Do this until CHAR_PTR is aligned on a word boundary.  */
+  align = -(uintptr_t)char_ptr % sizeof(word);
+  for (i = 0; i < align; ++i, ++char_ptr)
     if (*char_ptr == c || *char_ptr == '\0')
-      return (void *) char_ptr;
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
+      return (char *) char_ptr;
 
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-  magic_bits = -1;
-  magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1;
+  /* Set up a word, each of whose bytes is C.  */
+  repeated_c = ((op_t)-1 / 0xff) * c;
 
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-  if (sizeof (longword) > 4)
-    /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-    charmask |= (charmask << 16) << 16;
-  if (sizeof (longword) > 8)
-    abort ();
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  for (;;)
+  word_ptr = (op_t *) char_ptr;
+  do
     {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.
-
-	 3) But wait!  Aren't we looking for C as well as zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-      longword = *longword_ptr++;
-
-      /* Add MAGIC_BITS to LONGWORD.  */
-      if ((((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits) != 0 ||
-
-	  /* That caught zeroes.  Now test for C.  */
-	  ((((longword ^ charmask) + magic_bits) ^ ~(longword ^ charmask))
-	   & ~magic_bits) != 0)
-	{
-	  /* Which of the bytes was C or zero?
-	     If none of them were, it was a misfire; continue the search.  */
-
-	  const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);
-
-	  if (*cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (*++cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (*++cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (*++cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (sizeof (longword) > 4)
-	    {
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	    }
-	}
+      word = *word_ptr++;
     }
+  while (!has_zero_eq (word, repeated_c));
 
-  /* This should never happen.  */
-  return NULL;
+  found = index_first_zero_eq (word, repeated_c);
+  return (char *) (word_ptr - 1) + found;
 }
 
+#ifndef STRCHRNUL
 weak_alias (__strchrnul, strchrnul)
+#endif
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 15/16] arm: Define __memchr
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 14/16] arm: Add string-fza.h Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  2016-12-22  0:25   ` Joseph Myers
  2016-12-21 23:06 ` [PATCH v2 06/16] Improve generic strchrnul Richard Henderson
                   ` (13 subsequent siblings)
  15 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

All other memchr implementations define it.

	* sysdeps/arm/armv6t2/memchr.S (__memchr): Define alias.
---
 sysdeps/arm/armv6t2/memchr.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sysdeps/arm/armv6t2/memchr.S b/sysdeps/arm/armv6t2/memchr.S
index ded52dc..b4117fe 100644
--- a/sysdeps/arm/armv6t2/memchr.S
+++ b/sysdeps/arm/armv6t2/memchr.S
@@ -184,4 +184,6 @@ ENTRY(memchr)
 	DO_RET(lr)
 
 END(memchr)
+
+strong_alias (memchr, __memchr)
 libc_hidden_builtin_def (memchr)
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 10/16] Improve generic strcmp
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
                   ` (6 preceding siblings ...)
  2016-12-21 23:06 ` [PATCH v2 13/16] alpha: Add string-fzb.h and string-fzi.h Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 07/16] Improve generic strrchr Richard Henderson
                   ` (7 subsequent siblings)
  15 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

	* string/strcmp.c: Rewrite using memcopy.h, string-fzb.h,
	string-fzi.h.
---
 string/strcmp.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 91 insertions(+), 11 deletions(-)

diff --git a/string/strcmp.c b/string/strcmp.c
index 4b16f99..cb522ff 100644
--- a/string/strcmp.c
+++ b/string/strcmp.c
@@ -16,32 +16,112 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
+#include <stdint.h>
+#include <limits.h>
+#include <string-fzb.h>
+#include <string-fzi.h>
+#include <string-extbyte.h>
+#include <memcopy.h>
 
 #undef strcmp
 
-#ifndef STRCMP
-# define STRCMP strcmp
+#ifdef STRCMP
+# define strcmp STRCMP
 #endif
 
 /* Compare S1 and S2, returning less than, equal to or
    greater than zero if S1 is lexicographically less than,
    equal to or greater than S2.  */
 int
-STRCMP (const char *p1, const char *p2)
+strcmp (const char *p1, const char *p2)
 {
-  const unsigned char *s1 = (const unsigned char *) p1;
-  const unsigned char *s2 = (const unsigned char *) p2;
+  const op_t *x1, *x2;
+  op_t w1, w2;
   unsigned char c1, c2;
+  uintptr_t i, n, ofs;
+  int diff;
+
+  /* Handle the unaligned bytes of p1 first.  */
+  n = -(uintptr_t)p1 % sizeof(op_t);
+  for (i = 0; i < n; ++i)
+    {
+      c1 = *p1++;
+      c2 = *p2++;
+      diff = c1 - c2;
+      if (c1 == '\0' || diff)
+	return diff;
+    }
 
-  do
+  /* P1 is now aligned to unsigned long.  P2 may or may not be.  */
+  x1 = (const op_t *)p1;
+  w1 = *x1++;
+  ofs = (uintptr_t)p2 % sizeof(op_t);
+  if (ofs == 0)
     {
-      c1 = (unsigned char) *s1++;
-      c2 = (unsigned char) *s2++;
-      if (c1 == '\0')
-	return c1 - c2;
+      x2 = (const op_t *)p2;
+      w2 = *x2++;
+      /* Aligned loop.  If a difference is found, exit to compare the
+         bytes.  Else if a zero is found we have equal strings.  */
+      while (w1 == w2)
+	{
+	  if (has_zero (w1))
+	    return 0;
+          w1 = *x1++;
+          w2 = *x2++;
+	}
     }
-  while (c1 == c2);
+  else
+    {
+      op_t w2a, w2b;
+      uintptr_t sh_1, sh_2;
+
+      x2 = (const op_t *)(p2 - ofs);
+      w2a = *x2++;
+      sh_1 = ofs * CHAR_BIT;
+      sh_2 = sizeof(op_t) * CHAR_BIT - sh_1;
+
+      /* Align the first partial of P2, with 0xff for the rest of the
+         bytes so that we can also apply the has_zero test to see if we
+         have already reached EOS.  If we have, then we can simply fall
+         through to the final comparison.  */
+      w2 = MERGE (w2a, sh_1, (op_t)-1, sh_2);
+      if (!has_zero (w2))
+	{
+	  /* Unaligned loop.  The invariant is that W2B, which is "ahead"
+             of W1, does not contain end-of-string.  Therefore it is safe
+             (and necessary) to read another word from each while we do
+             not have a difference.  */
+	  while (1)
+	    {
+	      w2b = *x2++;
+	      w2 = MERGE (w2a, sh_1, w2b, sh_2);
+	      if (w1 != w2)
+		goto final_cmp;
+	      if (has_zero (w2b))
+		break;
+	      w1 = *x1++;
+	      w2a = w2b;
+	    }
 
+	  /* Zero found in the second partial of P2.  If we had EOS
+	     in the aligned word, we have equality.  */
+	  if (has_zero (w1))
+	    return 0;
+
+          /* Load the final word of P1 and align the final partial of P2.  */
+	  w1 = *x1++;
+          w2 = MERGE (w2b, sh_1, 0, sh_2);
+	}
+    }
+
+ final_cmp:
+  /* We have two aligned words of data.  */
+  i = index_first_zero_ne (w1, w2);
+  c1 = extractbyte (w1, i);
+  c2 = extractbyte (w2, i);
   return c1 - c2;
 }
+
+#ifndef STRCMP
 libc_hidden_builtin_def (strcmp)
+#endif
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 00/16] Improve generic string routines
@ 2016-12-21 23:06 Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 14/16] arm: Add string-fza.h Richard Henderson
                   ` (15 more replies)
  0 siblings, 16 replies; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

The idea here was to improve hppa performance without going all
the way to hand-written assembly (at least for now) so that other
minor architectures could benefit.

Changes since v1:
  * Marked ChangeLog entries with [BZ #5806], as appropriate.

  * Reorganized the headers, so that armv6t2 and power6 need override
    as little as possible to use their (integer) zero detection insns.

  * Hopefully fixed all of the coding style issues.

  * Adjusted the memrchr algorithm as discussed.

  * Replaced the #ifdef STRRCHR etc that are used by the multiarch files.

  * Tested on i386, i686, x86_64 (verified this is unused), ppc64,
    ppc64le --with-cpu=power8 (to use power6 in multiarch), armv7,
    aarch64, alpha (qemu) and hppa (qemu).

As for the rest of #5806... I see two schools of thought: either let
sleeping dogs lie, or remove the files entirely.  The "new" zero test
is objectively better, requiring only one branch per test instead of two.

For i686, it's possible that the unrolling done is significant, but
it's also code that's essentially unused except for ld.so startup.
After that we'll use the sse2 version for all but the barest fraction
of hw still running.

For m68k, there's no unrolling done and the compiler generated version
looks better to me; however I can't benchmark it.


r~


Richard Henderson (16):
  Parameterize op_t from memcopy.h
  Parameterize OP_T_THRES from memcopy.h
  Improve generic strlen
  Improve generic strchr
  Improve generic memchr
  Improve generic strchrnul
  Improve generic strrchr
  Improve generic memrchr
  Improve generic strnlen
  Improve generic strcmp
  hppa: Add memcopy.h
  hppa: Add string-fzb.h and string-fzi.h
  alpha: Add string-fzb.h and string-fzi.h
  arm: Add string-fza.h
  arm: Define __memchr
  powerpc: Add string-fza.h

 string/memchr.c                               | 146 ++++++------------
 string/memrchr.c                              | 205 +++++++-------------------
 string/strchr.c                               | 164 ++++-----------------
 string/strchrnul.c                            | 142 +++---------------
 string/strcmp.c                               | 102 +++++++++++--
 string/strlen.c                               |  89 +++--------
 string/strnlen.c                              | 133 +----------------
 string/strrchr.c                              |  76 ++++++++--
 sysdeps/alpha/string-fza.h                    |   1 +
 sysdeps/alpha/string-fzb.h                    |  51 +++++++
 sysdeps/alpha/string-fzi.h                    | 106 +++++++++++++
 sysdeps/arm/armv6t2/memchr.S                  |   2 +
 sysdeps/arm/armv6t2/string-fza.h              |  69 +++++++++
 sysdeps/generic/memcopy.h                     |  11 +-
 sysdeps/generic/string-extbyte.h              |  35 +++++
 sysdeps/generic/string-fza.h                  | 117 +++++++++++++++
 sysdeps/generic/string-fzb.h                  |  49 ++++++
 sysdeps/generic/string-fzi.h                  | 146 ++++++++++++++++++
 sysdeps/generic/string-opthr.h                |  25 ++++
 sysdeps/generic/string-optype.h               |  31 ++++
 sysdeps/hppa/memcopy.h                        |  44 ++++++
 sysdeps/hppa/string-fza.h                     |   1 +
 sysdeps/hppa/string-fzb.h                     |  69 +++++++++
 sysdeps/hppa/string-fzi.h                     | 129 ++++++++++++++++
 sysdeps/i386/memcopy.h                        |   3 -
 sysdeps/i386/string-opthr.h                   |  25 ++++
 sysdeps/m68k/memcopy.h                        |   3 -
 sysdeps/powerpc/power6/string-fza.h           |  65 ++++++++
 sysdeps/powerpc/powerpc32/power4/memcopy.h    |   5 -
 sysdeps/powerpc/powerpc32/power6/string-fza.h |   1 +
 sysdeps/powerpc/powerpc64/power6/string-fza.h |   1 +
 31 files changed, 1299 insertions(+), 747 deletions(-)
 create mode 100644 sysdeps/alpha/string-fza.h
 create mode 100644 sysdeps/alpha/string-fzb.h
 create mode 100644 sysdeps/alpha/string-fzi.h
 create mode 100644 sysdeps/arm/armv6t2/string-fza.h
 create mode 100644 sysdeps/generic/string-extbyte.h
 create mode 100644 sysdeps/generic/string-fza.h
 create mode 100644 sysdeps/generic/string-fzb.h
 create mode 100644 sysdeps/generic/string-fzi.h
 create mode 100644 sysdeps/generic/string-opthr.h
 create mode 100644 sysdeps/generic/string-optype.h
 create mode 100644 sysdeps/hppa/memcopy.h
 create mode 100644 sysdeps/hppa/string-fza.h
 create mode 100644 sysdeps/hppa/string-fzb.h
 create mode 100644 sysdeps/hppa/string-fzi.h
 create mode 100644 sysdeps/i386/string-opthr.h
 create mode 100644 sysdeps/powerpc/power6/string-fza.h
 create mode 100644 sysdeps/powerpc/powerpc32/power6/string-fza.h
 create mode 100644 sysdeps/powerpc/powerpc64/power6/string-fza.h

-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 16/16] powerpc: Add string-fza.h
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
                   ` (2 preceding siblings ...)
  2016-12-21 23:06 ` [PATCH v2 06/16] Improve generic strchrnul Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  2016-12-23 18:44   ` Tulio Magno Quites Machado Filho
  2016-12-21 23:06 ` [PATCH v2 08/16] Improve generic memrchr Richard Henderson
                   ` (11 subsequent siblings)
  15 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

While ppc has the more important string functions in assembly,
there are still a few generic routines used.

Use the Power 6 CMPB insn for testing of zeros.

	* sysdeps/powerpc/power6/string-fza.h: New file.
	* sysdeps/powerpc/powerpc32/power6/string-fza.h: New file.
	* sysdeps/powerpc/powerpc64/power6/string-fza.h: New file.
---
 sysdeps/powerpc/power6/string-fza.h           | 65 +++++++++++++++++++++++++++
 sysdeps/powerpc/powerpc32/power6/string-fza.h |  1 +
 sysdeps/powerpc/powerpc64/power6/string-fza.h |  1 +
 3 files changed, 67 insertions(+)
 create mode 100644 sysdeps/powerpc/power6/string-fza.h
 create mode 100644 sysdeps/powerpc/powerpc32/power6/string-fza.h
 create mode 100644 sysdeps/powerpc/powerpc64/power6/string-fza.h

diff --git a/sysdeps/powerpc/power6/string-fza.h b/sysdeps/powerpc/power6/string-fza.h
new file mode 100644
index 0000000..1515b00
--- /dev/null
+++ b/sysdeps/powerpc/power6/string-fza.h
@@ -0,0 +1,65 @@
+/* string-fza.h -- zero byte detection; basics.  Power6 version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef STRING_FZA_H
+#define STRING_FZA_H 1
+
+#include <string-optype.h>
+
+/* This function returns 0xff for each byte that is
+   equal between X1 and X2.  */
+
+static inline op_t
+find_eq_all (op_t x1, op_t x2)
+{
+  op_t ret;
+  asm ("cmpb %0,%1,%2" : "=r"(ret) : "r"(x1), "r"(x2));
+  return ret;
+}
+
+/* This function returns 0xff for each byte that is zero in X.  */
+
+static inline op_t
+find_zero_all (op_t x)
+{
+  return find_eq_all (x, 0);
+}
+
+/* Identify zero bytes in X1 or equality between X1 and X2.  */
+
+static inline op_t
+find_zero_eq_all (op_t x1, op_t x2)
+{
+  return find_zero_all (x1) | find_eq_all (x1, x2);
+}
+
+/* Identify zero bytes in X1 or inequality between X1 and X2.  */
+
+static inline op_t
+find_zero_ne_all (op_t x1, op_t x2)
+{
+  return find_zero_all (x1) | ~find_eq_all (x1, x2);
+}
+
+/* Define the "inexact" versions in terms of the exact versions.  */
+#define find_zero_low		find_zero_all
+#define find_eq_low		find_eq_all
+#define find_zero_eq_low	find_zero_eq_all
+#define find_zero_ne_low	find_zero_ne_all
+
+#endif /* STRING_FZA_H */
diff --git a/sysdeps/powerpc/powerpc32/power6/string-fza.h b/sysdeps/powerpc/powerpc32/power6/string-fza.h
new file mode 100644
index 0000000..bb00d7c
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power6/string-fza.h
@@ -0,0 +1 @@
+#include <sysdeps/powerpc/power6/string-fza.h>
diff --git a/sysdeps/powerpc/powerpc64/power6/string-fza.h b/sysdeps/powerpc/powerpc64/power6/string-fza.h
new file mode 100644
index 0000000..bb00d7c
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power6/string-fza.h
@@ -0,0 +1 @@
+#include <sysdeps/powerpc/power6/string-fza.h>
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 11/16] hppa: Add memcopy.h
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
                   ` (8 preceding siblings ...)
  2016-12-21 23:06 ` [PATCH v2 07/16] Improve generic strrchr Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 12/16] hppa: Add string-fzb.h and string-fzi.h Richard Henderson
                   ` (5 subsequent siblings)
  15 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

GCC's combine pass cannot merge (x >> c | y << (32 - c)) into a
double-word shift unless (1) the subtract is in the same basic block
and (2) the result of the subtract is used exactly once.  Neither
condition is true for any use of MERGE.

By forcing the use of a double-word shift, we not only reduce
contention on SAR, but also allow the setting of SAR to be hoisted
outside of a loop.

	* sysdeps/hppa/memcopy.h: New file.
---
 sysdeps/hppa/memcopy.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 sysdeps/hppa/memcopy.h

diff --git a/sysdeps/hppa/memcopy.h b/sysdeps/hppa/memcopy.h
new file mode 100644
index 0000000..4076b8b
--- /dev/null
+++ b/sysdeps/hppa/memcopy.h
@@ -0,0 +1,44 @@
+/* memcopy.h -- definitions for memory copy functions, PA-RISC version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/generic/memcopy.h>
+
+/* Use a single double-word shift instead of two shifts and an ior.
+   If the uses of MERGE were close to the computation of shl/shr,
+   the compiler might have been able to create this itself.
+   But instead that computation is well separated.
+
+   Using an inline function instead of a macro is the easiest way
+   to ensure that the types are correct.  */
+
+#undef MERGE
+
+extern void link_error(void);
+
+static inline op_t
+MERGE(op_t w0, int shl, op_t w1, int shr)
+{
+  op_t res;
+  if (OPSIZ == 4)
+    asm("shrpw %1,%2,%%sar,%0" : "=r"(res) : "r"(w0), "r"(w1), "q"(shr));
+  else if (OPSIZ == 8)
+    asm("shrpd %1,%2,%%sar,%0" : "=r"(res) : "r"(w0), "r"(w1), "q"(shr));
+  else
+    link_error(), res = 0;
+  return res;
+}
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 04/16] Improve generic strchr
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
                   ` (4 preceding siblings ...)
  2016-12-21 23:06 ` [PATCH v2 08/16] Improve generic memrchr Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 13/16] alpha: Add string-fzb.h and string-fzi.h Richard Henderson
                   ` (9 subsequent siblings)
  15 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

	[BZ #5806]
	* string/strchr.c: Use string-fzb.h, string-fzi.h, string-extbyte.h.
---
 string/strchr.c | 164 ++++++++++----------------------------------------------
 1 file changed, 29 insertions(+), 135 deletions(-)

diff --git a/string/strchr.c b/string/strchr.c
index 25c2fe4..3215126 100644
--- a/string/strchr.c
+++ b/string/strchr.c
@@ -22,164 +22,58 @@
 
 #include <string.h>
 #include <stdlib.h>
+#include <stdint.h>
+#include <string-fzb.h>
+#include <string-fzi.h>
+#include <string-extbyte.h>
 
 #undef strchr
+#undef index
 
-#ifndef STRCHR
-# define STRCHR strchr
+#ifdef STRCHR
+# define strchr STRCHR
 #endif
 
 /* Find the first occurrence of C in S.  */
 char *
-STRCHR (const char *s, int c_in)
+strchr (const char *s, int c_in)
 {
   const unsigned char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, charmask;
+  const op_t *word_ptr;
+  op_t word, repeated_c, found;
+  uintptr_t i, align;
   unsigned char c;
 
   c = (unsigned char) c_in;
+  char_ptr = (const unsigned char *) s;
 
   /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s;
-       ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0;
-       ++char_ptr)
+     Do this until CHAR_PTR is aligned on a word boundary.  */
+  align = -(uintptr_t)char_ptr % sizeof(word);
+  for (i = 0; i < align; ++i, ++char_ptr)
     if (*char_ptr == c)
-      return (void *) char_ptr;
+      return (char *) char_ptr;
     else if (*char_ptr == '\0')
       return NULL;
 
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-  magic_bits = -1;
-  magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1;
-
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-  if (sizeof (longword) > 4)
-    /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-    charmask |= (charmask << 16) << 16;
-  if (sizeof (longword) > 8)
-    abort ();
+  /* Set up a word, each of whose bytes is C.  */
+  repeated_c = ((op_t)-1 / 0xff) * c;
 
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  for (;;)
+  word_ptr = (const op_t *) char_ptr;
+  do
     {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.
-
-	 3) But wait!  Aren't we looking for C as well as zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-      longword = *longword_ptr++;
-
-      /* Add MAGIC_BITS to LONGWORD.  */
-      if ((((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits) != 0 ||
-
-	  /* That caught zeroes.  Now test for C.  */
-	  ((((longword ^ charmask) + magic_bits) ^ ~(longword ^ charmask))
-	   & ~magic_bits) != 0)
-	{
-	  /* Which of the bytes was C or zero?
-	     If none of them were, it was a misfire; continue the search.  */
-
-	  const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);
-
-	  if (*cp == c)
-	    return (char *) cp;
-	  else if (*cp == '\0')
-	    return NULL;
-	  if (*++cp == c)
-	    return (char *) cp;
-	  else if (*cp == '\0')
-	    return NULL;
-	  if (*++cp == c)
-	    return (char *) cp;
-	  else if (*cp == '\0')
-	    return NULL;
-	  if (*++cp == c)
-	    return (char *) cp;
-	  else if (*cp == '\0')
-	    return NULL;
-	  if (sizeof (longword) > 4)
-	    {
-	      if (*++cp == c)
-		return (char *) cp;
-	      else if (*cp == '\0')
-		return NULL;
-	      if (*++cp == c)
-		return (char *) cp;
-	      else if (*cp == '\0')
-		return NULL;
-	      if (*++cp == c)
-		return (char *) cp;
-	      else if (*cp == '\0')
-		return NULL;
-	      if (*++cp == c)
-		return (char *) cp;
-	      else if (*cp == '\0')
-		return NULL;
-	    }
-	}
+      word = *word_ptr++;
     }
+  while (!has_zero_eq (word, repeated_c));
 
-  return NULL;
+  found = index_first_zero_eq (word, repeated_c);
+  if (extractbyte (word, found) == c)
+    return (char *) (word_ptr - 1) + found;
+  else
+    return NULL;
 }
 
-#ifdef weak_alias
-# undef index
+#ifndef STRCHR
 weak_alias (strchr, index)
-#endif
 libc_hidden_builtin_def (strchr)
+#endif
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 02/16] Parameterize OP_T_THRES from memcopy.h
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
                   ` (12 preceding siblings ...)
  2016-12-21 23:06 ` [PATCH v2 05/16] Improve generic memchr Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 01/16] Parameterize op_t " Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 09/16] Improve generic strnlen Richard Henderson
  15 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

	* sysdeps/generic/memcopy.h (OP_T_THRES): Move...
	* sysdeps/generic/string-opthr.h: ... here; new file.
	* sysdeps/i386/memcopy.h (OP_T_THRES): Move...
	* sysdeps/i386/string-opthr.h: ... here; new file.
	* sysdeps/m68k/memcopy.h (OP_T_THRES): Remove.
	* sysdeps/powerpc/powerpc32/power4/memcopy.h (OP_T_THRES): Remove.
---
 sysdeps/generic/memcopy.h                  |  4 +---
 sysdeps/generic/string-opthr.h             | 25 +++++++++++++++++++++++++
 sysdeps/i386/memcopy.h                     |  3 ---
 sysdeps/i386/string-opthr.h                | 25 +++++++++++++++++++++++++
 sysdeps/m68k/memcopy.h                     |  3 ---
 sysdeps/powerpc/powerpc32/power4/memcopy.h |  5 -----
 6 files changed, 51 insertions(+), 14 deletions(-)
 create mode 100644 sysdeps/generic/string-opthr.h
 create mode 100644 sysdeps/i386/string-opthr.h

diff --git a/sysdeps/generic/memcopy.h b/sysdeps/generic/memcopy.h
index 38f27a8..a720e5e 100644
--- a/sysdeps/generic/memcopy.h
+++ b/sysdeps/generic/memcopy.h
@@ -58,6 +58,7 @@
 
 /* Type to use for aligned memory operations.  */
 #include <string-optype.h>
+#include <string-opthr.h>
 
 #define OPSIZ	(sizeof(op_t))
 
@@ -190,9 +191,6 @@ extern void _wordcopy_bwd_dest_aligned (long int, long int, size_t)
 
 #endif
 
-/* Threshold value for when to enter the unrolled loops.  */
-#define	OP_T_THRES	16
-
 /* Set to 1 if memcpy is safe to use for forward-copying memmove with
    overlapping addresses.  This is 0 by default because memcpy implementations
    are generally not safe for overlapping addresses.  */
diff --git a/sysdeps/generic/string-opthr.h b/sysdeps/generic/string-opthr.h
new file mode 100644
index 0000000..f4f4a33
--- /dev/null
+++ b/sysdeps/generic/string-opthr.h
@@ -0,0 +1,25 @@
+/* string-opthr.h -- Define a threshold for word access.  Generic version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef STRING_OPTHR_H
+#define STRING_OPTHR_H 1
+
+/* Threshold value for when to enter the unrolled loops.  */
+#define OP_T_THRES	16
+
+#endif /* string-opthr.h */
diff --git a/sysdeps/i386/memcopy.h b/sysdeps/i386/memcopy.h
index 2fe98fa..f59f8a8 100644
--- a/sysdeps/i386/memcopy.h
+++ b/sysdeps/i386/memcopy.h
@@ -19,9 +19,6 @@
 
 #include <sysdeps/generic/memcopy.h>
 
-#undef	OP_T_THRES
-#define	OP_T_THRES	8
-
 #undef	BYTE_COPY_FWD
 #define BYTE_COPY_FWD(dst_bp, src_bp, nbytes)				      \
   do {									      \
diff --git a/sysdeps/i386/string-opthr.h b/sysdeps/i386/string-opthr.h
new file mode 100644
index 0000000..f92c61b
--- /dev/null
+++ b/sysdeps/i386/string-opthr.h
@@ -0,0 +1,25 @@
+/* string-opthr.h -- Define a threshold for word access.  i386 version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef I386_STRING_OPTHR_H
+#define I386_STRING_OPTHR_H 1
+
+/* Threshold value for when to enter the unrolled loops.  */
+#define OP_T_THRES	8
+
+#endif /* I386_STRING_OPTHR_H */
diff --git a/sysdeps/m68k/memcopy.h b/sysdeps/m68k/memcopy.h
index 33a2064..751efff 100644
--- a/sysdeps/m68k/memcopy.h
+++ b/sysdeps/m68k/memcopy.h
@@ -21,9 +21,6 @@
 
 #if	defined(__mc68020__) || defined(mc68020)
 
-#undef	OP_T_THRES
-#define	OP_T_THRES	16
-
 /* WORD_COPY_FWD and WORD_COPY_BWD are not symmetric on the 68020,
    because of its weird instruction overlap characteristics.  */
 
diff --git a/sysdeps/powerpc/powerpc32/power4/memcopy.h b/sysdeps/powerpc/powerpc32/power4/memcopy.h
index a06a8c9..9cff24e 100644
--- a/sysdeps/powerpc/powerpc32/power4/memcopy.h
+++ b/sysdeps/powerpc/powerpc32/power4/memcopy.h
@@ -51,11 +51,6 @@
      [I fail to understand.  I feel stupid.  --roland]
 */
 
-
-/* Threshold value for when to enter the unrolled loops.  */
-#undef	OP_T_THRES
-#define OP_T_THRES 16
-
 /* Copy exactly NBYTES bytes from SRC_BP to DST_BP,
    without any assumptions about alignment of the pointers.  */
 #undef BYTE_COPY_FWD
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 13/16] alpha: Add string-fzb.h and string-fzi.h
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
                   ` (5 preceding siblings ...)
  2016-12-21 23:06 ` [PATCH v2 04/16] Improve generic strchr Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 10/16] Improve generic strcmp Richard Henderson
                   ` (8 subsequent siblings)
  15 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

While alpha has the more important string functions in assembly,
there are still a few for find the generic routines are used.

Use the CMPBGE insn, via the builtin, for testing of zeros.  Use a
simplified expansion of __builtin_ctz when the insn isn't available.

	* sysdeps/alpha/string-fza.h: New file.
	* sysdeps/alpha/string-fzb.h: New file.
	* sysdeps/alpha/string-fzi.h: New file.
---
 sysdeps/alpha/string-fza.h |   1 +
 sysdeps/alpha/string-fzb.h |  51 ++++++++++++++++++++++
 sysdeps/alpha/string-fzi.h | 106 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 158 insertions(+)
 create mode 100644 sysdeps/alpha/string-fza.h
 create mode 100644 sysdeps/alpha/string-fzb.h
 create mode 100644 sysdeps/alpha/string-fzi.h

diff --git a/sysdeps/alpha/string-fza.h b/sysdeps/alpha/string-fza.h
new file mode 100644
index 0000000..3eb0944
--- /dev/null
+++ b/sysdeps/alpha/string-fza.h
@@ -0,0 +1 @@
+#error "string-fza.h is unused on Alpha"
diff --git a/sysdeps/alpha/string-fzb.h b/sysdeps/alpha/string-fzb.h
new file mode 100644
index 0000000..13d6c9f
--- /dev/null
+++ b/sysdeps/alpha/string-fzb.h
@@ -0,0 +1,51 @@
+/* string-fzb.h -- zero byte detection; boolean.  Alpha version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef STRING_FZB_H
+#define STRING_FZB_H 1
+
+#include <string-optype.h>
+
+/* Note that since CMPBGE creates a bit mask rather than a byte mask,
+   we cannot simply provide a target-specific string-fza.h.  */
+
+/* Determine if any byte within X is zero.  This is a pure boolean test.  */
+
+static inline _Bool
+has_zero (op_t x)
+{
+  return __builtin_alpha_cmpbge (0, x) != 0;
+}
+
+/* Likewise, but for byte equality between X1 and X2.  */
+
+static inline _Bool
+has_eq (op_t x1, op_t x2)
+{
+  return has_zero (x1 ^ x2);
+}
+
+/* Likewise, but for zeros in X1 and equal bytes between X1 and X2.  */
+
+static inline _Bool
+has_zero_eq (op_t x1, op_t x2)
+{
+  return has_zero (x1) | has_eq (x1, x2);
+}
+
+#endif /* STRING_FZB_H */
diff --git a/sysdeps/alpha/string-fzi.h b/sysdeps/alpha/string-fzi.h
new file mode 100644
index 0000000..f351033
--- /dev/null
+++ b/sysdeps/alpha/string-fzi.h
@@ -0,0 +1,106 @@
+/* string-fzi.h -- zero byte detection; indices.  Alpha version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef STRING_FZI_H
+#define STRING_FZI_H
+
+#include <string-optype.h>
+
+/* Note that since CMPBGE creates a bit mask rather than a byte mask,
+   we cannot simply provide a target-specific string-fza.h.  */
+
+/* A subroutine for the index_zero functions.  Given a bitmask C,
+   return the index of the first bit set in memory order.  */
+
+static inline unsigned int
+index_first_ (unsigned long int c)
+{
+#ifdef __alpha_cix__
+  return __builtin_ctzl (c);
+#else
+  c = c & -c;
+  return (c & 0xf0 ? 4 : 0) + (c & 0xcc ? 2 : 0) + (c & 0xaa ? 1 : 0);
+#endif
+}
+
+/* Similarly, but return the (memory order) index of the last bit
+   that is non-zero.  Note that only the least 8 bits may be nonzero.  */
+
+static inline unsigned int
+index_last_ (unsigned long int x)
+{
+#ifdef __alpha_cix__
+  return __builtin_clzl (x) ^ 63;
+#else
+  unsigned r = 0;
+  if (x & 0xf0)
+    r += 4;
+  if (x & (0xc << r))
+    r += 2;
+  if (x & (0x2 << r))
+    r += 1;
+  return r;
+#endif
+}
+
+/* Given a word X that is known to contain a zero byte, return the
+   index of the first such within the word in memory order.  */
+
+static inline unsigned int
+index_first_zero (op_t x)
+{
+  return index_first_ (__builtin_alpha_cmpbge (0, x));
+}
+
+/* Similarly, but perform the test for byte equality between X1 and X2.  */
+
+static inline unsigned int
+index_first_eq (op_t x1, op_t x2)
+{
+  return index_first_zero (x1 ^ x2);
+}
+
+/* Similarly, but perform the search for zero within X1 or
+   equality between X1 and X2.  */
+
+static inline unsigned int
+index_first_zero_eq (op_t x1, op_t x2)
+{
+  return index_first_ (__builtin_alpha_cmpbge (0, x1)
+		       | __builtin_alpha_cmpbge (0, x1 ^ x2));
+}
+
+/* Similarly, but perform the search for zero within X1 or
+   inequality between X1 and X2.  */
+
+static inline unsigned int
+index_first_zero_ne (op_t x1, op_t x2)
+{
+  return index_first_ (__builtin_alpha_cmpbge (0, x1)
+		       | (__builtin_alpha_cmpbge (0, x1 ^ x2) ^ 0xFF));
+}
+
+/* Similarly, but search for the last zero within X.  */
+
+static inline unsigned int
+index_last_zero (op_t x)
+{
+  return index_last_ (__builtin_alpha_cmpbge (0, x));
+}
+
+#endif /* STRING_FZI_H */
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 01/16] Parameterize op_t from memcopy.h
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
                   ` (13 preceding siblings ...)
  2016-12-21 23:06 ` [PATCH v2 02/16] Parameterize OP_T_THRES from memcopy.h Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 09/16] Improve generic strnlen Richard Henderson
  15 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

	* sysdeps/generic/string-optype.h: New file.
	* sysdeps/generic/memcopy.h: Include it.
---
 sysdeps/generic/memcopy.h       |  7 +++----
 sysdeps/generic/string-optype.h | 31 +++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 4 deletions(-)
 create mode 100644 sysdeps/generic/string-optype.h

diff --git a/sysdeps/generic/memcopy.h b/sysdeps/generic/memcopy.h
index fe9aa60..38f27a8 100644
--- a/sysdeps/generic/memcopy.h
+++ b/sysdeps/generic/memcopy.h
@@ -56,10 +56,9 @@
      [I fail to understand.  I feel stupid.  --roland]
 */
 
-/* Type to use for aligned memory operations.
-   This should normally be the biggest type supported by a single load
-   and store.  */
-#define	op_t	unsigned long int
+/* Type to use for aligned memory operations.  */
+#include <string-optype.h>
+
 #define OPSIZ	(sizeof(op_t))
 
 /* Type to use for unaligned operations.  */
diff --git a/sysdeps/generic/string-optype.h b/sysdeps/generic/string-optype.h
new file mode 100644
index 0000000..63e5b09
--- /dev/null
+++ b/sysdeps/generic/string-optype.h
@@ -0,0 +1,31 @@
+/* string-optype.h -- Define a type to use for word access.  Generic version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef STRING_OPTYPE_H
+#define STRING_OPTYPE_H 1
+
+/* Use the existing parameterization from gmp as a default.  */
+#include <gmp-mparam.h>
+
+#ifdef _LONG_LONG_LIMB
+typedef unsigned long long int  op_t;
+#else
+typedef unsigned long int       op_t;
+#endif
+
+#endif /* string-optype.h */
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 08/16] Improve generic memrchr
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
                   ` (3 preceding siblings ...)
  2016-12-21 23:06 ` [PATCH v2 16/16] powerpc: Add string-fza.h Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 04/16] Improve generic strchr Richard Henderson
                   ` (10 subsequent siblings)
  15 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

	[BZ #5806]
	* string/memrchr.c: Use string-fzb.h, string-fzi.h.
---
 string/memrchr.c | 205 +++++++++++++++----------------------------------------
 1 file changed, 55 insertions(+), 150 deletions(-)

diff --git a/string/memrchr.c b/string/memrchr.c
index b9b0c9e..ed9ceaf 100644
--- a/string/memrchr.c
+++ b/string/memrchr.c
@@ -21,180 +21,85 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <stdlib.h>
-
-#ifdef HAVE_CONFIG_H
-# include <config.h>
-#endif
-
-#undef __ptr_t
-#define __ptr_t void *
-
-#if defined _LIBC
-# include <string.h>
-# include <memcopy.h>
-#endif
-
-#if defined HAVE_LIMITS_H || defined _LIBC
-# include <limits.h>
-#endif
-
-#define LONG_MAX_32_BITS 2147483647
-
-#ifndef LONG_MAX
-# define LONG_MAX LONG_MAX_32_BITS
-#endif
-
-#include <sys/types.h>
+#include <string.h>
+#include <stdint.h>
+#include <limits.h>
+#include <string-fzb.h>
+#include <string-fzi.h>
+#include <string-opthr.h>
 
 #undef __memrchr
 #undef memrchr
 
-#ifndef weak_alias
-# define __memrchr memrchr
-#endif
-
 /* Search no more than N bytes of S for C.  */
-__ptr_t
-#ifndef MEMRCHR
-__memrchr
-#else
-MEMRCHR
+
+#ifdef MEMRCHR
+# define __memrchr MEMRCHR
 #endif
-     (const __ptr_t s, int c_in, size_t n)
+
+void *
+__memrchr (const void *s, int c_in, size_t n)
 {
-  const unsigned char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, charmask;
+  const unsigned char *char_ptr = (const unsigned char *) s + n;
+  const op_t *word_ptr;
+  op_t word, repeated_c;
+  uintptr_t i, align;
   unsigned char c;
 
   c = (unsigned char) c_in;
 
   /* Handle the last few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s + n;
-       n > 0 && ((unsigned long int) char_ptr
-		 & (sizeof (longword) - 1)) != 0;
-       --n)
+     Do this until CHAR_PTR is aligned on a word boundary, or
+     the entirety of small inputs.  */
+  align = (uintptr_t)char_ptr % sizeof (word);
+  if (n < OP_T_THRES || align > n)
+    align = n;
+  for (i = 0; i < align; ++i)
     if (*--char_ptr == c)
-      return (__ptr_t) char_ptr;
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (const unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
+      return (void *) char_ptr;
 
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
+  /* Set up a word, each of whose bytes is C.  */
+  repeated_c = ((op_t)-1 / 0xff) * c;
 
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-  magic_bits = -1;
-  magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1;
+  word_ptr = (const op_t *) char_ptr;
+  n -= align;
+  if (__glibc_unlikely (n == 0))
+    return NULL;
 
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-#if LONG_MAX > LONG_MAX_32_BITS
-  charmask |= charmask << 32;
-#endif
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  while (n >= sizeof (longword))
+  /* Loop while we have at least one word remaining.  */
+  while (n >= sizeof (word))
     {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.
-
-	 3) But wait!  Aren't we looking for C, not zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-      longword = *--longword_ptr ^ charmask;
-
-      /* Add MAGIC_BITS to LONGWORD.  */
-      if ((((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits) != 0)
+      word = *--word_ptr;
+      if (has_eq (word, repeated_c))
 	{
-	  /* Which of the bytes was C?  If none of them were, it was
-	     a misfire; continue the search.  */
-
-	  const unsigned char *cp = (const unsigned char *) longword_ptr;
-
-#if LONG_MAX > 2147483647
-	  if (cp[7] == c)
-	    return (__ptr_t) &cp[7];
-	  if (cp[6] == c)
-	    return (__ptr_t) &cp[6];
-	  if (cp[5] == c)
-	    return (__ptr_t) &cp[5];
-	  if (cp[4] == c)
-	    return (__ptr_t) &cp[4];
-#endif
-	  if (cp[3] == c)
-	    return (__ptr_t) &cp[3];
-	  if (cp[2] == c)
-	    return (__ptr_t) &cp[2];
-	  if (cp[1] == c)
-	    return (__ptr_t) &cp[1];
-	  if (cp[0] == c)
-	    return (__ptr_t) cp;
+	  word ^= repeated_c;
+	  goto found;
 	}
-
-      n -= sizeof (longword);
+      n -= sizeof (word);
     }
 
-  char_ptr = (const unsigned char *) longword_ptr;
-
-  while (n-- > 0)
+  /* Handle any remaining portion of a word.  */
+  if (n > 0)
     {
-      if (*--char_ptr == c)
-	return (__ptr_t) char_ptr;
-    }
+      word = *--word_ptr ^ repeated_c;
 
-  return 0;
+      /* Mask out the garbage bytes at the front of WORD.  */
+      op_t m = -1;
+      if (__BYTE_ORDER == __LITTLE_ENDIAN)
+	m >>= n * CHAR_BIT;
+      else
+	m <<= n * CHAR_BIT;
+      word |= m;
+
+      if (has_zero (word))
+	{
+	found:
+	  return (char *) word_ptr + index_last_zero (word);
+	}
+    }
+  return NULL;
 }
+
 #ifndef MEMRCHR
-# ifdef weak_alias
 weak_alias (__memrchr, memrchr)
-# endif
 #endif
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 12/16] hppa: Add string-fzb.h and string-fzi.h
  2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
                   ` (9 preceding siblings ...)
  2016-12-21 23:06 ` [PATCH v2 11/16] hppa: Add memcopy.h Richard Henderson
@ 2016-12-21 23:06 ` Richard Henderson
  2016-12-21 23:06 ` [PATCH v2 03/16] Improve generic strlen Richard Henderson
                   ` (4 subsequent siblings)
  15 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-12-21 23:06 UTC (permalink / raw)
  To: libc-alpha

Use UXOR,SBZ to test for a zero byte within a word.  While we can
get semi-decent code out of asm-goto, we would do slightly better
with a compiler builtin.

For index_zero et al, sequential testing of bytes is less expensive than
any tricks that involve a count-leading-zeros insn that we don't have.

	* sysdeps/hppa/string-fza.h: New file.
	* sysdeps/hppa/string-fzb.h: New file.
	* sysdeps/hppa/string-fzi.h: New file.
---
 sysdeps/hppa/string-fza.h |   1 +
 sysdeps/hppa/string-fzb.h |  69 +++++++++++++++++++++++++
 sysdeps/hppa/string-fzi.h | 129 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 199 insertions(+)
 create mode 100644 sysdeps/hppa/string-fza.h
 create mode 100644 sysdeps/hppa/string-fzb.h
 create mode 100644 sysdeps/hppa/string-fzi.h

diff --git a/sysdeps/hppa/string-fza.h b/sysdeps/hppa/string-fza.h
new file mode 100644
index 0000000..92cfad7
--- /dev/null
+++ b/sysdeps/hppa/string-fza.h
@@ -0,0 +1 @@
+#error "string-fza.h is unused on HPPA"
diff --git a/sysdeps/hppa/string-fzb.h b/sysdeps/hppa/string-fzb.h
new file mode 100644
index 0000000..97f1b64
--- /dev/null
+++ b/sysdeps/hppa/string-fzb.h
@@ -0,0 +1,69 @@
+/* string-fzb.h -- zero byte detection, boolean.  HPPA version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef STRING_FZB_H
+#define STRING_FZB_H 1
+
+#include <string-optype.h>
+
+/* Determine if any byte within X is zero.  This is a pure boolean test.  */
+
+static inline _Bool
+has_zero (op_t x)
+{
+  _Static_assert (sizeof (op_t) == 4, "64-bit not supported");
+
+  /* It's more useful to expose a control transfer to the compiler
+     than to expose a proper boolean result.  */
+  asm goto ("uxor,sbz %%r0,%0,%%r0\n\t"
+	    "b,n %l1" : : "r"(x) : : nbz);
+  return 1;
+ nbz:
+  return 0;
+}
+
+/* Likewise, but for byte equality between X1 and X2.  */
+
+static inline _Bool
+has_eq (op_t x1, op_t x2)
+{
+  _Static_assert (sizeof (op_t) == 4, "64-bit not supported");
+
+  asm goto ("uxor,sbz %0,%1,%%r0\n\t"
+	    "b,n %l2" : : "r"(x1), "r"(x2) : : nbz);
+  return 1;
+ nbz:
+  return 0;
+}
+
+/* Likewise, but for zeros in X1 and equal bytes between X1 and X2.  */
+
+static inline _Bool
+has_zero_eq (op_t x1, op_t x2)
+{
+  _Static_assert (sizeof (op_t) == 4, "64-bit not supported");
+
+  asm goto ("uxor,sbz %%r0,%0,%%r0\n\t"
+	    "uxor,nbz %0,%1,%%r0\n\t"
+	    "b,n %l2" : : "r"(x1), "r"(x2) : : sbz);
+  return 0;
+ sbz:
+  return 1;
+}
+
+#endif /* STRING_HASZERO_H */
diff --git a/sysdeps/hppa/string-fzi.h b/sysdeps/hppa/string-fzi.h
new file mode 100644
index 0000000..ae310d9
--- /dev/null
+++ b/sysdeps/hppa/string-fzi.h
@@ -0,0 +1,129 @@
+/* string-fzi.h -- zero byte detection; indexes.  HPPA version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef STRING_FZI_H
+#define STRING_FZI_H 1
+
+#include <string-optype.h>
+
+/* Given a word X that is known to contain a zero byte, return the
+   index of the first such within the long in memory order.  */
+
+static inline unsigned int
+index_first_zero (op_t x)
+{
+  unsigned int ret;
+
+  _Static_assert (sizeof (op_t) == 4, "64-bit not supported");
+
+  /* Since we have no clz insn, direct tests of the bytes is faster
+     than loading up the constants to do the masking.  */
+  asm ("extrw,u,<> %1,23,8,%%r0\n\t"
+       "ldi 2,%0\n\t"
+       "extrw,u,<> %1,15,8,%%r0\n\t"
+       "ldi 1,%0\n\t"
+       "extrw,u,<> %1,7,8,%%r0\n\t"
+       "ldi 0,%0"
+       : "=r"(ret) : "r"(x), "0"(3));
+
+  return ret;
+}
+
+/* Similarly, but perform the search for byte equality between X1 and X2.  */
+
+static inline unsigned int
+index_first_eq (op_t x1, op_t x2)
+{
+  return index_first_zero (x1 ^ x2);
+}
+
+/* Similarly, but perform the search for zero within X1 or
+   equality between X1 and X2.  */
+
+static inline unsigned int
+index_first_zero_eq (op_t x1, op_t x2)
+{
+  unsigned int ret;
+
+  _Static_assert (sizeof (op_t) == 4, "64-bit not supported");
+
+  /* Since we have no clz insn, direct tests of the bytes is faster
+     than loading up the constants to do the masking.  */
+  asm ("extrw,u,= %1,23,8,%%r0\n\t"
+       "extrw,u,<> %2,23,8,%%r0\n\t"
+       "ldi 2,%0\n\t"
+       "extrw,u,= %1,15,8,%%r0\n\t"
+       "extrw,u,<> %2,15,8,%%r0\n\t"
+       "ldi 1,%0\n\t"
+       "extrw,u,= %1,7,8,%%r0\n\t"
+       "extrw,u,<> %2,7,8,%%r0\n\t"
+       "ldi 0,%0"
+       : "=r"(ret) : "r"(x1), "r"(x1 ^ x2), "0"(3));
+
+  return ret;
+}
+
+/* Similarly, but perform the search for zero within X1 or 
+   inequality between X1 and X2. */
+
+static inline unsigned int
+index_first_zero_ne (op_t x1, op_t x2)
+{
+  unsigned int ret;
+
+  _Static_assert (sizeof (op_t) == 4, "64-bit not supported");
+
+  /* Since we have no clz insn, direct tests of the bytes is faster
+     than loading up the constants to do the masking.  */
+  asm ("extrw,u,<> %2,23,8,%%r0\n\t"
+       "extrw,u,<> %1,23,8,%%r0\n\t"
+       "ldi 2,%0\n\t"
+       "extrw,u,<> %2,15,8,%%r0\n\t"
+       "extrw,u,<> %1,15,8,%%r0\n\t"
+       "ldi 1,%0\n\t"
+       "extrw,u,<> %2,7,8,%%r0\n\t"
+       "extrw,u,<> %1,7,8,%%r0\n\t"
+       "ldi 0,%0"
+       : "=r"(ret) : "r"(x1), "r"(x1 ^ x2), "0"(3));
+
+  return ret;
+}
+
+/* Similarly, but search for the last zero within X.  */
+
+static inline unsigned int
+index_last_zero (op_t x)
+{
+  unsigned int ret;
+
+  _Static_assert (sizeof (op_t) == 4, "64-bit not supported");
+
+  /* Since we have no ctz insn, direct tests of the bytes is faster
+     than loading up the constants to do the masking.  */
+  asm ("extrw,u,<> %1,15,8,%%r0\n\t"
+       "ldi 1,%0\n\t"
+       "extrw,u,<> %1,23,8,%%r0\n\t"
+       "ldi 2,%0\n\t"
+       "extrw,u,<> %1,31,8,%%r0\n\t"
+       "ldi 3,%0"
+       : "=r"(ret) : "r"(x), "0"(0));
+
+  return ret;
+}
+
+#endif /* STRING_FZI_H */
-- 
2.9.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 15/16] arm: Define __memchr
  2016-12-21 23:06 ` [PATCH v2 15/16] arm: Define __memchr Richard Henderson
@ 2016-12-22  0:25   ` Joseph Myers
  2016-12-23 18:55     ` Richard Henderson
  0 siblings, 1 reply; 21+ messages in thread
From: Joseph Myers @ 2016-12-22  0:25 UTC (permalink / raw)
  To: Richard Henderson; +Cc: libc-alpha

On Wed, 21 Dec 2016, Richard Henderson wrote:

> All other memchr implementations define it.

But since memchr is a C90 function (so there are no external-linkage 
namespace issues), and not used in any macros defined in installed headers 
(so no block-scope namespace issues), is there any actual use for 
__memchr, or should all the implementations just define memchr and not 
__memchr?

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 05/16] Improve generic memchr
  2016-12-21 23:06 ` [PATCH v2 05/16] Improve generic memchr Richard Henderson
@ 2016-12-23 18:33   ` Tulio Magno Quites Machado Filho
  0 siblings, 0 replies; 21+ messages in thread
From: Tulio Magno Quites Machado Filho @ 2016-12-23 18:33 UTC (permalink / raw)
  To: Richard Henderson, libc-alpha

Richard Henderson <rth@twiddle.net> writes:

> -#ifdef weak_alias
> +
> +#ifndef MEMCHR
>  weak_alias (__memchr, memchr)
> -#endif
>  libc_hidden_builtin_def (memchr)
> +#endif

If you make this change, you also have to make the following change.
LGTM with it.

diff --git a/sysdeps/powerpc/powerpc32/power4/multiarch/memchr-ppc32.c b/sysdeps/powerpc/powerpc32/power4/multiarch/memchr-ppc32.c
index 9861337..6faf996 100644
--- a/sysdeps/powerpc/powerpc32/power4/multiarch/memchr-ppc32.c
+++ b/sysdeps/powerpc/powerpc32/power4/multiarch/memchr-ppc32.c
@@ -23,12 +23,10 @@
 #undef weak_alias
 #define weak_alias(a, b)
 
-#ifdef SHARED
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
-  __hidden_ver1(__memchr_ppc, __GI_memchr, __memchr_ppc);
-#endif
-
 extern __typeof (memchr) __memchr_ppc attribute_hidden;
 
 #include <string/memchr.c>
+
+#ifdef SHARED
+__hidden_ver1(__memchr_ppc, __GI_memchr, __memchr_ppc);
+#endif

-- 
Tulio Magno

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 16/16] powerpc: Add string-fza.h
  2016-12-21 23:06 ` [PATCH v2 16/16] powerpc: Add string-fza.h Richard Henderson
@ 2016-12-23 18:44   ` Tulio Magno Quites Machado Filho
  0 siblings, 0 replies; 21+ messages in thread
From: Tulio Magno Quites Machado Filho @ 2016-12-23 18:44 UTC (permalink / raw)
  To: Richard Henderson, libc-alpha

Richard Henderson <rth@twiddle.net> writes:

> [ text/plain ]
> While ppc has the more important string functions in assembly,
> there are still a few generic routines used.
>
> Use the Power 6 CMPB insn for testing of zeros.
>
> 	* sysdeps/powerpc/power6/string-fza.h: New file.
> 	* sysdeps/powerpc/powerpc32/power6/string-fza.h: New file.
> 	* sysdeps/powerpc/powerpc64/power6/string-fza.h: New file.

This patch LGTM.

-- 
Tulio Magno

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 15/16] arm: Define __memchr
  2016-12-22  0:25   ` Joseph Myers
@ 2016-12-23 18:55     ` Richard Henderson
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-12-23 18:55 UTC (permalink / raw)
  To: Joseph Myers; +Cc: libc-alpha

On 12/21/2016 04:25 PM, Joseph Myers wrote:
> On Wed, 21 Dec 2016, Richard Henderson wrote:
> 
>> All other memchr implementations define it.
> 
> But since memchr is a C90 function (so there are no external-linkage 
> namespace issues), and not used in any macros defined in installed headers 
> (so no block-scope namespace issues), is there any actual use for 
> __memchr, or should all the implementations just define memchr and not 
> __memchr?
> 

It's an excellent question.  As best I can tell, __memchr is no longer used (or
was never used -- it's hard to tell) at all.

Probably it should be purged from all of the implementations.


r~

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2016-12-23 18:55 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-21 23:06 [PATCH v2 00/16] Improve generic string routines Richard Henderson
2016-12-21 23:06 ` [PATCH v2 14/16] arm: Add string-fza.h Richard Henderson
2016-12-21 23:06 ` [PATCH v2 15/16] arm: Define __memchr Richard Henderson
2016-12-22  0:25   ` Joseph Myers
2016-12-23 18:55     ` Richard Henderson
2016-12-21 23:06 ` [PATCH v2 06/16] Improve generic strchrnul Richard Henderson
2016-12-21 23:06 ` [PATCH v2 16/16] powerpc: Add string-fza.h Richard Henderson
2016-12-23 18:44   ` Tulio Magno Quites Machado Filho
2016-12-21 23:06 ` [PATCH v2 08/16] Improve generic memrchr Richard Henderson
2016-12-21 23:06 ` [PATCH v2 04/16] Improve generic strchr Richard Henderson
2016-12-21 23:06 ` [PATCH v2 13/16] alpha: Add string-fzb.h and string-fzi.h Richard Henderson
2016-12-21 23:06 ` [PATCH v2 10/16] Improve generic strcmp Richard Henderson
2016-12-21 23:06 ` [PATCH v2 07/16] Improve generic strrchr Richard Henderson
2016-12-21 23:06 ` [PATCH v2 11/16] hppa: Add memcopy.h Richard Henderson
2016-12-21 23:06 ` [PATCH v2 12/16] hppa: Add string-fzb.h and string-fzi.h Richard Henderson
2016-12-21 23:06 ` [PATCH v2 03/16] Improve generic strlen Richard Henderson
2016-12-21 23:06 ` [PATCH v2 05/16] Improve generic memchr Richard Henderson
2016-12-23 18:33   ` Tulio Magno Quites Machado Filho
2016-12-21 23:06 ` [PATCH v2 02/16] Parameterize OP_T_THRES from memcopy.h Richard Henderson
2016-12-21 23:06 ` [PATCH v2 01/16] Parameterize op_t " Richard Henderson
2016-12-21 23:06 ` [PATCH v2 09/16] Improve generic strnlen Richard Henderson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).