public inbox for glibc-cvs@sourceware.org
help / color / mirror / Atom feed
From: Noah Goldstein <nwg@sourceware.org>
To: glibc-cvs@sourceware.org
Subject: [glibc] x86: Optimize strspn in strspn-c.c
Date: Fri, 25 Mar 2022 18:18:16 +0000 (GMT)	[thread overview]
Message-ID: <20220325181816.8CB9A389851C@sourceware.org> (raw)

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=412d10343168b05b8cf6c3683457cf9711d28046

commit 412d10343168b05b8cf6c3683457cf9711d28046
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Wed Mar 23 16:57:24 2022 -0500

    x86: Optimize strspn in strspn-c.c
    
    Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
    _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
    sign extensions.
    
    geometric_mean(N=20) of all benchmarks that dont fallback on
    sse2; New / Original: .901
    
    All string/memory tests pass.
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Diff:
---
 sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 47 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
index 8fb3aba64d..6124033ceb 100644
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -62,81 +62,73 @@ __strspn_sse42 (const char *s, const char *a)
     return 0;
 
   const char *aligned;
-  __m128i mask;
-  int offset = (int) ((size_t) a & 15);
+  __m128i mask, maskz, zero;
+  unsigned int maskz_bits;
+  unsigned int offset = (int) ((size_t) a & 15);
+  zero = _mm_set1_epi8 (0);
   if (offset != 0)
     {
       /* Load masks.  */
       aligned = (const char *) ((size_t) a & -16L);
       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
-      mask = __m128i_shift_right (mask0, offset);
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
 
       /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16 - offset)
-	{
-	  /* There is no NULL terminator.  */
-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
-	  length += index;
-
-	  /* Don't use SSE4.2 if the length of A > 16.  */
-	  if (length > 16)
-	    return __strspn_sse2 (s, a);
-
-	  if (index != 0)
-	    {
-	      /* Combine mask0 and mask1.  We could play games with
-		 palignr, but frankly this data should be in L1 now
-		 so do the merge via an unaligned load.  */
-	      mask = _mm_loadu_si128 ((__m128i *) a);
-	    }
-	}
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
+        {
+          mask = __m128i_shift_right (mask0, offset);
+          offset = (unsigned int) ((size_t) s & 15);
+          if (offset)
+            goto start_unaligned;
+
+          aligned = s;
+          goto start_loop;
+        }
     }
-  else
-    {
-      /* A is aligned.  */
-      mask = _mm_load_si128 ((__m128i *) a);
 
-      /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16)
-	{
-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
-	     of A > 16.  */
-	  if (a[16] != 0)
-	    return __strspn_sse2 (s, a);
-	}
+  /* A is aligned.  */
+  mask = _mm_loadu_si128 ((__m128i *) a);
+
+  /* Find where the NULL terminator is.  */
+  maskz = _mm_cmpeq_epi8 (mask, zero);
+  maskz_bits = _mm_movemask_epi8 (maskz);
+  if (maskz_bits == 0)
+    {
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
+         of A > 16.  */
+      if (a[16] != 0)
+        return __strspn_sse2 (s, a);
     }
+  aligned = s;
+  offset = (unsigned int) ((size_t) s & 15);
 
-  offset = (int) ((size_t) s & 15);
   if (offset != 0)
     {
+    start_unaligned:
       /* Check partial string.  */
       aligned = (const char *) ((size_t) s & -16L);
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      __m128i adj_value = __m128i_shift_right (value, offset);
 
-      value = __m128i_shift_right (value, offset);
-
-      int length = _mm_cmpistri (mask, value, 0x12);
+      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
       /* No need to check CFlag since it is always 1.  */
       if (length < 16 - offset)
 	return length;
       /* Find where the NULL terminator is.  */
-      int index = _mm_cmpistri (value, value, 0x3a);
-      if (index < 16 - offset)
+      maskz = _mm_cmpeq_epi8 (value, zero);
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
 	return length;
       aligned += 16;
     }
-  else
-    aligned = s;
 
+start_loop:
   while (1)
     {
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
-      int index = _mm_cmpistri (mask, value, 0x12);
-      int cflag = _mm_cmpistrc (mask, value, 0x12);
+      unsigned int index = _mm_cmpistri (mask, value, 0x12);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
       if (cflag)
 	return (size_t) (aligned + index - s);
       aligned += 16;


                 reply	other threads:[~2022-03-25 18:18 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220325181816.8CB9A389851C@sourceware.org \
    --to=nwg@sourceware.org \
    --cc=glibc-cvs@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).