From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 7852) id CB69F385742A; Tue, 17 May 2022 03:20:15 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org CB69F385742A Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Sunil Pandey To: glibc-cvs@sourceware.org Subject: [glibc/release/2.34/master] x86: Optimize strspn in strspn-c.c X-Act-Checkin: glibc X-Git-Author: Noah Goldstein X-Git-Refname: refs/heads/release/2.34/master X-Git-Oldrev: 0ae1006967eef11909fbed0f6ecef2f260b133d3 X-Git-Newrev: 0a2da0111037b1cc214f8f40ca5bdebf36f35cbd Message-Id: <20220517032015.CB69F385742A@sourceware.org> Date: Tue, 17 May 2022 03:20:15 +0000 (GMT) X-BeenThere: glibc-cvs@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Glibc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 17 May 2022 03:20:15 -0000 https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=0a2da0111037b1cc214f8f40ca5bdebf36f35cbd commit 0a2da0111037b1cc214f8f40ca5bdebf36f35cbd Author: Noah Goldstein Date: Wed Mar 23 16:57:24 2022 -0500 x86: Optimize strspn in strspn-c.c Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of _mm_cmpistri. Also change offset to unsigned to avoid unnecessary sign extensions. geometric_mean(N=20) of all benchmarks that dont fallback on sse2; New / Original: .901 All string/memory tests pass. Reviewed-by: H.J. Lu (cherry picked from commit 412d10343168b05b8cf6c3683457cf9711d28046) Diff: --- sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 47 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c index a17196296b..3bcc479f1b 100644 --- a/sysdeps/x86_64/multiarch/strspn-c.c +++ b/sysdeps/x86_64/multiarch/strspn-c.c @@ -63,81 +63,73 @@ __strspn_sse42 (const char *s, const char *a) return 0; const char *aligned; - __m128i mask; - int offset = (int) ((size_t) a & 15); + __m128i mask, maskz, zero; + unsigned int maskz_bits; + unsigned int offset = (int) ((size_t) a & 15); + zero = _mm_set1_epi8 (0); if (offset != 0) { /* Load masks. */ aligned = (const char *) ((size_t) a & -16L); __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - - mask = __m128i_shift_right (mask0, offset); + maskz = _mm_cmpeq_epi8 (mask0, zero); /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16 - offset) - { - /* There is no NULL terminator. */ - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); - int index = _mm_cmpistri (mask1, mask1, 0x3a); - length += index; - - /* Don't use SSE4.2 if the length of A > 16. */ - if (length > 16) - return __strspn_sse2 (s, a); - - if (index != 0) - { - /* Combine mask0 and mask1. We could play games with - palignr, but frankly this data should be in L1 now - so do the merge via an unaligned load. */ - mask = _mm_loadu_si128 ((__m128i *) a); - } - } + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; + if (maskz_bits != 0) + { + mask = __m128i_shift_right (mask0, offset); + offset = (unsigned int) ((size_t) s & 15); + if (offset) + goto start_unaligned; + + aligned = s; + goto start_loop; + } } - else - { - /* A is aligned. */ - mask = _mm_load_si128 ((__m128i *) a); - /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16) - { - /* There is no NULL terminator. Don't use SSE4.2 if the length - of A > 16. */ - if (a[16] != 0) - return __strspn_sse2 (s, a); - } + /* A is aligned. */ + mask = _mm_loadu_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + maskz = _mm_cmpeq_epi8 (mask, zero); + maskz_bits = _mm_movemask_epi8 (maskz); + if (maskz_bits == 0) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return __strspn_sse2 (s, a); } + aligned = s; + offset = (unsigned int) ((size_t) s & 15); - offset = (int) ((size_t) s & 15); if (offset != 0) { + start_unaligned: /* Check partial string. */ aligned = (const char *) ((size_t) s & -16L); __m128i value = _mm_load_si128 ((__m128i *) aligned); + __m128i adj_value = __m128i_shift_right (value, offset); - value = __m128i_shift_right (value, offset); - - int length = _mm_cmpistri (mask, value, 0x12); + unsigned int length = _mm_cmpistri (mask, adj_value, 0x12); /* No need to check CFlag since it is always 1. */ if (length < 16 - offset) return length; /* Find where the NULL terminator is. */ - int index = _mm_cmpistri (value, value, 0x3a); - if (index < 16 - offset) + maskz = _mm_cmpeq_epi8 (value, zero); + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; + if (maskz_bits != 0) return length; aligned += 16; } - else - aligned = s; +start_loop: while (1) { __m128i value = _mm_load_si128 ((__m128i *) aligned); - int index = _mm_cmpistri (mask, value, 0x12); - int cflag = _mm_cmpistrc (mask, value, 0x12); + unsigned int index = _mm_cmpistri (mask, value, 0x12); + unsigned int cflag = _mm_cmpistrc (mask, value, 0x12); if (cflag) return (size_t) (aligned + index - s); aligned += 16;