From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 7844) id 71854389851C; Fri, 25 Mar 2022 18:18:11 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 71854389851C Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Noah Goldstein To: glibc-cvs@sourceware.org Subject: [glibc] x86: Optimize strcspn and strpbrk in strcspn-c.c X-Act-Checkin: glibc X-Git-Author: Noah Goldstein X-Git-Refname: refs/heads/master X-Git-Oldrev: dc18cd6c818944fafbeae9ba1b50bd8d0d070a7c X-Git-Newrev: 30d627d477d7255345a4b713cf352ac32d644d61 Message-Id: <20220325181811.71854389851C@sourceware.org> Date: Fri, 25 Mar 2022 18:18:11 +0000 (GMT) X-BeenThere: glibc-cvs@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Glibc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 25 Mar 2022 18:18:11 -0000 https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=30d627d477d7255345a4b713cf352ac32d644d61 commit 30d627d477d7255345a4b713cf352ac32d644d61 Author: Noah Goldstein Date: Wed Mar 23 16:57:22 2022 -0500 x86: Optimize strcspn and strpbrk in strcspn-c.c Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of _mm_cmpistri. Also change offset to unsigned to avoid unnecessary sign extensions. geometric_mean(N=20) of all benchmarks that dont fallback on sse2/strlen; New / Original: .928 All string/memory tests pass. Reviewed-by: H.J. Lu Diff: --- sysdeps/x86_64/multiarch/strcspn-c.c | 83 ++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 46 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c index 013aebf797..c312fab8b1 100644 --- a/sysdeps/x86_64/multiarch/strcspn-c.c +++ b/sysdeps/x86_64/multiarch/strcspn-c.c @@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a) RETURN (NULL, strlen (s)); const char *aligned; - __m128i mask; - int offset = (int) ((size_t) a & 15); + __m128i mask, maskz, zero; + unsigned int maskz_bits; + unsigned int offset = (unsigned int) ((size_t) a & 15); + zero = _mm_set1_epi8 (0); if (offset != 0) { /* Load masks. */ aligned = (const char *) ((size_t) a & -16L); __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - - mask = __m128i_shift_right (mask0, offset); + maskz = _mm_cmpeq_epi8 (mask0, zero); /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16 - offset) - { - /* There is no NULL terminator. */ - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); - int index = _mm_cmpistri (mask1, mask1, 0x3a); - length += index; - - /* Don't use SSE4.2 if the length of A > 16. */ - if (length > 16) - return STRCSPN_SSE2 (s, a); - - if (index != 0) - { - /* Combine mask0 and mask1. We could play games with - palignr, but frankly this data should be in L1 now - so do the merge via an unaligned load. */ - mask = _mm_loadu_si128 ((__m128i *) a); - } - } + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; + if (maskz_bits != 0) + { + mask = __m128i_shift_right (mask0, offset); + offset = (unsigned int) ((size_t) s & 15); + if (offset) + goto start_unaligned; + + aligned = s; + goto start_loop; + } } - else - { - /* A is aligned. */ - mask = _mm_load_si128 ((__m128i *) a); - /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16) - { - /* There is no NULL terminator. Don't use SSE4.2 if the length - of A > 16. */ - if (a[16] != 0) - return STRCSPN_SSE2 (s, a); - } + /* A is aligned. */ + mask = _mm_loadu_si128 ((__m128i *) a); + /* Find where the NULL terminator is. */ + maskz = _mm_cmpeq_epi8 (mask, zero); + maskz_bits = _mm_movemask_epi8 (maskz); + if (maskz_bits == 0) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return STRCSPN_SSE2 (s, a); } - offset = (int) ((size_t) s & 15); + aligned = s; + offset = (unsigned int) ((size_t) s & 15); if (offset != 0) { + start_unaligned: /* Check partial string. */ aligned = (const char *) ((size_t) s & -16L); __m128i value = _mm_load_si128 ((__m128i *) aligned); value = __m128i_shift_right (value, offset); - int length = _mm_cmpistri (mask, value, 0x2); + unsigned int length = _mm_cmpistri (mask, value, 0x2); /* No need to check ZFlag since ZFlag is always 1. */ - int cflag = _mm_cmpistrc (mask, value, 0x2); + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); if (cflag) RETURN ((char *) (s + length), length); /* Find where the NULL terminator is. */ - int index = _mm_cmpistri (value, value, 0x3a); + unsigned int index = _mm_cmpistri (value, value, 0x3a); if (index < 16 - offset) RETURN (NULL, index); aligned += 16; } - else - aligned = s; +start_loop: while (1) { __m128i value = _mm_load_si128 ((__m128i *) aligned); - int index = _mm_cmpistri (mask, value, 0x2); - int cflag = _mm_cmpistrc (mask, value, 0x2); - int zflag = _mm_cmpistrz (mask, value, 0x2); + unsigned int index = _mm_cmpistri (mask, value, 0x2); + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); + unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); if (cflag) RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); if (zflag)