From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pf1-x42b.google.com (mail-pf1-x42b.google.com [IPv6:2607:f8b0:4864:20::42b]) by sourceware.org (Postfix) with ESMTPS id 23FF43858C54 for ; Thu, 14 Apr 2022 18:06:13 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 23FF43858C54 Received: by mail-pf1-x42b.google.com with SMTP id n22so5190357pfa.0 for ; Thu, 14 Apr 2022 11:06:13 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=gimDa+NawyTEp7GyWnSRlIt4hKuyCvng0O+fyAs/6oM=; b=229X4nEjQKwscTwz1cUUdGF6TZG8nuePsLBK/GWGV7tqdkbs2l5HIAcRE4Iqf8Hwjt BnhoUI9eSSIquNJU3cTK/8kCjNXc29AX+KhkKEzODFVBy6YSJ9uJsNRR+aH0QUZWp7XK A7Coafahz020GR66cWJOiFv0GoWIXbcsX9Mk7J8YWJlTFmlPKpXWDtCG4uGuy0JWmpx9 gX2CD/6gcGj/8U3mOs9V9MJhqAYj7dWykrRe4pi0Yug4dvOCmdzRhRTXC7nuOqCxlP6t v0H0+fRDHZnTNCuJltjBsuUwgVpwXygkFIlXdDNUn7kuzrARRzEiRcht/tnaF5XLdgxJ OtxA== X-Gm-Message-State: AOAM532pSEcLH8WwAWOvik+wVx52hbyQfEGtX/mIzkqDKH2ikdLh0Qj0 DSIuHtE82hHkF3yz68lM2GZ/UkSDazM6yf91CXleik+7p5c= X-Google-Smtp-Source: ABdhPJzvL5w/b9QZkrAHTJSpR7uQadrb1Ne1xx5dfHVvIgd0WLvGwCnyDXzsjhqPkchp9EvxGiCdXHC4Jm+Gs5atQM8= X-Received: by 2002:a63:f457:0:b0:39c:ec64:cd76 with SMTP id p23-20020a63f457000000b0039cec64cd76mr3279013pgk.381.1649959571725; Thu, 14 Apr 2022 11:06:11 -0700 (PDT) MIME-Version: 1.0 References: <20220325183625.1170867-2-goldstein.w.n@gmail.com> <20220414164739.3146735-1-goldstein.w.n@gmail.com> <20220414164739.3146735-2-goldstein.w.n@gmail.com> In-Reply-To: <20220414164739.3146735-2-goldstein.w.n@gmail.com> From: "H.J. Lu" Date: Thu, 14 Apr 2022 11:05:35 -0700 Message-ID: Subject: Re: [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 To: Noah Goldstein Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3025.4 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_NUMSUBJECT, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 14 Apr 2022 18:06:16 -0000 On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result it is no longer worth it to keep the SSSE3 > versions given the code size cost. > --- > sysdeps/x86_64/multiarch/Makefile | 4 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 -- > sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 - > sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S | 6 - > sysdeps/x86_64/multiarch/strcmp-ssse3.S | 5 - > sysdeps/x86_64/multiarch/strcmp.c | 4 - > sysdeps/x86_64/multiarch/strncase_l-ssse3.S | 6 - > sysdeps/x86_64/multiarch/strncmp-ssse3.S | 28 ---- > sysdeps/x86_64/multiarch/strncmp.c | 4 - > sysdeps/x86_64/strcmp.S | 155 ++++-------------- > 10 files changed, 30 insertions(+), 202 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 51222dfab1..ed2def288d 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -58,7 +58,6 @@ sysdep_routines += \ > strcasecmp_l-evex \ > strcasecmp_l-sse2 \ > strcasecmp_l-sse4_2 \ > - strcasecmp_l-ssse3 \ > strcat-avx2 \ > strcat-avx2-rtm \ > strcat-evex \ > @@ -80,7 +79,6 @@ sysdep_routines += \ > strcmp-sse2 \ > strcmp-sse2-unaligned \ > strcmp-sse4_2 \ > - strcmp-ssse3 \ > strcpy-avx2 \ > strcpy-avx2-rtm \ > strcpy-evex \ > @@ -98,7 +96,6 @@ sysdep_routines += \ > strncase_l-evex \ > strncase_l-sse2 \ > strncase_l-sse4_2 \ > - strncase_l-ssse3 \ > strncat-avx2 \ > strncat-avx2-rtm \ > strncat-c \ > @@ -110,7 +107,6 @@ sysdep_routines += \ > strncmp-evex \ > strncmp-sse2 \ > strncmp-sse4_2 \ > - strncmp-ssse3 \ > strncpy-avx2 \ > strncpy-avx2-rtm \ > strncpy-c \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index f389928a4e..7e2be3554b 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, strcasecmp, > CPU_FEATURE_USABLE (SSE4_2), > __strcasecmp_sse42) > - IFUNC_IMPL_ADD (array, i, strcasecmp, > - CPU_FEATURE_USABLE (SSSE3), > - __strcasecmp_ssse3) > IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2)) > > /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ > @@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, strcasecmp_l, > CPU_FEATURE_USABLE (SSE4_2), > __strcasecmp_l_sse42) > - IFUNC_IMPL_ADD (array, i, strcasecmp_l, > - CPU_FEATURE_USABLE (SSSE3), > - __strcasecmp_l_ssse3) > IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, > __strcasecmp_l_sse2)) > > @@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __strcmp_evex) > IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2), > __strcmp_sse42) > - IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3), > - __strcmp_ssse3) > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) > > @@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, strncasecmp, > CPU_FEATURE_USABLE (SSE4_2), > __strncasecmp_sse42) > - IFUNC_IMPL_ADD (array, i, strncasecmp, > - CPU_FEATURE_USABLE (SSSE3), > - __strncasecmp_ssse3) > IFUNC_IMPL_ADD (array, i, strncasecmp, 1, > __strncasecmp_sse2)) > > @@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, strncasecmp_l, > CPU_FEATURE_USABLE (SSE4_2), > __strncasecmp_l_sse42) > - IFUNC_IMPL_ADD (array, i, strncasecmp_l, > - CPU_FEATURE_USABLE (SSSE3), > - __strncasecmp_l_ssse3) > IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, > __strncasecmp_l_sse2)) > > @@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __strncmp_evex) > IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2), > __strncmp_sse42) > - IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3), > - __strncmp_ssse3) > IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) > > #ifdef SHARED > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h > index 766539c241..296d32071b 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h > +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h > @@ -20,7 +20,6 @@ > #include > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > @@ -49,8 +48,5 @@ IFUNC_SELECTOR (void) > && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) > return OPTIMIZE (sse42); > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > - return OPTIMIZE (ssse3); > - > return OPTIMIZE (sse2); > } > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S > deleted file mode 100644 > index fb2f9ae14a..0000000000 > --- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S > +++ /dev/null > @@ -1,6 +0,0 @@ > -#define USE_SSSE3 1 > -#define USE_AS_STRCASECMP_L > -#define NO_NOLOCALE_ALIAS > -#define STRCMP __strcasecmp_l_ssse3 > -#define __strcasecmp __strcasecmp_ssse3 > -#include "../strcmp.S" > diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S > deleted file mode 100644 > index 1b7fa33c91..0000000000 > --- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S > +++ /dev/null > @@ -1,5 +0,0 @@ > -#if IS_IN (libc) > -# define USE_SSSE3 1 > -# define STRCMP __strcmp_ssse3 > -# include "../strcmp.S" > -#endif > diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c > index 68cb73baad..a248c2a6e6 100644 > --- a/sysdeps/x86_64/multiarch/strcmp.c > +++ b/sysdeps/x86_64/multiarch/strcmp.c > @@ -28,7 +28,6 @@ > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; > @@ -56,9 +55,6 @@ IFUNC_SELECTOR (void) > if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) > return OPTIMIZE (sse2_unaligned); > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > - return OPTIMIZE (ssse3); > - > return OPTIMIZE (sse2); > } > > diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S > deleted file mode 100644 > index 6728678688..0000000000 > --- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S > +++ /dev/null > @@ -1,6 +0,0 @@ > -#define USE_SSSE3 1 > -#define USE_AS_STRNCASECMP_L > -#define NO_NOLOCALE_ALIAS > -#define STRCMP __strncasecmp_l_ssse3 > -#define __strncasecmp __strncasecmp_ssse3 > -#include "../strcmp.S" > diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S > deleted file mode 100644 > index ec37308347..0000000000 > --- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S > +++ /dev/null > @@ -1,28 +0,0 @@ > -/* strcmp optimized with SSSE3. > - Copyright (C) 2017-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - . */ > - > -#include > - > -#define STRCMP __strncmp_ssse3 > - > -#undef libc_hidden_builtin_def > -#define libc_hidden_builtin_def(strcmp) > - > -#define USE_SSSE3 1 > -#define USE_AS_STRNCMP > -#include > diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c > index fca74199d8..70ae6547c9 100644 > --- a/sysdeps/x86_64/multiarch/strncmp.c > +++ b/sysdeps/x86_64/multiarch/strncmp.c > @@ -27,7 +27,6 @@ > # include > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > @@ -57,9 +56,6 @@ IFUNC_SELECTOR (void) > && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) > return OPTIMIZE (sse42); > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > - return OPTIMIZE (ssse3); > - > return OPTIMIZE (sse2); > } > > diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S > index 99d8b36f1d..c38dc627f9 100644 > --- a/sysdeps/x86_64/strcmp.S > +++ b/sysdeps/x86_64/strcmp.S > @@ -59,12 +59,7 @@ > # endif > #endif > > -#ifndef USE_SSSE3 > .text > -#else > - .section .text.ssse3,"ax",@progbits > -#endif > - > #ifdef USE_AS_STRCASECMP_L > # ifndef ENTRY2 > # define ENTRY2(name) ENTRY (name) > @@ -343,13 +338,10 @@ LABEL(gobble_ashr_1): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 /* store for next cycle */ > > -#ifndef USE_SSSE3 > psrldq $1, %xmm3 > pslldq $15, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -373,13 +365,10 @@ LABEL(gobble_ashr_1): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 /* store for next cycle */ > > -#ifndef USE_SSSE3 > psrldq $1, %xmm3 > pslldq $15, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -473,13 +462,10 @@ LABEL(gobble_ashr_2): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $2, %xmm3 > pslldq $14, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -504,13 +490,10 @@ LABEL(gobble_ashr_2): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $2, %xmm3 > pslldq $14, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -598,13 +581,10 @@ LABEL(gobble_ashr_3): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $3, %xmm3 > pslldq $13, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -629,13 +609,10 @@ LABEL(gobble_ashr_3): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $3, %xmm3 > pslldq $13, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -723,13 +700,10 @@ LABEL(gobble_ashr_4): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $4, %xmm3 > pslldq $12, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -754,13 +728,10 @@ LABEL(gobble_ashr_4): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $4, %xmm3 > pslldq $12, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -848,13 +819,10 @@ LABEL(gobble_ashr_5): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $5, %xmm3 > pslldq $11, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -879,13 +847,10 @@ LABEL(gobble_ashr_5): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $5, %xmm3 > pslldq $11, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -973,13 +938,10 @@ LABEL(gobble_ashr_6): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $6, %xmm3 > pslldq $10, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $6, %xmm3 > pslldq $10, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $7, %xmm3 > pslldq $9, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $7, %xmm3 > pslldq $9, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $8, %xmm3 > pslldq $8, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $8, %xmm3 > pslldq $8, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $9, %xmm3 > pslldq $7, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $9, %xmm3 > pslldq $7, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $10, %xmm3 > pslldq $6, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $10, %xmm3 > pslldq $6, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $11, %xmm3 > pslldq $5, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $11, %xmm3 > pslldq $5, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $12, %xmm3 > pslldq $4, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $12, %xmm3 > pslldq $4, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $13, %xmm3 > pslldq $3, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $13, %xmm3 > pslldq $3, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $14, %xmm3 > pslldq $2, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $14, %xmm3 > pslldq $2, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $15, %xmm3 > pslldq $1, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $15, %xmm3 > pslldq $1, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu Thanks. -- H.J.