From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pl1-x62e.google.com (mail-pl1-x62e.google.com [IPv6:2607:f8b0:4864:20::62e]) by sourceware.org (Postfix) with ESMTPS id 68F7C3851C04 for ; Mon, 15 Mar 2021 14:25:28 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 68F7C3851C04 Received: by mail-pl1-x62e.google.com with SMTP id o10so3190325plg.11 for ; Mon, 15 Mar 2021 07:25:28 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=0ETxT62n7aJSYRjutM7Ak20kzITHtezueR74aL1onrM=; b=dkaNfSh9tTw3gjzxWa4msGOp8riZIvej/ZzoqDcSnDcEIiTUAONdlGdx6duOlc1/ET qnFIR00MAaBKkBXoGSdLa06OP+Sv3MR9evMyAlig6XJL87fWWfpPq0eXKP7x+lAVV6yW dDOSB/RyiQObnap22cQCpFsJ3+AHoWCoNufOl0JIZYQqtfu6BN8N0W7lb1mBydDmA9oH L+k26orcM28RJ8zGWx7f+G3elK4LU17jSIXQXvHKw7/yjBtMkz0FPGkchwsuR6WsIA1J 6i2xM1n9td5dbmfafGGgJz6wXIWLoARLC7yQ9BTr636voQWM9Y/dFpX/n5R5//SyA7h8 1dBQ== X-Gm-Message-State: AOAM531E6l8a05bOC3o7G/UETGNQ/YwlUJeCarnVs/RhRGouF1YW0mgW c7bw5fhgPntzs5b6sMD1bmb63a/ASkY= X-Google-Smtp-Source: ABdhPJyUlBQk5scJd5Dgv0xXDAL+zBhUca56inu967KaACaq/A1CJwW6GAsGlcmILC616LNlzX6pvw== X-Received: by 2002:a17:902:ecc4:b029:e6:1a9f:3397 with SMTP id a4-20020a170902ecc4b02900e61a9f3397mr11797472plh.9.1615818326970; Mon, 15 Mar 2021 07:25:26 -0700 (PDT) Received: from gnu-cfl-2.localdomain ([172.56.38.48]) by smtp.gmail.com with ESMTPSA id z2sm13536707pfa.121.2021.03.15.07.25.24 for (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 15 Mar 2021 07:25:25 -0700 (PDT) Received: from gnu-cfl-2.?040none?041 (localhost [IPv6:::1]) by gnu-cfl-2.localdomain (Postfix) with ESMTP id 0FE9C1A0A33 for ; Mon, 15 Mar 2021 07:25:21 -0700 (PDT) From: "H.J. Lu" To: libc-alpha@sourceware.org Subject: [PATCH v2 06/10] x86-64: Add memcmp family functions with 256-bit EVEX Date: Mon, 15 Mar 2021 07:25:16 -0700 Message-Id: <20210315142520.1661407-7-hjl.tools@gmail.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20210315142520.1661407-1-hjl.tools@gmail.com> References: <20210315142520.1661407-1-hjl.tools@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Spam-Status: No, score=-3034.7 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, RCVD_IN_BARRACUDACENTRAL, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 15 Mar 2021 14:25:30 -0000 Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM abort with usable AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function exit. --- sysdeps/x86_64/multiarch/Makefile | 4 +- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 + sysdeps/x86_64/multiarch/ifunc-memcmp.h | 13 +- sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 440 ++++++++++++++++++ sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S | 4 + 5 files changed, 467 insertions(+), 4 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 1cc0a10e12..9d79b138e9 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ memset-avx2-unaligned-erms \ memset-avx512-unaligned-erms \ memchr-evex \ + memcmp-evex-movbe \ memmove-evex-unaligned-erms \ memrchr-evex \ memset-evex-unaligned-erms \ @@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ wcsncmp-evex \ wcsnlen-evex \ wcsrchr-evex \ - wmemchr-evex + wmemchr-evex \ + wmemcmp-evex-movbe endif ifeq ($(subdir),debug) diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index aac8e601df..96344a71e4 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX2) && CPU_FEATURE_USABLE (MOVBE)), __memcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, memcmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (MOVBE)), + __memcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), __memcmp_sse4_1) IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), @@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX2) && CPU_FEATURE_USABLE (MOVBE)), __wmemcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, wmemcmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (MOVBE)), + __wmemcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), __wmemcmp_sse4_1) IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h index d5df541ec4..5ac41a19b8 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h @@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && CPU_FEATURE_USABLE_P (cpu_features, MOVBE) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2_movbe); + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + return OPTIMIZE (evex_movbe); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2_movbe); + } if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) return OPTIMIZE (sse4_1); diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S new file mode 100644 index 0000000000..9c093972e1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S @@ -0,0 +1,440 @@ +/* memcmp/wmemcmp optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +/* memcmp/wmemcmp is implemented as: + 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap + to avoid branches. + 2. Use overlapping compare to avoid branch. + 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 + bytes for wmemcmp. + 4. If size is 8 * VEC_SIZE or less, unroll the loop. + 5. Compare 4 * VEC_SIZE at a time with the aligned first memory + area. + 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. + 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. + 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ + +# include + +# ifndef MEMCMP +# define MEMCMP __memcmp_evex_movbe +# endif + +# define VMOVU vmovdqu64 + +# ifdef USE_AS_WMEMCMP +# define VPCMPEQ vpcmpeqd +# else +# define VPCMPEQ vpcmpeqb +# endif + +# define XMM1 xmm17 +# define XMM2 xmm18 +# define YMM1 ymm17 +# define YMM2 ymm18 +# define YMM3 ymm19 +# define YMM4 ymm20 +# define YMM5 ymm21 +# define YMM6 ymm22 + +# define VEC_SIZE 32 +# ifdef USE_AS_WMEMCMP +# define VEC_MASK 0xff +# define XMM_MASK 0xf +# else +# define VEC_MASK 0xffffffff +# define XMM_MASK 0xffff +# endif + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + .section .text.evex,"ax",@progbits +ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %RDX_LP +# elif defined __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) + + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k1 + kmovd %k1, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + cmpq $(VEC_SIZE * 2), %rdx + jbe L(last_vec) + + /* More than 2 * VEC. */ + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) + cmpq $(VEC_SIZE * 4), %rdx + jb L(last_4x_vec) + + /* From 4 * VEC to 8 * VEC, inclusively. */ + VMOVU (%rsi), %YMM1 + VPCMPEQ (%rdi), %YMM1, %k1 + + VMOVU VEC_SIZE(%rsi), %YMM2 + VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 + + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 + + kandd %k1, %k2, %k5 + kandd %k3, %k4, %k6 + kandd %k5, %k6, %k6 + + kmovd %k6, %eax + cmpl $VEC_MASK, %eax + jne L(4x_vec_end) + + leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi + VMOVU (%rsi), %YMM1 + VPCMPEQ (%rdi), %YMM1, %k1 + + VMOVU VEC_SIZE(%rsi), %YMM2 + VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 + kandd %k1, %k2, %k5 + + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 + kandd %k3, %k5, %k5 + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 + kandd %k4, %k5, %k5 + + kmovd %k5, %eax + cmpl $VEC_MASK, %eax + jne L(4x_vec_end) + xorl %eax, %eax + ret + + .p2align 4 +L(last_2x_vec): + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + +L(last_vec): + /* Use overlapping loads to avoid branches. */ + leaq -VEC_SIZE(%rdi, %rdx), %rdi + leaq -VEC_SIZE(%rsi, %rdx), %rsi + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(first_vec): + /* A byte or int32 is different within 16 or 32 bytes. */ + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (%rdi, %rcx, 4), %edx + cmpl (%rsi, %rcx, 4), %edx +L(wmemcmp_return): + setl %al + negl %eax + orl $1, %eax +# else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %edx + sub %edx, %eax +# endif + ret + +# ifdef USE_AS_WMEMCMP + .p2align 4 +L(4): + xorl %eax, %eax + movl (%rdi), %edx + cmpl (%rsi), %edx + jne L(wmemcmp_return) + ret +# else + .p2align 4 +L(between_4_7): + /* Load as big endian with overlapping movbe to avoid branches. */ + movbe (%rdi), %eax + movbe (%rsi), %ecx + shlq $32, %rax + shlq $32, %rcx + movbe -4(%rdi, %rdx), %edi + movbe -4(%rsi, %rdx), %esi + orq %rdi, %rax + orq %rsi, %rcx + subq %rcx, %rax + je L(exit) + sbbl %eax, %eax + orl $1, %eax + ret + + .p2align 4 +L(exit): + ret + + .p2align 4 +L(between_2_3): + /* Load as big endian to avoid branches. */ + movzwl (%rdi), %eax + movzwl (%rsi), %ecx + shll $8, %eax + shll $8, %ecx + bswap %eax + bswap %ecx + movb -1(%rdi, %rdx), %al + movb -1(%rsi, %rdx), %cl + /* Subtraction is okay because the upper 8 bits are zero. */ + subl %ecx, %eax + ret + + .p2align 4 +L(1): + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + subl %ecx, %eax + ret +# endif + + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + .p2align 4 +L(less_vec): +# ifdef USE_AS_WMEMCMP + /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ + cmpb $4, %dl + je L(4) + jb L(zero) +# else + cmpb $1, %dl + je L(1) + jb L(zero) + cmpb $4, %dl + jb L(between_2_3) + cmpb $8, %dl + jb L(between_4_7) +# endif + cmpb $16, %dl + jae L(between_16_31) + /* It is between 8 and 15 bytes. */ + vmovq (%rdi), %XMM1 + vmovq (%rsi), %XMM2 + VPCMPEQ %XMM1, %XMM2, %k2 + kmovw %k2, %eax + subl $XMM_MASK, %eax + jnz L(first_vec) + /* Use overlapping loads to avoid branches. */ + leaq -8(%rdi, %rdx), %rdi + leaq -8(%rsi, %rdx), %rsi + vmovq (%rdi), %XMM1 + vmovq (%rsi), %XMM2 + VPCMPEQ %XMM1, %XMM2, %k2 + kmovw %k2, %eax + subl $XMM_MASK, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + VMOVU (%rsi), %XMM2 + VPCMPEQ (%rdi), %XMM2, %k2 + kmovw %k2, %eax + subl $XMM_MASK, %eax + jnz L(first_vec) + + /* Use overlapping loads to avoid branches. */ + leaq -16(%rdi, %rdx), %rdi + leaq -16(%rsi, %rdx), %rsi + VMOVU (%rsi), %XMM2 + VPCMPEQ (%rdi), %XMM2, %k2 + kmovw %k2, %eax + subl $XMM_MASK, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(more_8x_vec): + /* More than 8 * VEC. Check the first VEC. */ + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + /* Align the first memory area for aligned loads in the loop. + Compute how much the first memory area is misaligned. */ + movq %rdi, %rcx + andl $(VEC_SIZE - 1), %ecx + /* Get the negative of offset for alignment. */ + subq $VEC_SIZE, %rcx + /* Adjust the second memory area. */ + subq %rcx, %rsi + /* Adjust the first memory area which should be aligned now. */ + subq %rcx, %rdi + /* Adjust length. */ + addq %rcx, %rdx + +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + VMOVU (%rsi), %YMM1 + VPCMPEQ (%rdi), %YMM1, %k1 + + VMOVU VEC_SIZE(%rsi), %YMM2 + VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 + kandd %k2, %k1, %k5 + + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 + kandd %k3, %k5, %k5 + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 + kandd %k4, %k5, %k5 + + kmovd %k5, %eax + cmpl $VEC_MASK, %eax + jne L(4x_vec_end) + + addq $(VEC_SIZE * 4), %rdi + addq $(VEC_SIZE * 4), %rsi + + subq $(VEC_SIZE * 4), %rdx + cmpq $(VEC_SIZE * 4), %rdx + jae L(loop_4x_vec) + + /* Less than 4 * VEC. */ + cmpq $VEC_SIZE, %rdx + jbe L(last_vec) + cmpq $(VEC_SIZE * 2), %rdx + jbe L(last_2x_vec) + +L(last_4x_vec): + /* From 2 * VEC to 4 * VEC. */ + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rsi + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + /* Use overlapping loads to avoid branches. */ + leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rsi + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(4x_vec_end): + kmovd %k1, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec_x1) + kmovd %k3, %eax + subl $VEC_MASK, %eax + jnz L(first_vec_x2) + kmovd %k4, %eax + subl $VEC_MASK, %eax + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx + cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx + jmp L(wmemcmp_return) +# else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx + sub %edx, %eax +# endif + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl VEC_SIZE(%rdi, %rcx, 4), %edx + cmpl VEC_SIZE(%rsi, %rcx, 4), %edx + jmp L(wmemcmp_return) +# else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %edx + sub %edx, %eax +# endif + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx + cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx + jmp L(wmemcmp_return) +# else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx + sub %edx, %eax +# endif + ret +END (MEMCMP) +#endif diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S new file mode 100644 index 0000000000..4726d74aa1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S @@ -0,0 +1,4 @@ +#define MEMCMP __wmemcmp_evex_movbe +#define USE_AS_WMEMCMP 1 + +#include "memcmp-evex-movbe.S" -- 2.30.2