From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pj1-x102d.google.com (mail-pj1-x102d.google.com [IPv6:2607:f8b0:4864:20::102d]) by sourceware.org (Postfix) with ESMTPS id 9578C3851C0A for ; Mon, 15 Mar 2021 14:25:28 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 9578C3851C0A Received: by mail-pj1-x102d.google.com with SMTP id f2-20020a17090a4a82b02900c67bf8dc69so14479427pjh.1 for ; Mon, 15 Mar 2021 07:25:28 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=QPO0Cwah42bXaMU9klVrSPBgEoZ67WbMxPzhm6ivNpM=; b=sHxryi0h5jsGFn7yH2VQ3jGz3NVRYmaDLQ4FIThfmYJhte1AvOpe4gyAJR66a1okhc 67gU1sFkX828QLsBbm7LUfkajLTOEBQsa/00bMX9Z0meHLJ7FV9eEGBXFsNiVJyUSu54 0BA8qRHv3YqJuXC0h6IHVpIGYG9s+GruP3slVv1BwLUpJ258cW3ll9MZ1ztyoKnH4R85 FHzFIDldvXdAAbgnNlnA2ts71URZI9fqbJOz/MKLqc5uJSPy0uMzHQtI/I+kpzb/xBDA IXEdrK4FmKVasBEyAaTj10KklhCb6qe4T/TkleHBL2q6M3KzfiQvqCGiauwSGJkJJ78f +0lQ== X-Gm-Message-State: AOAM531tByt2J/00yx0hkSWimkEmASJdl4f2jGAlFggvXaTrhIseVFcg h16lkckR5seYz9IM+BEzb/9VdkVllqY= X-Google-Smtp-Source: ABdhPJzSvH2QPpMsdrGJgZkS5FzJHcDk+IaKUCtqKxKxkZYmn65RjlL6smH8irYHTLwQSRPaqgCLSg== X-Received: by 2002:a17:90a:c210:: with SMTP id e16mr13293934pjt.92.1615818327311; Mon, 15 Mar 2021 07:25:27 -0700 (PDT) Received: from gnu-cfl-2.localdomain ([172.56.38.48]) by smtp.gmail.com with ESMTPSA id i20sm12842297pgg.65.2021.03.15.07.25.24 for (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 15 Mar 2021 07:25:26 -0700 (PDT) Received: from gnu-cfl-2.?040none?041 (localhost [IPv6:::1]) by gnu-cfl-2.localdomain (Postfix) with ESMTP id 0519A1A09C6 for ; Mon, 15 Mar 2021 07:25:21 -0700 (PDT) From: "H.J. Lu" To: libc-alpha@sourceware.org Subject: [PATCH v2 05/10] x86-64: Add memset family functions with 256-bit EVEX Date: Mon, 15 Mar 2021 07:25:15 -0700 Message-Id: <20210315142520.1661407-6-hjl.tools@gmail.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20210315142520.1661407-1-hjl.tools@gmail.com> References: <20210315142520.1661407-1-hjl.tools@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Spam-Status: No, score=-3034.7 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_BARRACUDACENTRAL, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 15 Mar 2021 14:25:30 -0000 Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at function exit. --- sysdeps/x86_64/multiarch/Makefile | 1 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 +++++++++++++++++ sysdeps/x86_64/multiarch/ifunc-memset.h | 24 +++++++++++++++---- sysdeps/x86_64/multiarch/ifunc-wmemset.h | 13 ++++++---- .../multiarch/memset-evex-unaligned-erms.S | 24 +++++++++++++++++++ .../multiarch/memset-vec-unaligned-erms.S | 20 +++++++++++----- 6 files changed, 90 insertions(+), 14 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 4563fc56f5..1cc0a10e12 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ memchr-evex \ memmove-evex-unaligned-erms \ memrchr-evex \ + memset-evex-unaligned-erms \ rawmemchr-evex \ stpcpy-evex \ stpncpy-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index e1c39d58d6..aac8e601df 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memset_chk, CPU_FEATURE_USABLE (AVX2), __memset_chk_avx2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __memset_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __memset_chk_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memset_chk, CPU_FEATURE_USABLE (AVX512F), __memset_chk_avx512_unaligned_erms) @@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memset, CPU_FEATURE_USABLE (AVX2), __memset_avx2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __memset_evex_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __memset_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, memset, CPU_FEATURE_USABLE (AVX512F), __memset_avx512_unaligned_erms) @@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wmemset, CPU_FEATURE_USABLE (AVX2), __wmemset_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + CPU_FEATURE_USABLE (AVX512VL), + __wmemset_evex_unaligned) IFUNC_IMPL_ADD (array, i, wmemset, CPU_FEATURE_USABLE (AVX512F), __wmemset_avx512_unaligned)) @@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __wmemset_chk, CPU_FEATURE_USABLE (AVX2), __wmemset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + CPU_FEATURE_USABLE (AVX512VL), + __wmemset_chk_evex_unaligned) IFUNC_IMPL_ADD (array, i, __wmemset_chk, CPU_FEATURE_USABLE (AVX512F), __wmemset_chk_avx512_unaligned)) diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h index 0ac6b11882..0246818263 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memset.h +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h @@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) + attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms) @@ -56,10 +60,22 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) - return OPTIMIZE (avx2_unaligned_erms); - else - return OPTIMIZE (avx2_unaligned); + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (evex_unaligned_erms); + + return OPTIMIZE (evex_unaligned); + } + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (avx2_unaligned_erms); + + return OPTIMIZE (avx2_unaligned); + } } if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h index c1b0c2254b..7e947c56b4 100644 --- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h +++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h @@ -20,6 +20,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; static inline void * @@ -27,14 +28,18 @@ IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) { if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) - && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx512_unaligned); - else + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) + return OPTIMIZE (evex_unaligned); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2_unaligned); } diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S new file mode 100644 index 0000000000..ae0a4d6e46 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S @@ -0,0 +1,24 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define XMM0 xmm16 +# define YMM0 ymm16 +# define VEC0 ymm16 +# define VEC(i) VEC##i +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 +# define VZEROUPPER + +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movq r, %rax; \ + vpbroadcastb d, %VEC0 + +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movq r, %rax; \ + vpbroadcastd d, %VEC0 + +# define SECTION(p) p##.evex +# define MEMSET_SYMBOL(p,s) p##_evex_##s +# define WMEMSET_SYMBOL(p,s) p##_evex_##s + +# include "memset-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index faa4085615..358ee4be12 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -34,6 +34,14 @@ # define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) #endif +#ifndef XMM0 +# define XMM0 xmm0 +#endif + +#ifndef YMM0 +# define YMM0 ymm0 +#endif + #ifndef VZEROUPPER # if VEC_SIZE > 16 # define VZEROUPPER vzeroupper @@ -67,7 +75,7 @@ ENTRY (__bzero) mov %RDI_LP, %RAX_LP /* Set return value. */ mov %RSI_LP, %RDX_LP /* Set n. */ - pxor %xmm0, %xmm0 + pxor %XMM0, %XMM0 jmp L(entry_from_bzero) END (__bzero) weak_alias (__bzero, bzero) @@ -223,7 +231,7 @@ L(less_vec): cmpb $16, %dl jae L(between_16_31) # endif - MOVQ %xmm0, %rcx + MOVQ %XMM0, %rcx cmpb $8, %dl jae L(between_8_15) cmpb $4, %dl @@ -238,16 +246,16 @@ L(less_vec): # if VEC_SIZE > 32 /* From 32 to 63. No branch when size == 32. */ L(between_32_63): - vmovdqu %ymm0, -32(%rdi,%rdx) - vmovdqu %ymm0, (%rdi) + VMOVU %YMM0, -32(%rdi,%rdx) + VMOVU %YMM0, (%rdi) VZEROUPPER ret # endif # if VEC_SIZE > 16 /* From 16 to 31. No branch when size == 16. */ L(between_16_31): - vmovdqu %xmm0, -16(%rdi,%rdx) - vmovdqu %xmm0, (%rdi) + VMOVU %XMM0, -16(%rdi,%rdx) + VMOVU %XMM0, (%rdi) VZEROUPPER ret # endif -- 2.30.2