public inbox for glibc-cvs@sourceware.org help / color / mirror / Atom feed
From: Raoni Fassina Firmino <raoni@sourceware.org> To: glibc-cvs@sourceware.org Subject: [glibc/ibm/2.30/master] x86-64: Add memmove family functions with 256-bit EVEX Date: Fri, 25 Feb 2022 13:47:33 +0000 (GMT) [thread overview] Message-ID: <20220225134733.1631F3858413@sourceware.org> (raw) https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=8ebaa0fb1e385e0e0c9efbd40f096b51ff05b8c0 commit 8ebaa0fb1e385e0e0c9efbd40f096b51ff05b8c0 Author: H.J. Lu <hjl.tools@gmail.com> Date: Fri Mar 5 06:46:08 2021 -0800 x86-64: Add memmove family functions with 256-bit EVEX Update ifunc-memmove.h to select the function optimized with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM abort with usable AVX512VL since VZEROUPPER isn't needed at function exit. (cherry picked from commit 63ad43566f7a25d140dc723598aeb441ad657eed) Diff: --- sysdeps/x86_64/multiarch/Makefile | 1 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 36 ++++++++++++++++++++++ sysdeps/x86_64/multiarch/ifunc-memmove.h | 21 +++++++++++-- .../x86_64/multiarch/memmove-evex-unaligned-erms.S | 26 ++++++++++++++++ .../x86_64/multiarch/memmove-vec-unaligned-erms.S | 24 ++++++++++----- 5 files changed, 97 insertions(+), 11 deletions(-) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index d653a1d923..2a5a3dd71b 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -45,6 +45,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ memset-avx2-unaligned-erms \ memset-avx512-unaligned-erms \ memchr-evex \ + memmove-evex-unaligned-erms \ memrchr-evex \ rawmemchr-evex \ stpcpy-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 39b1d299be..ec78775470 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_ARCH_FEATURE (AVX_Usable), __memmove_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memmove_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memmove_chk_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_CPU_FEATURE (SSSE3), __memmove_chk_ssse3_back) @@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, HAS_ARCH_FEATURE (AVX_Usable), __memmove_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memmove_evex_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memmove_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, memmove, HAS_ARCH_FEATURE (AVX512F_Usable), __memmove_avx512_no_vzeroupper) @@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_ARCH_FEATURE (AVX_Usable), __memcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memcpy_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memcpy_chk_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_CPU_FEATURE (SSSE3), __memcpy_chk_ssse3_back) @@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, HAS_ARCH_FEATURE (AVX_Usable), __memcpy_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memcpy_evex_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memcpy_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), __memcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), @@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __mempcpy_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __mempcpy_chk_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_CPU_FEATURE (SSSE3), __mempcpy_chk_ssse3_back) @@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, mempcpy, HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __mempcpy_evex_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __mempcpy_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), __mempcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h index 9d63ee5de1..5487fcae57 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) + attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms) @@ -59,10 +63,21 @@ IFUNC_SELECTOR (void) if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) { - if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) - return OPTIMIZE (avx_unaligned_erms); + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)) + { + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (evex_unaligned_erms); + + return OPTIMIZE (evex_unaligned); + } + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + { + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (avx_unaligned_erms); - return OPTIMIZE (avx_unaligned); + return OPTIMIZE (avx_unaligned); + } } if (!CPU_FEATURES_CPU_P (cpu_features, SSSE3) diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S new file mode 100644 index 0000000000..b879007e89 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S @@ -0,0 +1,26 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define XMM0 xmm16 +# define XMM1 xmm17 +# define YMM0 ymm16 +# define YMM1 ymm17 +# define VEC0 ymm16 +# define VEC1 ymm17 +# define VEC2 ymm18 +# define VEC3 ymm19 +# define VEC4 ymm20 +# define VEC5 ymm21 +# define VEC6 ymm22 +# define VEC7 ymm23 +# define VEC8 ymm24 +# define VEC(i) VEC##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 +# define VZEROUPPER + +# define SECTION(p) p##.evex +# define MEMMOVE_SYMBOL(p,s) p##_evex_##s + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index 95e9bb0e5d..18d0fd3d14 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -48,6 +48,14 @@ # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) #endif +#ifndef XMM0 +# define XMM0 xmm0 +#endif + +#ifndef YMM0 +# define YMM0 ymm0 +#endif + #ifndef VZEROUPPER # if VEC_SIZE > 16 # define VZEROUPPER vzeroupper @@ -312,20 +320,20 @@ L(less_vec): #if VEC_SIZE > 32 L(between_32_63): /* From 32 to 63. No branch when size == 32. */ - vmovdqu (%rsi), %ymm0 - vmovdqu -32(%rsi,%rdx), %ymm1 - vmovdqu %ymm0, (%rdi) - vmovdqu %ymm1, -32(%rdi,%rdx) + VMOVU (%rsi), %YMM0 + VMOVU -32(%rsi,%rdx), %YMM1 + VMOVU %YMM0, (%rdi) + VMOVU %YMM1, -32(%rdi,%rdx) VZEROUPPER ret #endif #if VEC_SIZE > 16 /* From 16 to 31. No branch when size == 16. */ L(between_16_31): - vmovdqu (%rsi), %xmm0 - vmovdqu -16(%rsi,%rdx), %xmm1 - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm1, -16(%rdi,%rdx) + VMOVU (%rsi), %XMM0 + VMOVU -16(%rsi,%rdx), %XMM1 + VMOVU %XMM0, (%rdi) + VMOVU %XMM1, -16(%rdi,%rdx) ret #endif L(between_8_15):
reply other threads:[~2022-02-25 13:47 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20220225134733.1631F3858413@sourceware.org \ --to=raoni@sourceware.org \ --cc=glibc-cvs@sourceware.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).