* Subject: [PATCH] Add x86-64 memset with unaligned store and rep stosb
@ 2016-03-29 19:23 H.J. Lu
2016-03-30 23:18 ` Carlos O'Donell
0 siblings, 1 reply; 3+ messages in thread
From: H.J. Lu @ 2016-03-29 19:23 UTC (permalink / raw)
To: GNU C Library
[-- Attachment #1: Type: text/plain, Size: 294 bytes --]
The goal of this patch is to replace SSE2 and AVX2 memset.S
with faster and smaller alternatives, also support 64-byte vector
register size. bench-memset data on various Intel and AMD
processors is at
https://sourceware.org/bugzilla/show_bug.cgi?id=19881
Any comments, feedbacks?
--
H.J.
[-- Attachment #2: 0001-Add-x86-64-memset-with-unaligned-store-and-rep-stosb.patch --]
[-- Type: text/x-patch, Size: 14390 bytes --]
From d0d3495951be16568656971dd2c825da68c2660c Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 25 Mar 2016 08:20:17 -0700
Subject: [PATCH] Add x86-64 memset with unaligned store and rep stosb
Implement x86-64 memset with unaligned store and rep movsb. Support
16-byte, 32-byte and 64-byte vector register sizes. A single file
provides 2 implementations of memset, one with rep stosb and the other
without rep stosb. They share the same codes when size is between 2
times of vector register size and REP_STOSB_THRESHOLD which is 1KB for
16-byte vector register size and scaled up by larger vector register
size.
Key features:
1. Use overlapping store to avoid branch.
2. For size <= 4 times of vector register size, fully unroll the loop.
3. For size > 4 times of vector register size, store 4 times of vector
register size at a time.
[BZ #19881]
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and
memset-avx512-unaligned-erms.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned,
__memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned,
__memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned,
__memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned,
__memset_sse2_unaligned_erms, __memset_erms,
__memset_avx2_unaligned, __memset_avx2_unaligned_erms,
__memset_avx512_unaligned_erms and __memset_avx512_unaligned.
* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New
file.
* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S:
Likewise.
Memset
---
sysdeps/x86_64/multiarch/Makefile | 5 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 33 +++
.../x86_64/multiarch/memset-avx2-unaligned-erms.S | 14 ++
.../multiarch/memset-avx512-unaligned-erms.S | 17 ++
.../x86_64/multiarch/memset-sse2-unaligned-erms.S | 16 ++
.../x86_64/multiarch/memset-vec-unaligned-erms.S | 246 +++++++++++++++++++++
6 files changed, 330 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
create mode 100644 sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
create mode 100644 sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
create mode 100644 sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index ef4dbc0..8878efb 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -23,7 +23,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memset-avx512-no-vzeroupper \
memmove-sse2-unaligned-erms \
memmove-avx-unaligned-erms \
- memmove-avx512-unaligned-erms
+ memmove-avx512-unaligned-erms \
+ memset-sse2-unaligned-erms \
+ memset-avx2-unaligned-erms \
+ memset-avx512-unaligned-erms
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 9204da4..1e880f6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -118,12 +118,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, __memset_chk,
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
__memset_chk_sse2)
+ IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+ __memset_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+ __memset_chk_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_chk_avx2)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_chk_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_chk_avx2_unaligned_erms)
#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memset_chk_avx512_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memset_chk_avx512_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
__memset_chk_avx512_no_vzeroupper)
#endif
)
@@ -131,12 +147,29 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memset.S. */
IFUNC_IMPL (i, name, memset,
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
+ IFUNC_IMPL_ADD (array, i, memset, 1,
+ __memset_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memset, 1,
+ __memset_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_avx2)
+ IFUNC_IMPL_ADD (array, i, memset,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memset,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_avx2_unaligned_erms)
#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memset_avx512_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memset,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memset_avx512_unaligned)
+ IFUNC_IMPL_ADD (array, i, memset,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
__memset_avx512_no_vzeroupper)
#endif
)
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
new file mode 100644
index 0000000..e0dc565
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -0,0 +1,14 @@
+#define VEC_SIZE 32
+#define VEC(i) ymm##i
+#define VMOVU vmovdqu
+#define VMOVA vmovdqa
+
+#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ vmovd d, %xmm0; \
+ movq r, %rax; \
+ vpbroadcastb %xmm0, %ymm0
+
+#define SECTION(p) p##.avx
+#define MEMSET_SYMBOL(p,s) p##_avx2_##s
+
+#include "memset-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
new file mode 100644
index 0000000..72f4095
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -0,0 +1,17 @@
+#ifdef HAVE_AVX512_ASM_SUPPORT
+# define VEC_SIZE 64
+# define VEC(i) zmm##i
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+
+# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ vmovd d, %xmm0; \
+ movq r, %rax; \
+ vpbroadcastb %xmm0, %xmm0; \
+ vpbroadcastq %xmm0, %zmm0
+
+# define SECTION(p) p##.avx512
+# define MEMSET_SYMBOL(p,s) p##_avx512_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
new file mode 100644
index 0000000..437a858
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -0,0 +1,16 @@
+#define VEC_SIZE 16
+#define VEC(i) xmm##i
+#define VMOVU movdqu
+#define VMOVA movdqa
+
+#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ movd d, %xmm0; \
+ movq r, %rax; \
+ punpcklbw %xmm0, %xmm0; \
+ punpcklwd %xmm0, %xmm0; \
+ pshufd $0, %xmm0, %xmm0
+
+#define SECTION(p) p
+#define MEMSET_SYMBOL(p,s) p##_sse2_##s
+
+#include "memset-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
new file mode 100644
index 0000000..dd04789
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -0,0 +1,246 @@
+/* memset/bzero with unaligned store and rep stosb
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* memset is implemented as:
+ 1. Use overlapping store to avoid branch.
+ 2. Force 32-bit displacement for branches to avoid long nop between
+ instructions.
+ 3. If size is less than VEC, use integer register stores.
+ 4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
+ 5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
+ 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
+ 4 VEC stores and store 4 * VEC at a time until done.
+ */
+#include <sysdep.h>
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+# define VZEROUPPER vzeroupper
+# else
+# define VZEROUPPER
+# endif
+#endif
+
+#ifndef VZEROUPPER_SHORT_RETURN
+# if VEC_SIZE > 16
+# define VZEROUPPER_SHORT_RETURN vzeroupper
+# else
+# define VZEROUPPER_SHORT_RETURN rep
+# endif
+#endif
+
+#ifndef MOVQ
+# if VEC_SIZE > 16
+# define MOVQ vmovq
+# else
+# define MOVQ movq
+# endif
+#endif
+
+/* Threshold to use Enhanced REP STOSB. */
+#ifndef REP_STOSB_THRESHOLD
+# define REP_STOSB_THRESHOLD (1024 * (VEC_SIZE / 16))
+#endif
+
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
+
+#if !defined USE_MULTIARCH && IS_IN (libc)
+ .section SECTION(.text),"ax",@progbits
+ENTRY (__bzero)
+ movq %rdi, %rax /* Set return value. */
+ movq %rsi, %rdx /* Set n. */
+ pxor %xmm0, %xmm0
+ jmp L(entry_from_bzero)
+END (__bzero)
+weak_alias (__bzero, bzero)
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
+#endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+L(memset_entry):
+ VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+L(entry_from_bzero):
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(more_2x_vec)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(0), (%rdi)
+ VZEROUPPER
+ ret
+END (MEMSET_SYMBOL (__memset, unaligned))
+
+#if VEC_SIZE == 16
+/* Only used to measure performance of REP STOSB. */
+ENTRY (__memset_erms)
+#else
+/* Provide a symbol to debugger. */
+ENTRY (MEMSET_SYMBOL (__memset, erms))
+#endif
+L(stosb):
+ movq %rdx, %rcx
+ movzbl %sil, %eax
+ movq %rdi, %rdx
+ rep stosb
+ movq %rdx, %rax
+ ret
+#if VEC_SIZE == 16
+END (__memset_erms)
+#else
+END (MEMSET_SYMBOL (__memset, erms))
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
+#endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(stosb_more_2x_vec)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(0), (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(stosb_more_2x_vec):
+ cmpq $REP_STOSB_THRESHOLD, %rdx
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ ja.d32 L(stosb)
+ .p2align 4
+L(more_2x_vec):
+ cmpq $(VEC_SIZE * 4), %rdx
+ ja L(loop_start)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(0), VEC_SIZE(%rdi)
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+L(return):
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(loop_start):
+ leaq (VEC_SIZE * 4)(%rdi), %rcx
+ VMOVU %VEC(0), (%rdi)
+ andq $-(VEC_SIZE * 4), %rcx
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(0), VEC_SIZE(%rdi)
+ VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+ VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
+ VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
+ addq %rdi, %rdx
+ andq $-(VEC_SIZE * 4), %rdx
+ cmpq %rdx, %rcx
+# if VEC_SIZE == 32 || VEC_SIZE == 64
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ je.d32 L(return)
+# else
+ je L(return)
+# endif
+ .p2align 4
+L(loop):
+ VMOVA %VEC(0), (%rcx)
+ VMOVA %VEC(0), VEC_SIZE(%rcx)
+ VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
+ VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
+ addq $(VEC_SIZE * 4), %rcx
+ cmpq %rcx, %rdx
+ jne L(loop)
+ VZEROUPPER_SHORT_RETURN
+ ret
+L(less_vec):
+ /* Less than 1 VEC. */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+# error Unsupported VEC_SIZE!
+# endif
+# if VEC_SIZE > 32
+ cmpb $32, %dl
+ jae L(between_32_63)
+# endif
+# if VEC_SIZE > 16
+ cmpb $16, %dl
+ jae L(between_16_31)
+# endif
+ MOVQ %xmm0, %rcx
+ cmpb $8, %dl
+ jae L(between_8_15)
+ cmpb $4, %dl
+ jae L(between_4_7)
+ cmpb $1, %dl
+ ja L(between_2_3)
+ jb 1f
+ movb %cl, (%rdi)
+1:
+ VZEROUPPER
+ ret
+# if VEC_SIZE > 32
+ /* From 32 to 63. No branch when size == 32. */
+L(between_32_63):
+ vmovdqu %ymm0, -32(%rdi,%rdx)
+ vmovdqu %ymm0, (%rdi)
+ VZEROUPPER
+ ret
+# endif
+# if VEC_SIZE > 16
+ /* From 16 to 31. No branch when size == 16. */
+L(between_16_31):
+ vmovdqu %xmm0, -16(%rdi,%rdx)
+ vmovdqu %xmm0, (%rdi)
+ VZEROUPPER
+ ret
+# endif
+ /* From 8 to 15. No branch when size == 8. */
+L(between_8_15):
+ movq %rcx, -8(%rdi,%rdx)
+ movq %rcx, (%rdi)
+ VZEROUPPER
+ ret
+L(between_4_7):
+ /* From 4 to 7. No branch when size == 4. */
+ movl %ecx, -4(%rdi,%rdx)
+ movl %ecx, (%rdi)
+ VZEROUPPER
+ ret
+L(between_2_3):
+ /* From 2 to 3. No branch when size == 2. */
+ movw %cx, -2(%rdi,%rdx)
+ movw %cx, (%rdi)
+ VZEROUPPER
+ ret
+END (MEMSET_SYMBOL (__memset, unaligned_erms))
--
2.5.5
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: Subject: [PATCH] Add x86-64 memset with unaligned store and rep stosb
2016-03-29 19:23 Subject: [PATCH] Add x86-64 memset with unaligned store and rep stosb H.J. Lu
@ 2016-03-30 23:18 ` Carlos O'Donell
2016-03-31 16:04 ` H.J. Lu
0 siblings, 1 reply; 3+ messages in thread
From: Carlos O'Donell @ 2016-03-30 23:18 UTC (permalink / raw)
To: H.J. Lu, GNU C Library
On 03/29/2016 03:23 PM, H.J. Lu wrote:
> The goal of this patch is to replace SSE2 and AVX2 memset.S
> with faster and smaller alternatives, also support 64-byte vector
> register size. bench-memset data on various Intel and AMD
> processors is at
>
> https://sourceware.org/bugzilla/show_bug.cgi?id=19881
>
> Any comments, feedbacks?
Caveats about Penryn being slower apply here, and I expect your answer
is the same: the selection of the ifunc will not change, and so Penryn
will not use the newer versions.
This looks good to me.
Again, same question about thresholding below.
> -- H.J.
>
>
> 0001-Add-x86-64-memset-with-unaligned-store-and-rep-stosb.patch
>
>
> From d0d3495951be16568656971dd2c825da68c2660c Mon Sep 17 00:00:00 2001
> From: "H.J. Lu" <hjl.tools@gmail.com>
> Date: Fri, 25 Mar 2016 08:20:17 -0700
> Subject: [PATCH] Add x86-64 memset with unaligned store and rep stosb
>
> Implement x86-64 memset with unaligned store and rep movsb. Support
> 16-byte, 32-byte and 64-byte vector register sizes. A single file
> provides 2 implementations of memset, one with rep stosb and the other
> without rep stosb. They share the same codes when size is between 2
> times of vector register size and REP_STOSB_THRESHOLD which is 1KB for
> 16-byte vector register size and scaled up by larger vector register
> size.
>
> Key features:
>
> 1. Use overlapping store to avoid branch.
> 2. For size <= 4 times of vector register size, fully unroll the loop.
> 3. For size > 4 times of vector register size, store 4 times of vector
> register size at a time.
>
> [BZ #19881]
> * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
> memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and
> memset-avx512-unaligned-erms.
> * sysdeps/x86_64/multiarch/ifunc-impl-list.c
> (__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned,
> __memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned,
> __memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned,
> __memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned,
> __memset_sse2_unaligned_erms, __memset_erms,
> __memset_avx2_unaligned, __memset_avx2_unaligned_erms,
> __memset_avx512_unaligned_erms and __memset_avx512_unaligned.
> * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New
> file.
> * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
> Likewise.
> * sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S:
> Likewise.
> * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S:
> Likewise.
>
> Memset
> ---
> sysdeps/x86_64/multiarch/Makefile | 5 +-
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 33 +++
> .../x86_64/multiarch/memset-avx2-unaligned-erms.S | 14 ++
> .../multiarch/memset-avx512-unaligned-erms.S | 17 ++
> .../x86_64/multiarch/memset-sse2-unaligned-erms.S | 16 ++
> .../x86_64/multiarch/memset-vec-unaligned-erms.S | 246 +++++++++++++++++++++
> 6 files changed, 330 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> create mode 100644 sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> create mode 100644 sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> create mode 100644 sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index ef4dbc0..8878efb 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -23,7 +23,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
> memset-avx512-no-vzeroupper \
> memmove-sse2-unaligned-erms \
> memmove-avx-unaligned-erms \
> - memmove-avx512-unaligned-erms
> + memmove-avx512-unaligned-erms \
> + memset-sse2-unaligned-erms \
> + memset-avx2-unaligned-erms \
> + memset-avx512-unaligned-erms
OK.
> CFLAGS-varshift.c += -msse4
> CFLAGS-strcspn-c.c += -msse4
> CFLAGS-strpbrk-c.c += -msse4
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 9204da4..1e880f6 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -118,12 +118,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL (i, name, __memset_chk,
> IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
> __memset_chk_sse2)
> + IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
> + __memset_chk_sse2_unaligned)
> + IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
> + __memset_chk_sse2_unaligned_erms)
> IFUNC_IMPL_ADD (array, i, __memset_chk,
> HAS_ARCH_FEATURE (AVX2_Usable),
> __memset_chk_avx2)
> + IFUNC_IMPL_ADD (array, i, __memset_chk,
> + HAS_ARCH_FEATURE (AVX2_Usable),
> + __memset_chk_avx2_unaligned)
> + IFUNC_IMPL_ADD (array, i, __memset_chk,
> + HAS_ARCH_FEATURE (AVX2_Usable),
> + __memset_chk_avx2_unaligned_erms)
> #ifdef HAVE_AVX512_ASM_SUPPORT
> IFUNC_IMPL_ADD (array, i, __memset_chk,
> HAS_ARCH_FEATURE (AVX512F_Usable),
> + __memset_chk_avx512_unaligned_erms)
> + IFUNC_IMPL_ADD (array, i, __memset_chk,
> + HAS_ARCH_FEATURE (AVX512F_Usable),
> + __memset_chk_avx512_unaligned)
> + IFUNC_IMPL_ADD (array, i, __memset_chk,
> + HAS_ARCH_FEATURE (AVX512F_Usable),
> __memset_chk_avx512_no_vzeroupper)
> #endif
> )
> @@ -131,12 +147,29 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> /* Support sysdeps/x86_64/multiarch/memset.S. */
> IFUNC_IMPL (i, name, memset,
> IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
> + IFUNC_IMPL_ADD (array, i, memset, 1,
> + __memset_sse2_unaligned)
> + IFUNC_IMPL_ADD (array, i, memset, 1,
> + __memset_sse2_unaligned_erms)
> + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
> IFUNC_IMPL_ADD (array, i, memset,
> HAS_ARCH_FEATURE (AVX2_Usable),
> __memset_avx2)
> + IFUNC_IMPL_ADD (array, i, memset,
> + HAS_ARCH_FEATURE (AVX2_Usable),
> + __memset_avx2_unaligned)
> + IFUNC_IMPL_ADD (array, i, memset,
> + HAS_ARCH_FEATURE (AVX2_Usable),
> + __memset_avx2_unaligned_erms)
> #ifdef HAVE_AVX512_ASM_SUPPORT
> IFUNC_IMPL_ADD (array, i, memset,
> HAS_ARCH_FEATURE (AVX512F_Usable),
> + __memset_avx512_unaligned_erms)
> + IFUNC_IMPL_ADD (array, i, memset,
> + HAS_ARCH_FEATURE (AVX512F_Usable),
> + __memset_avx512_unaligned)
> + IFUNC_IMPL_ADD (array, i, memset,
> + HAS_ARCH_FEATURE (AVX512F_Usable),
> __memset_avx512_no_vzeroupper)
> #endif
> )
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> new file mode 100644
> index 0000000..e0dc565
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -0,0 +1,14 @@
> +#define VEC_SIZE 32
> +#define VEC(i) ymm##i
> +#define VMOVU vmovdqu
> +#define VMOVA vmovdqa
> +
> +#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> + vmovd d, %xmm0; \
> + movq r, %rax; \
> + vpbroadcastb %xmm0, %ymm0
> +
> +#define SECTION(p) p##.avx
> +#define MEMSET_SYMBOL(p,s) p##_avx2_##s
OK.
> +
> +#include "memset-vec-unaligned-erms.S"
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> new file mode 100644
> index 0000000..72f4095
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -0,0 +1,17 @@
> +#ifdef HAVE_AVX512_ASM_SUPPORT
> +# define VEC_SIZE 64
> +# define VEC(i) zmm##i
> +# define VMOVU vmovdqu64
> +# define VMOVA vmovdqa64
> +
> +# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> + vmovd d, %xmm0; \
> + movq r, %rax; \
> + vpbroadcastb %xmm0, %xmm0; \
> + vpbroadcastq %xmm0, %zmm0
> +
> +# define SECTION(p) p##.avx512
> +# define MEMSET_SYMBOL(p,s) p##_avx512_##s
OK.
> +
> +# include "memset-vec-unaligned-erms.S"
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> new file mode 100644
> index 0000000..437a858
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> @@ -0,0 +1,16 @@
> +#define VEC_SIZE 16
> +#define VEC(i) xmm##i
> +#define VMOVU movdqu
> +#define VMOVA movdqa
> +
> +#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> + movd d, %xmm0; \
> + movq r, %rax; \
> + punpcklbw %xmm0, %xmm0; \
> + punpcklwd %xmm0, %xmm0; \
> + pshufd $0, %xmm0, %xmm0
> +
> +#define SECTION(p) p
> +#define MEMSET_SYMBOL(p,s) p##_sse2_##s
OK.
> +
> +#include "memset-vec-unaligned-erms.S"
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> new file mode 100644
> index 0000000..dd04789
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -0,0 +1,246 @@
> +/* memset/bzero with unaligned store and rep stosb
> + Copyright (C) 2016 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +/* memset is implemented as:
> + 1. Use overlapping store to avoid branch.
> + 2. Force 32-bit displacement for branches to avoid long nop between
> + instructions.
> + 3. If size is less than VEC, use integer register stores.
> + 4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
> + 5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
> + 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
> + 4 VEC stores and store 4 * VEC at a time until done.
> + */
Use GNU formatting please.
e.g.
/* foo */
not
/* foo
*/
> +#include <sysdep.h>
> +
> +#ifndef VZEROUPPER
> +# if VEC_SIZE > 16
> +# define VZEROUPPER vzeroupper
> +# else
> +# define VZEROUPPER
> +# endif
> +#endif
> +
> +#ifndef VZEROUPPER_SHORT_RETURN
> +# if VEC_SIZE > 16
> +# define VZEROUPPER_SHORT_RETURN vzeroupper
> +# else
> +# define VZEROUPPER_SHORT_RETURN rep
> +# endif
> +#endif
> +
> +#ifndef MOVQ
> +# if VEC_SIZE > 16
> +# define MOVQ vmovq
> +# else
> +# define MOVQ movq
> +# endif
> +#endif
> +
> +/* Threshold to use Enhanced REP STOSB. */
> +#ifndef REP_STOSB_THRESHOLD
> +# define REP_STOSB_THRESHOLD (1024 * (VEC_SIZE / 16))
Same question as your other patch. How are we selecting this threshold?
> +#endif
> +
> +#ifndef SECTION
> +# error SECTION is not defined!
> +#endif
> +
> +#if !defined USE_MULTIARCH && IS_IN (libc)
> + .section SECTION(.text),"ax",@progbits
> +ENTRY (__bzero)
> + movq %rdi, %rax /* Set return value. */
> + movq %rsi, %rdx /* Set n. */
> + pxor %xmm0, %xmm0
> + jmp L(entry_from_bzero)
> +END (__bzero)
> +weak_alias (__bzero, bzero)
> +#endif
> +
> +#if defined SHARED && IS_IN (libc)
> +ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
> + cmpq %rdx, %rcx
> + jb HIDDEN_JUMPTARGET (__chk_fail)
> +END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
> +#endif
> +
> +ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> +L(memset_entry):
> + VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> +L(entry_from_bzero):
> + cmpq $VEC_SIZE, %rdx
> + jb L(less_vec)
> + cmpq $(VEC_SIZE * 2), %rdx
> + ja L(more_2x_vec)
> + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
> + VMOVU %VEC(0), (%rdi)
> + VZEROUPPER
> + ret
> +END (MEMSET_SYMBOL (__memset, unaligned))
> +
> +#if VEC_SIZE == 16
> +/* Only used to measure performance of REP STOSB. */
> +ENTRY (__memset_erms)
> +#else
> +/* Provide a symbol to debugger. */
> +ENTRY (MEMSET_SYMBOL (__memset, erms))
> +#endif
> +L(stosb):
> + movq %rdx, %rcx
> + movzbl %sil, %eax
> + movq %rdi, %rdx
> + rep stosb
> + movq %rdx, %rax
> + ret
> +#if VEC_SIZE == 16
> +END (__memset_erms)
> +#else
> +END (MEMSET_SYMBOL (__memset, erms))
> +#endif
> +
> +#if defined SHARED && IS_IN (libc)
> +ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
> + cmpq %rdx, %rcx
> + jb HIDDEN_JUMPTARGET (__chk_fail)
> +END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
> +#endif
> +
> +ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
> + VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> + cmpq $VEC_SIZE, %rdx
> + jb L(less_vec)
> + cmpq $(VEC_SIZE * 2), %rdx
> + ja L(stosb_more_2x_vec)
> + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
> + VMOVU %VEC(0), (%rdi)
> + VZEROUPPER
> + ret
> +
> + .p2align 4
> +L(stosb_more_2x_vec):
> + cmpq $REP_STOSB_THRESHOLD, %rdx
> + /* Force 32-bit displacement to avoid long nop between
> + instructions. */
> + ja.d32 L(stosb)
> + .p2align 4
> +L(more_2x_vec):
> + cmpq $(VEC_SIZE * 4), %rdx
> + ja L(loop_start)
> + VMOVU %VEC(0), (%rdi)
> + VMOVU %VEC(0), VEC_SIZE(%rdi)
> + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
> + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
> +L(return):
> + VZEROUPPER
> + ret
> +
> + .p2align 4
> +L(loop_start):
> + leaq (VEC_SIZE * 4)(%rdi), %rcx
> + VMOVU %VEC(0), (%rdi)
> + andq $-(VEC_SIZE * 4), %rcx
> + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
> + VMOVU %VEC(0), VEC_SIZE(%rdi)
> + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
> + VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
> + VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
> + VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
> + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
> + addq %rdi, %rdx
> + andq $-(VEC_SIZE * 4), %rdx
> + cmpq %rdx, %rcx
> +# if VEC_SIZE == 32 || VEC_SIZE == 64
> + /* Force 32-bit displacement to avoid long nop between
> + instructions. */
> + je.d32 L(return)
> +# else
> + je L(return)
> +# endif
> + .p2align 4
> +L(loop):
> + VMOVA %VEC(0), (%rcx)
> + VMOVA %VEC(0), VEC_SIZE(%rcx)
> + VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
> + VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
> + addq $(VEC_SIZE * 4), %rcx
> + cmpq %rcx, %rdx
> + jne L(loop)
> + VZEROUPPER_SHORT_RETURN
> + ret
> +L(less_vec):
> + /* Less than 1 VEC. */
> +# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> +# error Unsupported VEC_SIZE!
> +# endif
> +# if VEC_SIZE > 32
> + cmpb $32, %dl
> + jae L(between_32_63)
> +# endif
> +# if VEC_SIZE > 16
> + cmpb $16, %dl
> + jae L(between_16_31)
> +# endif
> + MOVQ %xmm0, %rcx
> + cmpb $8, %dl
> + jae L(between_8_15)
> + cmpb $4, %dl
> + jae L(between_4_7)
> + cmpb $1, %dl
> + ja L(between_2_3)
> + jb 1f
> + movb %cl, (%rdi)
> +1:
> + VZEROUPPER
> + ret
> +# if VEC_SIZE > 32
> + /* From 32 to 63. No branch when size == 32. */
> +L(between_32_63):
> + vmovdqu %ymm0, -32(%rdi,%rdx)
> + vmovdqu %ymm0, (%rdi)
> + VZEROUPPER
> + ret
> +# endif
> +# if VEC_SIZE > 16
> + /* From 16 to 31. No branch when size == 16. */
> +L(between_16_31):
> + vmovdqu %xmm0, -16(%rdi,%rdx)
> + vmovdqu %xmm0, (%rdi)
> + VZEROUPPER
> + ret
> +# endif
> + /* From 8 to 15. No branch when size == 8. */
> +L(between_8_15):
> + movq %rcx, -8(%rdi,%rdx)
> + movq %rcx, (%rdi)
> + VZEROUPPER
> + ret
> +L(between_4_7):
> + /* From 4 to 7. No branch when size == 4. */
> + movl %ecx, -4(%rdi,%rdx)
> + movl %ecx, (%rdi)
> + VZEROUPPER
> + ret
> +L(between_2_3):
> + /* From 2 to 3. No branch when size == 2. */
> + movw %cx, -2(%rdi,%rdx)
> + movw %cx, (%rdi)
> + VZEROUPPER
> + ret
> +END (MEMSET_SYMBOL (__memset, unaligned_erms))
> -- 2.5.5
OK.
--
Cheers,
Carlos.
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: Subject: [PATCH] Add x86-64 memset with unaligned store and rep stosb
2016-03-30 23:18 ` Carlos O'Donell
@ 2016-03-31 16:04 ` H.J. Lu
0 siblings, 0 replies; 3+ messages in thread
From: H.J. Lu @ 2016-03-31 16:04 UTC (permalink / raw)
To: Carlos O'Donell; +Cc: GNU C Library
[-- Attachment #1: Type: text/plain, Size: 755 bytes --]
On Wed, Mar 30, 2016 at 4:18 PM, Carlos O'Donell <carlos@redhat.com> wrote:
> On 03/29/2016 03:23 PM, H.J. Lu wrote:
>> The goal of this patch is to replace SSE2 and AVX2 memset.S
>> with faster and smaller alternatives, also support 64-byte vector
>> register size. bench-memset data on various Intel and AMD
>> processors is at
>>
>> https://sourceware.org/bugzilla/show_bug.cgi?id=19881
>>
>> Any comments, feedbacks?
>
> Caveats about Penryn being slower apply here, and I expect your answer
> is the same: the selection of the ifunc will not change, and so Penryn
> will not use the newer versions.
>
> This looks good to me.
>
> Again, same question about thresholding below.
>
Here is the updated patch I am going to check in.
Thanks.
--
H.J.
[-- Attachment #2: 0001-Add-x86-64-memset-with-unaligned-store-and-rep-stosb.patch --]
[-- Type: text/x-patch, Size: 14650 bytes --]
From 7df7c6a195d6bc6ffdd90db0786d5de9c67d037a Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 25 Mar 2016 08:20:17 -0700
Subject: [PATCH] Add x86-64 memset with unaligned store and rep stosb
Implement x86-64 memset with unaligned store and rep movsb. Support
16-byte, 32-byte and 64-byte vector register sizes. A single file
provides 2 implementations of memset, one with rep stosb and the other
without rep stosb. They share the same codes when size is between 2
times of vector register size and REP_STOSB_THRESHOLD which defaults
to 2KB.
Key features:
1. Use overlapping store to avoid branch.
2. For size <= 4 times of vector register size, fully unroll the loop.
3. For size > 4 times of vector register size, store 4 times of vector
register size at a time.
[BZ #19881]
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and
memset-avx512-unaligned-erms.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned,
__memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned,
__memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned,
__memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned,
__memset_sse2_unaligned_erms, __memset_erms,
__memset_avx2_unaligned, __memset_avx2_unaligned_erms,
__memset_avx512_unaligned_erms and __memset_avx512_unaligned.
* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New
file.
* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S:
Likewise.
---
sysdeps/x86_64/multiarch/Makefile | 5 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 33 +++
.../x86_64/multiarch/memset-avx2-unaligned-erms.S | 14 ++
.../multiarch/memset-avx512-unaligned-erms.S | 17 ++
.../x86_64/multiarch/memset-sse2-unaligned-erms.S | 16 ++
.../x86_64/multiarch/memset-vec-unaligned-erms.S | 251 +++++++++++++++++++++
6 files changed, 335 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
create mode 100644 sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
create mode 100644 sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
create mode 100644 sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index ef4dbc0..8878efb 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -23,7 +23,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memset-avx512-no-vzeroupper \
memmove-sse2-unaligned-erms \
memmove-avx-unaligned-erms \
- memmove-avx512-unaligned-erms
+ memmove-avx512-unaligned-erms \
+ memset-sse2-unaligned-erms \
+ memset-avx2-unaligned-erms \
+ memset-avx512-unaligned-erms
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 9204da4..1e880f6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -118,12 +118,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, __memset_chk,
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
__memset_chk_sse2)
+ IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+ __memset_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+ __memset_chk_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_chk_avx2)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_chk_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_chk_avx2_unaligned_erms)
#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memset_chk_avx512_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memset_chk_avx512_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
__memset_chk_avx512_no_vzeroupper)
#endif
)
@@ -131,12 +147,29 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memset.S. */
IFUNC_IMPL (i, name, memset,
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
+ IFUNC_IMPL_ADD (array, i, memset, 1,
+ __memset_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memset, 1,
+ __memset_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_avx2)
+ IFUNC_IMPL_ADD (array, i, memset,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memset,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_avx2_unaligned_erms)
#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memset_avx512_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memset,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memset_avx512_unaligned)
+ IFUNC_IMPL_ADD (array, i, memset,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
__memset_avx512_no_vzeroupper)
#endif
)
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
new file mode 100644
index 0000000..e0dc565
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -0,0 +1,14 @@
+#define VEC_SIZE 32
+#define VEC(i) ymm##i
+#define VMOVU vmovdqu
+#define VMOVA vmovdqa
+
+#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ vmovd d, %xmm0; \
+ movq r, %rax; \
+ vpbroadcastb %xmm0, %ymm0
+
+#define SECTION(p) p##.avx
+#define MEMSET_SYMBOL(p,s) p##_avx2_##s
+
+#include "memset-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
new file mode 100644
index 0000000..72f4095
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -0,0 +1,17 @@
+#ifdef HAVE_AVX512_ASM_SUPPORT
+# define VEC_SIZE 64
+# define VEC(i) zmm##i
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+
+# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ vmovd d, %xmm0; \
+ movq r, %rax; \
+ vpbroadcastb %xmm0, %xmm0; \
+ vpbroadcastq %xmm0, %zmm0
+
+# define SECTION(p) p##.avx512
+# define MEMSET_SYMBOL(p,s) p##_avx512_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
new file mode 100644
index 0000000..437a858
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -0,0 +1,16 @@
+#define VEC_SIZE 16
+#define VEC(i) xmm##i
+#define VMOVU movdqu
+#define VMOVA movdqa
+
+#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ movd d, %xmm0; \
+ movq r, %rax; \
+ punpcklbw %xmm0, %xmm0; \
+ punpcklwd %xmm0, %xmm0; \
+ pshufd $0, %xmm0, %xmm0
+
+#define SECTION(p) p
+#define MEMSET_SYMBOL(p,s) p##_sse2_##s
+
+#include "memset-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
new file mode 100644
index 0000000..9383517
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -0,0 +1,251 @@
+/* memset/bzero with unaligned store and rep stosb
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* memset is implemented as:
+ 1. Use overlapping store to avoid branch.
+ 2. Force 32-bit displacement for branches to avoid long nop between
+ instructions.
+ 3. If size is less than VEC, use integer register stores.
+ 4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
+ 5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
+ 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
+ 4 VEC stores and store 4 * VEC at a time until done. */
+
+#include <sysdep.h>
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+# define VZEROUPPER vzeroupper
+# else
+# define VZEROUPPER
+# endif
+#endif
+
+#ifndef VZEROUPPER_SHORT_RETURN
+# if VEC_SIZE > 16
+# define VZEROUPPER_SHORT_RETURN vzeroupper
+# else
+# define VZEROUPPER_SHORT_RETURN rep
+# endif
+#endif
+
+#ifndef MOVQ
+# if VEC_SIZE > 16
+# define MOVQ vmovq
+# else
+# define MOVQ movq
+# endif
+#endif
+
+/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
+ up REP STOSB operation, REP STOSB isn't faster on short data. The
+ memset micro benchmark in glibc shows that 2KB is the approximate
+ value above which REP STOSB becomes faster on processors with
+ Enhanced REP STOSB. Since the stored value is fixed, larger register
+ size has minimal impact on threshold. */
+#ifndef REP_STOSB_THRESHOLD
+# define REP_STOSB_THRESHOLD 2048
+#endif
+
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
+
+#if !defined USE_MULTIARCH && IS_IN (libc)
+ .section SECTION(.text),"ax",@progbits
+ENTRY (__bzero)
+ movq %rdi, %rax /* Set return value. */
+ movq %rsi, %rdx /* Set n. */
+ pxor %xmm0, %xmm0
+ jmp L(entry_from_bzero)
+END (__bzero)
+weak_alias (__bzero, bzero)
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
+#endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+L(memset_entry):
+ VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+L(entry_from_bzero):
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(more_2x_vec)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(0), (%rdi)
+ VZEROUPPER
+ ret
+END (MEMSET_SYMBOL (__memset, unaligned))
+
+#if VEC_SIZE == 16
+/* Only used to measure performance of REP STOSB. */
+ENTRY (__memset_erms)
+#else
+/* Provide a symbol to debugger. */
+ENTRY (MEMSET_SYMBOL (__memset, erms))
+#endif
+L(stosb):
+ movq %rdx, %rcx
+ movzbl %sil, %eax
+ movq %rdi, %rdx
+ rep stosb
+ movq %rdx, %rax
+ ret
+#if VEC_SIZE == 16
+END (__memset_erms)
+#else
+END (MEMSET_SYMBOL (__memset, erms))
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
+#endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(stosb_more_2x_vec)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(0), (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(stosb_more_2x_vec):
+ cmpq $REP_STOSB_THRESHOLD, %rdx
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ ja.d32 L(stosb)
+ .p2align 4
+L(more_2x_vec):
+ cmpq $(VEC_SIZE * 4), %rdx
+ ja L(loop_start)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(0), VEC_SIZE(%rdi)
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+L(return):
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(loop_start):
+ leaq (VEC_SIZE * 4)(%rdi), %rcx
+ VMOVU %VEC(0), (%rdi)
+ andq $-(VEC_SIZE * 4), %rcx
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(0), VEC_SIZE(%rdi)
+ VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+ VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
+ VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
+ addq %rdi, %rdx
+ andq $-(VEC_SIZE * 4), %rdx
+ cmpq %rdx, %rcx
+# if VEC_SIZE == 32 || VEC_SIZE == 64
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ je.d32 L(return)
+# else
+ je L(return)
+# endif
+ .p2align 4
+L(loop):
+ VMOVA %VEC(0), (%rcx)
+ VMOVA %VEC(0), VEC_SIZE(%rcx)
+ VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
+ VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
+ addq $(VEC_SIZE * 4), %rcx
+ cmpq %rcx, %rdx
+ jne L(loop)
+ VZEROUPPER_SHORT_RETURN
+ ret
+L(less_vec):
+ /* Less than 1 VEC. */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+# error Unsupported VEC_SIZE!
+# endif
+# if VEC_SIZE > 32
+ cmpb $32, %dl
+ jae L(between_32_63)
+# endif
+# if VEC_SIZE > 16
+ cmpb $16, %dl
+ jae L(between_16_31)
+# endif
+ MOVQ %xmm0, %rcx
+ cmpb $8, %dl
+ jae L(between_8_15)
+ cmpb $4, %dl
+ jae L(between_4_7)
+ cmpb $1, %dl
+ ja L(between_2_3)
+ jb 1f
+ movb %cl, (%rdi)
+1:
+ VZEROUPPER
+ ret
+# if VEC_SIZE > 32
+ /* From 32 to 63. No branch when size == 32. */
+L(between_32_63):
+ vmovdqu %ymm0, -32(%rdi,%rdx)
+ vmovdqu %ymm0, (%rdi)
+ VZEROUPPER
+ ret
+# endif
+# if VEC_SIZE > 16
+ /* From 16 to 31. No branch when size == 16. */
+L(between_16_31):
+ vmovdqu %xmm0, -16(%rdi,%rdx)
+ vmovdqu %xmm0, (%rdi)
+ VZEROUPPER
+ ret
+# endif
+ /* From 8 to 15. No branch when size == 8. */
+L(between_8_15):
+ movq %rcx, -8(%rdi,%rdx)
+ movq %rcx, (%rdi)
+ VZEROUPPER
+ ret
+L(between_4_7):
+ /* From 4 to 7. No branch when size == 4. */
+ movl %ecx, -4(%rdi,%rdx)
+ movl %ecx, (%rdi)
+ VZEROUPPER
+ ret
+L(between_2_3):
+ /* From 2 to 3. No branch when size == 2. */
+ movw %cx, -2(%rdi,%rdx)
+ movw %cx, (%rdi)
+ VZEROUPPER
+ ret
+END (MEMSET_SYMBOL (__memset, unaligned_erms))
--
2.5.5
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2016-03-31 16:04 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-29 19:23 Subject: [PATCH] Add x86-64 memset with unaligned store and rep stosb H.J. Lu
2016-03-30 23:18 ` Carlos O'Donell
2016-03-31 16:04 ` H.J. Lu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).