* [PATCH v1] x86: Improve vec generation in memset-vec-unaligned-erms.S
@ 2022-02-05 22:42 Noah Goldstein
2022-02-05 22:50 ` H.J. Lu
2022-02-06 6:54 ` [PATCH v2] " Noah Goldstein
0 siblings, 2 replies; 6+ messages in thread
From: Noah Goldstein @ 2022-02-05 22:42 UTC (permalink / raw)
To: libc-alpha
No bug.
Split vec generation into multiple steps. This allows the
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
case. This saves an expensive lane-cross instruction and removes
the need for 'vzeroupper'.
For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
byte broadcast.
Results for memset-avx2 small (geomean of N = 20 benchset runs).
size, New Time, Old Time, New / Old
1, 5.074, 4.399, 0.867
2, 4.433, 4.411, 0.995
4, 4.487, 4.415, 0.984
8, 4.454, 4.396, 0.987
16, 4.502, 4.443, 0.987
All relevant string/wcsmbs tests are passing.
---
sysdeps/x86_64/memset.S | 21 ++-
.../multiarch/memset-avx2-unaligned-erms.S | 18 +-
.../multiarch/memset-avx512-unaligned-erms.S | 18 +-
.../multiarch/memset-evex-unaligned-erms.S | 18 +-
.../multiarch/memset-vec-unaligned-erms.S | 163 +++++++++++-------
5 files changed, 151 insertions(+), 87 deletions(-)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 65c09bd0ac..ccf036be53 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -28,17 +28,22 @@
#define VMOVU movups
#define VMOVA movaps
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
- movq r, %rax; \
- punpcklbw %xmm0, %xmm0; \
- punpcklwd %xmm0, %xmm0; \
- pshufd $0, %xmm0, %xmm0
+ pxor %xmm1, %xmm1; \
+ pshufb %xmm1, %xmm0; \
+ movq r, %rax
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
- movq r, %rax; \
- pshufd $0, %xmm0, %xmm0
+ pshufd $0, %xmm0, %xmm0; \
+ movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
#define SECTION(p) p
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 1af668af0a..c0bf2875d0 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -10,15 +10,18 @@
# define VMOVU vmovdqu
# define VMOVA vmovdqa
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastb %xmm0, %ymm0
+ movq r, %rax;
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
# ifndef SECTION
# define SECTION(p) p##.avx
@@ -30,5 +33,6 @@
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
# endif
+# define USE_XMM_LESS_VEC
# include "memset-vec-unaligned-erms.S"
#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index f14d6f8493..5241216a77 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -15,13 +15,19 @@
# define VZEROUPPER
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastb d, %VEC0; \
+ movq r, %rax
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastd d, %VEC0; \
+ movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
# define SECTION(p) p##.evex512
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 64b09e77cc..6370021506 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -15,13 +15,19 @@
# define VZEROUPPER
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastb d, %VEC0; \
+ movq r, %rax
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastd d, %VEC0; \
+ movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
# define SECTION(p) p##.evex
# define MEMSET_SYMBOL(p,s) p##_evex_##s
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 1e0511c79a..fb9053f4d6 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -58,8 +58,10 @@
#ifndef MOVQ
# if VEC_SIZE > 16
# define MOVQ vmovq
+# define MOVD vmovd
# else
# define MOVQ movq
+# define MOVD movd
# endif
#endif
@@ -72,9 +74,17 @@
#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
# define END_REG rcx
# define LOOP_REG rdi
+# define LESS_VEC_REG rax
#else
# define END_REG rdi
# define LOOP_REG rdx
+# define LESS_VEC_REG rdi
+#endif
+
+#ifdef USE_XMM_LESS_VEC
+# define XMM_SMALL 1
+#else
+# define XMM_SMALL 0
#endif
#define PAGE_SIZE 4096
@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
shl $2, %RDX_LP
- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
- jmp L(entry_from_bzero)
+ WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ WMEMSET_VDUP_TO_VEC0_LOW()
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec_no_vdup)
+ WMEMSET_VDUP_TO_VEC0_HIGH()
+ jmp L(entry_from_wmemset)
END (WMEMSET_SYMBOL (__wmemset, unaligned))
#endif
@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
#endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
mov %edx, %edx
@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
L(entry_from_bzero):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
+ MEMSET_VDUP_TO_VEC0_HIGH()
+L(entry_from_wmemset):
cmpq $(VEC_SIZE * 2), %rdx
ja L(more_2x_vec)
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
# endif
ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
mov %edx, %edx
# endif
cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
+ MEMSET_VDUP_TO_VEC0_HIGH ()
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(stosb_more_2x_vec)
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
- */
- VMOVU %VEC(0), (%rax)
- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
VZEROUPPER_RETURN
#endif
- .p2align 4,, 10
+ .p2align 4,, 4
L(last_2x_vec):
#ifdef USE_LESS_VEC_MASK_STORE
- VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
- VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
#else
VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
@@ -212,6 +228,7 @@ L(last_2x_vec):
#ifdef USE_LESS_VEC_MASK_STORE
.p2align 4,, 10
L(less_vec):
+L(less_vec_no_vdup):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
and (4x, 8x] jump to target. */
L(more_2x_vec):
-
- /* Two different methods of setting up pointers / compare. The
- two methods are based on the fact that EVEX/AVX512 mov
- instructions take more bytes then AVX2/SSE2 mov instructions. As
- well that EVEX/AVX512 machines also have fast LEA_BID. Both
- setup and END_REG to avoid complex address mode. For EVEX/AVX512
- this saves code size and keeps a few targets in one fetch block.
- For AVX2/SSE2 this helps prevent AGU bottlenecks. */
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
- LOOP_4X_OFFSET) with LEA_BID. */
-
- /* END_REG is rcx for EVEX/AVX512. */
- leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
-#endif
-
- /* Stores to first 2x VEC before cmp as any path forward will
- require it. */
- VMOVU %VEC(0), (%rax)
- VMOVU %VEC(0), VEC_SIZE(%rax)
+ /* Store next 2x vec regardless. */
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)
+ /* Two different methods of setting up pointers / compare. The two
+ methods are based on the fact that EVEX/AVX512 mov instructions take
+ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
+ machines also have fast LEA_BID. Both setup and END_REG to avoid complex
+ address mode. For EVEX/AVX512 this saves code size and keeps a few
+ targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
+ bottlenecks. */
#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
/* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
addq %rdx, %END_REG
@@ -292,6 +299,15 @@ L(more_2x_vec):
cmpq $(VEC_SIZE * 4), %rdx
jbe L(last_2x_vec)
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
+ LEA_BID. */
+
+ /* END_REG is rcx for EVEX/AVX512. */
+ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+#endif
+
/* Store next 2x vec regardless. */
VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
@@ -355,65 +371,92 @@ L(stosb_local):
/* Define L(less_vec) only if not otherwise defined. */
.p2align 4
L(less_vec):
+ /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+ xmm). This is only does anything for AVX2. */
+ MEMSET_VDUP_TO_VEC0_LOW ()
+L(less_vec_no_vdup):
#endif
L(cross_page):
#if VEC_SIZE > 32
cmpl $32, %edx
- jae L(between_32_63)
+ jge L(between_32_63)
#endif
#if VEC_SIZE > 16
cmpl $16, %edx
- jae L(between_16_31)
+ jge L(between_16_31)
+#endif
+#ifndef USE_XMM_LESS_VEC
+ MOVQ %XMM0, %rcx
#endif
- MOVQ %XMM0, %rdi
cmpl $8, %edx
- jae L(between_8_15)
+ jge L(between_8_15)
cmpl $4, %edx
- jae L(between_4_7)
+ jge L(between_4_7)
cmpl $1, %edx
- ja L(between_2_3)
- jb L(return)
- movb %sil, (%rax)
- VZEROUPPER_RETURN
+ jg L(between_2_3)
+ jl L(return)
+ movb %sil, (%LESS_VEC_REG)
+ ret
- /* Align small targets only if not doing so would cross a fetch
- line. */
+ /* Align small targets only if not doing so would cross a fetch line.
+ */
#if VEC_SIZE > 32
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
/* From 32 to 63. No branch when size == 32. */
L(between_32_63):
- VMOVU %YMM0, (%rax)
- VMOVU %YMM0, -32(%rax, %rdx)
+ VMOVU %YMM0, (%LESS_VEC_REG)
+ VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
VZEROUPPER_RETURN
#endif
#if VEC_SIZE >= 32
- .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
L(between_16_31):
/* From 16 to 31. No branch when size == 16. */
- VMOVU %XMM0, (%rax)
- VMOVU %XMM0, -16(%rax, %rdx)
- VZEROUPPER_RETURN
+ VMOVU %XMM0, (%LESS_VEC_REG)
+ VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
+ ret
#endif
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+ */
+ .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
- movq %rdi, (%rax)
- movq %rdi, -8(%rax, %rdx)
- VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+ MOVQ %XMM0, (%rdi)
+ MOVQ %XMM0, -8(%rdi, %rdx)
+#else
+ movq %rcx, (%LESS_VEC_REG)
+ movq %rcx, -8(%LESS_VEC_REG, %rdx)
+#endif
+ ret
- .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+ /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+ */
+ .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
- movl %edi, (%rax)
- movl %edi, -4(%rax, %rdx)
- VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+ MOVD %XMM0, (%rdi)
+ MOVD %XMM0, -4(%rdi, %rdx)
+#else
+ movl %ecx, (%LESS_VEC_REG)
+ movl %ecx, -4(%LESS_VEC_REG, %rdx)
+#endif
+ ret
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ /* 4 * XMM_SMALL for the third mov for AVX2. */
+ .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
L(between_2_3):
/* From 2 to 3. No branch when size == 2. */
- movw %di, (%rax)
- movb %dil, -1(%rax, %rdx)
- VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+ movb %sil, (%rdi)
+ movb %sil, 1(%rdi)
+ movb %sil, -1(%rdi, %rdx)
+#else
+ movw %cx, (%LESS_VEC_REG)
+ movb %sil, -1(%LESS_VEC_REG, %rdx)
+#endif
+ ret
END (MEMSET_SYMBOL (__memset, unaligned_erms))
--
2.25.1
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v1] x86: Improve vec generation in memset-vec-unaligned-erms.S
2022-02-05 22:42 [PATCH v1] x86: Improve vec generation in memset-vec-unaligned-erms.S Noah Goldstein
@ 2022-02-05 22:50 ` H.J. Lu
2022-02-06 6:54 ` [PATCH v2] " Noah Goldstein
1 sibling, 0 replies; 6+ messages in thread
From: H.J. Lu @ 2022-02-05 22:50 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Sat, Feb 5, 2022 at 2:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug.
>
> Split vec generation into multiple steps. This allows the
> broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
> case. This saves an expensive lane-cross instruction and removes
> the need for 'vzeroupper'.
>
> For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
> byte broadcast.
I am working on the same codes. Please give me a few hours.
> Results for memset-avx2 small (geomean of N = 20 benchset runs).
>
> size, New Time, Old Time, New / Old
> 1, 5.074, 4.399, 0.867
> 2, 4.433, 4.411, 0.995
> 4, 4.487, 4.415, 0.984
> 8, 4.454, 4.396, 0.987
> 16, 4.502, 4.443, 0.987
>
> All relevant string/wcsmbs tests are passing.
> ---
> sysdeps/x86_64/memset.S | 21 ++-
> .../multiarch/memset-avx2-unaligned-erms.S | 18 +-
> .../multiarch/memset-avx512-unaligned-erms.S | 18 +-
> .../multiarch/memset-evex-unaligned-erms.S | 18 +-
> .../multiarch/memset-vec-unaligned-erms.S | 163 +++++++++++-------
> 5 files changed, 151 insertions(+), 87 deletions(-)
>
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index 65c09bd0ac..ccf036be53 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -28,17 +28,22 @@
> #define VMOVU movups
> #define VMOVA movaps
>
> -#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> movd d, %xmm0; \
> - movq r, %rax; \
> - punpcklbw %xmm0, %xmm0; \
> - punpcklwd %xmm0, %xmm0; \
> - pshufd $0, %xmm0, %xmm0
> + pxor %xmm1, %xmm1; \
> + pshufb %xmm1, %xmm0; \
> + movq r, %rax
>
> -#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> movd d, %xmm0; \
> - movq r, %rax; \
> - pshufd $0, %xmm0, %xmm0
> + pshufd $0, %xmm0, %xmm0; \
> + movq r, %rax
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH()
> +# define MEMSET_VDUP_TO_VEC0_LOW()
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> +# define WMEMSET_VDUP_TO_VEC0_LOW()
>
> #define SECTION(p) p
>
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index 1af668af0a..c0bf2875d0 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -10,15 +10,18 @@
> # define VMOVU vmovdqu
> # define VMOVA vmovdqa
>
> -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> vmovd d, %xmm0; \
> - movq r, %rax; \
> - vpbroadcastb %xmm0, %ymm0
> + movq r, %rax;
>
> -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> - vmovd d, %xmm0; \
> - movq r, %rax; \
> - vpbroadcastd %xmm0, %ymm0
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> + MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
> +# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
> +# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
>
> # ifndef SECTION
> # define SECTION(p) p##.avx
> @@ -30,5 +33,6 @@
> # define WMEMSET_SYMBOL(p,s) p##_avx2_##s
> # endif
>
> +# define USE_XMM_LESS_VEC
> # include "memset-vec-unaligned-erms.S"
> #endif
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index f14d6f8493..5241216a77 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -15,13 +15,19 @@
>
> # define VZEROUPPER
>
> -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> - movq r, %rax; \
> - vpbroadcastb d, %VEC0
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> + vpbroadcastb d, %VEC0; \
> + movq r, %rax
>
> -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> - movq r, %rax; \
> - vpbroadcastd d, %VEC0
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> + vpbroadcastd d, %VEC0; \
> + movq r, %rax
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH()
> +# define MEMSET_VDUP_TO_VEC0_LOW()
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> +# define WMEMSET_VDUP_TO_VEC0_LOW()
>
> # define SECTION(p) p##.evex512
> # define MEMSET_SYMBOL(p,s) p##_avx512_##s
> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> index 64b09e77cc..6370021506 100644
> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> @@ -15,13 +15,19 @@
>
> # define VZEROUPPER
>
> -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> - movq r, %rax; \
> - vpbroadcastb d, %VEC0
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> + vpbroadcastb d, %VEC0; \
> + movq r, %rax
>
> -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> - movq r, %rax; \
> - vpbroadcastd d, %VEC0
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> + vpbroadcastd d, %VEC0; \
> + movq r, %rax
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH()
> +# define MEMSET_VDUP_TO_VEC0_LOW()
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> +# define WMEMSET_VDUP_TO_VEC0_LOW()
>
> # define SECTION(p) p##.evex
> # define MEMSET_SYMBOL(p,s) p##_evex_##s
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 1e0511c79a..fb9053f4d6 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -58,8 +58,10 @@
> #ifndef MOVQ
> # if VEC_SIZE > 16
> # define MOVQ vmovq
> +# define MOVD vmovd
> # else
> # define MOVQ movq
> +# define MOVD movd
> # endif
> #endif
>
> @@ -72,9 +74,17 @@
> #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> # define END_REG rcx
> # define LOOP_REG rdi
> +# define LESS_VEC_REG rax
> #else
> # define END_REG rdi
> # define LOOP_REG rdx
> +# define LESS_VEC_REG rdi
> +#endif
> +
> +#ifdef USE_XMM_LESS_VEC
> +# define XMM_SMALL 1
> +#else
> +# define XMM_SMALL 0
> #endif
>
> #define PAGE_SIZE 4096
> @@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
>
> ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> shl $2, %RDX_LP
> - WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> - jmp L(entry_from_bzero)
> + WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> + WMEMSET_VDUP_TO_VEC0_LOW()
> + cmpq $VEC_SIZE, %rdx
> + jb L(less_vec_no_vdup)
> + WMEMSET_VDUP_TO_VEC0_HIGH()
> + jmp L(entry_from_wmemset)
> END (WMEMSET_SYMBOL (__wmemset, unaligned))
> #endif
>
> @@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> #endif
>
> ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> mov %edx, %edx
> @@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> L(entry_from_bzero):
> cmpq $VEC_SIZE, %rdx
> jb L(less_vec)
> + MEMSET_VDUP_TO_VEC0_HIGH()
> +L(entry_from_wmemset):
> cmpq $(VEC_SIZE * 2), %rdx
> ja L(more_2x_vec)
> /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> @@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> # endif
>
> ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> mov %edx, %edx
> # endif
> cmp $VEC_SIZE, %RDX_LP
> jb L(less_vec)
> + MEMSET_VDUP_TO_VEC0_HIGH ()
> cmp $(VEC_SIZE * 2), %RDX_LP
> ja L(stosb_more_2x_vec)
> - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
> - */
> - VMOVU %VEC(0), (%rax)
> - VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
> + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> + VMOVU %VEC(0), (%rdi)
> + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> VZEROUPPER_RETURN
> #endif
>
> - .p2align 4,, 10
> + .p2align 4,, 4
> L(last_2x_vec):
> #ifdef USE_LESS_VEC_MASK_STORE
> - VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
> - VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
> + VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
> + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> #else
> VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
> VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
> @@ -212,6 +228,7 @@ L(last_2x_vec):
> #ifdef USE_LESS_VEC_MASK_STORE
> .p2align 4,, 10
> L(less_vec):
> +L(less_vec_no_vdup):
> /* Less than 1 VEC. */
> # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> # error Unsupported VEC_SIZE!
> @@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
> /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
> and (4x, 8x] jump to target. */
> L(more_2x_vec):
> -
> - /* Two different methods of setting up pointers / compare. The
> - two methods are based on the fact that EVEX/AVX512 mov
> - instructions take more bytes then AVX2/SSE2 mov instructions. As
> - well that EVEX/AVX512 machines also have fast LEA_BID. Both
> - setup and END_REG to avoid complex address mode. For EVEX/AVX512
> - this saves code size and keeps a few targets in one fetch block.
> - For AVX2/SSE2 this helps prevent AGU bottlenecks. */
> -#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> - /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
> - LOOP_4X_OFFSET) with LEA_BID. */
> -
> - /* END_REG is rcx for EVEX/AVX512. */
> - leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> -#endif
> -
> - /* Stores to first 2x VEC before cmp as any path forward will
> - require it. */
> - VMOVU %VEC(0), (%rax)
> - VMOVU %VEC(0), VEC_SIZE(%rax)
> + /* Store next 2x vec regardless. */
> + VMOVU %VEC(0), (%rdi)
> + VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)
>
>
> + /* Two different methods of setting up pointers / compare. The two
> + methods are based on the fact that EVEX/AVX512 mov instructions take
> + more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
> + machines also have fast LEA_BID. Both setup and END_REG to avoid complex
> + address mode. For EVEX/AVX512 this saves code size and keeps a few
> + targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
> + bottlenecks. */
> #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
> /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
> addq %rdx, %END_REG
> @@ -292,6 +299,15 @@ L(more_2x_vec):
> cmpq $(VEC_SIZE * 4), %rdx
> jbe L(last_2x_vec)
>
> +
> +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> + /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
> + LEA_BID. */
> +
> + /* END_REG is rcx for EVEX/AVX512. */
> + leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> +#endif
> +
> /* Store next 2x vec regardless. */
> VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
> VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
> @@ -355,65 +371,92 @@ L(stosb_local):
> /* Define L(less_vec) only if not otherwise defined. */
> .p2align 4
> L(less_vec):
> + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
> + xmm). This is only does anything for AVX2. */
> + MEMSET_VDUP_TO_VEC0_LOW ()
> +L(less_vec_no_vdup):
> #endif
> L(cross_page):
> #if VEC_SIZE > 32
> cmpl $32, %edx
> - jae L(between_32_63)
> + jge L(between_32_63)
> #endif
> #if VEC_SIZE > 16
> cmpl $16, %edx
> - jae L(between_16_31)
> + jge L(between_16_31)
> +#endif
> +#ifndef USE_XMM_LESS_VEC
> + MOVQ %XMM0, %rcx
> #endif
> - MOVQ %XMM0, %rdi
> cmpl $8, %edx
> - jae L(between_8_15)
> + jge L(between_8_15)
> cmpl $4, %edx
> - jae L(between_4_7)
> + jge L(between_4_7)
> cmpl $1, %edx
> - ja L(between_2_3)
> - jb L(return)
> - movb %sil, (%rax)
> - VZEROUPPER_RETURN
> + jg L(between_2_3)
> + jl L(return)
> + movb %sil, (%LESS_VEC_REG)
> + ret
>
> - /* Align small targets only if not doing so would cross a fetch
> - line. */
> + /* Align small targets only if not doing so would cross a fetch line.
> + */
> #if VEC_SIZE > 32
> .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
> /* From 32 to 63. No branch when size == 32. */
> L(between_32_63):
> - VMOVU %YMM0, (%rax)
> - VMOVU %YMM0, -32(%rax, %rdx)
> + VMOVU %YMM0, (%LESS_VEC_REG)
> + VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
> VZEROUPPER_RETURN
> #endif
>
> #if VEC_SIZE >= 32
> - .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
> + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
> L(between_16_31):
> /* From 16 to 31. No branch when size == 16. */
> - VMOVU %XMM0, (%rax)
> - VMOVU %XMM0, -16(%rax, %rdx)
> - VZEROUPPER_RETURN
> + VMOVU %XMM0, (%LESS_VEC_REG)
> + VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
> + ret
> #endif
>
> - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> + /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> + */
> + .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
> L(between_8_15):
> /* From 8 to 15. No branch when size == 8. */
> - movq %rdi, (%rax)
> - movq %rdi, -8(%rax, %rdx)
> - VZEROUPPER_RETURN
> +#ifdef USE_XMM_LESS_VEC
> + MOVQ %XMM0, (%rdi)
> + MOVQ %XMM0, -8(%rdi, %rdx)
> +#else
> + movq %rcx, (%LESS_VEC_REG)
> + movq %rcx, -8(%LESS_VEC_REG, %rdx)
> +#endif
> + ret
>
> - .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
> + /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> + */
> + .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
> L(between_4_7):
> /* From 4 to 7. No branch when size == 4. */
> - movl %edi, (%rax)
> - movl %edi, -4(%rax, %rdx)
> - VZEROUPPER_RETURN
> +#ifdef USE_XMM_LESS_VEC
> + MOVD %XMM0, (%rdi)
> + MOVD %XMM0, -4(%rdi, %rdx)
> +#else
> + movl %ecx, (%LESS_VEC_REG)
> + movl %ecx, -4(%LESS_VEC_REG, %rdx)
> +#endif
> + ret
>
> - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> + /* 4 * XMM_SMALL for the third mov for AVX2. */
> + .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
> L(between_2_3):
> /* From 2 to 3. No branch when size == 2. */
> - movw %di, (%rax)
> - movb %dil, -1(%rax, %rdx)
> - VZEROUPPER_RETURN
> +#ifdef USE_XMM_LESS_VEC
> + movb %sil, (%rdi)
> + movb %sil, 1(%rdi)
> + movb %sil, -1(%rdi, %rdx)
> +#else
> + movw %cx, (%LESS_VEC_REG)
> + movb %sil, -1(%LESS_VEC_REG, %rdx)
> +#endif
> + ret
> END (MEMSET_SYMBOL (__memset, unaligned_erms))
> --
> 2.25.1
>
--
H.J.
^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH v2] x86: Improve vec generation in memset-vec-unaligned-erms.S
2022-02-05 22:42 [PATCH v1] x86: Improve vec generation in memset-vec-unaligned-erms.S Noah Goldstein
2022-02-05 22:50 ` H.J. Lu
@ 2022-02-06 6:54 ` Noah Goldstein
2022-02-06 16:28 ` H.J. Lu
1 sibling, 1 reply; 6+ messages in thread
From: Noah Goldstein @ 2022-02-06 6:54 UTC (permalink / raw)
To: libc-alpha
No bug.
Split vec generation into multiple steps. This allows the
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
case. This saves an expensive lane-cross instruction and removes
the need for 'vzeroupper'.
For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
byte broadcast.
Results for memset-avx2 small (geomean of N = 20 benchset runs).
size, New Time, Old Time, New / Old
0, 4.100, 3.831, 0.934
1, 5.074, 4.399, 0.867
2, 4.433, 4.411, 0.995
4, 4.487, 4.415, 0.984
8, 4.454, 4.396, 0.987
16, 4.502, 4.443, 0.987
All relevant string/wcsmbs tests are passing.
---
sysdeps/x86_64/memset.S | 21 ++-
.../multiarch/memset-avx2-unaligned-erms.S | 18 +-
.../multiarch/memset-avx512-unaligned-erms.S | 18 +-
.../multiarch/memset-evex-unaligned-erms.S | 18 +-
.../multiarch/memset-vec-unaligned-erms.S | 164 +++++++++++-------
5 files changed, 152 insertions(+), 87 deletions(-)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 65c09bd0ac..ccf036be53 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -28,17 +28,22 @@
#define VMOVU movups
#define VMOVA movaps
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
- movq r, %rax; \
- punpcklbw %xmm0, %xmm0; \
- punpcklwd %xmm0, %xmm0; \
- pshufd $0, %xmm0, %xmm0
+ pxor %xmm1, %xmm1; \
+ pshufb %xmm1, %xmm0; \
+ movq r, %rax
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
- movq r, %rax; \
- pshufd $0, %xmm0, %xmm0
+ pshufd $0, %xmm0, %xmm0; \
+ movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
#define SECTION(p) p
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 1af668af0a..c0bf2875d0 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -10,15 +10,18 @@
# define VMOVU vmovdqu
# define VMOVA vmovdqa
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastb %xmm0, %ymm0
+ movq r, %rax;
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
# ifndef SECTION
# define SECTION(p) p##.avx
@@ -30,5 +33,6 @@
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
# endif
+# define USE_XMM_LESS_VEC
# include "memset-vec-unaligned-erms.S"
#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index f14d6f8493..5241216a77 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -15,13 +15,19 @@
# define VZEROUPPER
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastb d, %VEC0; \
+ movq r, %rax
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastd d, %VEC0; \
+ movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
# define SECTION(p) p##.evex512
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 64b09e77cc..6370021506 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -15,13 +15,19 @@
# define VZEROUPPER
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastb d, %VEC0; \
+ movq r, %rax
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastd d, %VEC0; \
+ movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
# define SECTION(p) p##.evex
# define MEMSET_SYMBOL(p,s) p##_evex_##s
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 1e0511c79a..1b502b78e4 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -58,8 +58,10 @@
#ifndef MOVQ
# if VEC_SIZE > 16
# define MOVQ vmovq
+# define MOVD vmovd
# else
# define MOVQ movq
+# define MOVD movd
# endif
#endif
@@ -72,9 +74,17 @@
#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
# define END_REG rcx
# define LOOP_REG rdi
+# define LESS_VEC_REG rax
#else
# define END_REG rdi
# define LOOP_REG rdx
+# define LESS_VEC_REG rdi
+#endif
+
+#ifdef USE_XMM_LESS_VEC
+# define XMM_SMALL 1
+#else
+# define XMM_SMALL 0
#endif
#define PAGE_SIZE 4096
@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
shl $2, %RDX_LP
- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
- jmp L(entry_from_bzero)
+ WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ WMEMSET_VDUP_TO_VEC0_LOW()
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec_no_vdup)
+ WMEMSET_VDUP_TO_VEC0_HIGH()
+ jmp L(entry_from_wmemset)
END (WMEMSET_SYMBOL (__wmemset, unaligned))
#endif
@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
#endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
mov %edx, %edx
@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
L(entry_from_bzero):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
+ MEMSET_VDUP_TO_VEC0_HIGH()
+L(entry_from_wmemset):
cmpq $(VEC_SIZE * 2), %rdx
ja L(more_2x_vec)
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
# endif
ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
mov %edx, %edx
# endif
cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
+ MEMSET_VDUP_TO_VEC0_HIGH ()
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(stosb_more_2x_vec)
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
- */
- VMOVU %VEC(0), (%rax)
- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
VZEROUPPER_RETURN
#endif
- .p2align 4,, 10
+ .p2align 4,, 4
L(last_2x_vec):
#ifdef USE_LESS_VEC_MASK_STORE
- VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
- VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
#else
VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
@@ -212,6 +228,7 @@ L(last_2x_vec):
#ifdef USE_LESS_VEC_MASK_STORE
.p2align 4,, 10
L(less_vec):
+L(less_vec_no_vdup):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
and (4x, 8x] jump to target. */
L(more_2x_vec):
-
- /* Two different methods of setting up pointers / compare. The
- two methods are based on the fact that EVEX/AVX512 mov
- instructions take more bytes then AVX2/SSE2 mov instructions. As
- well that EVEX/AVX512 machines also have fast LEA_BID. Both
- setup and END_REG to avoid complex address mode. For EVEX/AVX512
- this saves code size and keeps a few targets in one fetch block.
- For AVX2/SSE2 this helps prevent AGU bottlenecks. */
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
- LOOP_4X_OFFSET) with LEA_BID. */
-
- /* END_REG is rcx for EVEX/AVX512. */
- leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
-#endif
-
- /* Stores to first 2x VEC before cmp as any path forward will
- require it. */
- VMOVU %VEC(0), (%rax)
- VMOVU %VEC(0), VEC_SIZE(%rax)
+ /* Store next 2x vec regardless. */
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)
+ /* Two different methods of setting up pointers / compare. The two
+ methods are based on the fact that EVEX/AVX512 mov instructions take
+ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
+ machines also have fast LEA_BID. Both setup and END_REG to avoid complex
+ address mode. For EVEX/AVX512 this saves code size and keeps a few
+ targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
+ bottlenecks. */
#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
/* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
addq %rdx, %END_REG
@@ -292,6 +299,15 @@ L(more_2x_vec):
cmpq $(VEC_SIZE * 4), %rdx
jbe L(last_2x_vec)
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
+ LEA_BID. */
+
+ /* END_REG is rcx for EVEX/AVX512. */
+ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+#endif
+
/* Store next 2x vec regardless. */
VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
@@ -355,65 +371,93 @@ L(stosb_local):
/* Define L(less_vec) only if not otherwise defined. */
.p2align 4
L(less_vec):
+ /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+ xmm). This is only does anything for AVX2. */
+ MEMSET_VDUP_TO_VEC0_LOW ()
+L(less_vec_no_vdup):
#endif
L(cross_page):
#if VEC_SIZE > 32
cmpl $32, %edx
- jae L(between_32_63)
+ jge L(between_32_63)
#endif
#if VEC_SIZE > 16
cmpl $16, %edx
- jae L(between_16_31)
+ jge L(between_16_31)
+#endif
+#ifndef USE_XMM_LESS_VEC
+ MOVQ %XMM0, %rcx
#endif
- MOVQ %XMM0, %rdi
cmpl $8, %edx
- jae L(between_8_15)
+ jge L(between_8_15)
cmpl $4, %edx
- jae L(between_4_7)
+ jge L(between_4_7)
cmpl $1, %edx
- ja L(between_2_3)
- jb L(return)
- movb %sil, (%rax)
- VZEROUPPER_RETURN
+ jg L(between_2_3)
+ jl L(between_0_0)
+ movb %sil, (%LESS_VEC_REG)
+L(between_0_0):
+ ret
- /* Align small targets only if not doing so would cross a fetch
- line. */
+ /* Align small targets only if not doing so would cross a fetch line.
+ */
#if VEC_SIZE > 32
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
/* From 32 to 63. No branch when size == 32. */
L(between_32_63):
- VMOVU %YMM0, (%rax)
- VMOVU %YMM0, -32(%rax, %rdx)
+ VMOVU %YMM0, (%LESS_VEC_REG)
+ VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
VZEROUPPER_RETURN
#endif
#if VEC_SIZE >= 32
- .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
L(between_16_31):
/* From 16 to 31. No branch when size == 16. */
- VMOVU %XMM0, (%rax)
- VMOVU %XMM0, -16(%rax, %rdx)
- VZEROUPPER_RETURN
+ VMOVU %XMM0, (%LESS_VEC_REG)
+ VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
+ ret
#endif
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+ */
+ .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
- movq %rdi, (%rax)
- movq %rdi, -8(%rax, %rdx)
- VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+ MOVQ %XMM0, (%rdi)
+ MOVQ %XMM0, -8(%rdi, %rdx)
+#else
+ movq %rcx, (%LESS_VEC_REG)
+ movq %rcx, -8(%LESS_VEC_REG, %rdx)
+#endif
+ ret
- .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+ /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+ */
+ .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
- movl %edi, (%rax)
- movl %edi, -4(%rax, %rdx)
- VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+ MOVD %XMM0, (%rdi)
+ MOVD %XMM0, -4(%rdi, %rdx)
+#else
+ movl %ecx, (%LESS_VEC_REG)
+ movl %ecx, -4(%LESS_VEC_REG, %rdx)
+#endif
+ ret
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ /* 4 * XMM_SMALL for the third mov for AVX2. */
+ .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
L(between_2_3):
/* From 2 to 3. No branch when size == 2. */
- movw %di, (%rax)
- movb %dil, -1(%rax, %rdx)
- VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+ movb %sil, (%rdi)
+ movb %sil, 1(%rdi)
+ movb %sil, -1(%rdi, %rdx)
+#else
+ movw %cx, (%LESS_VEC_REG)
+ movb %sil, -1(%LESS_VEC_REG, %rdx)
+#endif
+ ret
END (MEMSET_SYMBOL (__memset, unaligned_erms))
--
2.25.1
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] x86: Improve vec generation in memset-vec-unaligned-erms.S
2022-02-06 6:54 ` [PATCH v2] " Noah Goldstein
@ 2022-02-06 16:28 ` H.J. Lu
2022-02-07 3:48 ` Noah Goldstein
0 siblings, 1 reply; 6+ messages in thread
From: H.J. Lu @ 2022-02-06 16:28 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Sat, Feb 5, 2022 at 10:54 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug.
>
> Split vec generation into multiple steps. This allows the
> broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
> case. This saves an expensive lane-cross instruction and removes
> the need for 'vzeroupper'.
>
> For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
> byte broadcast.
>
> Results for memset-avx2 small (geomean of N = 20 benchset runs).
>
> size, New Time, Old Time, New / Old
> 0, 4.100, 3.831, 0.934
> 1, 5.074, 4.399, 0.867
> 2, 4.433, 4.411, 0.995
> 4, 4.487, 4.415, 0.984
> 8, 4.454, 4.396, 0.987
> 16, 4.502, 4.443, 0.987
>
> All relevant string/wcsmbs tests are passing.
> ---
> sysdeps/x86_64/memset.S | 21 ++-
> .../multiarch/memset-avx2-unaligned-erms.S | 18 +-
> .../multiarch/memset-avx512-unaligned-erms.S | 18 +-
> .../multiarch/memset-evex-unaligned-erms.S | 18 +-
> .../multiarch/memset-vec-unaligned-erms.S | 164 +++++++++++-------
> 5 files changed, 152 insertions(+), 87 deletions(-)
>
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index 65c09bd0ac..ccf036be53 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -28,17 +28,22 @@
> #define VMOVU movups
> #define VMOVA movaps
>
> -#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> movd d, %xmm0; \
> - movq r, %rax; \
> - punpcklbw %xmm0, %xmm0; \
> - punpcklwd %xmm0, %xmm0; \
> - pshufd $0, %xmm0, %xmm0
> + pxor %xmm1, %xmm1; \
> + pshufb %xmm1, %xmm0; \
> + movq r, %rax
>
> -#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> movd d, %xmm0; \
> - movq r, %rax; \
> - pshufd $0, %xmm0, %xmm0
> + pshufd $0, %xmm0, %xmm0; \
> + movq r, %rax
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH()
> +# define MEMSET_VDUP_TO_VEC0_LOW()
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> +# define WMEMSET_VDUP_TO_VEC0_LOW()
>
> #define SECTION(p) p
>
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index 1af668af0a..c0bf2875d0 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -10,15 +10,18 @@
> # define VMOVU vmovdqu
> # define VMOVA vmovdqa
>
> -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> vmovd d, %xmm0; \
> - movq r, %rax; \
> - vpbroadcastb %xmm0, %ymm0
> + movq r, %rax;
>
> -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> - vmovd d, %xmm0; \
> - movq r, %rax; \
> - vpbroadcastd %xmm0, %ymm0
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> + MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
> +# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
> +# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
>
> # ifndef SECTION
> # define SECTION(p) p##.avx
> @@ -30,5 +33,6 @@
> # define WMEMSET_SYMBOL(p,s) p##_avx2_##s
> # endif
>
> +# define USE_XMM_LESS_VEC
> # include "memset-vec-unaligned-erms.S"
> #endif
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index f14d6f8493..5241216a77 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -15,13 +15,19 @@
>
> # define VZEROUPPER
>
> -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> - movq r, %rax; \
> - vpbroadcastb d, %VEC0
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> + vpbroadcastb d, %VEC0; \
> + movq r, %rax
>
> -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> - movq r, %rax; \
> - vpbroadcastd d, %VEC0
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> + vpbroadcastd d, %VEC0; \
> + movq r, %rax
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH()
> +# define MEMSET_VDUP_TO_VEC0_LOW()
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> +# define WMEMSET_VDUP_TO_VEC0_LOW()
>
> # define SECTION(p) p##.evex512
> # define MEMSET_SYMBOL(p,s) p##_avx512_##s
> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> index 64b09e77cc..6370021506 100644
> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> @@ -15,13 +15,19 @@
>
> # define VZEROUPPER
>
> -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> - movq r, %rax; \
> - vpbroadcastb d, %VEC0
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> + vpbroadcastb d, %VEC0; \
> + movq r, %rax
>
> -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> - movq r, %rax; \
> - vpbroadcastd d, %VEC0
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> + vpbroadcastd d, %VEC0; \
> + movq r, %rax
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH()
> +# define MEMSET_VDUP_TO_VEC0_LOW()
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> +# define WMEMSET_VDUP_TO_VEC0_LOW()
>
> # define SECTION(p) p##.evex
> # define MEMSET_SYMBOL(p,s) p##_evex_##s
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 1e0511c79a..1b502b78e4 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -58,8 +58,10 @@
> #ifndef MOVQ
> # if VEC_SIZE > 16
> # define MOVQ vmovq
> +# define MOVD vmovd
> # else
> # define MOVQ movq
> +# define MOVD movd
> # endif
> #endif
>
> @@ -72,9 +74,17 @@
> #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> # define END_REG rcx
> # define LOOP_REG rdi
> +# define LESS_VEC_REG rax
> #else
> # define END_REG rdi
> # define LOOP_REG rdx
> +# define LESS_VEC_REG rdi
> +#endif
> +
> +#ifdef USE_XMM_LESS_VEC
> +# define XMM_SMALL 1
> +#else
> +# define XMM_SMALL 0
> #endif
>
> #define PAGE_SIZE 4096
> @@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
>
> ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> shl $2, %RDX_LP
> - WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> - jmp L(entry_from_bzero)
> + WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> + WMEMSET_VDUP_TO_VEC0_LOW()
> + cmpq $VEC_SIZE, %rdx
> + jb L(less_vec_no_vdup)
> + WMEMSET_VDUP_TO_VEC0_HIGH()
> + jmp L(entry_from_wmemset)
> END (WMEMSET_SYMBOL (__wmemset, unaligned))
> #endif
>
> @@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> #endif
>
> ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> mov %edx, %edx
> @@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> L(entry_from_bzero):
> cmpq $VEC_SIZE, %rdx
> jb L(less_vec)
> + MEMSET_VDUP_TO_VEC0_HIGH()
> +L(entry_from_wmemset):
> cmpq $(VEC_SIZE * 2), %rdx
> ja L(more_2x_vec)
> /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> @@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> # endif
>
> ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> mov %edx, %edx
> # endif
> cmp $VEC_SIZE, %RDX_LP
> jb L(less_vec)
> + MEMSET_VDUP_TO_VEC0_HIGH ()
> cmp $(VEC_SIZE * 2), %RDX_LP
> ja L(stosb_more_2x_vec)
> - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
> - */
> - VMOVU %VEC(0), (%rax)
> - VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
> + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> + VMOVU %VEC(0), (%rdi)
> + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> VZEROUPPER_RETURN
> #endif
>
> - .p2align 4,, 10
> + .p2align 4,, 4
> L(last_2x_vec):
> #ifdef USE_LESS_VEC_MASK_STORE
> - VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
> - VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
> + VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
> + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> #else
> VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
> VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
> @@ -212,6 +228,7 @@ L(last_2x_vec):
> #ifdef USE_LESS_VEC_MASK_STORE
> .p2align 4,, 10
> L(less_vec):
> +L(less_vec_no_vdup):
> /* Less than 1 VEC. */
> # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> # error Unsupported VEC_SIZE!
> @@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
> /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
> and (4x, 8x] jump to target. */
> L(more_2x_vec):
> -
> - /* Two different methods of setting up pointers / compare. The
> - two methods are based on the fact that EVEX/AVX512 mov
> - instructions take more bytes then AVX2/SSE2 mov instructions. As
> - well that EVEX/AVX512 machines also have fast LEA_BID. Both
> - setup and END_REG to avoid complex address mode. For EVEX/AVX512
> - this saves code size and keeps a few targets in one fetch block.
> - For AVX2/SSE2 this helps prevent AGU bottlenecks. */
> -#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> - /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
> - LOOP_4X_OFFSET) with LEA_BID. */
> -
> - /* END_REG is rcx for EVEX/AVX512. */
> - leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> -#endif
> -
> - /* Stores to first 2x VEC before cmp as any path forward will
> - require it. */
> - VMOVU %VEC(0), (%rax)
> - VMOVU %VEC(0), VEC_SIZE(%rax)
> + /* Store next 2x vec regardless. */
> + VMOVU %VEC(0), (%rdi)
> + VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)
>
>
> + /* Two different methods of setting up pointers / compare. The two
> + methods are based on the fact that EVEX/AVX512 mov instructions take
> + more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
> + machines also have fast LEA_BID. Both setup and END_REG to avoid complex
> + address mode. For EVEX/AVX512 this saves code size and keeps a few
> + targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
> + bottlenecks. */
> #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
> /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
> addq %rdx, %END_REG
> @@ -292,6 +299,15 @@ L(more_2x_vec):
> cmpq $(VEC_SIZE * 4), %rdx
> jbe L(last_2x_vec)
>
> +
> +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> + /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
> + LEA_BID. */
> +
> + /* END_REG is rcx for EVEX/AVX512. */
> + leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> +#endif
> +
> /* Store next 2x vec regardless. */
> VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
> VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
> @@ -355,65 +371,93 @@ L(stosb_local):
> /* Define L(less_vec) only if not otherwise defined. */
> .p2align 4
> L(less_vec):
> + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
> + xmm). This is only does anything for AVX2. */
> + MEMSET_VDUP_TO_VEC0_LOW ()
> +L(less_vec_no_vdup):
> #endif
> L(cross_page):
> #if VEC_SIZE > 32
> cmpl $32, %edx
> - jae L(between_32_63)
> + jge L(between_32_63)
> #endif
> #if VEC_SIZE > 16
> cmpl $16, %edx
> - jae L(between_16_31)
> + jge L(between_16_31)
> +#endif
> +#ifndef USE_XMM_LESS_VEC
> + MOVQ %XMM0, %rcx
> #endif
> - MOVQ %XMM0, %rdi
> cmpl $8, %edx
> - jae L(between_8_15)
> + jge L(between_8_15)
> cmpl $4, %edx
> - jae L(between_4_7)
> + jge L(between_4_7)
> cmpl $1, %edx
> - ja L(between_2_3)
> - jb L(return)
> - movb %sil, (%rax)
> - VZEROUPPER_RETURN
> + jg L(between_2_3)
> + jl L(between_0_0)
> + movb %sil, (%LESS_VEC_REG)
> +L(between_0_0):
> + ret
>
> - /* Align small targets only if not doing so would cross a fetch
> - line. */
> + /* Align small targets only if not doing so would cross a fetch line.
> + */
> #if VEC_SIZE > 32
> .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
> /* From 32 to 63. No branch when size == 32. */
> L(between_32_63):
> - VMOVU %YMM0, (%rax)
> - VMOVU %YMM0, -32(%rax, %rdx)
> + VMOVU %YMM0, (%LESS_VEC_REG)
> + VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
> VZEROUPPER_RETURN
> #endif
>
> #if VEC_SIZE >= 32
> - .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
> + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
> L(between_16_31):
> /* From 16 to 31. No branch when size == 16. */
> - VMOVU %XMM0, (%rax)
> - VMOVU %XMM0, -16(%rax, %rdx)
> - VZEROUPPER_RETURN
> + VMOVU %XMM0, (%LESS_VEC_REG)
> + VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
> + ret
> #endif
>
> - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> + /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> + */
> + .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
> L(between_8_15):
> /* From 8 to 15. No branch when size == 8. */
> - movq %rdi, (%rax)
> - movq %rdi, -8(%rax, %rdx)
> - VZEROUPPER_RETURN
> +#ifdef USE_XMM_LESS_VEC
> + MOVQ %XMM0, (%rdi)
> + MOVQ %XMM0, -8(%rdi, %rdx)
> +#else
> + movq %rcx, (%LESS_VEC_REG)
> + movq %rcx, -8(%LESS_VEC_REG, %rdx)
> +#endif
> + ret
>
> - .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
> + /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> + */
> + .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
> L(between_4_7):
> /* From 4 to 7. No branch when size == 4. */
> - movl %edi, (%rax)
> - movl %edi, -4(%rax, %rdx)
> - VZEROUPPER_RETURN
> +#ifdef USE_XMM_LESS_VEC
> + MOVD %XMM0, (%rdi)
> + MOVD %XMM0, -4(%rdi, %rdx)
> +#else
> + movl %ecx, (%LESS_VEC_REG)
> + movl %ecx, -4(%LESS_VEC_REG, %rdx)
> +#endif
> + ret
>
> - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> + /* 4 * XMM_SMALL for the third mov for AVX2. */
> + .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
> L(between_2_3):
> /* From 2 to 3. No branch when size == 2. */
> - movw %di, (%rax)
> - movb %dil, -1(%rax, %rdx)
> - VZEROUPPER_RETURN
> +#ifdef USE_XMM_LESS_VEC
> + movb %sil, (%rdi)
> + movb %sil, 1(%rdi)
> + movb %sil, -1(%rdi, %rdx)
> +#else
> + movw %cx, (%LESS_VEC_REG)
> + movb %sil, -1(%LESS_VEC_REG, %rdx)
> +#endif
> + ret
> END (MEMSET_SYMBOL (__memset, unaligned_erms))
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] x86: Improve vec generation in memset-vec-unaligned-erms.S
2022-02-06 16:28 ` H.J. Lu
@ 2022-02-07 3:48 ` Noah Goldstein
2022-05-04 5:46 ` Sunil Pandey
0 siblings, 1 reply; 6+ messages in thread
From: Noah Goldstein @ 2022-02-07 3:48 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Sun, Feb 6, 2022 at 10:29 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sat, Feb 5, 2022 at 10:54 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > No bug.
> >
> > Split vec generation into multiple steps. This allows the
> > broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
> > case. This saves an expensive lane-cross instruction and removes
> > the need for 'vzeroupper'.
> >
> > For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
> > byte broadcast.
> >
> > Results for memset-avx2 small (geomean of N = 20 benchset runs).
> >
> > size, New Time, Old Time, New / Old
> > 0, 4.100, 3.831, 0.934
> > 1, 5.074, 4.399, 0.867
> > 2, 4.433, 4.411, 0.995
> > 4, 4.487, 4.415, 0.984
> > 8, 4.454, 4.396, 0.987
> > 16, 4.502, 4.443, 0.987
> >
> > All relevant string/wcsmbs tests are passing.
> > ---
> > sysdeps/x86_64/memset.S | 21 ++-
> > .../multiarch/memset-avx2-unaligned-erms.S | 18 +-
> > .../multiarch/memset-avx512-unaligned-erms.S | 18 +-
> > .../multiarch/memset-evex-unaligned-erms.S | 18 +-
> > .../multiarch/memset-vec-unaligned-erms.S | 164 +++++++++++-------
> > 5 files changed, 152 insertions(+), 87 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> > index 65c09bd0ac..ccf036be53 100644
> > --- a/sysdeps/x86_64/memset.S
> > +++ b/sysdeps/x86_64/memset.S
> > @@ -28,17 +28,22 @@
> > #define VMOVU movups
> > #define VMOVA movaps
> >
> > -#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > movd d, %xmm0; \
> > - movq r, %rax; \
> > - punpcklbw %xmm0, %xmm0; \
> > - punpcklwd %xmm0, %xmm0; \
> > - pshufd $0, %xmm0, %xmm0
> > + pxor %xmm1, %xmm1; \
> > + pshufb %xmm1, %xmm0; \
> > + movq r, %rax
> >
> > -#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > movd d, %xmm0; \
> > - movq r, %rax; \
> > - pshufd $0, %xmm0, %xmm0
> > + pshufd $0, %xmm0, %xmm0; \
> > + movq r, %rax
> > +
> > +# define MEMSET_VDUP_TO_VEC0_HIGH()
> > +# define MEMSET_VDUP_TO_VEC0_LOW()
> > +
> > +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> > +# define WMEMSET_VDUP_TO_VEC0_LOW()
> >
> > #define SECTION(p) p
> >
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > index 1af668af0a..c0bf2875d0 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > @@ -10,15 +10,18 @@
> > # define VMOVU vmovdqu
> > # define VMOVA vmovdqa
> >
> > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > vmovd d, %xmm0; \
> > - movq r, %rax; \
> > - vpbroadcastb %xmm0, %ymm0
> > + movq r, %rax;
> >
> > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > - vmovd d, %xmm0; \
> > - movq r, %rax; \
> > - vpbroadcastd %xmm0, %ymm0
> > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > + MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
> > +
> > +# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
> > +# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
> > +
> > +# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
> > +# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
> >
> > # ifndef SECTION
> > # define SECTION(p) p##.avx
> > @@ -30,5 +33,6 @@
> > # define WMEMSET_SYMBOL(p,s) p##_avx2_##s
> > # endif
> >
> > +# define USE_XMM_LESS_VEC
> > # include "memset-vec-unaligned-erms.S"
> > #endif
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > index f14d6f8493..5241216a77 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > @@ -15,13 +15,19 @@
> >
> > # define VZEROUPPER
> >
> > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > - movq r, %rax; \
> > - vpbroadcastb d, %VEC0
> > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > + vpbroadcastb d, %VEC0; \
> > + movq r, %rax
> >
> > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > - movq r, %rax; \
> > - vpbroadcastd d, %VEC0
> > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > + vpbroadcastd d, %VEC0; \
> > + movq r, %rax
> > +
> > +# define MEMSET_VDUP_TO_VEC0_HIGH()
> > +# define MEMSET_VDUP_TO_VEC0_LOW()
> > +
> > +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> > +# define WMEMSET_VDUP_TO_VEC0_LOW()
> >
> > # define SECTION(p) p##.evex512
> > # define MEMSET_SYMBOL(p,s) p##_avx512_##s
> > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > index 64b09e77cc..6370021506 100644
> > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > @@ -15,13 +15,19 @@
> >
> > # define VZEROUPPER
> >
> > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > - movq r, %rax; \
> > - vpbroadcastb d, %VEC0
> > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > + vpbroadcastb d, %VEC0; \
> > + movq r, %rax
> >
> > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > - movq r, %rax; \
> > - vpbroadcastd d, %VEC0
> > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > + vpbroadcastd d, %VEC0; \
> > + movq r, %rax
> > +
> > +# define MEMSET_VDUP_TO_VEC0_HIGH()
> > +# define MEMSET_VDUP_TO_VEC0_LOW()
> > +
> > +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> > +# define WMEMSET_VDUP_TO_VEC0_LOW()
> >
> > # define SECTION(p) p##.evex
> > # define MEMSET_SYMBOL(p,s) p##_evex_##s
> > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > index 1e0511c79a..1b502b78e4 100644
> > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > @@ -58,8 +58,10 @@
> > #ifndef MOVQ
> > # if VEC_SIZE > 16
> > # define MOVQ vmovq
> > +# define MOVD vmovd
> > # else
> > # define MOVQ movq
> > +# define MOVD movd
> > # endif
> > #endif
> >
> > @@ -72,9 +74,17 @@
> > #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> > # define END_REG rcx
> > # define LOOP_REG rdi
> > +# define LESS_VEC_REG rax
> > #else
> > # define END_REG rdi
> > # define LOOP_REG rdx
> > +# define LESS_VEC_REG rdi
> > +#endif
> > +
> > +#ifdef USE_XMM_LESS_VEC
> > +# define XMM_SMALL 1
> > +#else
> > +# define XMM_SMALL 0
> > #endif
> >
> > #define PAGE_SIZE 4096
> > @@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
> >
> > ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> > shl $2, %RDX_LP
> > - WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > - jmp L(entry_from_bzero)
> > + WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> > + WMEMSET_VDUP_TO_VEC0_LOW()
> > + cmpq $VEC_SIZE, %rdx
> > + jb L(less_vec_no_vdup)
> > + WMEMSET_VDUP_TO_VEC0_HIGH()
> > + jmp L(entry_from_wmemset)
> > END (WMEMSET_SYMBOL (__wmemset, unaligned))
> > #endif
> >
> > @@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> > #endif
> >
> > ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> > - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> > # ifdef __ILP32__
> > /* Clear the upper 32 bits. */
> > mov %edx, %edx
> > @@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> > L(entry_from_bzero):
> > cmpq $VEC_SIZE, %rdx
> > jb L(less_vec)
> > + MEMSET_VDUP_TO_VEC0_HIGH()
> > +L(entry_from_wmemset):
> > cmpq $(VEC_SIZE * 2), %rdx
> > ja L(more_2x_vec)
> > /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> > @@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> > # endif
> >
> > ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> > - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> > # ifdef __ILP32__
> > /* Clear the upper 32 bits. */
> > mov %edx, %edx
> > # endif
> > cmp $VEC_SIZE, %RDX_LP
> > jb L(less_vec)
> > + MEMSET_VDUP_TO_VEC0_HIGH ()
> > cmp $(VEC_SIZE * 2), %RDX_LP
> > ja L(stosb_more_2x_vec)
> > - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
> > - */
> > - VMOVU %VEC(0), (%rax)
> > - VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
> > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> > + VMOVU %VEC(0), (%rdi)
> > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > VZEROUPPER_RETURN
> > #endif
> >
> > - .p2align 4,, 10
> > + .p2align 4,, 4
> > L(last_2x_vec):
> > #ifdef USE_LESS_VEC_MASK_STORE
> > - VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
> > - VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
> > + VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
> > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > #else
> > VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
> > VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
> > @@ -212,6 +228,7 @@ L(last_2x_vec):
> > #ifdef USE_LESS_VEC_MASK_STORE
> > .p2align 4,, 10
> > L(less_vec):
> > +L(less_vec_no_vdup):
> > /* Less than 1 VEC. */
> > # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> > # error Unsupported VEC_SIZE!
> > @@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
> > /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
> > and (4x, 8x] jump to target. */
> > L(more_2x_vec):
> > -
> > - /* Two different methods of setting up pointers / compare. The
> > - two methods are based on the fact that EVEX/AVX512 mov
> > - instructions take more bytes then AVX2/SSE2 mov instructions. As
> > - well that EVEX/AVX512 machines also have fast LEA_BID. Both
> > - setup and END_REG to avoid complex address mode. For EVEX/AVX512
> > - this saves code size and keeps a few targets in one fetch block.
> > - For AVX2/SSE2 this helps prevent AGU bottlenecks. */
> > -#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> > - /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
> > - LOOP_4X_OFFSET) with LEA_BID. */
> > -
> > - /* END_REG is rcx for EVEX/AVX512. */
> > - leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> > -#endif
> > -
> > - /* Stores to first 2x VEC before cmp as any path forward will
> > - require it. */
> > - VMOVU %VEC(0), (%rax)
> > - VMOVU %VEC(0), VEC_SIZE(%rax)
> > + /* Store next 2x vec regardless. */
> > + VMOVU %VEC(0), (%rdi)
> > + VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)
> >
> >
> > + /* Two different methods of setting up pointers / compare. The two
> > + methods are based on the fact that EVEX/AVX512 mov instructions take
> > + more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
> > + machines also have fast LEA_BID. Both setup and END_REG to avoid complex
> > + address mode. For EVEX/AVX512 this saves code size and keeps a few
> > + targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
> > + bottlenecks. */
> > #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
> > /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
> > addq %rdx, %END_REG
> > @@ -292,6 +299,15 @@ L(more_2x_vec):
> > cmpq $(VEC_SIZE * 4), %rdx
> > jbe L(last_2x_vec)
> >
> > +
> > +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> > + /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
> > + LEA_BID. */
> > +
> > + /* END_REG is rcx for EVEX/AVX512. */
> > + leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> > +#endif
> > +
> > /* Store next 2x vec regardless. */
> > VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
> > VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
> > @@ -355,65 +371,93 @@ L(stosb_local):
> > /* Define L(less_vec) only if not otherwise defined. */
> > .p2align 4
> > L(less_vec):
> > + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
> > + xmm). This is only does anything for AVX2. */
> > + MEMSET_VDUP_TO_VEC0_LOW ()
> > +L(less_vec_no_vdup):
> > #endif
> > L(cross_page):
> > #if VEC_SIZE > 32
> > cmpl $32, %edx
> > - jae L(between_32_63)
> > + jge L(between_32_63)
> > #endif
> > #if VEC_SIZE > 16
> > cmpl $16, %edx
> > - jae L(between_16_31)
> > + jge L(between_16_31)
> > +#endif
> > +#ifndef USE_XMM_LESS_VEC
> > + MOVQ %XMM0, %rcx
> > #endif
> > - MOVQ %XMM0, %rdi
> > cmpl $8, %edx
> > - jae L(between_8_15)
> > + jge L(between_8_15)
> > cmpl $4, %edx
> > - jae L(between_4_7)
> > + jge L(between_4_7)
> > cmpl $1, %edx
> > - ja L(between_2_3)
> > - jb L(return)
> > - movb %sil, (%rax)
> > - VZEROUPPER_RETURN
> > + jg L(between_2_3)
> > + jl L(between_0_0)
> > + movb %sil, (%LESS_VEC_REG)
> > +L(between_0_0):
> > + ret
> >
> > - /* Align small targets only if not doing so would cross a fetch
> > - line. */
> > + /* Align small targets only if not doing so would cross a fetch line.
> > + */
> > #if VEC_SIZE > 32
> > .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
> > /* From 32 to 63. No branch when size == 32. */
> > L(between_32_63):
> > - VMOVU %YMM0, (%rax)
> > - VMOVU %YMM0, -32(%rax, %rdx)
> > + VMOVU %YMM0, (%LESS_VEC_REG)
> > + VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
> > VZEROUPPER_RETURN
> > #endif
> >
> > #if VEC_SIZE >= 32
> > - .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
> > + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
> > L(between_16_31):
> > /* From 16 to 31. No branch when size == 16. */
> > - VMOVU %XMM0, (%rax)
> > - VMOVU %XMM0, -16(%rax, %rdx)
> > - VZEROUPPER_RETURN
> > + VMOVU %XMM0, (%LESS_VEC_REG)
> > + VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
> > + ret
> > #endif
> >
> > - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> > + /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> > + */
> > + .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
> > L(between_8_15):
> > /* From 8 to 15. No branch when size == 8. */
> > - movq %rdi, (%rax)
> > - movq %rdi, -8(%rax, %rdx)
> > - VZEROUPPER_RETURN
> > +#ifdef USE_XMM_LESS_VEC
> > + MOVQ %XMM0, (%rdi)
> > + MOVQ %XMM0, -8(%rdi, %rdx)
> > +#else
> > + movq %rcx, (%LESS_VEC_REG)
> > + movq %rcx, -8(%LESS_VEC_REG, %rdx)
> > +#endif
> > + ret
> >
> > - .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
> > + /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> > + */
> > + .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
> > L(between_4_7):
> > /* From 4 to 7. No branch when size == 4. */
> > - movl %edi, (%rax)
> > - movl %edi, -4(%rax, %rdx)
> > - VZEROUPPER_RETURN
> > +#ifdef USE_XMM_LESS_VEC
> > + MOVD %XMM0, (%rdi)
> > + MOVD %XMM0, -4(%rdi, %rdx)
> > +#else
> > + movl %ecx, (%LESS_VEC_REG)
> > + movl %ecx, -4(%LESS_VEC_REG, %rdx)
> > +#endif
> > + ret
> >
> > - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> > + /* 4 * XMM_SMALL for the third mov for AVX2. */
> > + .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
> > L(between_2_3):
> > /* From 2 to 3. No branch when size == 2. */
> > - movw %di, (%rax)
> > - movb %dil, -1(%rax, %rdx)
> > - VZEROUPPER_RETURN
> > +#ifdef USE_XMM_LESS_VEC
> > + movb %sil, (%rdi)
> > + movb %sil, 1(%rdi)
> > + movb %sil, -1(%rdi, %rdx)
> > +#else
> > + movw %cx, (%LESS_VEC_REG)
> > + movb %sil, -1(%LESS_VEC_REG, %rdx)
> > +#endif
> > + ret
> > END (MEMSET_SYMBOL (__memset, unaligned_erms))
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
Thanks pushed.
>
> --
> H.J.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] x86: Improve vec generation in memset-vec-unaligned-erms.S
2022-02-07 3:48 ` Noah Goldstein
@ 2022-05-04 5:46 ` Sunil Pandey
0 siblings, 0 replies; 6+ messages in thread
From: Sunil Pandey @ 2022-05-04 5:46 UTC (permalink / raw)
To: Noah Goldstein, Libc-stable Mailing List; +Cc: H.J. Lu, GNU C Library
On Sun, Feb 6, 2022 at 7:48 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Sun, Feb 6, 2022 at 10:29 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sat, Feb 5, 2022 at 10:54 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > No bug.
> > >
> > > Split vec generation into multiple steps. This allows the
> > > broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
> > > case. This saves an expensive lane-cross instruction and removes
> > > the need for 'vzeroupper'.
> > >
> > > For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
> > > byte broadcast.
> > >
> > > Results for memset-avx2 small (geomean of N = 20 benchset runs).
> > >
> > > size, New Time, Old Time, New / Old
> > > 0, 4.100, 3.831, 0.934
> > > 1, 5.074, 4.399, 0.867
> > > 2, 4.433, 4.411, 0.995
> > > 4, 4.487, 4.415, 0.984
> > > 8, 4.454, 4.396, 0.987
> > > 16, 4.502, 4.443, 0.987
> > >
> > > All relevant string/wcsmbs tests are passing.
> > > ---
> > > sysdeps/x86_64/memset.S | 21 ++-
> > > .../multiarch/memset-avx2-unaligned-erms.S | 18 +-
> > > .../multiarch/memset-avx512-unaligned-erms.S | 18 +-
> > > .../multiarch/memset-evex-unaligned-erms.S | 18 +-
> > > .../multiarch/memset-vec-unaligned-erms.S | 164 +++++++++++-------
> > > 5 files changed, 152 insertions(+), 87 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> > > index 65c09bd0ac..ccf036be53 100644
> > > --- a/sysdeps/x86_64/memset.S
> > > +++ b/sysdeps/x86_64/memset.S
> > > @@ -28,17 +28,22 @@
> > > #define VMOVU movups
> > > #define VMOVA movaps
> > >
> > > -#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > movd d, %xmm0; \
> > > - movq r, %rax; \
> > > - punpcklbw %xmm0, %xmm0; \
> > > - punpcklwd %xmm0, %xmm0; \
> > > - pshufd $0, %xmm0, %xmm0
> > > + pxor %xmm1, %xmm1; \
> > > + pshufb %xmm1, %xmm0; \
> > > + movq r, %rax
> > >
> > > -#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > movd d, %xmm0; \
> > > - movq r, %rax; \
> > > - pshufd $0, %xmm0, %xmm0
> > > + pshufd $0, %xmm0, %xmm0; \
> > > + movq r, %rax
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0_HIGH()
> > > +# define MEMSET_VDUP_TO_VEC0_LOW()
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> > > +# define WMEMSET_VDUP_TO_VEC0_LOW()
> > >
> > > #define SECTION(p) p
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > index 1af668af0a..c0bf2875d0 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > @@ -10,15 +10,18 @@
> > > # define VMOVU vmovdqu
> > > # define VMOVA vmovdqa
> > >
> > > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > vmovd d, %xmm0; \
> > > - movq r, %rax; \
> > > - vpbroadcastb %xmm0, %ymm0
> > > + movq r, %rax;
> > >
> > > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > - vmovd d, %xmm0; \
> > > - movq r, %rax; \
> > > - vpbroadcastd %xmm0, %ymm0
> > > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > + MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
> > > +# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
> > > +# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
> > >
> > > # ifndef SECTION
> > > # define SECTION(p) p##.avx
> > > @@ -30,5 +33,6 @@
> > > # define WMEMSET_SYMBOL(p,s) p##_avx2_##s
> > > # endif
> > >
> > > +# define USE_XMM_LESS_VEC
> > > # include "memset-vec-unaligned-erms.S"
> > > #endif
> > > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > index f14d6f8493..5241216a77 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > @@ -15,13 +15,19 @@
> > >
> > > # define VZEROUPPER
> > >
> > > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > - movq r, %rax; \
> > > - vpbroadcastb d, %VEC0
> > > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > + vpbroadcastb d, %VEC0; \
> > > + movq r, %rax
> > >
> > > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > - movq r, %rax; \
> > > - vpbroadcastd d, %VEC0
> > > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > + vpbroadcastd d, %VEC0; \
> > > + movq r, %rax
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0_HIGH()
> > > +# define MEMSET_VDUP_TO_VEC0_LOW()
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> > > +# define WMEMSET_VDUP_TO_VEC0_LOW()
> > >
> > > # define SECTION(p) p##.evex512
> > > # define MEMSET_SYMBOL(p,s) p##_avx512_##s
> > > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > index 64b09e77cc..6370021506 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > @@ -15,13 +15,19 @@
> > >
> > > # define VZEROUPPER
> > >
> > > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > - movq r, %rax; \
> > > - vpbroadcastb d, %VEC0
> > > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > + vpbroadcastb d, %VEC0; \
> > > + movq r, %rax
> > >
> > > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > - movq r, %rax; \
> > > - vpbroadcastd d, %VEC0
> > > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > + vpbroadcastd d, %VEC0; \
> > > + movq r, %rax
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0_HIGH()
> > > +# define MEMSET_VDUP_TO_VEC0_LOW()
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> > > +# define WMEMSET_VDUP_TO_VEC0_LOW()
> > >
> > > # define SECTION(p) p##.evex
> > > # define MEMSET_SYMBOL(p,s) p##_evex_##s
> > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > index 1e0511c79a..1b502b78e4 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > @@ -58,8 +58,10 @@
> > > #ifndef MOVQ
> > > # if VEC_SIZE > 16
> > > # define MOVQ vmovq
> > > +# define MOVD vmovd
> > > # else
> > > # define MOVQ movq
> > > +# define MOVD movd
> > > # endif
> > > #endif
> > >
> > > @@ -72,9 +74,17 @@
> > > #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> > > # define END_REG rcx
> > > # define LOOP_REG rdi
> > > +# define LESS_VEC_REG rax
> > > #else
> > > # define END_REG rdi
> > > # define LOOP_REG rdx
> > > +# define LESS_VEC_REG rdi
> > > +#endif
> > > +
> > > +#ifdef USE_XMM_LESS_VEC
> > > +# define XMM_SMALL 1
> > > +#else
> > > +# define XMM_SMALL 0
> > > #endif
> > >
> > > #define PAGE_SIZE 4096
> > > @@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
> > >
> > > ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> > > shl $2, %RDX_LP
> > > - WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > - jmp L(entry_from_bzero)
> > > + WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > + WMEMSET_VDUP_TO_VEC0_LOW()
> > > + cmpq $VEC_SIZE, %rdx
> > > + jb L(less_vec_no_vdup)
> > > + WMEMSET_VDUP_TO_VEC0_HIGH()
> > > + jmp L(entry_from_wmemset)
> > > END (WMEMSET_SYMBOL (__wmemset, unaligned))
> > > #endif
> > >
> > > @@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> > > #endif
> > >
> > > ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> > > - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > # ifdef __ILP32__
> > > /* Clear the upper 32 bits. */
> > > mov %edx, %edx
> > > @@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> > > L(entry_from_bzero):
> > > cmpq $VEC_SIZE, %rdx
> > > jb L(less_vec)
> > > + MEMSET_VDUP_TO_VEC0_HIGH()
> > > +L(entry_from_wmemset):
> > > cmpq $(VEC_SIZE * 2), %rdx
> > > ja L(more_2x_vec)
> > > /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> > > @@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> > > # endif
> > >
> > > ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> > > - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > # ifdef __ILP32__
> > > /* Clear the upper 32 bits. */
> > > mov %edx, %edx
> > > # endif
> > > cmp $VEC_SIZE, %RDX_LP
> > > jb L(less_vec)
> > > + MEMSET_VDUP_TO_VEC0_HIGH ()
> > > cmp $(VEC_SIZE * 2), %RDX_LP
> > > ja L(stosb_more_2x_vec)
> > > - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
> > > - */
> > > - VMOVU %VEC(0), (%rax)
> > > - VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
> > > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> > > + VMOVU %VEC(0), (%rdi)
> > > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > > VZEROUPPER_RETURN
> > > #endif
> > >
> > > - .p2align 4,, 10
> > > + .p2align 4,, 4
> > > L(last_2x_vec):
> > > #ifdef USE_LESS_VEC_MASK_STORE
> > > - VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
> > > - VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
> > > + VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
> > > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > > #else
> > > VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
> > > VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
> > > @@ -212,6 +228,7 @@ L(last_2x_vec):
> > > #ifdef USE_LESS_VEC_MASK_STORE
> > > .p2align 4,, 10
> > > L(less_vec):
> > > +L(less_vec_no_vdup):
> > > /* Less than 1 VEC. */
> > > # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> > > # error Unsupported VEC_SIZE!
> > > @@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
> > > /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
> > > and (4x, 8x] jump to target. */
> > > L(more_2x_vec):
> > > -
> > > - /* Two different methods of setting up pointers / compare. The
> > > - two methods are based on the fact that EVEX/AVX512 mov
> > > - instructions take more bytes then AVX2/SSE2 mov instructions. As
> > > - well that EVEX/AVX512 machines also have fast LEA_BID. Both
> > > - setup and END_REG to avoid complex address mode. For EVEX/AVX512
> > > - this saves code size and keeps a few targets in one fetch block.
> > > - For AVX2/SSE2 this helps prevent AGU bottlenecks. */
> > > -#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> > > - /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
> > > - LOOP_4X_OFFSET) with LEA_BID. */
> > > -
> > > - /* END_REG is rcx for EVEX/AVX512. */
> > > - leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> > > -#endif
> > > -
> > > - /* Stores to first 2x VEC before cmp as any path forward will
> > > - require it. */
> > > - VMOVU %VEC(0), (%rax)
> > > - VMOVU %VEC(0), VEC_SIZE(%rax)
> > > + /* Store next 2x vec regardless. */
> > > + VMOVU %VEC(0), (%rdi)
> > > + VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)
> > >
> > >
> > > + /* Two different methods of setting up pointers / compare. The two
> > > + methods are based on the fact that EVEX/AVX512 mov instructions take
> > > + more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
> > > + machines also have fast LEA_BID. Both setup and END_REG to avoid complex
> > > + address mode. For EVEX/AVX512 this saves code size and keeps a few
> > > + targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
> > > + bottlenecks. */
> > > #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
> > > /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
> > > addq %rdx, %END_REG
> > > @@ -292,6 +299,15 @@ L(more_2x_vec):
> > > cmpq $(VEC_SIZE * 4), %rdx
> > > jbe L(last_2x_vec)
> > >
> > > +
> > > +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> > > + /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
> > > + LEA_BID. */
> > > +
> > > + /* END_REG is rcx for EVEX/AVX512. */
> > > + leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> > > +#endif
> > > +
> > > /* Store next 2x vec regardless. */
> > > VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
> > > VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
> > > @@ -355,65 +371,93 @@ L(stosb_local):
> > > /* Define L(less_vec) only if not otherwise defined. */
> > > .p2align 4
> > > L(less_vec):
> > > + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
> > > + xmm). This is only does anything for AVX2. */
> > > + MEMSET_VDUP_TO_VEC0_LOW ()
> > > +L(less_vec_no_vdup):
> > > #endif
> > > L(cross_page):
> > > #if VEC_SIZE > 32
> > > cmpl $32, %edx
> > > - jae L(between_32_63)
> > > + jge L(between_32_63)
> > > #endif
> > > #if VEC_SIZE > 16
> > > cmpl $16, %edx
> > > - jae L(between_16_31)
> > > + jge L(between_16_31)
> > > +#endif
> > > +#ifndef USE_XMM_LESS_VEC
> > > + MOVQ %XMM0, %rcx
> > > #endif
> > > - MOVQ %XMM0, %rdi
> > > cmpl $8, %edx
> > > - jae L(between_8_15)
> > > + jge L(between_8_15)
> > > cmpl $4, %edx
> > > - jae L(between_4_7)
> > > + jge L(between_4_7)
> > > cmpl $1, %edx
> > > - ja L(between_2_3)
> > > - jb L(return)
> > > - movb %sil, (%rax)
> > > - VZEROUPPER_RETURN
> > > + jg L(between_2_3)
> > > + jl L(between_0_0)
> > > + movb %sil, (%LESS_VEC_REG)
> > > +L(between_0_0):
> > > + ret
> > >
> > > - /* Align small targets only if not doing so would cross a fetch
> > > - line. */
> > > + /* Align small targets only if not doing so would cross a fetch line.
> > > + */
> > > #if VEC_SIZE > 32
> > > .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
> > > /* From 32 to 63. No branch when size == 32. */
> > > L(between_32_63):
> > > - VMOVU %YMM0, (%rax)
> > > - VMOVU %YMM0, -32(%rax, %rdx)
> > > + VMOVU %YMM0, (%LESS_VEC_REG)
> > > + VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
> > > VZEROUPPER_RETURN
> > > #endif
> > >
> > > #if VEC_SIZE >= 32
> > > - .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
> > > + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
> > > L(between_16_31):
> > > /* From 16 to 31. No branch when size == 16. */
> > > - VMOVU %XMM0, (%rax)
> > > - VMOVU %XMM0, -16(%rax, %rdx)
> > > - VZEROUPPER_RETURN
> > > + VMOVU %XMM0, (%LESS_VEC_REG)
> > > + VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
> > > + ret
> > > #endif
> > >
> > > - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> > > + /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> > > + */
> > > + .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
> > > L(between_8_15):
> > > /* From 8 to 15. No branch when size == 8. */
> > > - movq %rdi, (%rax)
> > > - movq %rdi, -8(%rax, %rdx)
> > > - VZEROUPPER_RETURN
> > > +#ifdef USE_XMM_LESS_VEC
> > > + MOVQ %XMM0, (%rdi)
> > > + MOVQ %XMM0, -8(%rdi, %rdx)
> > > +#else
> > > + movq %rcx, (%LESS_VEC_REG)
> > > + movq %rcx, -8(%LESS_VEC_REG, %rdx)
> > > +#endif
> > > + ret
> > >
> > > - .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
> > > + /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> > > + */
> > > + .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
> > > L(between_4_7):
> > > /* From 4 to 7. No branch when size == 4. */
> > > - movl %edi, (%rax)
> > > - movl %edi, -4(%rax, %rdx)
> > > - VZEROUPPER_RETURN
> > > +#ifdef USE_XMM_LESS_VEC
> > > + MOVD %XMM0, (%rdi)
> > > + MOVD %XMM0, -4(%rdi, %rdx)
> > > +#else
> > > + movl %ecx, (%LESS_VEC_REG)
> > > + movl %ecx, -4(%LESS_VEC_REG, %rdx)
> > > +#endif
> > > + ret
> > >
> > > - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> > > + /* 4 * XMM_SMALL for the third mov for AVX2. */
> > > + .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
> > > L(between_2_3):
> > > /* From 2 to 3. No branch when size == 2. */
> > > - movw %di, (%rax)
> > > - movb %dil, -1(%rax, %rdx)
> > > - VZEROUPPER_RETURN
> > > +#ifdef USE_XMM_LESS_VEC
> > > + movb %sil, (%rdi)
> > > + movb %sil, 1(%rdi)
> > > + movb %sil, -1(%rdi, %rdx)
> > > +#else
> > > + movw %cx, (%LESS_VEC_REG)
> > > + movb %sil, -1(%LESS_VEC_REG, %rdx)
> > > +#endif
> > > + ret
> > > END (MEMSET_SYMBOL (__memset, unaligned_erms))
> > > --
> > > 2.25.1
> > >
> >
> > LGTM.
> >
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> >
> > Thanks.
>
> Thanks pushed.
> >
> > --
> > H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2022-05-04 5:47 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-05 22:42 [PATCH v1] x86: Improve vec generation in memset-vec-unaligned-erms.S Noah Goldstein
2022-02-05 22:50 ` H.J. Lu
2022-02-06 6:54 ` [PATCH v2] " Noah Goldstein
2022-02-06 16:28 ` H.J. Lu
2022-02-07 3:48 ` Noah Goldstein
2022-05-04 5:46 ` Sunil Pandey
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).