[PATCH v1] x86: Improve vec generation in memset-vec-unaligned-erms.S

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH v1] x86: Improve vec generation in memset-vec-unaligned-erms.S
@ 2022-02-05 22:42 Noah Goldstein
  2022-02-05 22:50 ` H.J. Lu
  2022-02-06  6:54 ` [PATCH v2] " Noah Goldstein
  0 siblings, 2 replies; 6+ messages in thread
From: Noah Goldstein @ 2022-02-05 22:42 UTC (permalink / raw)
  To: libc-alpha

No bug.

Split vec generation into multiple steps. This allows the
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
case. This saves an expensive lane-cross instruction and removes
the need for 'vzeroupper'.

For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
byte broadcast.

Results for memset-avx2 small (geomean of N = 20 benchset runs).

size, New Time, Old Time, New / Old
   1,    5.074,    4.399,     0.867
   2,    4.433,    4.411,     0.995
   4,    4.487,    4.415,     0.984
   8,    4.454,    4.396,     0.987
  16,    4.502,    4.443,     0.987

All relevant string/wcsmbs tests are passing.
---
 sysdeps/x86_64/memset.S                       |  21 ++-
 .../multiarch/memset-avx2-unaligned-erms.S    |  18 +-
 .../multiarch/memset-avx512-unaligned-erms.S  |  18 +-
 .../multiarch/memset-evex-unaligned-erms.S    |  18 +-
 .../multiarch/memset-vec-unaligned-erms.S     | 163 +++++++++++-------
 5 files changed, 151 insertions(+), 87 deletions(-)

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 65c09bd0ac..ccf036be53 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -28,17 +28,22 @@
 #define VMOVU     movups
 #define VMOVA     movaps
 
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
-  movq r, %rax; \
-  punpcklbw %xmm0, %xmm0; \
-  punpcklwd %xmm0, %xmm0; \
-  pshufd $0, %xmm0, %xmm0
+  pxor %xmm1, %xmm1; \
+  pshufb %xmm1, %xmm0; \
+  movq r, %rax
 
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
-  movq r, %rax; \
-  pshufd $0, %xmm0, %xmm0
+  pshufd $0, %xmm0, %xmm0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
 
 #define SECTION(p)		p
 
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 1af668af0a..c0bf2875d0 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -10,15 +10,18 @@
 # define VMOVU     vmovdqu
 # define VMOVA     vmovdqa
 
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastb %xmm0, %ymm0
+  movq r, %rax;
 
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
 
 # ifndef SECTION
 #  define SECTION(p)		p##.avx
@@ -30,5 +33,6 @@
 #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
 
+# define USE_XMM_LESS_VEC
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index f14d6f8493..5241216a77 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -15,13 +15,19 @@
 
 # define VZEROUPPER
 
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastb d, %VEC0; \
+  movq r, %rax
 
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastd d, %VEC0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
 
 # define SECTION(p)		p##.evex512
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 64b09e77cc..6370021506 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -15,13 +15,19 @@
 
 # define VZEROUPPER
 
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastb d, %VEC0; \
+  movq r, %rax
 
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastd d, %VEC0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
 
 # define SECTION(p)		p##.evex
 # define MEMSET_SYMBOL(p,s)	p##_evex_##s
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 1e0511c79a..fb9053f4d6 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -58,8 +58,10 @@
 #ifndef MOVQ
 # if VEC_SIZE > 16
 #  define MOVQ				vmovq
+#  define MOVD				vmovd
 # else
 #  define MOVQ				movq
+#  define MOVD				movd
 # endif
 #endif
 
@@ -72,9 +74,17 @@
 #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
 # define END_REG	rcx
 # define LOOP_REG	rdi
+# define LESS_VEC_REG	rax
 #else
 # define END_REG	rdi
 # define LOOP_REG	rdx
+# define LESS_VEC_REG	rdi
+#endif
+
+#ifdef USE_XMM_LESS_VEC
+# define XMM_SMALL	1
+#else
+# define XMM_SMALL	0
 #endif
 
 #define PAGE_SIZE 4096
@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
 
 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
 	shl	$2, %RDX_LP
-	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
-	jmp	L(entry_from_bzero)
+	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+	WMEMSET_VDUP_TO_VEC0_LOW()
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec_no_vdup)
+	WMEMSET_VDUP_TO_VEC0_HIGH()
+	jmp	L(entry_from_wmemset)
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
 #endif
 
@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 #endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
 L(entry_from_bzero):
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
+	MEMSET_VDUP_TO_VEC0_HIGH()
+L(entry_from_wmemset):
 	cmpq	$(VEC_SIZE * 2), %rdx
 	ja	L(more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 # endif
 
 ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	MEMSET_VDUP_TO_VEC0_HIGH ()
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(stosb_more_2x_vec)
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
-	 */
-	VMOVU	%VEC(0), (%rax)
-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
-	.p2align 4,, 10
+	.p2align 4,, 4
 L(last_2x_vec):
 #ifdef USE_LESS_VEC_MASK_STORE
-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
 #else
 	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
 	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
@@ -212,6 +228,7 @@ L(last_2x_vec):
 #ifdef USE_LESS_VEC_MASK_STORE
 	.p2align 4,, 10
 L(less_vec):
+L(less_vec_no_vdup):
 	/* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 #  error Unsupported VEC_SIZE!
@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
 	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
 	   and (4x, 8x] jump to target.  */
 L(more_2x_vec):
-
-	/* Two different methods of setting up pointers / compare. The
-	   two methods are based on the fact that EVEX/AVX512 mov
-	   instructions take more bytes then AVX2/SSE2 mov instructions. As
-	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
-	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
-	   this saves code size and keeps a few targets in one fetch block.
-	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
-	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
-	   LOOP_4X_OFFSET) with LEA_BID.  */
-
-	/* END_REG is rcx for EVEX/AVX512.  */
-	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
-#endif
-
-	/* Stores to first 2x VEC before cmp as any path forward will
-	   require it.  */
-	VMOVU	%VEC(0), (%rax)
-	VMOVU	%VEC(0), VEC_SIZE(%rax)
+	/* Store next 2x vec regardless.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
 
 
+	/* Two different methods of setting up pointers / compare. The two
+	   methods are based on the fact that EVEX/AVX512 mov instructions take
+	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
+	   machines also have fast LEA_BID. Both setup and END_REG to avoid complex
+	   address mode. For EVEX/AVX512 this saves code size and keeps a few
+	   targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
+	   bottlenecks.  */
 #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
 	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
 	addq	%rdx, %END_REG
@@ -292,6 +299,15 @@ L(more_2x_vec):
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_2x_vec)
 
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
+	   LEA_BID.  */
+
+	/* END_REG is rcx for EVEX/AVX512.  */
+	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+#endif
+
 	/* Store next 2x vec regardless.  */
 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
@@ -355,65 +371,92 @@ L(stosb_local):
 	/* Define L(less_vec) only if not otherwise defined.  */
 	.p2align 4
 L(less_vec):
+	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+	   xmm). This is only does anything for AVX2.  */
+	MEMSET_VDUP_TO_VEC0_LOW ()
+L(less_vec_no_vdup):
 #endif
 L(cross_page):
 #if VEC_SIZE > 32
 	cmpl	$32, %edx
-	jae	L(between_32_63)
+	jge	L(between_32_63)
 #endif
 #if VEC_SIZE > 16
 	cmpl	$16, %edx
-	jae	L(between_16_31)
+	jge	L(between_16_31)
+#endif
+#ifndef USE_XMM_LESS_VEC
+	MOVQ	%XMM0, %rcx
 #endif
-	MOVQ	%XMM0, %rdi
 	cmpl	$8, %edx
-	jae	L(between_8_15)
+	jge	L(between_8_15)
 	cmpl	$4, %edx
-	jae	L(between_4_7)
+	jge	L(between_4_7)
 	cmpl	$1, %edx
-	ja	L(between_2_3)
-	jb	L(return)
-	movb	%sil, (%rax)
-	VZEROUPPER_RETURN
+	jg	L(between_2_3)
+	jl	L(return)
+	movb	%sil, (%LESS_VEC_REG)
+	ret
 
-	/* Align small targets only if not doing so would cross a fetch
-	   line.  */
+	/* Align small targets only if not doing so would cross a fetch line.
+	 */
 #if VEC_SIZE > 32
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
-	VMOVU	%YMM0, (%rax)
-	VMOVU	%YMM0, -32(%rax, %rdx)
+	VMOVU	%YMM0, (%LESS_VEC_REG)
+	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
 #if VEC_SIZE >= 32
-	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
 L(between_16_31):
 	/* From 16 to 31.  No branch when size == 16.  */
-	VMOVU	%XMM0, (%rax)
-	VMOVU	%XMM0, -16(%rax, %rdx)
-	VZEROUPPER_RETURN
+	VMOVU	%XMM0, (%LESS_VEC_REG)
+	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
+	ret
 #endif
 
-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+	/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+	 */
+	.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
-	movq	%rdi, (%rax)
-	movq	%rdi, -8(%rax, %rdx)
-	VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+	MOVQ	%XMM0, (%rdi)
+	MOVQ	%XMM0, -8(%rdi, %rdx)
+#else
+	movq	%rcx, (%LESS_VEC_REG)
+	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
+#endif
+	ret
 
-	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+	/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+	 */
+	.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
-	movl	%edi, (%rax)
-	movl	%edi, -4(%rax, %rdx)
-	VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+	MOVD	%XMM0, (%rdi)
+	MOVD	%XMM0, -4(%rdi, %rdx)
+#else
+	movl	%ecx, (%LESS_VEC_REG)
+	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
+#endif
+	ret
 
-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+	/* 4 * XMM_SMALL for the third mov for AVX2.  */
+	.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
 L(between_2_3):
 	/* From 2 to 3.  No branch when size == 2.  */
-	movw	%di, (%rax)
-	movb	%dil, -1(%rax, %rdx)
-	VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+	movb	%sil, (%rdi)
+	movb	%sil, 1(%rdi)
+	movb	%sil, -1(%rdi, %rdx)
+#else
+	movw	%cx, (%LESS_VEC_REG)
+	movb	%sil, -1(%LESS_VEC_REG, %rdx)
+#endif
+	ret
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
-- 
2.25.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v1] x86: Improve vec generation in memset-vec-unaligned-erms.S
  2022-02-05 22:42 [PATCH v1] x86: Improve vec generation in memset-vec-unaligned-erms.S Noah Goldstein
@ 2022-02-05 22:50 ` H.J. Lu
  2022-02-06  6:54 ` [PATCH v2] " Noah Goldstein
  1 sibling, 0 replies; 6+ messages in thread
From: H.J. Lu @ 2022-02-05 22:50 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Sat, Feb 5, 2022 at 2:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug.
>
> Split vec generation into multiple steps. This allows the
> broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
> case. This saves an expensive lane-cross instruction and removes
> the need for 'vzeroupper'.
>
> For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
> byte broadcast.

I am working on the same codes.  Please give me a few hours.

> Results for memset-avx2 small (geomean of N = 20 benchset runs).
>
> size, New Time, Old Time, New / Old
>    1,    5.074,    4.399,     0.867
>    2,    4.433,    4.411,     0.995
>    4,    4.487,    4.415,     0.984
>    8,    4.454,    4.396,     0.987
>   16,    4.502,    4.443,     0.987
>
> All relevant string/wcsmbs tests are passing.
> ---
>  sysdeps/x86_64/memset.S                       |  21 ++-
>  .../multiarch/memset-avx2-unaligned-erms.S    |  18 +-
>  .../multiarch/memset-avx512-unaligned-erms.S  |  18 +-
>  .../multiarch/memset-evex-unaligned-erms.S    |  18 +-
>  .../multiarch/memset-vec-unaligned-erms.S     | 163 +++++++++++-------
>  5 files changed, 151 insertions(+), 87 deletions(-)
>
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index 65c09bd0ac..ccf036be53 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -28,17 +28,22 @@
>  #define VMOVU     movups
>  #define VMOVA     movaps
>
> -#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    movd d, %xmm0; \
> -  movq r, %rax; \
> -  punpcklbw %xmm0, %xmm0; \
> -  punpcklwd %xmm0, %xmm0; \
> -  pshufd $0, %xmm0, %xmm0
> +  pxor %xmm1, %xmm1; \
> +  pshufb %xmm1, %xmm0; \
> +  movq r, %rax
>
> -#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    movd d, %xmm0; \
> -  movq r, %rax; \
> -  pshufd $0, %xmm0, %xmm0
> +  pshufd $0, %xmm0, %xmm0; \
> +  movq r, %rax
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH()
> +# define MEMSET_VDUP_TO_VEC0_LOW()
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> +# define WMEMSET_VDUP_TO_VEC0_LOW()
>
>  #define SECTION(p)             p
>
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index 1af668af0a..c0bf2875d0 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -10,15 +10,18 @@
>  # define VMOVU     vmovdqu
>  # define VMOVA     vmovdqa
>
> -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    vmovd d, %xmm0; \
> -  movq r, %rax; \
> -  vpbroadcastb %xmm0, %ymm0
> +  movq r, %rax;
>
> -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  vmovd d, %xmm0; \
> -  movq r, %rax; \
> -  vpbroadcastd %xmm0, %ymm0
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> +  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
> +# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
> +# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
>
>  # ifndef SECTION
>  #  define SECTION(p)           p##.avx
> @@ -30,5 +33,6 @@
>  #  define WMEMSET_SYMBOL(p,s)  p##_avx2_##s
>  # endif
>
> +# define USE_XMM_LESS_VEC
>  # include "memset-vec-unaligned-erms.S"
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index f14d6f8493..5241216a77 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -15,13 +15,19 @@
>
>  # define VZEROUPPER
>
> -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  movq r, %rax; \
> -  vpbroadcastb d, %VEC0
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> +  vpbroadcastb d, %VEC0; \
> +  movq r, %rax
>
> -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  movq r, %rax; \
> -  vpbroadcastd d, %VEC0
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> +  vpbroadcastd d, %VEC0; \
> +  movq r, %rax
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH()
> +# define MEMSET_VDUP_TO_VEC0_LOW()
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> +# define WMEMSET_VDUP_TO_VEC0_LOW()
>
>  # define SECTION(p)            p##.evex512
>  # define MEMSET_SYMBOL(p,s)    p##_avx512_##s
> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> index 64b09e77cc..6370021506 100644
> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> @@ -15,13 +15,19 @@
>
>  # define VZEROUPPER
>
> -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  movq r, %rax; \
> -  vpbroadcastb d, %VEC0
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> +  vpbroadcastb d, %VEC0; \
> +  movq r, %rax
>
> -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  movq r, %rax; \
> -  vpbroadcastd d, %VEC0
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> +  vpbroadcastd d, %VEC0; \
> +  movq r, %rax
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH()
> +# define MEMSET_VDUP_TO_VEC0_LOW()
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> +# define WMEMSET_VDUP_TO_VEC0_LOW()
>
>  # define SECTION(p)            p##.evex
>  # define MEMSET_SYMBOL(p,s)    p##_evex_##s
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 1e0511c79a..fb9053f4d6 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -58,8 +58,10 @@
>  #ifndef MOVQ
>  # if VEC_SIZE > 16
>  #  define MOVQ                         vmovq
> +#  define MOVD                         vmovd
>  # else
>  #  define MOVQ                         movq
> +#  define MOVD                         movd
>  # endif
>  #endif
>
> @@ -72,9 +74,17 @@
>  #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
>  # define END_REG       rcx
>  # define LOOP_REG      rdi
> +# define LESS_VEC_REG  rax
>  #else
>  # define END_REG       rdi
>  # define LOOP_REG      rdx
> +# define LESS_VEC_REG  rdi
> +#endif
> +
> +#ifdef USE_XMM_LESS_VEC
> +# define XMM_SMALL     1
> +#else
> +# define XMM_SMALL     0
>  #endif
>
>  #define PAGE_SIZE 4096
> @@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
>
>  ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
>         shl     $2, %RDX_LP
> -       WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> -       jmp     L(entry_from_bzero)
> +       WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> +       WMEMSET_VDUP_TO_VEC0_LOW()
> +       cmpq    $VEC_SIZE, %rdx
> +       jb      L(less_vec_no_vdup)
> +       WMEMSET_VDUP_TO_VEC0_HIGH()
> +       jmp     L(entry_from_wmemset)
>  END (WMEMSET_SYMBOL (__wmemset, unaligned))
>  #endif
>
> @@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>  #endif
>
>  ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> -       MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> +       MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
>  # ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         mov     %edx, %edx
> @@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
>  L(entry_from_bzero):
>         cmpq    $VEC_SIZE, %rdx
>         jb      L(less_vec)
> +       MEMSET_VDUP_TO_VEC0_HIGH()
> +L(entry_from_wmemset):
>         cmpq    $(VEC_SIZE * 2), %rdx
>         ja      L(more_2x_vec)
>         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> @@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
>  # endif
>
>  ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> -       MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> +       MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
>  # ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         mov     %edx, %edx
>  # endif
>         cmp     $VEC_SIZE, %RDX_LP
>         jb      L(less_vec)
> +       MEMSET_VDUP_TO_VEC0_HIGH ()
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(stosb_more_2x_vec)
> -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
> -        */
> -       VMOVU   %VEC(0), (%rax)
> -       VMOVU   %VEC(0), -VEC_SIZE(%rax, %rdx)
> +       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +       VMOVU   %VEC(0), (%rdi)
> +       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
>         VZEROUPPER_RETURN
>  #endif
>
> -       .p2align 4,, 10
> +       .p2align 4,, 4
>  L(last_2x_vec):
>  #ifdef USE_LESS_VEC_MASK_STORE
> -       VMOVU   %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
> -       VMOVU   %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
> +       VMOVU   %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
> +       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
>  #else
>         VMOVU   %VEC(0), (VEC_SIZE * -2)(%rdi)
>         VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi)
> @@ -212,6 +228,7 @@ L(last_2x_vec):
>  #ifdef USE_LESS_VEC_MASK_STORE
>         .p2align 4,, 10
>  L(less_vec):
> +L(less_vec_no_vdup):
>         /* Less than 1 VEC.  */
>  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
>  #  error Unsupported VEC_SIZE!
> @@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
>         /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
>            and (4x, 8x] jump to target.  */
>  L(more_2x_vec):
> -
> -       /* Two different methods of setting up pointers / compare. The
> -          two methods are based on the fact that EVEX/AVX512 mov
> -          instructions take more bytes then AVX2/SSE2 mov instructions. As
> -          well that EVEX/AVX512 machines also have fast LEA_BID. Both
> -          setup and END_REG to avoid complex address mode. For EVEX/AVX512
> -          this saves code size and keeps a few targets in one fetch block.
> -          For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
> -#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> -       /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
> -          LOOP_4X_OFFSET) with LEA_BID.  */
> -
> -       /* END_REG is rcx for EVEX/AVX512.  */
> -       leaq    -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> -#endif
> -
> -       /* Stores to first 2x VEC before cmp as any path forward will
> -          require it.  */
> -       VMOVU   %VEC(0), (%rax)
> -       VMOVU   %VEC(0), VEC_SIZE(%rax)
> +       /* Store next 2x vec regardless.  */
> +       VMOVU   %VEC(0), (%rdi)
> +       VMOVU   %VEC(0), (VEC_SIZE * 1)(%rdi)
>
>
> +       /* Two different methods of setting up pointers / compare. The two
> +          methods are based on the fact that EVEX/AVX512 mov instructions take
> +          more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
> +          machines also have fast LEA_BID. Both setup and END_REG to avoid complex
> +          address mode. For EVEX/AVX512 this saves code size and keeps a few
> +          targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
> +          bottlenecks.  */
>  #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
>         /* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
>         addq    %rdx, %END_REG
> @@ -292,6 +299,15 @@ L(more_2x_vec):
>         cmpq    $(VEC_SIZE * 4), %rdx
>         jbe     L(last_2x_vec)
>
> +
> +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> +       /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
> +          LEA_BID.  */
> +
> +       /* END_REG is rcx for EVEX/AVX512.  */
> +       leaq    -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> +#endif
> +
>         /* Store next 2x vec regardless.  */
>         VMOVU   %VEC(0), (VEC_SIZE * 2)(%rax)
>         VMOVU   %VEC(0), (VEC_SIZE * 3)(%rax)
> @@ -355,65 +371,92 @@ L(stosb_local):
>         /* Define L(less_vec) only if not otherwise defined.  */
>         .p2align 4
>  L(less_vec):
> +       /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
> +          xmm). This is only does anything for AVX2.  */
> +       MEMSET_VDUP_TO_VEC0_LOW ()
> +L(less_vec_no_vdup):
>  #endif
>  L(cross_page):
>  #if VEC_SIZE > 32
>         cmpl    $32, %edx
> -       jae     L(between_32_63)
> +       jge     L(between_32_63)
>  #endif
>  #if VEC_SIZE > 16
>         cmpl    $16, %edx
> -       jae     L(between_16_31)
> +       jge     L(between_16_31)
> +#endif
> +#ifndef USE_XMM_LESS_VEC
> +       MOVQ    %XMM0, %rcx
>  #endif
> -       MOVQ    %XMM0, %rdi
>         cmpl    $8, %edx
> -       jae     L(between_8_15)
> +       jge     L(between_8_15)
>         cmpl    $4, %edx
> -       jae     L(between_4_7)
> +       jge     L(between_4_7)
>         cmpl    $1, %edx
> -       ja      L(between_2_3)
> -       jb      L(return)
> -       movb    %sil, (%rax)
> -       VZEROUPPER_RETURN
> +       jg      L(between_2_3)
> +       jl      L(return)
> +       movb    %sil, (%LESS_VEC_REG)
> +       ret
>
> -       /* Align small targets only if not doing so would cross a fetch
> -          line.  */
> +       /* Align small targets only if not doing so would cross a fetch line.
> +        */
>  #if VEC_SIZE > 32
>         .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
>         /* From 32 to 63.  No branch when size == 32.  */
>  L(between_32_63):
> -       VMOVU   %YMM0, (%rax)
> -       VMOVU   %YMM0, -32(%rax, %rdx)
> +       VMOVU   %YMM0, (%LESS_VEC_REG)
> +       VMOVU   %YMM0, -32(%LESS_VEC_REG, %rdx)
>         VZEROUPPER_RETURN
>  #endif
>
>  #if VEC_SIZE >= 32
> -       .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
> +       .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
>  L(between_16_31):
>         /* From 16 to 31.  No branch when size == 16.  */
> -       VMOVU   %XMM0, (%rax)
> -       VMOVU   %XMM0, -16(%rax, %rdx)
> -       VZEROUPPER_RETURN
> +       VMOVU   %XMM0, (%LESS_VEC_REG)
> +       VMOVU   %XMM0, -16(%LESS_VEC_REG, %rdx)
> +       ret
>  #endif
>
> -       .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> +       /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> +        */
> +       .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
>  L(between_8_15):
>         /* From 8 to 15.  No branch when size == 8.  */
> -       movq    %rdi, (%rax)
> -       movq    %rdi, -8(%rax, %rdx)
> -       VZEROUPPER_RETURN
> +#ifdef USE_XMM_LESS_VEC
> +       MOVQ    %XMM0, (%rdi)
> +       MOVQ    %XMM0, -8(%rdi, %rdx)
> +#else
> +       movq    %rcx, (%LESS_VEC_REG)
> +       movq    %rcx, -8(%LESS_VEC_REG, %rdx)
> +#endif
> +       ret
>
> -       .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
> +       /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> +        */
> +       .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
>  L(between_4_7):
>         /* From 4 to 7.  No branch when size == 4.  */
> -       movl    %edi, (%rax)
> -       movl    %edi, -4(%rax, %rdx)
> -       VZEROUPPER_RETURN
> +#ifdef USE_XMM_LESS_VEC
> +       MOVD    %XMM0, (%rdi)
> +       MOVD    %XMM0, -4(%rdi, %rdx)
> +#else
> +       movl    %ecx, (%LESS_VEC_REG)
> +       movl    %ecx, -4(%LESS_VEC_REG, %rdx)
> +#endif
> +       ret
>
> -       .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> +       /* 4 * XMM_SMALL for the third mov for AVX2.  */
> +       .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
>  L(between_2_3):
>         /* From 2 to 3.  No branch when size == 2.  */
> -       movw    %di, (%rax)
> -       movb    %dil, -1(%rax, %rdx)
> -       VZEROUPPER_RETURN
> +#ifdef USE_XMM_LESS_VEC
> +       movb    %sil, (%rdi)
> +       movb    %sil, 1(%rdi)
> +       movb    %sil, -1(%rdi, %rdx)
> +#else
> +       movw    %cx, (%LESS_VEC_REG)
> +       movb    %sil, -1(%LESS_VEC_REG, %rdx)
> +#endif
> +       ret
>  END (MEMSET_SYMBOL (__memset, unaligned_erms))
> --
> 2.25.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v2] x86: Improve vec generation in memset-vec-unaligned-erms.S
  2022-02-05 22:42 [PATCH v1] x86: Improve vec generation in memset-vec-unaligned-erms.S Noah Goldstein
  2022-02-05 22:50 ` H.J. Lu
@ 2022-02-06  6:54 ` Noah Goldstein
  2022-02-06 16:28   ` H.J. Lu
  1 sibling, 1 reply; 6+ messages in thread
From: Noah Goldstein @ 2022-02-06  6:54 UTC (permalink / raw)
  To: libc-alpha

No bug.

Split vec generation into multiple steps. This allows the
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
case. This saves an expensive lane-cross instruction and removes
the need for 'vzeroupper'.

For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
byte broadcast.

Results for memset-avx2 small (geomean of N = 20 benchset runs).

size, New Time, Old Time, New / Old
   0,    4.100,    3.831,     0.934
   1,    5.074,    4.399,     0.867
   2,    4.433,    4.411,     0.995
   4,    4.487,    4.415,     0.984
   8,    4.454,    4.396,     0.987
  16,    4.502,    4.443,     0.987

All relevant string/wcsmbs tests are passing.
---
 sysdeps/x86_64/memset.S                       |  21 ++-
 .../multiarch/memset-avx2-unaligned-erms.S    |  18 +-
 .../multiarch/memset-avx512-unaligned-erms.S  |  18 +-
 .../multiarch/memset-evex-unaligned-erms.S    |  18 +-
 .../multiarch/memset-vec-unaligned-erms.S     | 164 +++++++++++-------
 5 files changed, 152 insertions(+), 87 deletions(-)

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 65c09bd0ac..ccf036be53 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -28,17 +28,22 @@
 #define VMOVU     movups
 #define VMOVA     movaps
 
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
-  movq r, %rax; \
-  punpcklbw %xmm0, %xmm0; \
-  punpcklwd %xmm0, %xmm0; \
-  pshufd $0, %xmm0, %xmm0
+  pxor %xmm1, %xmm1; \
+  pshufb %xmm1, %xmm0; \
+  movq r, %rax
 
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
-  movq r, %rax; \
-  pshufd $0, %xmm0, %xmm0
+  pshufd $0, %xmm0, %xmm0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
 
 #define SECTION(p)		p
 
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 1af668af0a..c0bf2875d0 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -10,15 +10,18 @@
 # define VMOVU     vmovdqu
 # define VMOVA     vmovdqa
 
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastb %xmm0, %ymm0
+  movq r, %rax;
 
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
 
 # ifndef SECTION
 #  define SECTION(p)		p##.avx
@@ -30,5 +33,6 @@
 #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
 
+# define USE_XMM_LESS_VEC
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index f14d6f8493..5241216a77 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -15,13 +15,19 @@
 
 # define VZEROUPPER
 
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastb d, %VEC0; \
+  movq r, %rax
 
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastd d, %VEC0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
 
 # define SECTION(p)		p##.evex512
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 64b09e77cc..6370021506 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -15,13 +15,19 @@
 
 # define VZEROUPPER
 
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastb d, %VEC0; \
+  movq r, %rax
 
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastd d, %VEC0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
 
 # define SECTION(p)		p##.evex
 # define MEMSET_SYMBOL(p,s)	p##_evex_##s
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 1e0511c79a..1b502b78e4 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -58,8 +58,10 @@
 #ifndef MOVQ
 # if VEC_SIZE > 16
 #  define MOVQ				vmovq
+#  define MOVD				vmovd
 # else
 #  define MOVQ				movq
+#  define MOVD				movd
 # endif
 #endif
 
@@ -72,9 +74,17 @@
 #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
 # define END_REG	rcx
 # define LOOP_REG	rdi
+# define LESS_VEC_REG	rax
 #else
 # define END_REG	rdi
 # define LOOP_REG	rdx
+# define LESS_VEC_REG	rdi
+#endif
+
+#ifdef USE_XMM_LESS_VEC
+# define XMM_SMALL	1
+#else
+# define XMM_SMALL	0
 #endif
 
 #define PAGE_SIZE 4096
@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
 
 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
 	shl	$2, %RDX_LP
-	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
-	jmp	L(entry_from_bzero)
+	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+	WMEMSET_VDUP_TO_VEC0_LOW()
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec_no_vdup)
+	WMEMSET_VDUP_TO_VEC0_HIGH()
+	jmp	L(entry_from_wmemset)
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
 #endif
 
@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 #endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
 L(entry_from_bzero):
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
+	MEMSET_VDUP_TO_VEC0_HIGH()
+L(entry_from_wmemset):
 	cmpq	$(VEC_SIZE * 2), %rdx
 	ja	L(more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 # endif
 
 ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	MEMSET_VDUP_TO_VEC0_HIGH ()
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(stosb_more_2x_vec)
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
-	 */
-	VMOVU	%VEC(0), (%rax)
-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
-	.p2align 4,, 10
+	.p2align 4,, 4
 L(last_2x_vec):
 #ifdef USE_LESS_VEC_MASK_STORE
-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
 #else
 	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
 	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
@@ -212,6 +228,7 @@ L(last_2x_vec):
 #ifdef USE_LESS_VEC_MASK_STORE
 	.p2align 4,, 10
 L(less_vec):
+L(less_vec_no_vdup):
 	/* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 #  error Unsupported VEC_SIZE!
@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
 	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
 	   and (4x, 8x] jump to target.  */
 L(more_2x_vec):
-
-	/* Two different methods of setting up pointers / compare. The
-	   two methods are based on the fact that EVEX/AVX512 mov
-	   instructions take more bytes then AVX2/SSE2 mov instructions. As
-	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
-	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
-	   this saves code size and keeps a few targets in one fetch block.
-	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
-	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
-	   LOOP_4X_OFFSET) with LEA_BID.  */
-
-	/* END_REG is rcx for EVEX/AVX512.  */
-	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
-#endif
-
-	/* Stores to first 2x VEC before cmp as any path forward will
-	   require it.  */
-	VMOVU	%VEC(0), (%rax)
-	VMOVU	%VEC(0), VEC_SIZE(%rax)
+	/* Store next 2x vec regardless.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
 
 
+	/* Two different methods of setting up pointers / compare. The two
+	   methods are based on the fact that EVEX/AVX512 mov instructions take
+	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
+	   machines also have fast LEA_BID. Both setup and END_REG to avoid complex
+	   address mode. For EVEX/AVX512 this saves code size and keeps a few
+	   targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
+	   bottlenecks.  */
 #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
 	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
 	addq	%rdx, %END_REG
@@ -292,6 +299,15 @@ L(more_2x_vec):
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_2x_vec)
 
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
+	   LEA_BID.  */
+
+	/* END_REG is rcx for EVEX/AVX512.  */
+	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+#endif
+
 	/* Store next 2x vec regardless.  */
 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
@@ -355,65 +371,93 @@ L(stosb_local):
 	/* Define L(less_vec) only if not otherwise defined.  */
 	.p2align 4
 L(less_vec):
+	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+	   xmm). This is only does anything for AVX2.  */
+	MEMSET_VDUP_TO_VEC0_LOW ()
+L(less_vec_no_vdup):
 #endif
 L(cross_page):
 #if VEC_SIZE > 32
 	cmpl	$32, %edx
-	jae	L(between_32_63)
+	jge	L(between_32_63)
 #endif
 #if VEC_SIZE > 16
 	cmpl	$16, %edx
-	jae	L(between_16_31)
+	jge	L(between_16_31)
+#endif
+#ifndef USE_XMM_LESS_VEC
+	MOVQ	%XMM0, %rcx
 #endif
-	MOVQ	%XMM0, %rdi
 	cmpl	$8, %edx
-	jae	L(between_8_15)
+	jge	L(between_8_15)
 	cmpl	$4, %edx
-	jae	L(between_4_7)
+	jge	L(between_4_7)
 	cmpl	$1, %edx
-	ja	L(between_2_3)
-	jb	L(return)
-	movb	%sil, (%rax)
-	VZEROUPPER_RETURN
+	jg	L(between_2_3)
+	jl	L(between_0_0)
+	movb	%sil, (%LESS_VEC_REG)
+L(between_0_0):
+	ret
 
-	/* Align small targets only if not doing so would cross a fetch
-	   line.  */
+	/* Align small targets only if not doing so would cross a fetch line.
+	 */
 #if VEC_SIZE > 32
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
-	VMOVU	%YMM0, (%rax)
-	VMOVU	%YMM0, -32(%rax, %rdx)
+	VMOVU	%YMM0, (%LESS_VEC_REG)
+	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
 #if VEC_SIZE >= 32
-	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
 L(between_16_31):
 	/* From 16 to 31.  No branch when size == 16.  */
-	VMOVU	%XMM0, (%rax)
-	VMOVU	%XMM0, -16(%rax, %rdx)
-	VZEROUPPER_RETURN
+	VMOVU	%XMM0, (%LESS_VEC_REG)
+	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
+	ret
 #endif
 
-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+	/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+	 */
+	.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
-	movq	%rdi, (%rax)
-	movq	%rdi, -8(%rax, %rdx)
-	VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+	MOVQ	%XMM0, (%rdi)
+	MOVQ	%XMM0, -8(%rdi, %rdx)
+#else
+	movq	%rcx, (%LESS_VEC_REG)
+	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
+#endif
+	ret
 
-	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+	/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+	 */
+	.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
-	movl	%edi, (%rax)
-	movl	%edi, -4(%rax, %rdx)
-	VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+	MOVD	%XMM0, (%rdi)
+	MOVD	%XMM0, -4(%rdi, %rdx)
+#else
+	movl	%ecx, (%LESS_VEC_REG)
+	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
+#endif
+	ret
 
-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+	/* 4 * XMM_SMALL for the third mov for AVX2.  */
+	.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
 L(between_2_3):
 	/* From 2 to 3.  No branch when size == 2.  */
-	movw	%di, (%rax)
-	movb	%dil, -1(%rax, %rdx)
-	VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+	movb	%sil, (%rdi)
+	movb	%sil, 1(%rdi)
+	movb	%sil, -1(%rdi, %rdx)
+#else
+	movw	%cx, (%LESS_VEC_REG)
+	movb	%sil, -1(%LESS_VEC_REG, %rdx)
+#endif
+	ret
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
-- 
2.25.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] x86: Improve vec generation in memset-vec-unaligned-erms.S
  2022-02-06  6:54 ` [PATCH v2] " Noah Goldstein
@ 2022-02-06 16:28   ` H.J. Lu
  2022-02-07  3:48     ` Noah Goldstein
  0 siblings, 1 reply; 6+ messages in thread
From: H.J. Lu @ 2022-02-06 16:28 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Sat, Feb 5, 2022 at 10:54 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug.
>
> Split vec generation into multiple steps. This allows the
> broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
> case. This saves an expensive lane-cross instruction and removes
> the need for 'vzeroupper'.
>
> For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
> byte broadcast.
>
> Results for memset-avx2 small (geomean of N = 20 benchset runs).
>
> size, New Time, Old Time, New / Old
>    0,    4.100,    3.831,     0.934
>    1,    5.074,    4.399,     0.867
>    2,    4.433,    4.411,     0.995
>    4,    4.487,    4.415,     0.984
>    8,    4.454,    4.396,     0.987
>   16,    4.502,    4.443,     0.987
>
> All relevant string/wcsmbs tests are passing.
> ---
>  sysdeps/x86_64/memset.S                       |  21 ++-
>  .../multiarch/memset-avx2-unaligned-erms.S    |  18 +-
>  .../multiarch/memset-avx512-unaligned-erms.S  |  18 +-
>  .../multiarch/memset-evex-unaligned-erms.S    |  18 +-
>  .../multiarch/memset-vec-unaligned-erms.S     | 164 +++++++++++-------
>  5 files changed, 152 insertions(+), 87 deletions(-)
>
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index 65c09bd0ac..ccf036be53 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -28,17 +28,22 @@
>  #define VMOVU     movups
>  #define VMOVA     movaps
>
> -#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    movd d, %xmm0; \
> -  movq r, %rax; \
> -  punpcklbw %xmm0, %xmm0; \
> -  punpcklwd %xmm0, %xmm0; \
> -  pshufd $0, %xmm0, %xmm0
> +  pxor %xmm1, %xmm1; \
> +  pshufb %xmm1, %xmm0; \
> +  movq r, %rax
>
> -#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    movd d, %xmm0; \
> -  movq r, %rax; \
> -  pshufd $0, %xmm0, %xmm0
> +  pshufd $0, %xmm0, %xmm0; \
> +  movq r, %rax
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH()
> +# define MEMSET_VDUP_TO_VEC0_LOW()
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> +# define WMEMSET_VDUP_TO_VEC0_LOW()
>
>  #define SECTION(p)             p
>
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index 1af668af0a..c0bf2875d0 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -10,15 +10,18 @@
>  # define VMOVU     vmovdqu
>  # define VMOVA     vmovdqa
>
> -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    vmovd d, %xmm0; \
> -  movq r, %rax; \
> -  vpbroadcastb %xmm0, %ymm0
> +  movq r, %rax;
>
> -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  vmovd d, %xmm0; \
> -  movq r, %rax; \
> -  vpbroadcastd %xmm0, %ymm0
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> +  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
> +# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
> +# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
>
>  # ifndef SECTION
>  #  define SECTION(p)           p##.avx
> @@ -30,5 +33,6 @@
>  #  define WMEMSET_SYMBOL(p,s)  p##_avx2_##s
>  # endif
>
> +# define USE_XMM_LESS_VEC
>  # include "memset-vec-unaligned-erms.S"
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index f14d6f8493..5241216a77 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -15,13 +15,19 @@
>
>  # define VZEROUPPER
>
> -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  movq r, %rax; \
> -  vpbroadcastb d, %VEC0
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> +  vpbroadcastb d, %VEC0; \
> +  movq r, %rax
>
> -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  movq r, %rax; \
> -  vpbroadcastd d, %VEC0
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> +  vpbroadcastd d, %VEC0; \
> +  movq r, %rax
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH()
> +# define MEMSET_VDUP_TO_VEC0_LOW()
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> +# define WMEMSET_VDUP_TO_VEC0_LOW()
>
>  # define SECTION(p)            p##.evex512
>  # define MEMSET_SYMBOL(p,s)    p##_avx512_##s
> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> index 64b09e77cc..6370021506 100644
> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> @@ -15,13 +15,19 @@
>
>  # define VZEROUPPER
>
> -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  movq r, %rax; \
> -  vpbroadcastb d, %VEC0
> +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> +  vpbroadcastb d, %VEC0; \
> +  movq r, %rax
>
> -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  movq r, %rax; \
> -  vpbroadcastd d, %VEC0
> +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> +  vpbroadcastd d, %VEC0; \
> +  movq r, %rax
> +
> +# define MEMSET_VDUP_TO_VEC0_HIGH()
> +# define MEMSET_VDUP_TO_VEC0_LOW()
> +
> +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> +# define WMEMSET_VDUP_TO_VEC0_LOW()
>
>  # define SECTION(p)            p##.evex
>  # define MEMSET_SYMBOL(p,s)    p##_evex_##s
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 1e0511c79a..1b502b78e4 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -58,8 +58,10 @@
>  #ifndef MOVQ
>  # if VEC_SIZE > 16
>  #  define MOVQ                         vmovq
> +#  define MOVD                         vmovd
>  # else
>  #  define MOVQ                         movq
> +#  define MOVD                         movd
>  # endif
>  #endif
>
> @@ -72,9 +74,17 @@
>  #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
>  # define END_REG       rcx
>  # define LOOP_REG      rdi
> +# define LESS_VEC_REG  rax
>  #else
>  # define END_REG       rdi
>  # define LOOP_REG      rdx
> +# define LESS_VEC_REG  rdi
> +#endif
> +
> +#ifdef USE_XMM_LESS_VEC
> +# define XMM_SMALL     1
> +#else
> +# define XMM_SMALL     0
>  #endif
>
>  #define PAGE_SIZE 4096
> @@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
>
>  ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
>         shl     $2, %RDX_LP
> -       WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> -       jmp     L(entry_from_bzero)
> +       WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> +       WMEMSET_VDUP_TO_VEC0_LOW()
> +       cmpq    $VEC_SIZE, %rdx
> +       jb      L(less_vec_no_vdup)
> +       WMEMSET_VDUP_TO_VEC0_HIGH()
> +       jmp     L(entry_from_wmemset)
>  END (WMEMSET_SYMBOL (__wmemset, unaligned))
>  #endif
>
> @@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>  #endif
>
>  ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> -       MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> +       MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
>  # ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         mov     %edx, %edx
> @@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
>  L(entry_from_bzero):
>         cmpq    $VEC_SIZE, %rdx
>         jb      L(less_vec)
> +       MEMSET_VDUP_TO_VEC0_HIGH()
> +L(entry_from_wmemset):
>         cmpq    $(VEC_SIZE * 2), %rdx
>         ja      L(more_2x_vec)
>         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> @@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
>  # endif
>
>  ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> -       MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> +       MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
>  # ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         mov     %edx, %edx
>  # endif
>         cmp     $VEC_SIZE, %RDX_LP
>         jb      L(less_vec)
> +       MEMSET_VDUP_TO_VEC0_HIGH ()
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(stosb_more_2x_vec)
> -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
> -        */
> -       VMOVU   %VEC(0), (%rax)
> -       VMOVU   %VEC(0), -VEC_SIZE(%rax, %rdx)
> +       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +       VMOVU   %VEC(0), (%rdi)
> +       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
>         VZEROUPPER_RETURN
>  #endif
>
> -       .p2align 4,, 10
> +       .p2align 4,, 4
>  L(last_2x_vec):
>  #ifdef USE_LESS_VEC_MASK_STORE
> -       VMOVU   %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
> -       VMOVU   %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
> +       VMOVU   %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
> +       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
>  #else
>         VMOVU   %VEC(0), (VEC_SIZE * -2)(%rdi)
>         VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi)
> @@ -212,6 +228,7 @@ L(last_2x_vec):
>  #ifdef USE_LESS_VEC_MASK_STORE
>         .p2align 4,, 10
>  L(less_vec):
> +L(less_vec_no_vdup):
>         /* Less than 1 VEC.  */
>  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
>  #  error Unsupported VEC_SIZE!
> @@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
>         /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
>            and (4x, 8x] jump to target.  */
>  L(more_2x_vec):
> -
> -       /* Two different methods of setting up pointers / compare. The
> -          two methods are based on the fact that EVEX/AVX512 mov
> -          instructions take more bytes then AVX2/SSE2 mov instructions. As
> -          well that EVEX/AVX512 machines also have fast LEA_BID. Both
> -          setup and END_REG to avoid complex address mode. For EVEX/AVX512
> -          this saves code size and keeps a few targets in one fetch block.
> -          For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
> -#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> -       /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
> -          LOOP_4X_OFFSET) with LEA_BID.  */
> -
> -       /* END_REG is rcx for EVEX/AVX512.  */
> -       leaq    -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> -#endif
> -
> -       /* Stores to first 2x VEC before cmp as any path forward will
> -          require it.  */
> -       VMOVU   %VEC(0), (%rax)
> -       VMOVU   %VEC(0), VEC_SIZE(%rax)
> +       /* Store next 2x vec regardless.  */
> +       VMOVU   %VEC(0), (%rdi)
> +       VMOVU   %VEC(0), (VEC_SIZE * 1)(%rdi)
>
>
> +       /* Two different methods of setting up pointers / compare. The two
> +          methods are based on the fact that EVEX/AVX512 mov instructions take
> +          more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
> +          machines also have fast LEA_BID. Both setup and END_REG to avoid complex
> +          address mode. For EVEX/AVX512 this saves code size and keeps a few
> +          targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
> +          bottlenecks.  */
>  #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
>         /* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
>         addq    %rdx, %END_REG
> @@ -292,6 +299,15 @@ L(more_2x_vec):
>         cmpq    $(VEC_SIZE * 4), %rdx
>         jbe     L(last_2x_vec)
>
> +
> +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> +       /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
> +          LEA_BID.  */
> +
> +       /* END_REG is rcx for EVEX/AVX512.  */
> +       leaq    -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> +#endif
> +
>         /* Store next 2x vec regardless.  */
>         VMOVU   %VEC(0), (VEC_SIZE * 2)(%rax)
>         VMOVU   %VEC(0), (VEC_SIZE * 3)(%rax)
> @@ -355,65 +371,93 @@ L(stosb_local):
>         /* Define L(less_vec) only if not otherwise defined.  */
>         .p2align 4
>  L(less_vec):
> +       /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
> +          xmm). This is only does anything for AVX2.  */
> +       MEMSET_VDUP_TO_VEC0_LOW ()
> +L(less_vec_no_vdup):
>  #endif
>  L(cross_page):
>  #if VEC_SIZE > 32
>         cmpl    $32, %edx
> -       jae     L(between_32_63)
> +       jge     L(between_32_63)
>  #endif
>  #if VEC_SIZE > 16
>         cmpl    $16, %edx
> -       jae     L(between_16_31)
> +       jge     L(between_16_31)
> +#endif
> +#ifndef USE_XMM_LESS_VEC
> +       MOVQ    %XMM0, %rcx
>  #endif
> -       MOVQ    %XMM0, %rdi
>         cmpl    $8, %edx
> -       jae     L(between_8_15)
> +       jge     L(between_8_15)
>         cmpl    $4, %edx
> -       jae     L(between_4_7)
> +       jge     L(between_4_7)
>         cmpl    $1, %edx
> -       ja      L(between_2_3)
> -       jb      L(return)
> -       movb    %sil, (%rax)
> -       VZEROUPPER_RETURN
> +       jg      L(between_2_3)
> +       jl      L(between_0_0)
> +       movb    %sil, (%LESS_VEC_REG)
> +L(between_0_0):
> +       ret
>
> -       /* Align small targets only if not doing so would cross a fetch
> -          line.  */
> +       /* Align small targets only if not doing so would cross a fetch line.
> +        */
>  #if VEC_SIZE > 32
>         .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
>         /* From 32 to 63.  No branch when size == 32.  */
>  L(between_32_63):
> -       VMOVU   %YMM0, (%rax)
> -       VMOVU   %YMM0, -32(%rax, %rdx)
> +       VMOVU   %YMM0, (%LESS_VEC_REG)
> +       VMOVU   %YMM0, -32(%LESS_VEC_REG, %rdx)
>         VZEROUPPER_RETURN
>  #endif
>
>  #if VEC_SIZE >= 32
> -       .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
> +       .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
>  L(between_16_31):
>         /* From 16 to 31.  No branch when size == 16.  */
> -       VMOVU   %XMM0, (%rax)
> -       VMOVU   %XMM0, -16(%rax, %rdx)
> -       VZEROUPPER_RETURN
> +       VMOVU   %XMM0, (%LESS_VEC_REG)
> +       VMOVU   %XMM0, -16(%LESS_VEC_REG, %rdx)
> +       ret
>  #endif
>
> -       .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> +       /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> +        */
> +       .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
>  L(between_8_15):
>         /* From 8 to 15.  No branch when size == 8.  */
> -       movq    %rdi, (%rax)
> -       movq    %rdi, -8(%rax, %rdx)
> -       VZEROUPPER_RETURN
> +#ifdef USE_XMM_LESS_VEC
> +       MOVQ    %XMM0, (%rdi)
> +       MOVQ    %XMM0, -8(%rdi, %rdx)
> +#else
> +       movq    %rcx, (%LESS_VEC_REG)
> +       movq    %rcx, -8(%LESS_VEC_REG, %rdx)
> +#endif
> +       ret
>
> -       .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
> +       /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> +        */
> +       .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
>  L(between_4_7):
>         /* From 4 to 7.  No branch when size == 4.  */
> -       movl    %edi, (%rax)
> -       movl    %edi, -4(%rax, %rdx)
> -       VZEROUPPER_RETURN
> +#ifdef USE_XMM_LESS_VEC
> +       MOVD    %XMM0, (%rdi)
> +       MOVD    %XMM0, -4(%rdi, %rdx)
> +#else
> +       movl    %ecx, (%LESS_VEC_REG)
> +       movl    %ecx, -4(%LESS_VEC_REG, %rdx)
> +#endif
> +       ret
>
> -       .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> +       /* 4 * XMM_SMALL for the third mov for AVX2.  */
> +       .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
>  L(between_2_3):
>         /* From 2 to 3.  No branch when size == 2.  */
> -       movw    %di, (%rax)
> -       movb    %dil, -1(%rax, %rdx)
> -       VZEROUPPER_RETURN
> +#ifdef USE_XMM_LESS_VEC
> +       movb    %sil, (%rdi)
> +       movb    %sil, 1(%rdi)
> +       movb    %sil, -1(%rdi, %rdx)
> +#else
> +       movw    %cx, (%LESS_VEC_REG)
> +       movb    %sil, -1(%LESS_VEC_REG, %rdx)
> +#endif
> +       ret
>  END (MEMSET_SYMBOL (__memset, unaligned_erms))
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] x86: Improve vec generation in memset-vec-unaligned-erms.S
  2022-02-06 16:28   ` H.J. Lu
@ 2022-02-07  3:48     ` Noah Goldstein
  2022-05-04  5:46       ` Sunil Pandey
  0 siblings, 1 reply; 6+ messages in thread
From: Noah Goldstein @ 2022-02-07  3:48 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Sun, Feb 6, 2022 at 10:29 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sat, Feb 5, 2022 at 10:54 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > No bug.
> >
> > Split vec generation into multiple steps. This allows the
> > broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
> > case. This saves an expensive lane-cross instruction and removes
> > the need for 'vzeroupper'.
> >
> > For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
> > byte broadcast.
> >
> > Results for memset-avx2 small (geomean of N = 20 benchset runs).
> >
> > size, New Time, Old Time, New / Old
> >    0,    4.100,    3.831,     0.934
> >    1,    5.074,    4.399,     0.867
> >    2,    4.433,    4.411,     0.995
> >    4,    4.487,    4.415,     0.984
> >    8,    4.454,    4.396,     0.987
> >   16,    4.502,    4.443,     0.987
> >
> > All relevant string/wcsmbs tests are passing.
> > ---
> >  sysdeps/x86_64/memset.S                       |  21 ++-
> >  .../multiarch/memset-avx2-unaligned-erms.S    |  18 +-
> >  .../multiarch/memset-avx512-unaligned-erms.S  |  18 +-
> >  .../multiarch/memset-evex-unaligned-erms.S    |  18 +-
> >  .../multiarch/memset-vec-unaligned-erms.S     | 164 +++++++++++-------
> >  5 files changed, 152 insertions(+), 87 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> > index 65c09bd0ac..ccf036be53 100644
> > --- a/sysdeps/x86_64/memset.S
> > +++ b/sysdeps/x86_64/memset.S
> > @@ -28,17 +28,22 @@
> >  #define VMOVU     movups
> >  #define VMOVA     movaps
> >
> > -#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> >    movd d, %xmm0; \
> > -  movq r, %rax; \
> > -  punpcklbw %xmm0, %xmm0; \
> > -  punpcklwd %xmm0, %xmm0; \
> > -  pshufd $0, %xmm0, %xmm0
> > +  pxor %xmm1, %xmm1; \
> > +  pshufb %xmm1, %xmm0; \
> > +  movq r, %rax
> >
> > -#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> >    movd d, %xmm0; \
> > -  movq r, %rax; \
> > -  pshufd $0, %xmm0, %xmm0
> > +  pshufd $0, %xmm0, %xmm0; \
> > +  movq r, %rax
> > +
> > +# define MEMSET_VDUP_TO_VEC0_HIGH()
> > +# define MEMSET_VDUP_TO_VEC0_LOW()
> > +
> > +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> > +# define WMEMSET_VDUP_TO_VEC0_LOW()
> >
> >  #define SECTION(p)             p
> >
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > index 1af668af0a..c0bf2875d0 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > @@ -10,15 +10,18 @@
> >  # define VMOVU     vmovdqu
> >  # define VMOVA     vmovdqa
> >
> > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> >    vmovd d, %xmm0; \
> > -  movq r, %rax; \
> > -  vpbroadcastb %xmm0, %ymm0
> > +  movq r, %rax;
> >
> > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > -  vmovd d, %xmm0; \
> > -  movq r, %rax; \
> > -  vpbroadcastd %xmm0, %ymm0
> > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > +  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
> > +
> > +# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
> > +# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
> > +
> > +# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
> > +# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
> >
> >  # ifndef SECTION
> >  #  define SECTION(p)           p##.avx
> > @@ -30,5 +33,6 @@
> >  #  define WMEMSET_SYMBOL(p,s)  p##_avx2_##s
> >  # endif
> >
> > +# define USE_XMM_LESS_VEC
> >  # include "memset-vec-unaligned-erms.S"
> >  #endif
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > index f14d6f8493..5241216a77 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > @@ -15,13 +15,19 @@
> >
> >  # define VZEROUPPER
> >
> > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > -  movq r, %rax; \
> > -  vpbroadcastb d, %VEC0
> > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > +  vpbroadcastb d, %VEC0; \
> > +  movq r, %rax
> >
> > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > -  movq r, %rax; \
> > -  vpbroadcastd d, %VEC0
> > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > +  vpbroadcastd d, %VEC0; \
> > +  movq r, %rax
> > +
> > +# define MEMSET_VDUP_TO_VEC0_HIGH()
> > +# define MEMSET_VDUP_TO_VEC0_LOW()
> > +
> > +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> > +# define WMEMSET_VDUP_TO_VEC0_LOW()
> >
> >  # define SECTION(p)            p##.evex512
> >  # define MEMSET_SYMBOL(p,s)    p##_avx512_##s
> > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > index 64b09e77cc..6370021506 100644
> > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > @@ -15,13 +15,19 @@
> >
> >  # define VZEROUPPER
> >
> > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > -  movq r, %rax; \
> > -  vpbroadcastb d, %VEC0
> > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > +  vpbroadcastb d, %VEC0; \
> > +  movq r, %rax
> >
> > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > -  movq r, %rax; \
> > -  vpbroadcastd d, %VEC0
> > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > +  vpbroadcastd d, %VEC0; \
> > +  movq r, %rax
> > +
> > +# define MEMSET_VDUP_TO_VEC0_HIGH()
> > +# define MEMSET_VDUP_TO_VEC0_LOW()
> > +
> > +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> > +# define WMEMSET_VDUP_TO_VEC0_LOW()
> >
> >  # define SECTION(p)            p##.evex
> >  # define MEMSET_SYMBOL(p,s)    p##_evex_##s
> > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > index 1e0511c79a..1b502b78e4 100644
> > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > @@ -58,8 +58,10 @@
> >  #ifndef MOVQ
> >  # if VEC_SIZE > 16
> >  #  define MOVQ                         vmovq
> > +#  define MOVD                         vmovd
> >  # else
> >  #  define MOVQ                         movq
> > +#  define MOVD                         movd
> >  # endif
> >  #endif
> >
> > @@ -72,9 +74,17 @@
> >  #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> >  # define END_REG       rcx
> >  # define LOOP_REG      rdi
> > +# define LESS_VEC_REG  rax
> >  #else
> >  # define END_REG       rdi
> >  # define LOOP_REG      rdx
> > +# define LESS_VEC_REG  rdi
> > +#endif
> > +
> > +#ifdef USE_XMM_LESS_VEC
> > +# define XMM_SMALL     1
> > +#else
> > +# define XMM_SMALL     0
> >  #endif
> >
> >  #define PAGE_SIZE 4096
> > @@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
> >
> >  ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> >         shl     $2, %RDX_LP
> > -       WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > -       jmp     L(entry_from_bzero)
> > +       WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> > +       WMEMSET_VDUP_TO_VEC0_LOW()
> > +       cmpq    $VEC_SIZE, %rdx
> > +       jb      L(less_vec_no_vdup)
> > +       WMEMSET_VDUP_TO_VEC0_HIGH()
> > +       jmp     L(entry_from_wmemset)
> >  END (WMEMSET_SYMBOL (__wmemset, unaligned))
> >  #endif
> >
> > @@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> >  #endif
> >
> >  ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> > -       MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > +       MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> >  # ifdef __ILP32__
> >         /* Clear the upper 32 bits.  */
> >         mov     %edx, %edx
> > @@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> >  L(entry_from_bzero):
> >         cmpq    $VEC_SIZE, %rdx
> >         jb      L(less_vec)
> > +       MEMSET_VDUP_TO_VEC0_HIGH()
> > +L(entry_from_wmemset):
> >         cmpq    $(VEC_SIZE * 2), %rdx
> >         ja      L(more_2x_vec)
> >         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > @@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> >  # endif
> >
> >  ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> > -       MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > +       MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> >  # ifdef __ILP32__
> >         /* Clear the upper 32 bits.  */
> >         mov     %edx, %edx
> >  # endif
> >         cmp     $VEC_SIZE, %RDX_LP
> >         jb      L(less_vec)
> > +       MEMSET_VDUP_TO_VEC0_HIGH ()
> >         cmp     $(VEC_SIZE * 2), %RDX_LP
> >         ja      L(stosb_more_2x_vec)
> > -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
> > -        */
> > -       VMOVU   %VEC(0), (%rax)
> > -       VMOVU   %VEC(0), -VEC_SIZE(%rax, %rdx)
> > +       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > +       VMOVU   %VEC(0), (%rdi)
> > +       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> >         VZEROUPPER_RETURN
> >  #endif
> >
> > -       .p2align 4,, 10
> > +       .p2align 4,, 4
> >  L(last_2x_vec):
> >  #ifdef USE_LESS_VEC_MASK_STORE
> > -       VMOVU   %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
> > -       VMOVU   %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
> > +       VMOVU   %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
> > +       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> >  #else
> >         VMOVU   %VEC(0), (VEC_SIZE * -2)(%rdi)
> >         VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi)
> > @@ -212,6 +228,7 @@ L(last_2x_vec):
> >  #ifdef USE_LESS_VEC_MASK_STORE
> >         .p2align 4,, 10
> >  L(less_vec):
> > +L(less_vec_no_vdup):
> >         /* Less than 1 VEC.  */
> >  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> >  #  error Unsupported VEC_SIZE!
> > @@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
> >         /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
> >            and (4x, 8x] jump to target.  */
> >  L(more_2x_vec):
> > -
> > -       /* Two different methods of setting up pointers / compare. The
> > -          two methods are based on the fact that EVEX/AVX512 mov
> > -          instructions take more bytes then AVX2/SSE2 mov instructions. As
> > -          well that EVEX/AVX512 machines also have fast LEA_BID. Both
> > -          setup and END_REG to avoid complex address mode. For EVEX/AVX512
> > -          this saves code size and keeps a few targets in one fetch block.
> > -          For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
> > -#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> > -       /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
> > -          LOOP_4X_OFFSET) with LEA_BID.  */
> > -
> > -       /* END_REG is rcx for EVEX/AVX512.  */
> > -       leaq    -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> > -#endif
> > -
> > -       /* Stores to first 2x VEC before cmp as any path forward will
> > -          require it.  */
> > -       VMOVU   %VEC(0), (%rax)
> > -       VMOVU   %VEC(0), VEC_SIZE(%rax)
> > +       /* Store next 2x vec regardless.  */
> > +       VMOVU   %VEC(0), (%rdi)
> > +       VMOVU   %VEC(0), (VEC_SIZE * 1)(%rdi)
> >
> >
> > +       /* Two different methods of setting up pointers / compare. The two
> > +          methods are based on the fact that EVEX/AVX512 mov instructions take
> > +          more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
> > +          machines also have fast LEA_BID. Both setup and END_REG to avoid complex
> > +          address mode. For EVEX/AVX512 this saves code size and keeps a few
> > +          targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
> > +          bottlenecks.  */
> >  #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
> >         /* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
> >         addq    %rdx, %END_REG
> > @@ -292,6 +299,15 @@ L(more_2x_vec):
> >         cmpq    $(VEC_SIZE * 4), %rdx
> >         jbe     L(last_2x_vec)
> >
> > +
> > +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> > +       /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
> > +          LEA_BID.  */
> > +
> > +       /* END_REG is rcx for EVEX/AVX512.  */
> > +       leaq    -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> > +#endif
> > +
> >         /* Store next 2x vec regardless.  */
> >         VMOVU   %VEC(0), (VEC_SIZE * 2)(%rax)
> >         VMOVU   %VEC(0), (VEC_SIZE * 3)(%rax)
> > @@ -355,65 +371,93 @@ L(stosb_local):
> >         /* Define L(less_vec) only if not otherwise defined.  */
> >         .p2align 4
> >  L(less_vec):
> > +       /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
> > +          xmm). This is only does anything for AVX2.  */
> > +       MEMSET_VDUP_TO_VEC0_LOW ()
> > +L(less_vec_no_vdup):
> >  #endif
> >  L(cross_page):
> >  #if VEC_SIZE > 32
> >         cmpl    $32, %edx
> > -       jae     L(between_32_63)
> > +       jge     L(between_32_63)
> >  #endif
> >  #if VEC_SIZE > 16
> >         cmpl    $16, %edx
> > -       jae     L(between_16_31)
> > +       jge     L(between_16_31)
> > +#endif
> > +#ifndef USE_XMM_LESS_VEC
> > +       MOVQ    %XMM0, %rcx
> >  #endif
> > -       MOVQ    %XMM0, %rdi
> >         cmpl    $8, %edx
> > -       jae     L(between_8_15)
> > +       jge     L(between_8_15)
> >         cmpl    $4, %edx
> > -       jae     L(between_4_7)
> > +       jge     L(between_4_7)
> >         cmpl    $1, %edx
> > -       ja      L(between_2_3)
> > -       jb      L(return)
> > -       movb    %sil, (%rax)
> > -       VZEROUPPER_RETURN
> > +       jg      L(between_2_3)
> > +       jl      L(between_0_0)
> > +       movb    %sil, (%LESS_VEC_REG)
> > +L(between_0_0):
> > +       ret
> >
> > -       /* Align small targets only if not doing so would cross a fetch
> > -          line.  */
> > +       /* Align small targets only if not doing so would cross a fetch line.
> > +        */
> >  #if VEC_SIZE > 32
> >         .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
> >         /* From 32 to 63.  No branch when size == 32.  */
> >  L(between_32_63):
> > -       VMOVU   %YMM0, (%rax)
> > -       VMOVU   %YMM0, -32(%rax, %rdx)
> > +       VMOVU   %YMM0, (%LESS_VEC_REG)
> > +       VMOVU   %YMM0, -32(%LESS_VEC_REG, %rdx)
> >         VZEROUPPER_RETURN
> >  #endif
> >
> >  #if VEC_SIZE >= 32
> > -       .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
> > +       .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
> >  L(between_16_31):
> >         /* From 16 to 31.  No branch when size == 16.  */
> > -       VMOVU   %XMM0, (%rax)
> > -       VMOVU   %XMM0, -16(%rax, %rdx)
> > -       VZEROUPPER_RETURN
> > +       VMOVU   %XMM0, (%LESS_VEC_REG)
> > +       VMOVU   %XMM0, -16(%LESS_VEC_REG, %rdx)
> > +       ret
> >  #endif
> >
> > -       .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> > +       /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> > +        */
> > +       .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
> >  L(between_8_15):
> >         /* From 8 to 15.  No branch when size == 8.  */
> > -       movq    %rdi, (%rax)
> > -       movq    %rdi, -8(%rax, %rdx)
> > -       VZEROUPPER_RETURN
> > +#ifdef USE_XMM_LESS_VEC
> > +       MOVQ    %XMM0, (%rdi)
> > +       MOVQ    %XMM0, -8(%rdi, %rdx)
> > +#else
> > +       movq    %rcx, (%LESS_VEC_REG)
> > +       movq    %rcx, -8(%LESS_VEC_REG, %rdx)
> > +#endif
> > +       ret
> >
> > -       .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
> > +       /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> > +        */
> > +       .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
> >  L(between_4_7):
> >         /* From 4 to 7.  No branch when size == 4.  */
> > -       movl    %edi, (%rax)
> > -       movl    %edi, -4(%rax, %rdx)
> > -       VZEROUPPER_RETURN
> > +#ifdef USE_XMM_LESS_VEC
> > +       MOVD    %XMM0, (%rdi)
> > +       MOVD    %XMM0, -4(%rdi, %rdx)
> > +#else
> > +       movl    %ecx, (%LESS_VEC_REG)
> > +       movl    %ecx, -4(%LESS_VEC_REG, %rdx)
> > +#endif
> > +       ret
> >
> > -       .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> > +       /* 4 * XMM_SMALL for the third mov for AVX2.  */
> > +       .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
> >  L(between_2_3):
> >         /* From 2 to 3.  No branch when size == 2.  */
> > -       movw    %di, (%rax)
> > -       movb    %dil, -1(%rax, %rdx)
> > -       VZEROUPPER_RETURN
> > +#ifdef USE_XMM_LESS_VEC
> > +       movb    %sil, (%rdi)
> > +       movb    %sil, 1(%rdi)
> > +       movb    %sil, -1(%rdi, %rdx)
> > +#else
> > +       movw    %cx, (%LESS_VEC_REG)
> > +       movb    %sil, -1(%LESS_VEC_REG, %rdx)
> > +#endif
> > +       ret
> >  END (MEMSET_SYMBOL (__memset, unaligned_erms))
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.

Thanks pushed.
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] x86: Improve vec generation in memset-vec-unaligned-erms.S
  2022-02-07  3:48     ` Noah Goldstein
@ 2022-05-04  5:46       ` Sunil Pandey
  0 siblings, 0 replies; 6+ messages in thread
From: Sunil Pandey @ 2022-05-04  5:46 UTC (permalink / raw)
  To: Noah Goldstein, Libc-stable Mailing List; +Cc: H.J. Lu, GNU C Library

On Sun, Feb 6, 2022 at 7:48 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Sun, Feb 6, 2022 at 10:29 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sat, Feb 5, 2022 at 10:54 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > No bug.
> > >
> > > Split vec generation into multiple steps. This allows the
> > > broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
> > > case. This saves an expensive lane-cross instruction and removes
> > > the need for 'vzeroupper'.
> > >
> > > For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
> > > byte broadcast.
> > >
> > > Results for memset-avx2 small (geomean of N = 20 benchset runs).
> > >
> > > size, New Time, Old Time, New / Old
> > >    0,    4.100,    3.831,     0.934
> > >    1,    5.074,    4.399,     0.867
> > >    2,    4.433,    4.411,     0.995
> > >    4,    4.487,    4.415,     0.984
> > >    8,    4.454,    4.396,     0.987
> > >   16,    4.502,    4.443,     0.987
> > >
> > > All relevant string/wcsmbs tests are passing.
> > > ---
> > >  sysdeps/x86_64/memset.S                       |  21 ++-
> > >  .../multiarch/memset-avx2-unaligned-erms.S    |  18 +-
> > >  .../multiarch/memset-avx512-unaligned-erms.S  |  18 +-
> > >  .../multiarch/memset-evex-unaligned-erms.S    |  18 +-
> > >  .../multiarch/memset-vec-unaligned-erms.S     | 164 +++++++++++-------
> > >  5 files changed, 152 insertions(+), 87 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> > > index 65c09bd0ac..ccf036be53 100644
> > > --- a/sysdeps/x86_64/memset.S
> > > +++ b/sysdeps/x86_64/memset.S
> > > @@ -28,17 +28,22 @@
> > >  #define VMOVU     movups
> > >  #define VMOVA     movaps
> > >
> > > -#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > >    movd d, %xmm0; \
> > > -  movq r, %rax; \
> > > -  punpcklbw %xmm0, %xmm0; \
> > > -  punpcklwd %xmm0, %xmm0; \
> > > -  pshufd $0, %xmm0, %xmm0
> > > +  pxor %xmm1, %xmm1; \
> > > +  pshufb %xmm1, %xmm0; \
> > > +  movq r, %rax
> > >
> > > -#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > >    movd d, %xmm0; \
> > > -  movq r, %rax; \
> > > -  pshufd $0, %xmm0, %xmm0
> > > +  pshufd $0, %xmm0, %xmm0; \
> > > +  movq r, %rax
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0_HIGH()
> > > +# define MEMSET_VDUP_TO_VEC0_LOW()
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> > > +# define WMEMSET_VDUP_TO_VEC0_LOW()
> > >
> > >  #define SECTION(p)             p
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > index 1af668af0a..c0bf2875d0 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > @@ -10,15 +10,18 @@
> > >  # define VMOVU     vmovdqu
> > >  # define VMOVA     vmovdqa
> > >
> > > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > >    vmovd d, %xmm0; \
> > > -  movq r, %rax; \
> > > -  vpbroadcastb %xmm0, %ymm0
> > > +  movq r, %rax;
> > >
> > > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > -  vmovd d, %xmm0; \
> > > -  movq r, %rax; \
> > > -  vpbroadcastd %xmm0, %ymm0
> > > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > +  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
> > > +# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
> > > +# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
> > >
> > >  # ifndef SECTION
> > >  #  define SECTION(p)           p##.avx
> > > @@ -30,5 +33,6 @@
> > >  #  define WMEMSET_SYMBOL(p,s)  p##_avx2_##s
> > >  # endif
> > >
> > > +# define USE_XMM_LESS_VEC
> > >  # include "memset-vec-unaligned-erms.S"
> > >  #endif
> > > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > index f14d6f8493..5241216a77 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > @@ -15,13 +15,19 @@
> > >
> > >  # define VZEROUPPER
> > >
> > > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > -  movq r, %rax; \
> > > -  vpbroadcastb d, %VEC0
> > > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > +  vpbroadcastb d, %VEC0; \
> > > +  movq r, %rax
> > >
> > > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > -  movq r, %rax; \
> > > -  vpbroadcastd d, %VEC0
> > > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > +  vpbroadcastd d, %VEC0; \
> > > +  movq r, %rax
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0_HIGH()
> > > +# define MEMSET_VDUP_TO_VEC0_LOW()
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> > > +# define WMEMSET_VDUP_TO_VEC0_LOW()
> > >
> > >  # define SECTION(p)            p##.evex512
> > >  # define MEMSET_SYMBOL(p,s)    p##_avx512_##s
> > > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > index 64b09e77cc..6370021506 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > @@ -15,13 +15,19 @@
> > >
> > >  # define VZEROUPPER
> > >
> > > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > -  movq r, %rax; \
> > > -  vpbroadcastb d, %VEC0
> > > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > +  vpbroadcastb d, %VEC0; \
> > > +  movq r, %rax
> > >
> > > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > -  movq r, %rax; \
> > > -  vpbroadcastd d, %VEC0
> > > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> > > +  vpbroadcastd d, %VEC0; \
> > > +  movq r, %rax
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0_HIGH()
> > > +# define MEMSET_VDUP_TO_VEC0_LOW()
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0_HIGH()
> > > +# define WMEMSET_VDUP_TO_VEC0_LOW()
> > >
> > >  # define SECTION(p)            p##.evex
> > >  # define MEMSET_SYMBOL(p,s)    p##_evex_##s
> > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > index 1e0511c79a..1b502b78e4 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > @@ -58,8 +58,10 @@
> > >  #ifndef MOVQ
> > >  # if VEC_SIZE > 16
> > >  #  define MOVQ                         vmovq
> > > +#  define MOVD                         vmovd
> > >  # else
> > >  #  define MOVQ                         movq
> > > +#  define MOVD                         movd
> > >  # endif
> > >  #endif
> > >
> > > @@ -72,9 +74,17 @@
> > >  #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> > >  # define END_REG       rcx
> > >  # define LOOP_REG      rdi
> > > +# define LESS_VEC_REG  rax
> > >  #else
> > >  # define END_REG       rdi
> > >  # define LOOP_REG      rdx
> > > +# define LESS_VEC_REG  rdi
> > > +#endif
> > > +
> > > +#ifdef USE_XMM_LESS_VEC
> > > +# define XMM_SMALL     1
> > > +#else
> > > +# define XMM_SMALL     0
> > >  #endif
> > >
> > >  #define PAGE_SIZE 4096
> > > @@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
> > >
> > >  ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> > >         shl     $2, %RDX_LP
> > > -       WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > -       jmp     L(entry_from_bzero)
> > > +       WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > +       WMEMSET_VDUP_TO_VEC0_LOW()
> > > +       cmpq    $VEC_SIZE, %rdx
> > > +       jb      L(less_vec_no_vdup)
> > > +       WMEMSET_VDUP_TO_VEC0_HIGH()
> > > +       jmp     L(entry_from_wmemset)
> > >  END (WMEMSET_SYMBOL (__wmemset, unaligned))
> > >  #endif
> > >
> > > @@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> > >  #endif
> > >
> > >  ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> > > -       MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > +       MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> > >  # ifdef __ILP32__
> > >         /* Clear the upper 32 bits.  */
> > >         mov     %edx, %edx
> > > @@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> > >  L(entry_from_bzero):
> > >         cmpq    $VEC_SIZE, %rdx
> > >         jb      L(less_vec)
> > > +       MEMSET_VDUP_TO_VEC0_HIGH()
> > > +L(entry_from_wmemset):
> > >         cmpq    $(VEC_SIZE * 2), %rdx
> > >         ja      L(more_2x_vec)
> > >         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > > @@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> > >  # endif
> > >
> > >  ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> > > -       MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > +       MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> > >  # ifdef __ILP32__
> > >         /* Clear the upper 32 bits.  */
> > >         mov     %edx, %edx
> > >  # endif
> > >         cmp     $VEC_SIZE, %RDX_LP
> > >         jb      L(less_vec)
> > > +       MEMSET_VDUP_TO_VEC0_HIGH ()
> > >         cmp     $(VEC_SIZE * 2), %RDX_LP
> > >         ja      L(stosb_more_2x_vec)
> > > -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
> > > -        */
> > > -       VMOVU   %VEC(0), (%rax)
> > > -       VMOVU   %VEC(0), -VEC_SIZE(%rax, %rdx)
> > > +       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > > +       VMOVU   %VEC(0), (%rdi)
> > > +       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > >         VZEROUPPER_RETURN
> > >  #endif
> > >
> > > -       .p2align 4,, 10
> > > +       .p2align 4,, 4
> > >  L(last_2x_vec):
> > >  #ifdef USE_LESS_VEC_MASK_STORE
> > > -       VMOVU   %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
> > > -       VMOVU   %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
> > > +       VMOVU   %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
> > > +       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > >  #else
> > >         VMOVU   %VEC(0), (VEC_SIZE * -2)(%rdi)
> > >         VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi)
> > > @@ -212,6 +228,7 @@ L(last_2x_vec):
> > >  #ifdef USE_LESS_VEC_MASK_STORE
> > >         .p2align 4,, 10
> > >  L(less_vec):
> > > +L(less_vec_no_vdup):
> > >         /* Less than 1 VEC.  */
> > >  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> > >  #  error Unsupported VEC_SIZE!
> > > @@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
> > >         /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
> > >            and (4x, 8x] jump to target.  */
> > >  L(more_2x_vec):
> > > -
> > > -       /* Two different methods of setting up pointers / compare. The
> > > -          two methods are based on the fact that EVEX/AVX512 mov
> > > -          instructions take more bytes then AVX2/SSE2 mov instructions. As
> > > -          well that EVEX/AVX512 machines also have fast LEA_BID. Both
> > > -          setup and END_REG to avoid complex address mode. For EVEX/AVX512
> > > -          this saves code size and keeps a few targets in one fetch block.
> > > -          For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
> > > -#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> > > -       /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
> > > -          LOOP_4X_OFFSET) with LEA_BID.  */
> > > -
> > > -       /* END_REG is rcx for EVEX/AVX512.  */
> > > -       leaq    -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> > > -#endif
> > > -
> > > -       /* Stores to first 2x VEC before cmp as any path forward will
> > > -          require it.  */
> > > -       VMOVU   %VEC(0), (%rax)
> > > -       VMOVU   %VEC(0), VEC_SIZE(%rax)
> > > +       /* Store next 2x vec regardless.  */
> > > +       VMOVU   %VEC(0), (%rdi)
> > > +       VMOVU   %VEC(0), (VEC_SIZE * 1)(%rdi)
> > >
> > >
> > > +       /* Two different methods of setting up pointers / compare. The two
> > > +          methods are based on the fact that EVEX/AVX512 mov instructions take
> > > +          more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
> > > +          machines also have fast LEA_BID. Both setup and END_REG to avoid complex
> > > +          address mode. For EVEX/AVX512 this saves code size and keeps a few
> > > +          targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
> > > +          bottlenecks.  */
> > >  #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
> > >         /* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
> > >         addq    %rdx, %END_REG
> > > @@ -292,6 +299,15 @@ L(more_2x_vec):
> > >         cmpq    $(VEC_SIZE * 4), %rdx
> > >         jbe     L(last_2x_vec)
> > >
> > > +
> > > +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> > > +       /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
> > > +          LEA_BID.  */
> > > +
> > > +       /* END_REG is rcx for EVEX/AVX512.  */
> > > +       leaq    -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
> > > +#endif
> > > +
> > >         /* Store next 2x vec regardless.  */
> > >         VMOVU   %VEC(0), (VEC_SIZE * 2)(%rax)
> > >         VMOVU   %VEC(0), (VEC_SIZE * 3)(%rax)
> > > @@ -355,65 +371,93 @@ L(stosb_local):
> > >         /* Define L(less_vec) only if not otherwise defined.  */
> > >         .p2align 4
> > >  L(less_vec):
> > > +       /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
> > > +          xmm). This is only does anything for AVX2.  */
> > > +       MEMSET_VDUP_TO_VEC0_LOW ()
> > > +L(less_vec_no_vdup):
> > >  #endif
> > >  L(cross_page):
> > >  #if VEC_SIZE > 32
> > >         cmpl    $32, %edx
> > > -       jae     L(between_32_63)
> > > +       jge     L(between_32_63)
> > >  #endif
> > >  #if VEC_SIZE > 16
> > >         cmpl    $16, %edx
> > > -       jae     L(between_16_31)
> > > +       jge     L(between_16_31)
> > > +#endif
> > > +#ifndef USE_XMM_LESS_VEC
> > > +       MOVQ    %XMM0, %rcx
> > >  #endif
> > > -       MOVQ    %XMM0, %rdi
> > >         cmpl    $8, %edx
> > > -       jae     L(between_8_15)
> > > +       jge     L(between_8_15)
> > >         cmpl    $4, %edx
> > > -       jae     L(between_4_7)
> > > +       jge     L(between_4_7)
> > >         cmpl    $1, %edx
> > > -       ja      L(between_2_3)
> > > -       jb      L(return)
> > > -       movb    %sil, (%rax)
> > > -       VZEROUPPER_RETURN
> > > +       jg      L(between_2_3)
> > > +       jl      L(between_0_0)
> > > +       movb    %sil, (%LESS_VEC_REG)
> > > +L(between_0_0):
> > > +       ret
> > >
> > > -       /* Align small targets only if not doing so would cross a fetch
> > > -          line.  */
> > > +       /* Align small targets only if not doing so would cross a fetch line.
> > > +        */
> > >  #if VEC_SIZE > 32
> > >         .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
> > >         /* From 32 to 63.  No branch when size == 32.  */
> > >  L(between_32_63):
> > > -       VMOVU   %YMM0, (%rax)
> > > -       VMOVU   %YMM0, -32(%rax, %rdx)
> > > +       VMOVU   %YMM0, (%LESS_VEC_REG)
> > > +       VMOVU   %YMM0, -32(%LESS_VEC_REG, %rdx)
> > >         VZEROUPPER_RETURN
> > >  #endif
> > >
> > >  #if VEC_SIZE >= 32
> > > -       .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
> > > +       .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
> > >  L(between_16_31):
> > >         /* From 16 to 31.  No branch when size == 16.  */
> > > -       VMOVU   %XMM0, (%rax)
> > > -       VMOVU   %XMM0, -16(%rax, %rdx)
> > > -       VZEROUPPER_RETURN
> > > +       VMOVU   %XMM0, (%LESS_VEC_REG)
> > > +       VMOVU   %XMM0, -16(%LESS_VEC_REG, %rdx)
> > > +       ret
> > >  #endif
> > >
> > > -       .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> > > +       /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> > > +        */
> > > +       .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
> > >  L(between_8_15):
> > >         /* From 8 to 15.  No branch when size == 8.  */
> > > -       movq    %rdi, (%rax)
> > > -       movq    %rdi, -8(%rax, %rdx)
> > > -       VZEROUPPER_RETURN
> > > +#ifdef USE_XMM_LESS_VEC
> > > +       MOVQ    %XMM0, (%rdi)
> > > +       MOVQ    %XMM0, -8(%rdi, %rdx)
> > > +#else
> > > +       movq    %rcx, (%LESS_VEC_REG)
> > > +       movq    %rcx, -8(%LESS_VEC_REG, %rdx)
> > > +#endif
> > > +       ret
> > >
> > > -       .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
> > > +       /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
> > > +        */
> > > +       .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
> > >  L(between_4_7):
> > >         /* From 4 to 7.  No branch when size == 4.  */
> > > -       movl    %edi, (%rax)
> > > -       movl    %edi, -4(%rax, %rdx)
> > > -       VZEROUPPER_RETURN
> > > +#ifdef USE_XMM_LESS_VEC
> > > +       MOVD    %XMM0, (%rdi)
> > > +       MOVD    %XMM0, -4(%rdi, %rdx)
> > > +#else
> > > +       movl    %ecx, (%LESS_VEC_REG)
> > > +       movl    %ecx, -4(%LESS_VEC_REG, %rdx)
> > > +#endif
> > > +       ret
> > >
> > > -       .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
> > > +       /* 4 * XMM_SMALL for the third mov for AVX2.  */
> > > +       .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
> > >  L(between_2_3):
> > >         /* From 2 to 3.  No branch when size == 2.  */
> > > -       movw    %di, (%rax)
> > > -       movb    %dil, -1(%rax, %rdx)
> > > -       VZEROUPPER_RETURN
> > > +#ifdef USE_XMM_LESS_VEC
> > > +       movb    %sil, (%rdi)
> > > +       movb    %sil, 1(%rdi)
> > > +       movb    %sil, -1(%rdi, %rdx)
> > > +#else
> > > +       movw    %cx, (%LESS_VEC_REG)
> > > +       movb    %sil, -1(%LESS_VEC_REG, %rdx)
> > > +#endif
> > > +       ret
> > >  END (MEMSET_SYMBOL (__memset, unaligned_erms))
> > > --
> > > 2.25.1
> > >
> >
> > LGTM.
> >
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> >
> > Thanks.
>
> Thanks pushed.
> >
> > --
> > H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-05-04  5:47 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-05 22:42 [PATCH v1] x86: Improve vec generation in memset-vec-unaligned-erms.S Noah Goldstein
2022-02-05 22:50 ` H.J. Lu
2022-02-06  6:54 ` [PATCH v2] " Noah Goldstein
2022-02-06 16:28   ` H.J. Lu
2022-02-07  3:48     ` Noah Goldstein
2022-05-04  5:46       ` Sunil Pandey

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).