[PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library
@ 2022-06-03  4:42 Noah Goldstein
  2022-06-03  4:42 ` [PATCH v1 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
                   ` (7 more replies)
  0 siblings, 8 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03  4:42 UTC (permalink / raw)
  To: libc-alpha

This patch does not touch any existing code and is only meant to be a
tool for future patches so that simple source files can more easily be
maintained to target multiple VEC classes.

There is no difference in the objdump of libc.so before and after this
patch.
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h  | 33 +++++++++
 sysdeps/x86_64/multiarch/avx-vecs.h      | 53 ++++++++++++++
 sysdeps/x86_64/multiarch/avx2-rtm-vecs.h | 33 +++++++++
 sysdeps/x86_64/multiarch/avx2-vecs.h     | 30 ++++++++
 sysdeps/x86_64/multiarch/evex256-vecs.h  | 50 +++++++++++++
 sysdeps/x86_64/multiarch/evex512-vecs.h  | 49 +++++++++++++
 sysdeps/x86_64/multiarch/sse2-vecs.h     | 48 +++++++++++++
 sysdeps/x86_64/multiarch/vec-macros.h    | 90 ++++++++++++++++++++++++
 8 files changed, 386 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/avx2-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/vec-macros.h

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
new file mode 100644
index 0000000000..c00b83ea0e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
@@ -0,0 +1,33 @@
+/* Common config for AVX-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_RTM_VECS_H
+#define _AVX_RTM_VECS_H			1
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define SECTION(p)				p##.avx.rtm
+
+#define USE_WITH_RTM			1
+#include "avx-vecs.h"
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
new file mode 100644
index 0000000000..3b84d7e8b2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-vecs.h
@@ -0,0 +1,53 @@
+/* Common config for AVX VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_VECS_H
+#define _AVX_VECS_H			1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC				1
+#include "vec-macros.h"
+
+#ifndef USE_WITH_AVX2
+# define USE_WITH_AVX		1
+#endif
+/* Included by RTM version.  */
+#ifndef SECTION
+# define SECTION(p)			p##.avx
+#endif
+
+#define VEC_SIZE			32
+/* 4-byte mov instructions with AVX2.  */
+#define MOV_SIZE			4
+/* 1 (ret) + 3 (vzeroupper).  */
+#define RET_SIZE			4
+#define VZEROUPPER			vzeroupper
+
+#define VMOVU				vmovdqu
+#define VMOVA				vmovdqa
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
new file mode 100644
index 0000000000..a5d46e8c66
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
@@ -0,0 +1,33 @@
+/* Common config for AVX2-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX2_RTM_VECS_H
+#define _AVX2_RTM_VECS_H			1
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define SECTION(p)				p##.avx.rtm
+
+#define USE_WITH_RTM			1
+#include "avx2-vecs.h"
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/avx2-vecs.h b/sysdeps/x86_64/multiarch/avx2-vecs.h
new file mode 100644
index 0000000000..4c029b4621
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx2-vecs.h
@@ -0,0 +1,30 @@
+/* Common config for AVX2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX2_VECS_H
+#define _AVX2_VECS_H			1
+
+#define USE_WITH_AVX2		1
+/* Included by RTM version.  */
+#ifndef SECTION
+# define SECTION(p)			p##.avx
+#endif
+#include "avx-vecs.h"
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
new file mode 100644
index 0000000000..ed7a32b0ec
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
@@ -0,0 +1,50 @@
+/* Common config for EVEX256 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX256_VECS_H
+#define _EVEX256_VECS_H			1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC				1
+#include "vec-macros.h"
+
+#define USE_WITH_EVEX256	1
+#ifndef SECTION
+# define SECTION(p)			p##.evex
+#endif
+
+#define VEC_SIZE			32
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VEC_xmm				VEC_hi_xmm
+#define VEC					VEC_hi_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
new file mode 100644
index 0000000000..53597734fc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
@@ -0,0 +1,49 @@
+/* Common config for EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX512_VECS_H
+#define _EVEX512_VECS_H			1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC				1
+#include "vec-macros.h"
+
+#define USE_WITH_EVEX512	1
+#define SECTION(p)			p##.evex512
+
+#define VEC_SIZE			64
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm/ymm portion.  */
+#define VEC_xmm				VEC_hi_xmm
+#define VEC_ymm				VEC_hi_ymm
+#define VEC					VEC_hi_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
new file mode 100644
index 0000000000..b645b93e3d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
@@ -0,0 +1,48 @@
+/* Common config for SSE2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SSE2_VECS_H
+#define _SSE2_VECS_H			1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC				1
+#include "vec-macros.h"
+
+#define USE_WITH_SSE2		1
+#define SECTION(p)			p
+
+#define VEC_SIZE			16
+/* 3-byte mov instructions with SSE2.  */
+#define MOV_SIZE			3
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+
+#define VMOVU				movups
+#define VMOVA				movaps
+#define VMOVNT				movntdq
+#define VZEROUPPER
+
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_xmm
+
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
new file mode 100644
index 0000000000..4dae4503c8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/vec-macros.h
@@ -0,0 +1,90 @@
+/* Macro helpers for VEC_{type}({vec_num})
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _VEC_MACROS_H
+# define _VEC_MACROS_H			1
+
+# ifndef HAS_VEC
+#  error "Never include this file directly. Always include a vector config."
+# endif
+
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
+   VEC(N) values.  */
+#define VEC_hi_xmm0				xmm16
+#define VEC_hi_xmm1				xmm17
+#define VEC_hi_xmm2				xmm18
+#define VEC_hi_xmm3				xmm19
+#define VEC_hi_xmm4				xmm20
+#define VEC_hi_xmm5				xmm21
+#define VEC_hi_xmm6				xmm22
+#define VEC_hi_xmm7				xmm23
+#define VEC_hi_xmm8				xmm24
+#define VEC_hi_xmm9				xmm25
+#define VEC_hi_xmm10			xmm26
+#define VEC_hi_xmm11			xmm27
+#define VEC_hi_xmm12			xmm28
+#define VEC_hi_xmm13			xmm29
+#define VEC_hi_xmm14			xmm30
+#define VEC_hi_xmm15			xmm31
+
+#define VEC_hi_ymm0				ymm16
+#define VEC_hi_ymm1				ymm17
+#define VEC_hi_ymm2				ymm18
+#define VEC_hi_ymm3				ymm19
+#define VEC_hi_ymm4				ymm20
+#define VEC_hi_ymm5				ymm21
+#define VEC_hi_ymm6				ymm22
+#define VEC_hi_ymm7				ymm23
+#define VEC_hi_ymm8				ymm24
+#define VEC_hi_ymm9				ymm25
+#define VEC_hi_ymm10			ymm26
+#define VEC_hi_ymm11			ymm27
+#define VEC_hi_ymm12			ymm28
+#define VEC_hi_ymm13			ymm29
+#define VEC_hi_ymm14			ymm30
+#define VEC_hi_ymm15			ymm31
+
+#define VEC_hi_zmm0				zmm16
+#define VEC_hi_zmm1				zmm17
+#define VEC_hi_zmm2				zmm18
+#define VEC_hi_zmm3				zmm19
+#define VEC_hi_zmm4				zmm20
+#define VEC_hi_zmm5				zmm21
+#define VEC_hi_zmm6				zmm22
+#define VEC_hi_zmm7				zmm23
+#define VEC_hi_zmm8				zmm24
+#define VEC_hi_zmm9				zmm25
+#define VEC_hi_zmm10			zmm26
+#define VEC_hi_zmm11			zmm27
+#define VEC_hi_zmm12			zmm28
+#define VEC_hi_zmm13			zmm29
+#define VEC_hi_zmm14			zmm30
+#define VEC_hi_zmm15			zmm31
+
+# define PRIMITIVE_VEC(vec, num)		vec##num
+
+# define VEC_any_xmm(i)			PRIMITIVE_VEC(xmm, i)
+# define VEC_any_ymm(i)			PRIMITIVE_VEC(ymm, i)
+# define VEC_any_zmm(i)			PRIMITIVE_VEC(zmm, i)
+
+# define VEC_hi_xmm(i)			PRIMITIVE_VEC(VEC_hi_xmm, i)
+# define VEC_hi_ymm(i)			PRIMITIVE_VEC(VEC_hi_ymm, i)
+# define VEC_hi_zmm(i)			PRIMITIVE_VEC(VEC_hi_zmm, i)
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v1 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
  2022-06-03  4:42 [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
@ 2022-06-03  4:42 ` Noah Goldstein
  2022-06-03 20:04   ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                     ` (4 more replies)
  2022-06-03  4:42 ` [PATCH v1 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
                   ` (6 subsequent siblings)
  7 siblings, 5 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03  4:42 UTC (permalink / raw)
  To: libc-alpha

The RTM vzeroupper mitigation has no way of replacing inline
vzeroupper not before a return.

This code does not change any existing functionality.

There is no difference in the objdump of libc.so before and after this
patch.
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h  |  1 +
 sysdeps/x86_64/multiarch/avx2-rtm-vecs.h |  1 +
 sysdeps/x86_64/sysdep.h                  | 16 ++++++++++++++++
 3 files changed, 18 insertions(+)

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
index c00b83ea0e..e954b8e1b0 100644
--- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
@@ -20,6 +20,7 @@
 #ifndef _AVX_RTM_VECS_H
 #define _AVX_RTM_VECS_H			1
 
+#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN	\
 	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
index a5d46e8c66..e20c3635a0 100644
--- a/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
+++ b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
@@ -20,6 +20,7 @@
 #ifndef _AVX2_RTM_VECS_H
 #define _AVX2_RTM_VECS_H			1
 
+#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN	\
 	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index f14d50786d..2cb31a558b 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -106,6 +106,22 @@ lose:									      \
 	vzeroupper;						\
 	ret
 
+/* Can be used to replace vzeroupper that is not directly before a
+   return.  */
+#define COND_VZEROUPPER_XTEST							\
+    xtest;							\
+    jz 1f;							\
+    vzeroall;							\
+    jmp 2f;							\
+1:							\
+    vzeroupper;							\
+2:
+
+/* In RTM define this as COND_VZEROUPPER_XTEST.  */
+#ifndef COND_VZEROUPPER
+# define COND_VZEROUPPER vzeroupper
+#endif
+
 /* Zero upper vector registers and return.  */
 #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
 # define ZERO_UPPER_VEC_REGISTERS_RETURN \
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library
  2022-06-03  4:42 ` [PATCH v1 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
@ 2022-06-03 20:04   ` Noah Goldstein
  2022-06-03 20:04     ` [PATCH v2 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
                       ` (7 more replies)
  2022-06-03 23:49   ` [PATCH v3 " Noah Goldstein
                     ` (3 subsequent siblings)
  4 siblings, 8 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 20:04 UTC (permalink / raw)
  To: libc-alpha

This patch does not touch any existing code and is only meant to be a
tool for future patches so that simple source files can more easily be
maintained to target multiple VEC classes.

There is no difference in the objdump of libc.so before and after this
patch.
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h  | 33 +++++++++
 sysdeps/x86_64/multiarch/avx-vecs.h      | 53 ++++++++++++++
 sysdeps/x86_64/multiarch/avx2-rtm-vecs.h | 33 +++++++++
 sysdeps/x86_64/multiarch/avx2-vecs.h     | 30 ++++++++
 sysdeps/x86_64/multiarch/evex256-vecs.h  | 50 +++++++++++++
 sysdeps/x86_64/multiarch/evex512-vecs.h  | 49 +++++++++++++
 sysdeps/x86_64/multiarch/sse2-vecs.h     | 48 +++++++++++++
 sysdeps/x86_64/multiarch/vec-macros.h    | 90 ++++++++++++++++++++++++
 8 files changed, 386 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/avx2-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/vec-macros.h

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
new file mode 100644
index 0000000000..c00b83ea0e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
@@ -0,0 +1,33 @@
+/* Common config for AVX-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_RTM_VECS_H
+#define _AVX_RTM_VECS_H			1
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define SECTION(p)				p##.avx.rtm
+
+#define USE_WITH_RTM			1
+#include "avx-vecs.h"
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
new file mode 100644
index 0000000000..3b84d7e8b2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-vecs.h
@@ -0,0 +1,53 @@
+/* Common config for AVX VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_VECS_H
+#define _AVX_VECS_H			1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC				1
+#include "vec-macros.h"
+
+#ifndef USE_WITH_AVX2
+# define USE_WITH_AVX		1
+#endif
+/* Included by RTM version.  */
+#ifndef SECTION
+# define SECTION(p)			p##.avx
+#endif
+
+#define VEC_SIZE			32
+/* 4-byte mov instructions with AVX2.  */
+#define MOV_SIZE			4
+/* 1 (ret) + 3 (vzeroupper).  */
+#define RET_SIZE			4
+#define VZEROUPPER			vzeroupper
+
+#define VMOVU				vmovdqu
+#define VMOVA				vmovdqa
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
new file mode 100644
index 0000000000..a5d46e8c66
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
@@ -0,0 +1,33 @@
+/* Common config for AVX2-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX2_RTM_VECS_H
+#define _AVX2_RTM_VECS_H			1
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define SECTION(p)				p##.avx.rtm
+
+#define USE_WITH_RTM			1
+#include "avx2-vecs.h"
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/avx2-vecs.h b/sysdeps/x86_64/multiarch/avx2-vecs.h
new file mode 100644
index 0000000000..4c029b4621
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx2-vecs.h
@@ -0,0 +1,30 @@
+/* Common config for AVX2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX2_VECS_H
+#define _AVX2_VECS_H			1
+
+#define USE_WITH_AVX2		1
+/* Included by RTM version.  */
+#ifndef SECTION
+# define SECTION(p)			p##.avx
+#endif
+#include "avx-vecs.h"
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
new file mode 100644
index 0000000000..ed7a32b0ec
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
@@ -0,0 +1,50 @@
+/* Common config for EVEX256 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX256_VECS_H
+#define _EVEX256_VECS_H			1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC				1
+#include "vec-macros.h"
+
+#define USE_WITH_EVEX256	1
+#ifndef SECTION
+# define SECTION(p)			p##.evex
+#endif
+
+#define VEC_SIZE			32
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VEC_xmm				VEC_hi_xmm
+#define VEC					VEC_hi_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
new file mode 100644
index 0000000000..53597734fc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
@@ -0,0 +1,49 @@
+/* Common config for EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX512_VECS_H
+#define _EVEX512_VECS_H			1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC				1
+#include "vec-macros.h"
+
+#define USE_WITH_EVEX512	1
+#define SECTION(p)			p##.evex512
+
+#define VEC_SIZE			64
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm/ymm portion.  */
+#define VEC_xmm				VEC_hi_xmm
+#define VEC_ymm				VEC_hi_ymm
+#define VEC					VEC_hi_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
new file mode 100644
index 0000000000..b645b93e3d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
@@ -0,0 +1,48 @@
+/* Common config for SSE2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SSE2_VECS_H
+#define _SSE2_VECS_H			1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC				1
+#include "vec-macros.h"
+
+#define USE_WITH_SSE2		1
+#define SECTION(p)			p
+
+#define VEC_SIZE			16
+/* 3-byte mov instructions with SSE2.  */
+#define MOV_SIZE			3
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+
+#define VMOVU				movups
+#define VMOVA				movaps
+#define VMOVNT				movntdq
+#define VZEROUPPER
+
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_xmm
+
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
new file mode 100644
index 0000000000..4dae4503c8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/vec-macros.h
@@ -0,0 +1,90 @@
+/* Macro helpers for VEC_{type}({vec_num})
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _VEC_MACROS_H
+# define _VEC_MACROS_H			1
+
+# ifndef HAS_VEC
+#  error "Never include this file directly. Always include a vector config."
+# endif
+
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
+   VEC(N) values.  */
+#define VEC_hi_xmm0				xmm16
+#define VEC_hi_xmm1				xmm17
+#define VEC_hi_xmm2				xmm18
+#define VEC_hi_xmm3				xmm19
+#define VEC_hi_xmm4				xmm20
+#define VEC_hi_xmm5				xmm21
+#define VEC_hi_xmm6				xmm22
+#define VEC_hi_xmm7				xmm23
+#define VEC_hi_xmm8				xmm24
+#define VEC_hi_xmm9				xmm25
+#define VEC_hi_xmm10			xmm26
+#define VEC_hi_xmm11			xmm27
+#define VEC_hi_xmm12			xmm28
+#define VEC_hi_xmm13			xmm29
+#define VEC_hi_xmm14			xmm30
+#define VEC_hi_xmm15			xmm31
+
+#define VEC_hi_ymm0				ymm16
+#define VEC_hi_ymm1				ymm17
+#define VEC_hi_ymm2				ymm18
+#define VEC_hi_ymm3				ymm19
+#define VEC_hi_ymm4				ymm20
+#define VEC_hi_ymm5				ymm21
+#define VEC_hi_ymm6				ymm22
+#define VEC_hi_ymm7				ymm23
+#define VEC_hi_ymm8				ymm24
+#define VEC_hi_ymm9				ymm25
+#define VEC_hi_ymm10			ymm26
+#define VEC_hi_ymm11			ymm27
+#define VEC_hi_ymm12			ymm28
+#define VEC_hi_ymm13			ymm29
+#define VEC_hi_ymm14			ymm30
+#define VEC_hi_ymm15			ymm31
+
+#define VEC_hi_zmm0				zmm16
+#define VEC_hi_zmm1				zmm17
+#define VEC_hi_zmm2				zmm18
+#define VEC_hi_zmm3				zmm19
+#define VEC_hi_zmm4				zmm20
+#define VEC_hi_zmm5				zmm21
+#define VEC_hi_zmm6				zmm22
+#define VEC_hi_zmm7				zmm23
+#define VEC_hi_zmm8				zmm24
+#define VEC_hi_zmm9				zmm25
+#define VEC_hi_zmm10			zmm26
+#define VEC_hi_zmm11			zmm27
+#define VEC_hi_zmm12			zmm28
+#define VEC_hi_zmm13			zmm29
+#define VEC_hi_zmm14			zmm30
+#define VEC_hi_zmm15			zmm31
+
+# define PRIMITIVE_VEC(vec, num)		vec##num
+
+# define VEC_any_xmm(i)			PRIMITIVE_VEC(xmm, i)
+# define VEC_any_ymm(i)			PRIMITIVE_VEC(ymm, i)
+# define VEC_any_zmm(i)			PRIMITIVE_VEC(zmm, i)
+
+# define VEC_hi_xmm(i)			PRIMITIVE_VEC(VEC_hi_xmm, i)
+# define VEC_hi_ymm(i)			PRIMITIVE_VEC(VEC_hi_ymm, i)
+# define VEC_hi_zmm(i)			PRIMITIVE_VEC(VEC_hi_zmm, i)
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v2 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
  2022-06-03 20:04   ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
@ 2022-06-03 20:04     ` Noah Goldstein
  2022-06-03 23:12       ` H.J. Lu
  2022-06-03 20:04     ` [PATCH v2 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
                       ` (6 subsequent siblings)
  7 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 20:04 UTC (permalink / raw)
  To: libc-alpha

The RTM vzeroupper mitigation has no way of replacing inline
vzeroupper not before a return.

This code does not change any existing functionality.

There is no difference in the objdump of libc.so before and after this
patch.
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h  |  1 +
 sysdeps/x86_64/multiarch/avx2-rtm-vecs.h |  1 +
 sysdeps/x86_64/sysdep.h                  | 16 ++++++++++++++++
 3 files changed, 18 insertions(+)

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
index c00b83ea0e..e954b8e1b0 100644
--- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
@@ -20,6 +20,7 @@
 #ifndef _AVX_RTM_VECS_H
 #define _AVX_RTM_VECS_H			1
 
+#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN	\
 	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
index a5d46e8c66..e20c3635a0 100644
--- a/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
+++ b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
@@ -20,6 +20,7 @@
 #ifndef _AVX2_RTM_VECS_H
 #define _AVX2_RTM_VECS_H			1
 
+#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN	\
 	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index f14d50786d..2cb31a558b 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -106,6 +106,22 @@ lose:									      \
 	vzeroupper;						\
 	ret
 
+/* Can be used to replace vzeroupper that is not directly before a
+   return.  */
+#define COND_VZEROUPPER_XTEST							\
+    xtest;							\
+    jz 1f;							\
+    vzeroall;							\
+    jmp 2f;							\
+1:							\
+    vzeroupper;							\
+2:
+
+/* In RTM define this as COND_VZEROUPPER_XTEST.  */
+#ifndef COND_VZEROUPPER
+# define COND_VZEROUPPER vzeroupper
+#endif
+
 /* Zero upper vector registers and return.  */
 #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
 # define ZERO_UPPER_VEC_REGISTERS_RETURN \
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
  2022-06-03 20:04     ` [PATCH v2 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
@ 2022-06-03 23:12       ` H.J. Lu
  2022-06-03 23:33         ` Noah Goldstein
  0 siblings, 1 reply; 82+ messages in thread
From: H.J. Lu @ 2022-06-03 23:12 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Fri, Jun 3, 2022 at 1:04 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The RTM vzeroupper mitigation has no way of replacing inline
> vzeroupper not before a return.
>
> This code does not change any existing functionality.
>
> There is no difference in the objdump of libc.so before and after this
> patch.
> ---
>  sysdeps/x86_64/multiarch/avx-rtm-vecs.h  |  1 +
>  sysdeps/x86_64/multiarch/avx2-rtm-vecs.h |  1 +
>  sysdeps/x86_64/sysdep.h                  | 16 ++++++++++++++++
>  3 files changed, 18 insertions(+)
>
> diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> index c00b83ea0e..e954b8e1b0 100644
> --- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> +++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> @@ -20,6 +20,7 @@
>  #ifndef _AVX_RTM_VECS_H
>  #define _AVX_RTM_VECS_H                        1
>
> +#define COND_VZEROUPPER                        COND_VZEROUPPER_XTEST
>  #define ZERO_UPPER_VEC_REGISTERS_RETURN        \
>         ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>
> diff --git a/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
> index a5d46e8c66..e20c3635a0 100644
> --- a/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
> +++ b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
> @@ -20,6 +20,7 @@
>  #ifndef _AVX2_RTM_VECS_H
>  #define _AVX2_RTM_VECS_H                       1
>
> +#define COND_VZEROUPPER                        COND_VZEROUPPER_XTEST
>  #define ZERO_UPPER_VEC_REGISTERS_RETURN        \
>         ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>
> diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
> index f14d50786d..2cb31a558b 100644
> --- a/sysdeps/x86_64/sysdep.h
> +++ b/sysdeps/x86_64/sysdep.h
> @@ -106,6 +106,22 @@ lose:                                                                            \
>         vzeroupper;                                             \
>         ret
>
> +/* Can be used to replace vzeroupper that is not directly before a
> +   return.  */
> +#define COND_VZEROUPPER_XTEST                                                  \
> +    xtest;                                                     \
> +    jz 1f;                                                     \
> +    vzeroall;                                                  \
> +    jmp 2f;                                                    \
> +1:                                                     \
> +    vzeroupper;                                                        \
> +2:

Will "ret" always be after "2:"?

> +/* In RTM define this as COND_VZEROUPPER_XTEST.  */
> +#ifndef COND_VZEROUPPER
> +# define COND_VZEROUPPER vzeroupper
> +#endif
> +
>  /* Zero upper vector registers and return.  */
>  #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
>  # define ZERO_UPPER_VEC_REGISTERS_RETURN \
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
  2022-06-03 23:12       ` H.J. Lu
@ 2022-06-03 23:33         ` Noah Goldstein
  0 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 23:33 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Fri, Jun 3, 2022 at 6:12 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Jun 3, 2022 at 1:04 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The RTM vzeroupper mitigation has no way of replacing inline
> > vzeroupper not before a return.
> >
> > This code does not change any existing functionality.
> >
> > There is no difference in the objdump of libc.so before and after this
> > patch.
> > ---
> >  sysdeps/x86_64/multiarch/avx-rtm-vecs.h  |  1 +
> >  sysdeps/x86_64/multiarch/avx2-rtm-vecs.h |  1 +
> >  sysdeps/x86_64/sysdep.h                  | 16 ++++++++++++++++
> >  3 files changed, 18 insertions(+)
> >
> > diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> > index c00b83ea0e..e954b8e1b0 100644
> > --- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> > +++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> > @@ -20,6 +20,7 @@
> >  #ifndef _AVX_RTM_VECS_H
> >  #define _AVX_RTM_VECS_H                        1
> >
> > +#define COND_VZEROUPPER                        COND_VZEROUPPER_XTEST
> >  #define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> >         ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> >
> > diff --git a/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
> > index a5d46e8c66..e20c3635a0 100644
> > --- a/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
> > +++ b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
> > @@ -20,6 +20,7 @@
> >  #ifndef _AVX2_RTM_VECS_H
> >  #define _AVX2_RTM_VECS_H                       1
> >
> > +#define COND_VZEROUPPER                        COND_VZEROUPPER_XTEST
> >  #define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> >         ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> >
> > diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
> > index f14d50786d..2cb31a558b 100644
> > --- a/sysdeps/x86_64/sysdep.h
> > +++ b/sysdeps/x86_64/sysdep.h
> > @@ -106,6 +106,22 @@ lose:                                                                            \
> >         vzeroupper;                                             \
> >         ret
> >
> > +/* Can be used to replace vzeroupper that is not directly before a
> > +   return.  */
> > +#define COND_VZEROUPPER_XTEST                                                  \
> > +    xtest;                                                     \
> > +    jz 1f;                                                     \
> > +    vzeroall;                                                  \
> > +    jmp 2f;                                                    \
> > +1:                                                     \
> > +    vzeroupper;                                                        \
> > +2:
>
> Will "ret" always be after "2:"?

At some point but not immediately afterwards.

For example:

L(zero):
xorl %eax, %eax
VZEROUPPER_RETURN

L(check):
tzcntl %eax, %eax
cmpl %eax, %edx
jle L(zero)
addq %rdi, %rax
VZEROUPPER_RETURN

Can become:

L(zero):
xorl %eax, %eax
ret

L(check):
tzcntl %eax, %eax
COND_VZEROUPPER
cmpl %eax, %edx
jle L(zero)
addq %rdi, %rax
ret

Which saves code size.

>
> > +/* In RTM define this as COND_VZEROUPPER_XTEST.  */
> > +#ifndef COND_VZEROUPPER
> > +# define COND_VZEROUPPER vzeroupper
> > +#endif
> > +
> >  /* Zero upper vector registers and return.  */
> >  #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
> >  # define ZERO_UPPER_VEC_REGISTERS_RETURN \
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v2 3/8] Benchtests: Improve memrchr benchmarks
  2022-06-03 20:04   ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  2022-06-03 20:04     ` [PATCH v2 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
@ 2022-06-03 20:04     ` Noah Goldstein
  2022-06-03 20:04     ` [PATCH v2 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
                       ` (5 subsequent siblings)
  7 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 20:04 UTC (permalink / raw)
  To: libc-alpha

Add a second iteration for memrchr to set `pos` starting from the end
of the buffer.

Previously `pos` was only set relative to the begining of the
buffer. This isn't really useful for memchr because the begining
of the search space is (buf + len).
---
 benchtests/bench-memchr.c | 110 ++++++++++++++++++++++----------------
 1 file changed, 65 insertions(+), 45 deletions(-)

diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
index 4d7212332f..0facda2fa0 100644
--- a/benchtests/bench-memchr.c
+++ b/benchtests/bench-memchr.c
@@ -76,7 +76,7 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
 
 static void
 do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
-	 int seek_char)
+	 int seek_char, int invert_pos)
 {
   size_t i;
 
@@ -96,7 +96,10 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
 
   if (pos < len)
     {
-      buf[align + pos] = seek_char;
+      if (invert_pos)
+	buf[align + len - pos] = seek_char;
+      else
+	buf[align + pos] = seek_char;
       buf[align + len] = -seek_char;
     }
   else
@@ -109,6 +112,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
   json_attr_uint (json_ctx, "pos", pos);
   json_attr_uint (json_ctx, "len", len);
   json_attr_uint (json_ctx, "seek_char", seek_char);
+  json_attr_uint (json_ctx, "invert_pos", invert_pos);
 
   json_array_begin (json_ctx, "timings");
 
@@ -123,6 +127,7 @@ int
 test_main (void)
 {
   size_t i;
+  int repeats;
   json_ctx_t json_ctx;
   test_init ();
 
@@ -142,53 +147,68 @@ test_main (void)
 
   json_array_begin (&json_ctx, "results");
 
-  for (i = 1; i < 8; ++i)
+  for (repeats = 0; repeats < 2; ++repeats)
     {
-      do_test (&json_ctx, 0, 16 << i, 2048, 23);
-      do_test (&json_ctx, i, 64, 256, 23);
-      do_test (&json_ctx, 0, 16 << i, 2048, 0);
-      do_test (&json_ctx, i, 64, 256, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, 64, 256, 0);
+      for (i = 1; i < 8; ++i)
+	{
+	  do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
+	  do_test (&json_ctx, i, 64, 256, 23, repeats);
+	  do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
+	  do_test (&json_ctx, i, 64, 256, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
 #ifdef USE_AS_MEMRCHR
-      /* Also test the position close to the beginning for memrchr.  */
-      do_test (&json_ctx, 0, i, 256, 23);
-      do_test (&json_ctx, 0, i, 256, 0);
-      do_test (&json_ctx, i, i, 256, 23);
-      do_test (&json_ctx, i, i, 256, 0);
+	  /* Also test the position close to the beginning for memrchr.  */
+	  do_test (&json_ctx, 0, i, 256, 23, repeats);
+	  do_test (&json_ctx, 0, i, 256, 0, repeats);
+	  do_test (&json_ctx, i, i, 256, 23, repeats);
+	  do_test (&json_ctx, i, i, 256, 0, repeats);
 #endif
-    }
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (&json_ctx, i, i << 5, 192, 23);
-      do_test (&json_ctx, i, i << 5, 192, 0);
-      do_test (&json_ctx, i, i << 5, 256, 23);
-      do_test (&json_ctx, i, i << 5, 256, 0);
-      do_test (&json_ctx, i, i << 5, 512, 23);
-      do_test (&json_ctx, i, i << 5, 512, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23);
-    }
-  for (i = 1; i < 32; ++i)
-    {
-      do_test (&json_ctx, 0, i, i + 1, 23);
-      do_test (&json_ctx, 0, i, i + 1, 0);
-      do_test (&json_ctx, i, i, i + 1, 23);
-      do_test (&json_ctx, i, i, i + 1, 0);
-      do_test (&json_ctx, 0, i, i - 1, 23);
-      do_test (&json_ctx, 0, i, i - 1, 0);
-      do_test (&json_ctx, i, i, i - 1, 23);
-      do_test (&json_ctx, i, i, i - 1, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23);
-      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23);
-      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0);
+	}
+      for (i = 1; i < 8; ++i)
+	{
+	  do_test (&json_ctx, i, i << 5, 192, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 192, 0, repeats);
+	  do_test (&json_ctx, i, i << 5, 256, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 256, 0, repeats);
+	  do_test (&json_ctx, i, i << 5, 512, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 512, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
+	}
+      for (i = 1; i < 32; ++i)
+	{
+	  do_test (&json_ctx, 0, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, 0, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, i, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, i, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, 0, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, 0, i, i - 1, 0, repeats);
+	  do_test (&json_ctx, i, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, i, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () / 2, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i - 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0, repeats);
+
 #ifdef USE_AS_MEMRCHR
-      /* Also test the position close to the beginning for memrchr.  */
-      do_test (&json_ctx, 0, 1, i + 1, 23);
-      do_test (&json_ctx, 0, 2, i + 1, 0);
+	  do_test (&json_ctx, 0, 1, i + 1, 23, repeats);
+	  do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
+#endif
+	}
+#ifndef USE_AS_MEMRCHR
+      break;
 #endif
     }
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v2 4/8] x86: Optimize memrchr-sse2.S
  2022-06-03 20:04   ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  2022-06-03 20:04     ` [PATCH v2 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
  2022-06-03 20:04     ` [PATCH v2 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
@ 2022-06-03 20:04     ` Noah Goldstein
  2022-06-03 20:04     ` [PATCH v2 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
                       ` (4 subsequent siblings)
  7 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 20:04 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller lengths more.
    2. optimizes target placement more carefully.
    3. reuses logic more.
    4. fixes up various inefficiencies in the logic.

The total code size saving is: 394 bytes
Geometric Mean of all benchmarks New / Old: 0.874

Regressions:
    1. The page cross case is now colder, especially re-entry from the
       page cross case if a match is not found in the first VEC
       (roughly 50%). My general opinion with this patch is this is
       acceptable given the "coldness" of this case (less than 4%) and
       generally performance improvement in the other far more common
       cases.

    2. There are some regressions 5-15% for medium/large user-arg
       lengths that have a match in the first VEC. This is because the
       logic was rewritten to optimize finds in the first VEC if the
       user-arg length is shorter (where we see roughly 20-50%
       performance improvements). It is not always the case this is a
       regression. My intuition is some frontend quirk is partially
       explaining the data although I haven't been able to find the
       root cause.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++--------------------
 1 file changed, 292 insertions(+), 321 deletions(-)

diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index d1a9f47911..b0dffd2ae2 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -18,362 +18,333 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#define VEC_SIZE			16
+#define PAGE_SIZE			4096
 
 	.text
-ENTRY (__memrchr)
-	movd	%esi, %xmm1
-
-	sub	$16, %RDX_LP
-	jbe	L(length_less16)
-
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	%RDX_LP, %RDI_LP
-	pshufd	$0, %xmm1, %xmm1
-
-	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-
-/* Check if there is a match.  */
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(loop_prolog)
-
-	add	$16, %rdi
-	add	$16, %rdx
-	and	$-16, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(loop_prolog):
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqb	%xmm1, %xmm4
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	mov	%edi, %ecx
-	and	$63, %ecx
-	jz	L(align64_loop)
-
-	add	$64, %rdi
-	add	$64, %rdx
-	and	$-64, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(align64_loop):
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	movdqa	48(%rdi), %xmm4
-
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm1, %xmm4
-
-	pmaxub	%xmm3, %xmm0
-	pmaxub	%xmm4, %xmm2
-	pmaxub	%xmm0, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	test	%eax, %eax
-	jz	L(align64_loop)
-
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm2
-
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	(%rdi), %xmm1
-
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	pmovmskb	%xmm1, %eax
-	bsr	%eax, %eax
-
-	add	%rdi, %rax
+ENTRY_P2ALIGN(__memrchr, 6)
+#ifdef __ILP32__
+	/* Clear upper bits.  */
+	mov	%RDX_LP, %RDX_LP
+#endif
+	movd	%esi, %xmm0
+
+	/* Get end pointer.  */
+	leaq	(%rdx, %rdi), %rcx
+
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	jz	L(page_cross)
+
+	/* NB: This load happens regardless of whether rdx (len) is zero. Since
+	   it doesn't cross a page and the standard gurantees any pointer have
+	   at least one-valid byte this load must be safe. For the entire
+	   history of the x86 memrchr implementation this has been possible so
+	   no code "should" be relying on a zero-length check before this load.
+	   The zero-length check is moved to the page cross case because it is
+	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
+	   into 2-cache lines.  */
+	movups	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
+	   zero.  */
+	bsrl	%eax, %eax
+	jz	L(ret_0)
+	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
+	   if out of bounds.  */
+	addl	%edx, %eax
+	jl	L(zero_0)
+	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
+	   ptr.  */
+	addq	%rdi, %rax
+L(ret_0):
 	ret
 
-	.p2align 4
-L(exit_loop):
-	add	$64, %edx
-	cmp	$32, %edx
-	jbe	L(exit_loop_32)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16_1)
-	cmp	$48, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches0_1)
-	xor	%eax, %eax
+	.p2align 4,, 5
+L(ret_vec_x0):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(exit_loop_32):
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48_1)
-	cmp	$16, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	32(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches32_1)
-	xor	%eax, %eax
+	.p2align 4,, 2
+L(zero_0):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches0):
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
-	ret
 
-	.p2align 4
-L(matches32):
-	bsr	%eax, %eax
-	lea	32(%rax, %rdi), %rax
+	.p2align 4,, 8
+L(more_1x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	/* Align rcx (pointer to string).  */
+	decq	%rcx
+	andq	$-VEC_SIZE, %rcx
+
+	movq	%rcx, %rdx
+	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
+	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
+	   it adds more frontend uops (even if the moves can be eliminated) and
+	   some percentage of the time actual backend uops.  */
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	subq	%rdi, %rdx
+	pmovmskb %xmm1, %eax
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	subl	$VEC_SIZE, %edx
+	jbe	L(ret_vec_x0_test)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_1)
+	addl	%edx, %eax
+	jl	L(zero_0)
+	addq	%rdi, %rax
+L(ret_1):
 	ret
 
-	.p2align 4
-L(matches48):
-	bsr	%eax, %eax
-	lea	48(%rax, %rdi), %rax
+	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
+	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
+	   lines.  Naturally aligned % 16 to 8-bytes.  */
+L(page_cross):
+	/* Zero length check.  */
+	testq	%rdx, %rdx
+	jz	L(zero_0)
+
+	leaq	-1(%rcx), %r8
+	andq	$-(VEC_SIZE), %r8
+
+	movaps	(%r8), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	negl	%ecx
+	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
+	   explicitly.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	shl	%cl, %esi
+	movzwl	%si, %eax
+	leaq	(%rdi, %rdx), %rcx
+	cmpq	%rdi, %r8
+	ja	L(more_1x_vec)
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_2)
+	addl	%edx, %eax
+	jl	L(zero_1)
+	addq	%rdi, %rax
+L(ret_2):
 	ret
 
-	.p2align 4
-L(matches0_1):
-	bsr	%eax, %eax
-	sub	$64, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	add	%rdi, %rax
+	/* Fits in aliging bytes.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches16_1):
-	bsr	%eax, %eax
-	sub	$48, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	16(%rdi, %rax), %rax
+	.p2align 4,, 5
+L(ret_vec_x1):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(matches32_1):
-	bsr	%eax, %eax
-	sub	$32, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	32(%rdi, %rax), %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(matches48_1):
-	bsr	%eax, %eax
-	sub	$16, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	48(%rdi, %rax), %rax
-	ret
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(return_null):
-	xor	%eax, %eax
-	ret
 
-	.p2align 4
-L(length_less16_offset0):
-	test	%edx, %edx
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	mov	%dl, %cl
-	pcmpeqb	(%rdi), %xmm1
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
 
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
+	addl	$(VEC_SIZE), %edx
+	jle	L(ret_vec_x2_test)
 
-	pmovmskb	%xmm1, %eax
+L(last_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
 
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
+	subl	$(VEC_SIZE), %edx
+	bsrl	%eax, %eax
+	jz	L(ret_3)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
+L(ret_3):
 	ret
 
-	.p2align 4
-L(length_less16):
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	$16, %edx
-
-	pshufd	$0, %xmm1, %xmm1
-
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(length_less16_offset0)
-
-	mov	%cl, %dh
-	mov	%ecx, %esi
-	add	%dl, %dh
-	and	$-16, %rdi
-
-	sub	$16, %dh
-	ja	L(length_less16_part2)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-
-	sar	%cl, %eax
-	mov	%dl, %cl
-
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
-
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 6
+L(ret_vec_x2_test):
+	bsrl	%eax, %eax
+	jz	L(zero_2)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4
-L(length_less16_part2):
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	mov	%dh, %cl
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
+L(zero_2):
+	xorl	%eax, %eax
+	ret
 
-	test	%eax, %eax
-	jnz	L(length_less16_part2_return)
 
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
+	.p2align 4,, 5
+L(ret_vec_x2):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	mov	%esi, %ecx
-	sar	%cl, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	.p2align 4,, 5
+L(ret_vec_x3):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
+
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x3)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
+	   keeping the code from spilling to the next cache line.  */
+	addq	$(VEC_SIZE * 4 - 1), %rcx
+	andq	$-(VEC_SIZE * 4), %rcx
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
+	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
+	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
+	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
+	pcmpeqb	%xmm0, %xmm1
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm0, %xmm4
+
+	por	%xmm1, %xmm2
+	por	%xmm3, %xmm4
+	por	%xmm2, %xmm4
+
+	pmovmskb %xmm4, %esi
+	testl	%esi, %esi
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	%rdx, %rcx
+	jne	L(loop_4x_vec)
+
+	subl	%edi, %edx
+
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 2
+L(last_4x_vec):
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+	bsrl	%eax, %eax
+	jz	L(ret_4)
+	addl	%edx, %eax
+	jl	L(zero_3)
+	addq	%rdi, %rax
+L(ret_4):
 	ret
 
-	.p2align 4
-L(length_less16_part2_return):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 3
+L(loop_end):
+	pmovmskb %xmm1, %eax
+	sall	$16, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm2, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm3, %eax
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	sall	$16, %eax
+	orl	%esi, %eax
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
 	ret
 
-END (__memrchr)
+L(ret_vec_end):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
+	ret
+	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
+	   aligning bytes.  */
+L(zero_3):
+	xorl	%eax, %eax
+	ret
+	/* 2-bytes from next cache line.  */
+END(__memrchr)
 weak_alias (__memrchr, memrchr)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v2 5/8] x86: Optimize memrchr-evex.S
  2022-06-03 20:04   ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (2 preceding siblings ...)
  2022-06-03 20:04     ` [PATCH v2 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
@ 2022-06-03 20:04     ` Noah Goldstein
  2022-06-03 20:04     ` [PATCH v2 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
                       ` (3 subsequent siblings)
  7 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 20:04 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller user-arg lengths more.
    2. optimizes target placement more carefully
    3. reuses logic more
    4. fixes up various inefficiencies in the logic. The biggest
       case here is the `lzcnt` logic for checking returns which
       saves either a branch or multiple instructions.

The total code size saving is: 263 bytes
Geometric Mean of all benchmarks New / Old: 0.755

Regressions:
There are some regressions. Particularly where the length (user arg
length) is large but the position of the match char is near the
begining of the string (in first VEC). This case has roughly a
20% regression.

This is because the new logic gives the hot path for immediate matches
to shorter lengths (the more common input). This case has roughly
a 35% speedup.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memrchr-evex.S | 539 ++++++++++++------------
 1 file changed, 268 insertions(+), 271 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index 0b99709c6b..ad541c0e50 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -19,319 +19,316 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
+# ifndef MEMRCHR
+#  define MEMRCHR				__memrchr_evex
+# endif
+
+# define PAGE_SIZE			4096
+# define VECMATCH			VEC(0)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(MEMRCHR, 6)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
+
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdi, %rdx), %rax
+	vpbroadcastb %esi, %VECMATCH
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+
+	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will gurantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-# define VMOVA		vmovdqa64
-
-# define YMMMATCH	ymm16
-
-# define VEC_SIZE 32
-
-	.section .text.evex,"ax",@progbits
-ENTRY (__memrchr_evex)
-	/* Broadcast CHAR to YMMMATCH.  */
-	vpbroadcastb %esi, %YMMMATCH
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
-
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
-
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
-	kord	%k1, %k2, %k5
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
-	kord	%k3, %k4, %k6
-	kortestd %k5, %k6
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	kmovd	%k1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0_dec):
+	decq	%rax
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
+	/* Align rax (pointer to string).  */
+	andq	$-VEC_SIZE, %rax
 
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
+	/* Recompute length after aligning.  */
+	movq	%rax, %rdx
 
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	ret
+	subq	%rdi, %rdx
 
-	.p2align 4
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
+
+	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
+	decq	%rax
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpb	$0, (%rsi), %VECMATCH, %k0
+	kmovd	%k0, %r8d
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %ecx
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%ecx
+	shlxl	%ecx, %r8d, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_1)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
+	/* Continue creating zero labels that fit in aligning bytes and get
+	   2-byte encoding / are in the same cache line as condition.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
-
-	kmovd	%k1, %eax
-
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
-
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(zero)
-
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
-
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	.p2align 4,, 8
+L(ret_vec_x2):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	/* Check the last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecissary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	ret
+	decq	%rax
+	andq	$-(VEC_SIZE * 4), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	andq	$-(VEC_SIZE * 4), %rdx
 
 	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
-
-	/* Check the last VEC.  */
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+L(loop_4x_vec):
+	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
+	   on).  */
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
+
+	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
+	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
+	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
+
+	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
+	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
+	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
+	vptestnmb %VEC(3), %VEC(3), %k2
+
+	/* Any 1s and we found CHAR.  */
+	kortestd %k2, %k4
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
+
+	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
+	subq	$-(VEC_SIZE * 4), %rdx
+	movq	%rdx, %rax
+	subl	%edi, %edx
+L(last_4x_vec):
+
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	kmovd	%k1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	movl	%r8d, %ecx
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	kmovd	%k1, %eax
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret_1)
+	xorl	%eax, %eax
+L(ret_1):
+	ret
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 6
+L(loop_end):
+	kmovd	%k1, %ecx
+	notl	%ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vptestnmb %VEC(2), %VEC(2), %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	kmovd	%k2, %ecx
+	kmovd	%k4, %esi
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	addq	%rcx, %rax
+	ret
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	addq	$(VEC_SIZE), %rax
+L(ret_vec_x1_end):
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
 	ret
-END (__memrchr_evex)
+
+END(MEMRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v2 6/8] x86: Optimize memrchr-avx2.S
  2022-06-03 20:04   ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (3 preceding siblings ...)
  2022-06-03 20:04     ` [PATCH v2 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
@ 2022-06-03 20:04     ` Noah Goldstein
  2022-06-03 20:04     ` [PATCH v2 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
                       ` (2 subsequent siblings)
  7 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 20:04 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller user-arg lengths more.
    2. optimizes target placement more carefully
    3. reuses logic more
    4. fixes up various inefficiencies in the logic. The biggest
       case here is the `lzcnt` logic for checking returns which
       saves either a branch or multiple instructions.

The total code size saving is: 306 bytes
Geometric Mean of all benchmarks New / Old: 0.760

Regressions:
There are some regressions. Particularly where the length (user arg
length) is large but the position of the match char is near the
begining of the string (in first VEC). This case has roughly a
10-20% regression.

This is because the new logic gives the hot path for immediate matches
to shorter lengths (the more common input). This case has roughly
a 15-45% speedup.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S |   1 +
 sysdeps/x86_64/multiarch/memrchr-avx2.S     | 538 ++++++++++----------
 2 files changed, 260 insertions(+), 279 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
index cea2d2a72d..5e9beeeef2 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
@@ -2,6 +2,7 @@
 # define MEMRCHR __memrchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index ba2ce7cb03..6915e1c373 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -21,340 +21,320 @@
 # include <sysdep.h>
 
 # ifndef MEMRCHR
-#  define MEMRCHR	__memrchr_avx2
+#  define MEMRCHR				__memrchr_avx2
 # endif
 
 # ifndef VZEROUPPER
-#  define VZEROUPPER	vzeroupper
+#  define VZEROUPPER			vzeroupper
 # endif
 
+// abf-off
 # ifndef SECTION
 #  define SECTION(p)	p##.avx
 # endif
+// abf-on
+
+# define VEC_SIZE			32
+# define PAGE_SIZE			4096
+	.section SECTION(.text), "ax", @progbits
+ENTRY(MEMRCHR)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
 
-# define VEC_SIZE 32
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMRCHR)
-	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
-	vpbroadcastb %xmm0, %ymm0
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdx, %rdi), %rax
 
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
+	vpbroadcastb %xmm0, %ymm0
 
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+
+L(ret_vec_x0_test):
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will gurantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+
+	/* Hoist vzeroupper (not great for RTM) to save code size. This allows
+	   all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vmovdqa	(%rdi), %ymm1
-	vmovdqa	VEC_SIZE(%rdi), %ymm2
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
-
-	vpcmpeqb %ymm1, %ymm0, %ymm1
-	vpcmpeqb %ymm2, %ymm0, %ymm2
-	vpcmpeqb %ymm3, %ymm0, %ymm3
-	vpcmpeqb %ymm4, %ymm0, %ymm4
-
-	vpor	%ymm1, %ymm2, %ymm5
-	vpor	%ymm3, %ymm4, %ymm6
-	vpor	%ymm5, %ymm6, %ymm5
-
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpmovmskb %ymm1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Align rax (string pointer).  */
+	andq	$-VEC_SIZE, %rax
+
+	/* Recompute remaining length after aligning.  */
+	movq	%rax, %rdx
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
+	subq	%rdi, %rdx
+	decq	%rax
+	vpmovmskb %ymm1, %ecx
+	/* Fall through for short (hotter than length).  */
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpeqb (%rsi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %r8d
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%r8d
+	shlxl	%r8d, %ecx, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
+	.p2align 4,, 11
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
+	.p2align 4,, 10
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	VZEROUPPER_RETURN
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	.p2align 4
-L(null):
+	/* First in aligning bytes.  */
+L(zero_2):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
+	.p2align 4,, 4
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	vpcmpeqb (%rdi), %ymm0, %ymm1
 
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
+	.p2align 4,, 11
+L(ret_vec_x2):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4,, 14
+L(ret_vec_x3):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
 	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(null)
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecissary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	/* Align rax to (VEC_SIZE - 1).  */
+	orq	$(VEC_SIZE * 4 - 1), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	orq	$(VEC_SIZE * 4 - 1), %rdx
 
-	/* Check the last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	.p2align 4
+L(loop_4x_vec):
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	vpor	%ymm1, %ymm2, %ymm2
+	vpor	%ymm3, %ymm4, %ymm4
+	vpor	%ymm2, %ymm4, %ymm4
+	vpmovmskb %ymm4, %esi
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	testl	%esi, %esi
+	jnz	L(loop_end)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	VZEROUPPER_RETURN
+	addq	$(VEC_SIZE * -4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
 
-	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
+	subl	%edi, %edx
+	incl	%edx
 
-	/* Check the last VEC.  */
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
+L(last_4x_vec):
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	vpmovmskb %ymm1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret0)
+	xorl	%eax, %eax
+L(ret0):
+	ret
 
-	movl	%r8d, %ecx
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(loop_end):
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vpmovmskb %ymm2, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	vpmovmskb %ymm3, %ecx
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	.p2align 4,, 4
+L(ret_vec_x1_end):
+	/* 64-bit version will automatically add 32 (VEC_SIZE).  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
+	VZEROUPPER_RETURN
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
-END (MEMRCHR)
+
+	/* 2 bytes until next cache line.  */
+END(MEMRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v2 7/8] x86: Shrink code size of memchr-avx2.S
  2022-06-03 20:04   ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (4 preceding siblings ...)
  2022-06-03 20:04     ` [PATCH v2 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
@ 2022-06-03 20:04     ` Noah Goldstein
  2022-06-03 20:04     ` [PATCH v2 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
  2022-06-03 23:09     ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library H.J. Lu
  7 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 20:04 UTC (permalink / raw)
  To: libc-alpha

This is not meant as a performance optimization. The previous code was
far to liberal in aligning targets and wasted code size unnecissarily.

The total code size saving is: 59 bytes

There are no major changes in the benchmarks.
Geometric Mean of all benchmarks New / Old: 0.967

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S |   1 +
 sysdeps/x86_64/multiarch/memchr-avx2.S     | 109 +++++++++++----------
 2 files changed, 60 insertions(+), 50 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
index 87b076c7c4..c4d71938c5 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
@@ -2,6 +2,7 @@
 # define MEMCHR __memchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 75bd7262e0..28a01280ec 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -57,7 +57,7 @@
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 5)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 #  ifdef __ILP32__
@@ -87,12 +87,14 @@ ENTRY (MEMCHR)
 # endif
 	testl	%eax, %eax
 	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
 
 # ifndef USE_AS_RAWMEMCHR
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
 	/* Check if first match was before length.  */
 	tzcntl	%eax, %eax
@@ -100,58 +102,31 @@ L(first_vec_x0):
 	/* NB: Multiply length by 4 to get byte count.  */
 	sall	$2, %edx
 #  endif
-	xorl	%ecx, %ecx
+    COND_VZEROUPPER
+	/* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
+	   block. branch here as opposed to cmovcc is not that costly. Common
+	   usage of memchr is to check if the return was NULL (if string was
+	   known to contain CHAR user would use rawmemchr). This branch will be
+	   highly correlated with the user branch and can be used by most
+	   modern branch predictors to predict the user branch.  */
 	cmpl	%eax, %edx
-	leaq	(%rdi, %rax), %rax
-	cmovle	%rcx, %rax
-	VZEROUPPER_RETURN
-
-L(null):
-	xorl	%eax, %eax
-	ret
-# endif
-	.p2align 4
-L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is
-	   necessary for computer return address if byte is found or
-	   adjusting length if it is not and this is memchr.  */
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
-	   and rdi for rawmemchr.  */
-	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
-	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Calculate length until end of page (length checked for a
-	   match).  */
-	leaq	1(%ALGN_PTR_REG), %rsi
-	subq	%RRAW_PTR_REG, %rsi
-#  ifdef USE_AS_WMEMCHR
-	/* NB: Divide bytes by 4 to get wchar_t count.  */
-	shrl	$2, %esi
-#  endif
-# endif
-	/* Remove the leading bytes.  */
-	sarxl	%ERAW_PTR_REG, %eax, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Check the end of data.  */
-	cmpq	%rsi, %rdx
-	jbe	L(first_vec_x0)
+    jle  L(null)
+	addq	%rdi, %rax
+    ret
 # endif
-	testl	%eax, %eax
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
-	addq	%RRAW_PTR_REG, %rax
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x1):
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-
+# ifndef USE_AS_RAWMEMCHR
+	/* First in aligning bytes here.  */
+L(null):
+	xorl	%eax, %eax
+	ret
+# endif
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
@@ -340,7 +315,7 @@ L(first_vec_x1_check):
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-	.p2align 4
+	.p2align 4,, 6
 L(set_zero_end):
 	xorl	%eax, %eax
 	VZEROUPPER_RETURN
@@ -428,5 +403,39 @@ L(last_vec_x3):
 	VZEROUPPER_RETURN
 # endif
 
+	.p2align 4
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is necessary for
+	   computer return address if byte is found or adjusting length if it
+	   is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
+	   rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMPEQ	(%ALGN_PTR_REG), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length checked for a match).  */
+	leal	VEC_SIZE(%ALGN_PTR_REG), %esi
+	subl	%ERAW_PTR_REG, %esi
+# ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+# endif
+# endif
+	/* Remove the leading bytes.  */
+	sarxl	%ERAW_PTR_REG, %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Check the end of data.  */
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	bsfl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
+	VZEROUPPER_RETURN
+
+
 END (MEMCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v2 8/8] x86: Shrink code size of memchr-evex.S
  2022-06-03 20:04   ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (5 preceding siblings ...)
  2022-06-03 20:04     ` [PATCH v2 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
@ 2022-06-03 20:04     ` Noah Goldstein
  2022-06-03 23:09     ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library H.J. Lu
  7 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 20:04 UTC (permalink / raw)
  To: libc-alpha

This is not meant as a performance optimization. The previous code was
far to liberal in aligning targets and wasted code size unnecissarily.

The total code size saving is: 64 bytes

There are no non-negligible changes in the benchmarks.
Geometric Mean of all benchmarks New / Old: 1.000

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memchr-evex.S | 46 ++++++++++++++------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index cfaf02907d..0fd11b7632 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -88,7 +88,7 @@
 # define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 6)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
@@ -131,22 +131,24 @@ L(zero):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
-	/* Check if first match was before length.  */
-	tzcntl	%eax, %eax
-	xorl	%ecx, %ecx
-	cmpl	%eax, %edx
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-	cmovle	%rcx, %rax
+	/* Check if first match was before length. NB: tzcnt has false data-
+	   dependency on destination. eax already had a data-dependency on esi
+	   so this should have no affect here.  */
+	tzcntl	%eax, %esi
+#  ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
+#  else
+	addq	%rsi, %rdi
+#  endif
+	xorl	%eax, %eax
+	cmpl	%esi, %edx
+	cmovg	%rdi, %rax
 	ret
-# else
-	/* NB: first_vec_x0 is 17 bytes which will leave
-	   cross_page_boundary (which is relatively cold) close enough
-	   to ideal alignment. So only realign L(cross_page_boundary) if
-	   rawmemchr.  */
-	.p2align 4
 # endif
+
+	.p2align 4
 L(cross_page_boundary):
 	/* Save pointer before aligning as its original value is
 	   necessary for computer return address if byte is found or
@@ -400,10 +402,14 @@ L(last_2x_vec):
 L(zero_end):
 	ret
 
+L(set_zero_end):
+	xorl	%eax, %eax
+	ret
 
 	.p2align 4
 L(first_vec_x1_check):
-	tzcntl	%eax, %eax
+	/* eax must be non-zero. Use bsfl to save code size.  */
+	bsfl	%eax, %eax
 	/* Adjust length.  */
 	subl	$-(CHAR_PER_VEC * 4), %edx
 	/* Check if match within remaining length.  */
@@ -412,9 +418,6 @@ L(first_vec_x1_check):
 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 	ret
-L(set_zero_end):
-	xorl	%eax, %eax
-	ret
 
 	.p2align 4
 L(loop_4x_vec_end):
@@ -464,7 +467,7 @@ L(loop_4x_vec_end):
 # endif
 	ret
 
-	.p2align 4
+	.p2align 4,, 10
 L(last_vec_x1_return):
 	tzcntl	%eax, %eax
 # if defined USE_AS_WMEMCHR || RET_OFFSET != 0
@@ -496,6 +499,7 @@ L(last_vec_x3_return):
 # endif
 
 # ifndef USE_AS_RAWMEMCHR
+	.p2align 4,, 5
 L(last_4x_vec_or_less_cmpeq):
 	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
 	kmovd	%k0, %eax
@@ -546,7 +550,7 @@ L(last_4x_vec):
 #  endif
 	andl	%ecx, %eax
 	jz	L(zero_end2)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 L(zero_end2):
 	ret
@@ -562,6 +566,6 @@ L(last_vec_x3):
 	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 # endif
-
+	/* 7 bytes from next cache line.  */
 END (MEMCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library
  2022-06-03 20:04   ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (6 preceding siblings ...)
  2022-06-03 20:04     ` [PATCH v2 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
@ 2022-06-03 23:09     ` H.J. Lu
  2022-06-03 23:49       ` Noah Goldstein
  7 siblings, 1 reply; 82+ messages in thread
From: H.J. Lu @ 2022-06-03 23:09 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Fri, Jun 3, 2022 at 1:04 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This patch does not touch any existing code and is only meant to be a
> tool for future patches so that simple source files can more easily be
> maintained to target multiple VEC classes.
>
> There is no difference in the objdump of libc.so before and after this
> patch.
> ---
>  sysdeps/x86_64/multiarch/avx-rtm-vecs.h  | 33 +++++++++
>  sysdeps/x86_64/multiarch/avx-vecs.h      | 53 ++++++++++++++
>  sysdeps/x86_64/multiarch/avx2-rtm-vecs.h | 33 +++++++++
>  sysdeps/x86_64/multiarch/avx2-vecs.h     | 30 ++++++++
>  sysdeps/x86_64/multiarch/evex256-vecs.h  | 50 +++++++++++++
>  sysdeps/x86_64/multiarch/evex512-vecs.h  | 49 +++++++++++++
>  sysdeps/x86_64/multiarch/sse2-vecs.h     | 48 +++++++++++++
>  sysdeps/x86_64/multiarch/vec-macros.h    | 90 ++++++++++++++++++++++++
>  8 files changed, 386 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/avx2-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/vec-macros.h
>
> diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> new file mode 100644
> index 0000000000..c00b83ea0e
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> @@ -0,0 +1,33 @@
> +/* Common config for AVX-RTM VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _AVX_RTM_VECS_H
> +#define _AVX_RTM_VECS_H                        1
> +
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN              jmp L(return_vzeroupper)
> +
> +#define SECTION(p)                             p##.avx.rtm
> +
> +#define USE_WITH_RTM                   1
> +#include "avx-vecs.h"
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
> new file mode 100644
> index 0000000000..3b84d7e8b2
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/avx-vecs.h
> @@ -0,0 +1,53 @@
> +/* Common config for AVX VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _AVX_VECS_H
> +#define _AVX_VECS_H                    1
> +
> +#ifdef HAS_VEC
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define HAS_VEC                                1
> +#include "vec-macros.h"
> +
> +#ifndef USE_WITH_AVX2
> +# define USE_WITH_AVX          1
> +#endif
> +/* Included by RTM version.  */
> +#ifndef SECTION
> +# define SECTION(p)                    p##.avx
> +#endif

Can SECTION be defined unconditionally? If a different SECTION
is needed, you can undef it first,

> +
> +#define VEC_SIZE                       32
> +/* 4-byte mov instructions with AVX2.  */
> +#define MOV_SIZE                       4
> +/* 1 (ret) + 3 (vzeroupper).  */
> +#define RET_SIZE                       4
> +#define VZEROUPPER                     vzeroupper
> +
> +#define VMOVU                          vmovdqu
> +#define VMOVA                          vmovdqa
> +#define VMOVNT                         vmovntdq
> +
> +/* Often need to access xmm portion.  */
> +#define VEC_xmm                                VEC_any_xmm
> +#define VEC                                    VEC_any_ymm

Can we check VEC or VEC_SIZE instead of HAS_VEC?

> +
> +#endif

Do we need both AVX and AVX2?  Will AVX2 be sufficient?

> diff --git a/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
> new file mode 100644
> index 0000000000..a5d46e8c66
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
> @@ -0,0 +1,33 @@
> +/* Common config for AVX2-RTM VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _AVX2_RTM_VECS_H
> +#define _AVX2_RTM_VECS_H                       1
> +
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN              jmp L(return_vzeroupper)
> +
> +#define SECTION(p)                             p##.avx.rtm
> +
> +#define USE_WITH_RTM                   1
> +#include "avx2-vecs.h"
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/avx2-vecs.h b/sysdeps/x86_64/multiarch/avx2-vecs.h
> new file mode 100644
> index 0000000000..4c029b4621
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/avx2-vecs.h
> @@ -0,0 +1,30 @@
> +/* Common config for AVX2 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _AVX2_VECS_H
> +#define _AVX2_VECS_H                   1
> +
> +#define USE_WITH_AVX2          1
> +/* Included by RTM version.  */
> +#ifndef SECTION
> +# define SECTION(p)                    p##.avx
> +#endif
> +#include "avx-vecs.h"
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
> new file mode 100644
> index 0000000000..ed7a32b0ec
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
> @@ -0,0 +1,50 @@
> +/* Common config for EVEX256 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _EVEX256_VECS_H
> +#define _EVEX256_VECS_H                        1
> +
> +#ifdef HAS_VEC
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define HAS_VEC                                1
> +#include "vec-macros.h"
> +
> +#define USE_WITH_EVEX256       1
> +#ifndef SECTION
> +# define SECTION(p)                    p##.evex
> +#endif
> +
> +#define VEC_SIZE                       32
> +/* 6-byte mov instructions with EVEX.  */
> +#define MOV_SIZE                       6
> +/* No vzeroupper needed.  */
> +#define RET_SIZE                       1
> +#define VZEROUPPER
> +
> +#define VMOVU                          vmovdqu64
> +#define VMOVA                          vmovdqa64
> +#define VMOVNT                         vmovntdq
> +
> +/* Often need to access xmm portion.  */
> +#define VEC_xmm                                VEC_hi_xmm
> +#define VEC                                    VEC_hi_ymm

Can we add evex-vecs.h for common macros?

> +#endif
> diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
> new file mode 100644
> index 0000000000..53597734fc
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
> @@ -0,0 +1,49 @@
> +/* Common config for EVEX512 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _EVEX512_VECS_H
> +#define _EVEX512_VECS_H                        1
> +
> +#ifdef HAS_VEC
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define HAS_VEC                                1
> +#include "vec-macros.h"
> +
> +#define USE_WITH_EVEX512       1
> +#define SECTION(p)                     p##.evex512
> +
> +#define VEC_SIZE                       64
> +/* 6-byte mov instructions with EVEX.  */
> +#define MOV_SIZE                       6
> +/* No vzeroupper needed.  */
> +#define RET_SIZE                       1
> +#define VZEROUPPER
> +
> +#define VMOVU                          vmovdqu64
> +#define VMOVA                          vmovdqa64
> +#define VMOVNT                         vmovntdq
> +
> +/* Often need to access xmm/ymm portion.  */
> +#define VEC_xmm                                VEC_hi_xmm
> +#define VEC_ymm                                VEC_hi_ymm
> +#define VEC                                    VEC_hi_zmm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
> new file mode 100644
> index 0000000000..b645b93e3d
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
> @@ -0,0 +1,48 @@
> +/* Common config for SSE2 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _SSE2_VECS_H
> +#define _SSE2_VECS_H                   1
> +
> +#ifdef HAS_VEC
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define HAS_VEC                                1
> +#include "vec-macros.h"
> +
> +#define USE_WITH_SSE2          1
> +#define SECTION(p)                     p
> +
> +#define VEC_SIZE                       16
> +/* 3-byte mov instructions with SSE2.  */
> +#define MOV_SIZE                       3
> +/* No vzeroupper needed.  */
> +#define RET_SIZE                       1
> +
> +#define VMOVU                          movups
> +#define VMOVA                          movaps
> +#define VMOVNT                         movntdq
> +#define VZEROUPPER
> +
> +#define VEC_xmm                                VEC_any_xmm
> +#define VEC                                    VEC_any_xmm
> +
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
> new file mode 100644
> index 0000000000..4dae4503c8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/vec-macros.h
> @@ -0,0 +1,90 @@
> +/* Macro helpers for VEC_{type}({vec_num})
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _VEC_MACROS_H
> +# define _VEC_MACROS_H                 1

Remove a space after #.

> +
> +# ifndef HAS_VEC
> +#  error "Never include this file directly. Always include a vector config."
> +# endif

Remove a space after #.

> +
> +/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
> +   VEC(N) values.  */
> +#define VEC_hi_xmm0                            xmm16
> +#define VEC_hi_xmm1                            xmm17
> +#define VEC_hi_xmm2                            xmm18
> +#define VEC_hi_xmm3                            xmm19
> +#define VEC_hi_xmm4                            xmm20
> +#define VEC_hi_xmm5                            xmm21
> +#define VEC_hi_xmm6                            xmm22
> +#define VEC_hi_xmm7                            xmm23
> +#define VEC_hi_xmm8                            xmm24
> +#define VEC_hi_xmm9                            xmm25
> +#define VEC_hi_xmm10                   xmm26
> +#define VEC_hi_xmm11                   xmm27
> +#define VEC_hi_xmm12                   xmm28
> +#define VEC_hi_xmm13                   xmm29
> +#define VEC_hi_xmm14                   xmm30
> +#define VEC_hi_xmm15                   xmm31
> +
> +#define VEC_hi_ymm0                            ymm16
> +#define VEC_hi_ymm1                            ymm17
> +#define VEC_hi_ymm2                            ymm18
> +#define VEC_hi_ymm3                            ymm19
> +#define VEC_hi_ymm4                            ymm20
> +#define VEC_hi_ymm5                            ymm21
> +#define VEC_hi_ymm6                            ymm22
> +#define VEC_hi_ymm7                            ymm23
> +#define VEC_hi_ymm8                            ymm24
> +#define VEC_hi_ymm9                            ymm25
> +#define VEC_hi_ymm10                   ymm26
> +#define VEC_hi_ymm11                   ymm27
> +#define VEC_hi_ymm12                   ymm28
> +#define VEC_hi_ymm13                   ymm29
> +#define VEC_hi_ymm14                   ymm30
> +#define VEC_hi_ymm15                   ymm31
> +
> +#define VEC_hi_zmm0                            zmm16
> +#define VEC_hi_zmm1                            zmm17
> +#define VEC_hi_zmm2                            zmm18
> +#define VEC_hi_zmm3                            zmm19
> +#define VEC_hi_zmm4                            zmm20
> +#define VEC_hi_zmm5                            zmm21
> +#define VEC_hi_zmm6                            zmm22
> +#define VEC_hi_zmm7                            zmm23
> +#define VEC_hi_zmm8                            zmm24
> +#define VEC_hi_zmm9                            zmm25
> +#define VEC_hi_zmm10                   zmm26
> +#define VEC_hi_zmm11                   zmm27
> +#define VEC_hi_zmm12                   zmm28
> +#define VEC_hi_zmm13                   zmm29
> +#define VEC_hi_zmm14                   zmm30
> +#define VEC_hi_zmm15                   zmm31
> +
> +# define PRIMITIVE_VEC(vec, num)               vec##num
> +
> +# define VEC_any_xmm(i)                        PRIMITIVE_VEC(xmm, i)
> +# define VEC_any_ymm(i)                        PRIMITIVE_VEC(ymm, i)
> +# define VEC_any_zmm(i)                        PRIMITIVE_VEC(zmm, i)
> +
> +# define VEC_hi_xmm(i)                 PRIMITIVE_VEC(VEC_hi_xmm, i)
> +# define VEC_hi_ymm(i)                 PRIMITIVE_VEC(VEC_hi_ymm, i)
> +# define VEC_hi_zmm(i)                 PRIMITIVE_VEC(VEC_hi_zmm, i)
> +
> +#endif
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library
  2022-06-03 23:09     ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library H.J. Lu
@ 2022-06-03 23:49       ` Noah Goldstein
  0 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 23:49 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Fri, Jun 3, 2022 at 6:10 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Jun 3, 2022 at 1:04 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > This patch does not touch any existing code and is only meant to be a
> > tool for future patches so that simple source files can more easily be
> > maintained to target multiple VEC classes.
> >
> > There is no difference in the objdump of libc.so before and after this
> > patch.
> > ---
> >  sysdeps/x86_64/multiarch/avx-rtm-vecs.h  | 33 +++++++++
> >  sysdeps/x86_64/multiarch/avx-vecs.h      | 53 ++++++++++++++
> >  sysdeps/x86_64/multiarch/avx2-rtm-vecs.h | 33 +++++++++
> >  sysdeps/x86_64/multiarch/avx2-vecs.h     | 30 ++++++++
> >  sysdeps/x86_64/multiarch/evex256-vecs.h  | 50 +++++++++++++
> >  sysdeps/x86_64/multiarch/evex512-vecs.h  | 49 +++++++++++++
> >  sysdeps/x86_64/multiarch/sse2-vecs.h     | 48 +++++++++++++
> >  sysdeps/x86_64/multiarch/vec-macros.h    | 90 ++++++++++++++++++++++++
> >  8 files changed, 386 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/avx2-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/vec-macros.h
> >
> > diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> > new file mode 100644
> > index 0000000000..c00b83ea0e
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> > @@ -0,0 +1,33 @@
> > +/* Common config for AVX-RTM VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _AVX_RTM_VECS_H
> > +#define _AVX_RTM_VECS_H                        1
> > +
> > +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > +
> > +#define VZEROUPPER_RETURN              jmp L(return_vzeroupper)
> > +
> > +#define SECTION(p)                             p##.avx.rtm
> > +
> > +#define USE_WITH_RTM                   1
> > +#include "avx-vecs.h"
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
> > new file mode 100644
> > index 0000000000..3b84d7e8b2
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/avx-vecs.h
> > @@ -0,0 +1,53 @@
> > +/* Common config for AVX VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _AVX_VECS_H
> > +#define _AVX_VECS_H                    1
> > +
> > +#ifdef HAS_VEC
> > +# error "Multiple VEC configs included!"
> > +#endif
> > +
> > +#define HAS_VEC                                1
> > +#include "vec-macros.h"
> > +
> > +#ifndef USE_WITH_AVX2
> > +# define USE_WITH_AVX          1
> > +#endif
> > +/* Included by RTM version.  */
> > +#ifndef SECTION
> > +# define SECTION(p)                    p##.avx
> > +#endif
>
> Can SECTION be defined unconditionally? If a different SECTION
> is needed, you can undef it first,

Fixed in V2.
>
> > +
> > +#define VEC_SIZE                       32
> > +/* 4-byte mov instructions with AVX2.  */
> > +#define MOV_SIZE                       4
> > +/* 1 (ret) + 3 (vzeroupper).  */
> > +#define RET_SIZE                       4
> > +#define VZEROUPPER                     vzeroupper
> > +
> > +#define VMOVU                          vmovdqu
> > +#define VMOVA                          vmovdqa
> > +#define VMOVNT                         vmovntdq
> > +
> > +/* Often need to access xmm portion.  */
> > +#define VEC_xmm                                VEC_any_xmm
> > +#define VEC                                    VEC_any_ymm
>
> Can we check VEC or VEC_SIZE instead of HAS_VEC?

Changed in V2.
>
> > +
> > +#endif
>
> Do we need both AVX and AVX2?  Will AVX2 be sufficient?

Removed avx2 version in V2.
>
> > diff --git a/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
> > new file mode 100644
> > index 0000000000..a5d46e8c66
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
> > @@ -0,0 +1,33 @@
> > +/* Common config for AVX2-RTM VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _AVX2_RTM_VECS_H
> > +#define _AVX2_RTM_VECS_H                       1
> > +
> > +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > +
> > +#define VZEROUPPER_RETURN              jmp L(return_vzeroupper)
> > +
> > +#define SECTION(p)                             p##.avx.rtm
> > +
> > +#define USE_WITH_RTM                   1
> > +#include "avx2-vecs.h"
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/avx2-vecs.h b/sysdeps/x86_64/multiarch/avx2-vecs.h
> > new file mode 100644
> > index 0000000000..4c029b4621
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/avx2-vecs.h
> > @@ -0,0 +1,30 @@
> > +/* Common config for AVX2 VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _AVX2_VECS_H
> > +#define _AVX2_VECS_H                   1
> > +
> > +#define USE_WITH_AVX2          1
> > +/* Included by RTM version.  */
> > +#ifndef SECTION
> > +# define SECTION(p)                    p##.avx
> > +#endif
> > +#include "avx-vecs.h"
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
> > new file mode 100644
> > index 0000000000..ed7a32b0ec
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
> > @@ -0,0 +1,50 @@
> > +/* Common config for EVEX256 VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _EVEX256_VECS_H
> > +#define _EVEX256_VECS_H                        1
> > +
> > +#ifdef HAS_VEC
> > +# error "Multiple VEC configs included!"
> > +#endif
> > +
> > +#define HAS_VEC                                1
> > +#include "vec-macros.h"
> > +
> > +#define USE_WITH_EVEX256       1
> > +#ifndef SECTION
> > +# define SECTION(p)                    p##.evex
> > +#endif
> > +
> > +#define VEC_SIZE                       32
> > +/* 6-byte mov instructions with EVEX.  */
> > +#define MOV_SIZE                       6
> > +/* No vzeroupper needed.  */
> > +#define RET_SIZE                       1
> > +#define VZEROUPPER
> > +
> > +#define VMOVU                          vmovdqu64
> > +#define VMOVA                          vmovdqa64
> > +#define VMOVNT                         vmovntdq
> > +
> > +/* Often need to access xmm portion.  */
> > +#define VEC_xmm                                VEC_hi_xmm
> > +#define VEC                                    VEC_hi_ymm
>
> Can we add evex-vecs.h for common macros?

Done in V2.
>
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
> > new file mode 100644
> > index 0000000000..53597734fc
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
> > @@ -0,0 +1,49 @@
> > +/* Common config for EVEX512 VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _EVEX512_VECS_H
> > +#define _EVEX512_VECS_H                        1
> > +
> > +#ifdef HAS_VEC
> > +# error "Multiple VEC configs included!"
> > +#endif
> > +
> > +#define HAS_VEC                                1
> > +#include "vec-macros.h"
> > +
> > +#define USE_WITH_EVEX512       1
> > +#define SECTION(p)                     p##.evex512
> > +
> > +#define VEC_SIZE                       64
> > +/* 6-byte mov instructions with EVEX.  */
> > +#define MOV_SIZE                       6
> > +/* No vzeroupper needed.  */
> > +#define RET_SIZE                       1
> > +#define VZEROUPPER
> > +
> > +#define VMOVU                          vmovdqu64
> > +#define VMOVA                          vmovdqa64
> > +#define VMOVNT                         vmovntdq
> > +
> > +/* Often need to access xmm/ymm portion.  */
> > +#define VEC_xmm                                VEC_hi_xmm
> > +#define VEC_ymm                                VEC_hi_ymm
> > +#define VEC                                    VEC_hi_zmm
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
> > new file mode 100644
> > index 0000000000..b645b93e3d
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
> > @@ -0,0 +1,48 @@
> > +/* Common config for SSE2 VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _SSE2_VECS_H
> > +#define _SSE2_VECS_H                   1
> > +
> > +#ifdef HAS_VEC
> > +# error "Multiple VEC configs included!"
> > +#endif
> > +
> > +#define HAS_VEC                                1
> > +#include "vec-macros.h"
> > +
> > +#define USE_WITH_SSE2          1
> > +#define SECTION(p)                     p
> > +
> > +#define VEC_SIZE                       16
> > +/* 3-byte mov instructions with SSE2.  */
> > +#define MOV_SIZE                       3
> > +/* No vzeroupper needed.  */
> > +#define RET_SIZE                       1
> > +
> > +#define VMOVU                          movups
> > +#define VMOVA                          movaps
> > +#define VMOVNT                         movntdq
> > +#define VZEROUPPER
> > +
> > +#define VEC_xmm                                VEC_any_xmm
> > +#define VEC                                    VEC_any_xmm
> > +
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
> > new file mode 100644
> > index 0000000000..4dae4503c8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/vec-macros.h
> > @@ -0,0 +1,90 @@
> > +/* Macro helpers for VEC_{type}({vec_num})
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _VEC_MACROS_H
> > +# define _VEC_MACROS_H                 1
>
> Remove a space after #.

Fixed in V2.
>
> > +
> > +# ifndef HAS_VEC
> > +#  error "Never include this file directly. Always include a vector config."
> > +# endif
>
> Remove a space after #.

Fixed in V2.
>
> > +
> > +/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
> > +   VEC(N) values.  */
> > +#define VEC_hi_xmm0                            xmm16
> > +#define VEC_hi_xmm1                            xmm17
> > +#define VEC_hi_xmm2                            xmm18
> > +#define VEC_hi_xmm3                            xmm19
> > +#define VEC_hi_xmm4                            xmm20
> > +#define VEC_hi_xmm5                            xmm21
> > +#define VEC_hi_xmm6                            xmm22
> > +#define VEC_hi_xmm7                            xmm23
> > +#define VEC_hi_xmm8                            xmm24
> > +#define VEC_hi_xmm9                            xmm25
> > +#define VEC_hi_xmm10                   xmm26
> > +#define VEC_hi_xmm11                   xmm27
> > +#define VEC_hi_xmm12                   xmm28
> > +#define VEC_hi_xmm13                   xmm29
> > +#define VEC_hi_xmm14                   xmm30
> > +#define VEC_hi_xmm15                   xmm31
> > +
> > +#define VEC_hi_ymm0                            ymm16
> > +#define VEC_hi_ymm1                            ymm17
> > +#define VEC_hi_ymm2                            ymm18
> > +#define VEC_hi_ymm3                            ymm19
> > +#define VEC_hi_ymm4                            ymm20
> > +#define VEC_hi_ymm5                            ymm21
> > +#define VEC_hi_ymm6                            ymm22
> > +#define VEC_hi_ymm7                            ymm23
> > +#define VEC_hi_ymm8                            ymm24
> > +#define VEC_hi_ymm9                            ymm25
> > +#define VEC_hi_ymm10                   ymm26
> > +#define VEC_hi_ymm11                   ymm27
> > +#define VEC_hi_ymm12                   ymm28
> > +#define VEC_hi_ymm13                   ymm29
> > +#define VEC_hi_ymm14                   ymm30
> > +#define VEC_hi_ymm15                   ymm31
> > +
> > +#define VEC_hi_zmm0                            zmm16
> > +#define VEC_hi_zmm1                            zmm17
> > +#define VEC_hi_zmm2                            zmm18
> > +#define VEC_hi_zmm3                            zmm19
> > +#define VEC_hi_zmm4                            zmm20
> > +#define VEC_hi_zmm5                            zmm21
> > +#define VEC_hi_zmm6                            zmm22
> > +#define VEC_hi_zmm7                            zmm23
> > +#define VEC_hi_zmm8                            zmm24
> > +#define VEC_hi_zmm9                            zmm25
> > +#define VEC_hi_zmm10                   zmm26
> > +#define VEC_hi_zmm11                   zmm27
> > +#define VEC_hi_zmm12                   zmm28
> > +#define VEC_hi_zmm13                   zmm29
> > +#define VEC_hi_zmm14                   zmm30
> > +#define VEC_hi_zmm15                   zmm31
> > +
> > +# define PRIMITIVE_VEC(vec, num)               vec##num
> > +
> > +# define VEC_any_xmm(i)                        PRIMITIVE_VEC(xmm, i)
> > +# define VEC_any_ymm(i)                        PRIMITIVE_VEC(ymm, i)
> > +# define VEC_any_zmm(i)                        PRIMITIVE_VEC(zmm, i)
> > +
> > +# define VEC_hi_xmm(i)                 PRIMITIVE_VEC(VEC_hi_xmm, i)
> > +# define VEC_hi_ymm(i)                 PRIMITIVE_VEC(VEC_hi_ymm, i)
> > +# define VEC_hi_zmm(i)                 PRIMITIVE_VEC(VEC_hi_zmm, i)
> > +

Removed spaces here as well in V2.
> > +#endif
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v3 1/8] x86: Create header for VEC classes in x86 strings library
  2022-06-03  4:42 ` [PATCH v1 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
  2022-06-03 20:04   ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
@ 2022-06-03 23:49   ` Noah Goldstein
  2022-06-03 23:49     ` [PATCH v3 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
                       ` (6 more replies)
  2022-06-06 22:37   ` [PATCH v4 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                     ` (2 subsequent siblings)
  4 siblings, 7 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 23:49 UTC (permalink / raw)
  To: libc-alpha

This patch does not touch any existing code and is only meant to be a
tool for future patches so that simple source files can more easily be
maintained to target multiple VEC classes.

There is no difference in the objdump of libc.so before and after this
patch.
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h     | 34 ++++++++
 sysdeps/x86_64/multiarch/avx-vecs.h         | 47 +++++++++++
 sysdeps/x86_64/multiarch/evex-vecs-common.h | 39 +++++++++
 sysdeps/x86_64/multiarch/evex256-vecs.h     | 35 ++++++++
 sysdeps/x86_64/multiarch/evex512-vecs.h     | 35 ++++++++
 sysdeps/x86_64/multiarch/sse2-vecs.h        | 47 +++++++++++
 sysdeps/x86_64/multiarch/vec-macros.h       | 90 +++++++++++++++++++++
 7 files changed, 327 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/evex-vecs-common.h
 create mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/vec-macros.h

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
new file mode 100644
index 0000000000..3f531dd47f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
@@ -0,0 +1,34 @@
+/* Common config for AVX-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_RTM_VECS_H
+#define _AVX_RTM_VECS_H			1
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define USE_WITH_RTM			1
+#include "avx-vecs.h"
+
+#undef SECTION
+#define SECTION(p)				p##.avx.rtm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
new file mode 100644
index 0000000000..89680f5db8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for AVX VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_VECS_H
+#define _AVX_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "vec-macros.h"
+
+#define USE_WITH_AVX		1
+#define SECTION(p)			p##.avx
+
+/* 4-byte mov instructions with AVX2.  */
+#define MOV_SIZE			4
+/* 1 (ret) + 3 (vzeroupper).  */
+#define RET_SIZE			4
+#define VZEROUPPER			vzeroupper
+
+#define VMOVU				vmovdqu
+#define VMOVA				vmovdqa
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
new file mode 100644
index 0000000000..99806ebcd7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex-vecs-common.h
@@ -0,0 +1,39 @@
+/* Common config for EVEX256 and EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX_VECS_COMMON_H
+#define _EVEX_VECS_COMMON_H			1
+
+#include "vec-macros.h"
+
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+#define VEC_xmm				VEC_hi_xmm
+#define VEC_ymm				VEC_hi_ymm
+#define VEC_zmm				VEC_hi_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
new file mode 100644
index 0000000000..222ba46dc7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
@@ -0,0 +1,35 @@
+/* Common config for EVEX256 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX256_VECS_H
+#define _EVEX256_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "evex-vecs-common.h"
+
+#define USE_WITH_EVEX256	1
+#define SECTION(p)			p##.evex
+
+#define VEC					VEC_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
new file mode 100644
index 0000000000..d1784d5368
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
@@ -0,0 +1,35 @@
+/* Common config for EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX512_VECS_H
+#define _EVEX512_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			64
+#include "evex-vecs-common.h"
+
+#define USE_WITH_EVEX512	1
+#define SECTION(p)			p##.evex512
+
+#define VEC					VEC_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
new file mode 100644
index 0000000000..2b77a59d56
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for SSE2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SSE2_VECS_H
+#define _SSE2_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			16
+#include "vec-macros.h"
+
+#define USE_WITH_SSE2		1
+#define SECTION(p)			p
+
+/* 3-byte mov instructions with SSE2.  */
+#define MOV_SIZE			3
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				movups
+#define VMOVA				movaps
+#define VMOVNT				movntdq
+
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_xmm
+
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
new file mode 100644
index 0000000000..9f3ffecede
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/vec-macros.h
@@ -0,0 +1,90 @@
+/* Macro helpers for VEC_{type}({vec_num})
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _VEC_MACROS_H
+#define _VEC_MACROS_H			1
+
+#ifndef VEC_SIZE
+# error "Never include this file directly. Always include a vector config."
+#endif
+
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
+   VEC(N) values.  */
+#define VEC_hi_xmm0				xmm16
+#define VEC_hi_xmm1				xmm17
+#define VEC_hi_xmm2				xmm18
+#define VEC_hi_xmm3				xmm19
+#define VEC_hi_xmm4				xmm20
+#define VEC_hi_xmm5				xmm21
+#define VEC_hi_xmm6				xmm22
+#define VEC_hi_xmm7				xmm23
+#define VEC_hi_xmm8				xmm24
+#define VEC_hi_xmm9				xmm25
+#define VEC_hi_xmm10			xmm26
+#define VEC_hi_xmm11			xmm27
+#define VEC_hi_xmm12			xmm28
+#define VEC_hi_xmm13			xmm29
+#define VEC_hi_xmm14			xmm30
+#define VEC_hi_xmm15			xmm31
+
+#define VEC_hi_ymm0				ymm16
+#define VEC_hi_ymm1				ymm17
+#define VEC_hi_ymm2				ymm18
+#define VEC_hi_ymm3				ymm19
+#define VEC_hi_ymm4				ymm20
+#define VEC_hi_ymm5				ymm21
+#define VEC_hi_ymm6				ymm22
+#define VEC_hi_ymm7				ymm23
+#define VEC_hi_ymm8				ymm24
+#define VEC_hi_ymm9				ymm25
+#define VEC_hi_ymm10			ymm26
+#define VEC_hi_ymm11			ymm27
+#define VEC_hi_ymm12			ymm28
+#define VEC_hi_ymm13			ymm29
+#define VEC_hi_ymm14			ymm30
+#define VEC_hi_ymm15			ymm31
+
+#define VEC_hi_zmm0				zmm16
+#define VEC_hi_zmm1				zmm17
+#define VEC_hi_zmm2				zmm18
+#define VEC_hi_zmm3				zmm19
+#define VEC_hi_zmm4				zmm20
+#define VEC_hi_zmm5				zmm21
+#define VEC_hi_zmm6				zmm22
+#define VEC_hi_zmm7				zmm23
+#define VEC_hi_zmm8				zmm24
+#define VEC_hi_zmm9				zmm25
+#define VEC_hi_zmm10			zmm26
+#define VEC_hi_zmm11			zmm27
+#define VEC_hi_zmm12			zmm28
+#define VEC_hi_zmm13			zmm29
+#define VEC_hi_zmm14			zmm30
+#define VEC_hi_zmm15			zmm31
+
+#define PRIMITIVE_VEC(vec, num)		vec##num
+
+#define VEC_any_xmm(i)			PRIMITIVE_VEC(xmm, i)
+#define VEC_any_ymm(i)			PRIMITIVE_VEC(ymm, i)
+#define VEC_any_zmm(i)			PRIMITIVE_VEC(zmm, i)
+
+#define VEC_hi_xmm(i)			PRIMITIVE_VEC(VEC_hi_xmm, i)
+#define VEC_hi_ymm(i)			PRIMITIVE_VEC(VEC_hi_ymm, i)
+#define VEC_hi_zmm(i)			PRIMITIVE_VEC(VEC_hi_zmm, i)
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v3 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
  2022-06-03 23:49   ` [PATCH v3 " Noah Goldstein
@ 2022-06-03 23:49     ` Noah Goldstein
  2022-06-06 21:30       ` H.J. Lu
  2022-06-03 23:49     ` [PATCH v3 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
                       ` (5 subsequent siblings)
  6 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 23:49 UTC (permalink / raw)
  To: libc-alpha

The RTM vzeroupper mitigation has no way of replacing inline
vzeroupper not before a return.

This code does not change any existing functionality.

There is no difference in the objdump of libc.so before and after this
patch.
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h |  1 +
 sysdeps/x86_64/sysdep.h                 | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
index 3f531dd47f..6ca9f5e6ba 100644
--- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
@@ -20,6 +20,7 @@
 #ifndef _AVX_RTM_VECS_H
 #define _AVX_RTM_VECS_H			1
 
+#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN	\
 	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index f14d50786d..2cb31a558b 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -106,6 +106,22 @@ lose:									      \
 	vzeroupper;						\
 	ret
 
+/* Can be used to replace vzeroupper that is not directly before a
+   return.  */
+#define COND_VZEROUPPER_XTEST							\
+    xtest;							\
+    jz 1f;							\
+    vzeroall;							\
+    jmp 2f;							\
+1:							\
+    vzeroupper;							\
+2:
+
+/* In RTM define this as COND_VZEROUPPER_XTEST.  */
+#ifndef COND_VZEROUPPER
+# define COND_VZEROUPPER vzeroupper
+#endif
+
 /* Zero upper vector registers and return.  */
 #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
 # define ZERO_UPPER_VEC_REGISTERS_RETURN \
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
  2022-06-03 23:49     ` [PATCH v3 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
@ 2022-06-06 21:30       ` H.J. Lu
  2022-06-06 22:38         ` Noah Goldstein
  0 siblings, 1 reply; 82+ messages in thread
From: H.J. Lu @ 2022-06-06 21:30 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Fri, Jun 3, 2022 at 4:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The RTM vzeroupper mitigation has no way of replacing inline
> vzeroupper not before a return.
>
> This code does not change any existing functionality.
>
> There is no difference in the objdump of libc.so before and after this
> patch.
> ---
>  sysdeps/x86_64/multiarch/avx-rtm-vecs.h |  1 +
>  sysdeps/x86_64/sysdep.h                 | 16 ++++++++++++++++
>  2 files changed, 17 insertions(+)
>
> diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> index 3f531dd47f..6ca9f5e6ba 100644
> --- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> +++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> @@ -20,6 +20,7 @@
>  #ifndef _AVX_RTM_VECS_H
>  #define _AVX_RTM_VECS_H                        1
>
> +#define COND_VZEROUPPER                        COND_VZEROUPPER_XTEST
>  #define ZERO_UPPER_VEC_REGISTERS_RETURN        \
>         ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>
> diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
> index f14d50786d..2cb31a558b 100644
> --- a/sysdeps/x86_64/sysdep.h
> +++ b/sysdeps/x86_64/sysdep.h
> @@ -106,6 +106,22 @@ lose:                                                                            \
>         vzeroupper;                                             \
>         ret
>
> +/* Can be used to replace vzeroupper that is not directly before a
> +   return.  */

Please mention that it should be used to reduce the number of
vzerouppers.

> +#define COND_VZEROUPPER_XTEST                                                  \
> +    xtest;                                                     \
> +    jz 1f;                                                     \
> +    vzeroall;                                                  \
> +    jmp 2f;                                                    \
> +1:                                                     \
> +    vzeroupper;                                                        \
> +2:
> +
> +/* In RTM define this as COND_VZEROUPPER_XTEST.  */
> +#ifndef COND_VZEROUPPER
> +# define COND_VZEROUPPER vzeroupper
> +#endif
> +
>  /* Zero upper vector registers and return.  */
>  #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
>  # define ZERO_UPPER_VEC_REGISTERS_RETURN \
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
  2022-06-06 21:30       ` H.J. Lu
@ 2022-06-06 22:38         ` Noah Goldstein
  0 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-06 22:38 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 2:31 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Jun 3, 2022 at 4:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The RTM vzeroupper mitigation has no way of replacing inline
> > vzeroupper not before a return.
> >
> > This code does not change any existing functionality.
> >
> > There is no difference in the objdump of libc.so before and after this
> > patch.
> > ---
> >  sysdeps/x86_64/multiarch/avx-rtm-vecs.h |  1 +
> >  sysdeps/x86_64/sysdep.h                 | 16 ++++++++++++++++
> >  2 files changed, 17 insertions(+)
> >
> > diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> > index 3f531dd47f..6ca9f5e6ba 100644
> > --- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> > +++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> > @@ -20,6 +20,7 @@
> >  #ifndef _AVX_RTM_VECS_H
> >  #define _AVX_RTM_VECS_H                        1
> >
> > +#define COND_VZEROUPPER                        COND_VZEROUPPER_XTEST
> >  #define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> >         ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> >
> > diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
> > index f14d50786d..2cb31a558b 100644
> > --- a/sysdeps/x86_64/sysdep.h
> > +++ b/sysdeps/x86_64/sysdep.h
> > @@ -106,6 +106,22 @@ lose:                                                                            \
> >         vzeroupper;                                             \
> >         ret
> >
> > +/* Can be used to replace vzeroupper that is not directly before a
> > +   return.  */
>
> Please mention that it should be used to reduce the number of
> vzerouppers.

Fixed in V4

Made things more explicit in the comment and commit message.

>
> > +#define COND_VZEROUPPER_XTEST                                                  \
> > +    xtest;                                                     \
> > +    jz 1f;                                                     \
> > +    vzeroall;                                                  \
> > +    jmp 2f;                                                    \
> > +1:                                                     \
> > +    vzeroupper;                                                        \
> > +2:
> > +
> > +/* In RTM define this as COND_VZEROUPPER_XTEST.  */
> > +#ifndef COND_VZEROUPPER
> > +# define COND_VZEROUPPER vzeroupper
> > +#endif
> > +
> >  /* Zero upper vector registers and return.  */
> >  #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
> >  # define ZERO_UPPER_VEC_REGISTERS_RETURN \
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v3 3/8] Benchtests: Improve memrchr benchmarks
  2022-06-03 23:49   ` [PATCH v3 " Noah Goldstein
  2022-06-03 23:49     ` [PATCH v3 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
@ 2022-06-03 23:49     ` Noah Goldstein
  2022-06-03 23:49     ` [PATCH v3 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
                       ` (4 subsequent siblings)
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 23:49 UTC (permalink / raw)
  To: libc-alpha

Add a second iteration for memrchr to set `pos` starting from the end
of the buffer.

Previously `pos` was only set relative to the begining of the
buffer. This isn't really useful for memchr because the begining
of the search space is (buf + len).
---
 benchtests/bench-memchr.c | 110 ++++++++++++++++++++++----------------
 1 file changed, 65 insertions(+), 45 deletions(-)

diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
index 4d7212332f..0facda2fa0 100644
--- a/benchtests/bench-memchr.c
+++ b/benchtests/bench-memchr.c
@@ -76,7 +76,7 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
 
 static void
 do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
-	 int seek_char)
+	 int seek_char, int invert_pos)
 {
   size_t i;
 
@@ -96,7 +96,10 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
 
   if (pos < len)
     {
-      buf[align + pos] = seek_char;
+      if (invert_pos)
+	buf[align + len - pos] = seek_char;
+      else
+	buf[align + pos] = seek_char;
       buf[align + len] = -seek_char;
     }
   else
@@ -109,6 +112,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
   json_attr_uint (json_ctx, "pos", pos);
   json_attr_uint (json_ctx, "len", len);
   json_attr_uint (json_ctx, "seek_char", seek_char);
+  json_attr_uint (json_ctx, "invert_pos", invert_pos);
 
   json_array_begin (json_ctx, "timings");
 
@@ -123,6 +127,7 @@ int
 test_main (void)
 {
   size_t i;
+  int repeats;
   json_ctx_t json_ctx;
   test_init ();
 
@@ -142,53 +147,68 @@ test_main (void)
 
   json_array_begin (&json_ctx, "results");
 
-  for (i = 1; i < 8; ++i)
+  for (repeats = 0; repeats < 2; ++repeats)
     {
-      do_test (&json_ctx, 0, 16 << i, 2048, 23);
-      do_test (&json_ctx, i, 64, 256, 23);
-      do_test (&json_ctx, 0, 16 << i, 2048, 0);
-      do_test (&json_ctx, i, 64, 256, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, 64, 256, 0);
+      for (i = 1; i < 8; ++i)
+	{
+	  do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
+	  do_test (&json_ctx, i, 64, 256, 23, repeats);
+	  do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
+	  do_test (&json_ctx, i, 64, 256, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
 #ifdef USE_AS_MEMRCHR
-      /* Also test the position close to the beginning for memrchr.  */
-      do_test (&json_ctx, 0, i, 256, 23);
-      do_test (&json_ctx, 0, i, 256, 0);
-      do_test (&json_ctx, i, i, 256, 23);
-      do_test (&json_ctx, i, i, 256, 0);
+	  /* Also test the position close to the beginning for memrchr.  */
+	  do_test (&json_ctx, 0, i, 256, 23, repeats);
+	  do_test (&json_ctx, 0, i, 256, 0, repeats);
+	  do_test (&json_ctx, i, i, 256, 23, repeats);
+	  do_test (&json_ctx, i, i, 256, 0, repeats);
 #endif
-    }
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (&json_ctx, i, i << 5, 192, 23);
-      do_test (&json_ctx, i, i << 5, 192, 0);
-      do_test (&json_ctx, i, i << 5, 256, 23);
-      do_test (&json_ctx, i, i << 5, 256, 0);
-      do_test (&json_ctx, i, i << 5, 512, 23);
-      do_test (&json_ctx, i, i << 5, 512, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23);
-    }
-  for (i = 1; i < 32; ++i)
-    {
-      do_test (&json_ctx, 0, i, i + 1, 23);
-      do_test (&json_ctx, 0, i, i + 1, 0);
-      do_test (&json_ctx, i, i, i + 1, 23);
-      do_test (&json_ctx, i, i, i + 1, 0);
-      do_test (&json_ctx, 0, i, i - 1, 23);
-      do_test (&json_ctx, 0, i, i - 1, 0);
-      do_test (&json_ctx, i, i, i - 1, 23);
-      do_test (&json_ctx, i, i, i - 1, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23);
-      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23);
-      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0);
+	}
+      for (i = 1; i < 8; ++i)
+	{
+	  do_test (&json_ctx, i, i << 5, 192, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 192, 0, repeats);
+	  do_test (&json_ctx, i, i << 5, 256, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 256, 0, repeats);
+	  do_test (&json_ctx, i, i << 5, 512, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 512, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
+	}
+      for (i = 1; i < 32; ++i)
+	{
+	  do_test (&json_ctx, 0, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, 0, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, i, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, i, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, 0, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, 0, i, i - 1, 0, repeats);
+	  do_test (&json_ctx, i, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, i, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () / 2, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i - 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0, repeats);
+
 #ifdef USE_AS_MEMRCHR
-      /* Also test the position close to the beginning for memrchr.  */
-      do_test (&json_ctx, 0, 1, i + 1, 23);
-      do_test (&json_ctx, 0, 2, i + 1, 0);
+	  do_test (&json_ctx, 0, 1, i + 1, 23, repeats);
+	  do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
+#endif
+	}
+#ifndef USE_AS_MEMRCHR
+      break;
 #endif
     }
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v3 4/8] x86: Optimize memrchr-sse2.S
  2022-06-03 23:49   ` [PATCH v3 " Noah Goldstein
  2022-06-03 23:49     ` [PATCH v3 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
  2022-06-03 23:49     ` [PATCH v3 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
@ 2022-06-03 23:49     ` Noah Goldstein
  2022-06-03 23:49     ` [PATCH v3 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
                       ` (3 subsequent siblings)
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 23:49 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller lengths more.
    2. optimizes target placement more carefully.
    3. reuses logic more.
    4. fixes up various inefficiencies in the logic.

The total code size saving is: 394 bytes
Geometric Mean of all benchmarks New / Old: 0.874

Regressions:
    1. The page cross case is now colder, especially re-entry from the
       page cross case if a match is not found in the first VEC
       (roughly 50%). My general opinion with this patch is this is
       acceptable given the "coldness" of this case (less than 4%) and
       generally performance improvement in the other far more common
       cases.

    2. There are some regressions 5-15% for medium/large user-arg
       lengths that have a match in the first VEC. This is because the
       logic was rewritten to optimize finds in the first VEC if the
       user-arg length is shorter (where we see roughly 20-50%
       performance improvements). It is not always the case this is a
       regression. My intuition is some frontend quirk is partially
       explaining the data although I haven't been able to find the
       root cause.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++--------------------
 1 file changed, 292 insertions(+), 321 deletions(-)

diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index d1a9f47911..b0dffd2ae2 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -18,362 +18,333 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#define VEC_SIZE			16
+#define PAGE_SIZE			4096
 
 	.text
-ENTRY (__memrchr)
-	movd	%esi, %xmm1
-
-	sub	$16, %RDX_LP
-	jbe	L(length_less16)
-
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	%RDX_LP, %RDI_LP
-	pshufd	$0, %xmm1, %xmm1
-
-	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-
-/* Check if there is a match.  */
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(loop_prolog)
-
-	add	$16, %rdi
-	add	$16, %rdx
-	and	$-16, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(loop_prolog):
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqb	%xmm1, %xmm4
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	mov	%edi, %ecx
-	and	$63, %ecx
-	jz	L(align64_loop)
-
-	add	$64, %rdi
-	add	$64, %rdx
-	and	$-64, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(align64_loop):
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	movdqa	48(%rdi), %xmm4
-
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm1, %xmm4
-
-	pmaxub	%xmm3, %xmm0
-	pmaxub	%xmm4, %xmm2
-	pmaxub	%xmm0, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	test	%eax, %eax
-	jz	L(align64_loop)
-
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm2
-
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	(%rdi), %xmm1
-
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	pmovmskb	%xmm1, %eax
-	bsr	%eax, %eax
-
-	add	%rdi, %rax
+ENTRY_P2ALIGN(__memrchr, 6)
+#ifdef __ILP32__
+	/* Clear upper bits.  */
+	mov	%RDX_LP, %RDX_LP
+#endif
+	movd	%esi, %xmm0
+
+	/* Get end pointer.  */
+	leaq	(%rdx, %rdi), %rcx
+
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	jz	L(page_cross)
+
+	/* NB: This load happens regardless of whether rdx (len) is zero. Since
+	   it doesn't cross a page and the standard gurantees any pointer have
+	   at least one-valid byte this load must be safe. For the entire
+	   history of the x86 memrchr implementation this has been possible so
+	   no code "should" be relying on a zero-length check before this load.
+	   The zero-length check is moved to the page cross case because it is
+	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
+	   into 2-cache lines.  */
+	movups	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
+	   zero.  */
+	bsrl	%eax, %eax
+	jz	L(ret_0)
+	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
+	   if out of bounds.  */
+	addl	%edx, %eax
+	jl	L(zero_0)
+	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
+	   ptr.  */
+	addq	%rdi, %rax
+L(ret_0):
 	ret
 
-	.p2align 4
-L(exit_loop):
-	add	$64, %edx
-	cmp	$32, %edx
-	jbe	L(exit_loop_32)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16_1)
-	cmp	$48, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches0_1)
-	xor	%eax, %eax
+	.p2align 4,, 5
+L(ret_vec_x0):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(exit_loop_32):
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48_1)
-	cmp	$16, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	32(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches32_1)
-	xor	%eax, %eax
+	.p2align 4,, 2
+L(zero_0):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches0):
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
-	ret
 
-	.p2align 4
-L(matches32):
-	bsr	%eax, %eax
-	lea	32(%rax, %rdi), %rax
+	.p2align 4,, 8
+L(more_1x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	/* Align rcx (pointer to string).  */
+	decq	%rcx
+	andq	$-VEC_SIZE, %rcx
+
+	movq	%rcx, %rdx
+	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
+	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
+	   it adds more frontend uops (even if the moves can be eliminated) and
+	   some percentage of the time actual backend uops.  */
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	subq	%rdi, %rdx
+	pmovmskb %xmm1, %eax
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	subl	$VEC_SIZE, %edx
+	jbe	L(ret_vec_x0_test)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_1)
+	addl	%edx, %eax
+	jl	L(zero_0)
+	addq	%rdi, %rax
+L(ret_1):
 	ret
 
-	.p2align 4
-L(matches48):
-	bsr	%eax, %eax
-	lea	48(%rax, %rdi), %rax
+	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
+	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
+	   lines.  Naturally aligned % 16 to 8-bytes.  */
+L(page_cross):
+	/* Zero length check.  */
+	testq	%rdx, %rdx
+	jz	L(zero_0)
+
+	leaq	-1(%rcx), %r8
+	andq	$-(VEC_SIZE), %r8
+
+	movaps	(%r8), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	negl	%ecx
+	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
+	   explicitly.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	shl	%cl, %esi
+	movzwl	%si, %eax
+	leaq	(%rdi, %rdx), %rcx
+	cmpq	%rdi, %r8
+	ja	L(more_1x_vec)
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_2)
+	addl	%edx, %eax
+	jl	L(zero_1)
+	addq	%rdi, %rax
+L(ret_2):
 	ret
 
-	.p2align 4
-L(matches0_1):
-	bsr	%eax, %eax
-	sub	$64, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	add	%rdi, %rax
+	/* Fits in aliging bytes.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches16_1):
-	bsr	%eax, %eax
-	sub	$48, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	16(%rdi, %rax), %rax
+	.p2align 4,, 5
+L(ret_vec_x1):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(matches32_1):
-	bsr	%eax, %eax
-	sub	$32, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	32(%rdi, %rax), %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(matches48_1):
-	bsr	%eax, %eax
-	sub	$16, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	48(%rdi, %rax), %rax
-	ret
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(return_null):
-	xor	%eax, %eax
-	ret
 
-	.p2align 4
-L(length_less16_offset0):
-	test	%edx, %edx
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	mov	%dl, %cl
-	pcmpeqb	(%rdi), %xmm1
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
 
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
+	addl	$(VEC_SIZE), %edx
+	jle	L(ret_vec_x2_test)
 
-	pmovmskb	%xmm1, %eax
+L(last_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
 
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
+	subl	$(VEC_SIZE), %edx
+	bsrl	%eax, %eax
+	jz	L(ret_3)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
+L(ret_3):
 	ret
 
-	.p2align 4
-L(length_less16):
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	$16, %edx
-
-	pshufd	$0, %xmm1, %xmm1
-
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(length_less16_offset0)
-
-	mov	%cl, %dh
-	mov	%ecx, %esi
-	add	%dl, %dh
-	and	$-16, %rdi
-
-	sub	$16, %dh
-	ja	L(length_less16_part2)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-
-	sar	%cl, %eax
-	mov	%dl, %cl
-
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
-
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 6
+L(ret_vec_x2_test):
+	bsrl	%eax, %eax
+	jz	L(zero_2)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4
-L(length_less16_part2):
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	mov	%dh, %cl
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
+L(zero_2):
+	xorl	%eax, %eax
+	ret
 
-	test	%eax, %eax
-	jnz	L(length_less16_part2_return)
 
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
+	.p2align 4,, 5
+L(ret_vec_x2):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	mov	%esi, %ecx
-	sar	%cl, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	.p2align 4,, 5
+L(ret_vec_x3):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
+
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x3)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
+	   keeping the code from spilling to the next cache line.  */
+	addq	$(VEC_SIZE * 4 - 1), %rcx
+	andq	$-(VEC_SIZE * 4), %rcx
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
+	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
+	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
+	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
+	pcmpeqb	%xmm0, %xmm1
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm0, %xmm4
+
+	por	%xmm1, %xmm2
+	por	%xmm3, %xmm4
+	por	%xmm2, %xmm4
+
+	pmovmskb %xmm4, %esi
+	testl	%esi, %esi
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	%rdx, %rcx
+	jne	L(loop_4x_vec)
+
+	subl	%edi, %edx
+
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 2
+L(last_4x_vec):
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+	bsrl	%eax, %eax
+	jz	L(ret_4)
+	addl	%edx, %eax
+	jl	L(zero_3)
+	addq	%rdi, %rax
+L(ret_4):
 	ret
 
-	.p2align 4
-L(length_less16_part2_return):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 3
+L(loop_end):
+	pmovmskb %xmm1, %eax
+	sall	$16, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm2, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm3, %eax
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	sall	$16, %eax
+	orl	%esi, %eax
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
 	ret
 
-END (__memrchr)
+L(ret_vec_end):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
+	ret
+	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
+	   aligning bytes.  */
+L(zero_3):
+	xorl	%eax, %eax
+	ret
+	/* 2-bytes from next cache line.  */
+END(__memrchr)
 weak_alias (__memrchr, memrchr)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v3 5/8] x86: Optimize memrchr-evex.S
  2022-06-03 23:49   ` [PATCH v3 " Noah Goldstein
                       ` (2 preceding siblings ...)
  2022-06-03 23:49     ` [PATCH v3 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
@ 2022-06-03 23:49     ` Noah Goldstein
  2022-06-03 23:49     ` [PATCH v3 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
                       ` (2 subsequent siblings)
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 23:49 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller user-arg lengths more.
    2. optimizes target placement more carefully
    3. reuses logic more
    4. fixes up various inefficiencies in the logic. The biggest
       case here is the `lzcnt` logic for checking returns which
       saves either a branch or multiple instructions.

The total code size saving is: 263 bytes
Geometric Mean of all benchmarks New / Old: 0.755

Regressions:
There are some regressions. Particularly where the length (user arg
length) is large but the position of the match char is near the
begining of the string (in first VEC). This case has roughly a
20% regression.

This is because the new logic gives the hot path for immediate matches
to shorter lengths (the more common input). This case has roughly
a 35% speedup.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memrchr-evex.S | 539 ++++++++++++------------
 1 file changed, 268 insertions(+), 271 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index 0b99709c6b..ad541c0e50 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -19,319 +19,316 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
+# ifndef MEMRCHR
+#  define MEMRCHR				__memrchr_evex
+# endif
+
+# define PAGE_SIZE			4096
+# define VECMATCH			VEC(0)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(MEMRCHR, 6)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
+
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdi, %rdx), %rax
+	vpbroadcastb %esi, %VECMATCH
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+
+	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will gurantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-# define VMOVA		vmovdqa64
-
-# define YMMMATCH	ymm16
-
-# define VEC_SIZE 32
-
-	.section .text.evex,"ax",@progbits
-ENTRY (__memrchr_evex)
-	/* Broadcast CHAR to YMMMATCH.  */
-	vpbroadcastb %esi, %YMMMATCH
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
-
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
-
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
-	kord	%k1, %k2, %k5
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
-	kord	%k3, %k4, %k6
-	kortestd %k5, %k6
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	kmovd	%k1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0_dec):
+	decq	%rax
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
+	/* Align rax (pointer to string).  */
+	andq	$-VEC_SIZE, %rax
 
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
+	/* Recompute length after aligning.  */
+	movq	%rax, %rdx
 
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	ret
+	subq	%rdi, %rdx
 
-	.p2align 4
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
+
+	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
+	decq	%rax
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpb	$0, (%rsi), %VECMATCH, %k0
+	kmovd	%k0, %r8d
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %ecx
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%ecx
+	shlxl	%ecx, %r8d, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_1)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
+	/* Continue creating zero labels that fit in aligning bytes and get
+	   2-byte encoding / are in the same cache line as condition.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
-
-	kmovd	%k1, %eax
-
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
-
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(zero)
-
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
-
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	.p2align 4,, 8
+L(ret_vec_x2):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	/* Check the last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecissary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	ret
+	decq	%rax
+	andq	$-(VEC_SIZE * 4), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	andq	$-(VEC_SIZE * 4), %rdx
 
 	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
-
-	/* Check the last VEC.  */
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+L(loop_4x_vec):
+	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
+	   on).  */
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
+
+	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
+	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
+	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
+
+	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
+	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
+	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
+	vptestnmb %VEC(3), %VEC(3), %k2
+
+	/* Any 1s and we found CHAR.  */
+	kortestd %k2, %k4
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
+
+	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
+	subq	$-(VEC_SIZE * 4), %rdx
+	movq	%rdx, %rax
+	subl	%edi, %edx
+L(last_4x_vec):
+
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	kmovd	%k1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	movl	%r8d, %ecx
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	kmovd	%k1, %eax
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret_1)
+	xorl	%eax, %eax
+L(ret_1):
+	ret
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 6
+L(loop_end):
+	kmovd	%k1, %ecx
+	notl	%ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vptestnmb %VEC(2), %VEC(2), %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	kmovd	%k2, %ecx
+	kmovd	%k4, %esi
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	addq	%rcx, %rax
+	ret
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	addq	$(VEC_SIZE), %rax
+L(ret_vec_x1_end):
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
 	ret
-END (__memrchr_evex)
+
+END(MEMRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v3 6/8] x86: Optimize memrchr-avx2.S
  2022-06-03 23:49   ` [PATCH v3 " Noah Goldstein
                       ` (3 preceding siblings ...)
  2022-06-03 23:49     ` [PATCH v3 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
@ 2022-06-03 23:49     ` Noah Goldstein
  2022-06-03 23:50     ` [PATCH v3 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
  2022-06-03 23:50     ` [PATCH v3 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 23:49 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller user-arg lengths more.
    2. optimizes target placement more carefully
    3. reuses logic more
    4. fixes up various inefficiencies in the logic. The biggest
       case here is the `lzcnt` logic for checking returns which
       saves either a branch or multiple instructions.

The total code size saving is: 306 bytes
Geometric Mean of all benchmarks New / Old: 0.760

Regressions:
There are some regressions. Particularly where the length (user arg
length) is large but the position of the match char is near the
begining of the string (in first VEC). This case has roughly a
10-20% regression.

This is because the new logic gives the hot path for immediate matches
to shorter lengths (the more common input). This case has roughly
a 15-45% speedup.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S |   1 +
 sysdeps/x86_64/multiarch/memrchr-avx2.S     | 538 ++++++++++----------
 2 files changed, 260 insertions(+), 279 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
index cea2d2a72d..5e9beeeef2 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
@@ -2,6 +2,7 @@
 # define MEMRCHR __memrchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index ba2ce7cb03..6915e1c373 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -21,340 +21,320 @@
 # include <sysdep.h>
 
 # ifndef MEMRCHR
-#  define MEMRCHR	__memrchr_avx2
+#  define MEMRCHR				__memrchr_avx2
 # endif
 
 # ifndef VZEROUPPER
-#  define VZEROUPPER	vzeroupper
+#  define VZEROUPPER			vzeroupper
 # endif
 
+// abf-off
 # ifndef SECTION
 #  define SECTION(p)	p##.avx
 # endif
+// abf-on
+
+# define VEC_SIZE			32
+# define PAGE_SIZE			4096
+	.section SECTION(.text), "ax", @progbits
+ENTRY(MEMRCHR)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
 
-# define VEC_SIZE 32
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMRCHR)
-	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
-	vpbroadcastb %xmm0, %ymm0
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdx, %rdi), %rax
 
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
+	vpbroadcastb %xmm0, %ymm0
 
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+
+L(ret_vec_x0_test):
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will gurantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+
+	/* Hoist vzeroupper (not great for RTM) to save code size. This allows
+	   all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vmovdqa	(%rdi), %ymm1
-	vmovdqa	VEC_SIZE(%rdi), %ymm2
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
-
-	vpcmpeqb %ymm1, %ymm0, %ymm1
-	vpcmpeqb %ymm2, %ymm0, %ymm2
-	vpcmpeqb %ymm3, %ymm0, %ymm3
-	vpcmpeqb %ymm4, %ymm0, %ymm4
-
-	vpor	%ymm1, %ymm2, %ymm5
-	vpor	%ymm3, %ymm4, %ymm6
-	vpor	%ymm5, %ymm6, %ymm5
-
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpmovmskb %ymm1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Align rax (string pointer).  */
+	andq	$-VEC_SIZE, %rax
+
+	/* Recompute remaining length after aligning.  */
+	movq	%rax, %rdx
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
+	subq	%rdi, %rdx
+	decq	%rax
+	vpmovmskb %ymm1, %ecx
+	/* Fall through for short (hotter than length).  */
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpeqb (%rsi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %r8d
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%r8d
+	shlxl	%r8d, %ecx, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
+	.p2align 4,, 11
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
+	.p2align 4,, 10
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	VZEROUPPER_RETURN
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	.p2align 4
-L(null):
+	/* First in aligning bytes.  */
+L(zero_2):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
+	.p2align 4,, 4
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	vpcmpeqb (%rdi), %ymm0, %ymm1
 
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
+	.p2align 4,, 11
+L(ret_vec_x2):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4,, 14
+L(ret_vec_x3):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
 	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(null)
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecissary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	/* Align rax to (VEC_SIZE - 1).  */
+	orq	$(VEC_SIZE * 4 - 1), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	orq	$(VEC_SIZE * 4 - 1), %rdx
 
-	/* Check the last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	.p2align 4
+L(loop_4x_vec):
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	vpor	%ymm1, %ymm2, %ymm2
+	vpor	%ymm3, %ymm4, %ymm4
+	vpor	%ymm2, %ymm4, %ymm4
+	vpmovmskb %ymm4, %esi
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	testl	%esi, %esi
+	jnz	L(loop_end)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	VZEROUPPER_RETURN
+	addq	$(VEC_SIZE * -4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
 
-	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
+	subl	%edi, %edx
+	incl	%edx
 
-	/* Check the last VEC.  */
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
+L(last_4x_vec):
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	vpmovmskb %ymm1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret0)
+	xorl	%eax, %eax
+L(ret0):
+	ret
 
-	movl	%r8d, %ecx
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(loop_end):
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vpmovmskb %ymm2, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	vpmovmskb %ymm3, %ecx
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	.p2align 4,, 4
+L(ret_vec_x1_end):
+	/* 64-bit version will automatically add 32 (VEC_SIZE).  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
+	VZEROUPPER_RETURN
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
-END (MEMRCHR)
+
+	/* 2 bytes until next cache line.  */
+END(MEMRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v3 7/8] x86: Shrink code size of memchr-avx2.S
  2022-06-03 23:49   ` [PATCH v3 " Noah Goldstein
                       ` (4 preceding siblings ...)
  2022-06-03 23:49     ` [PATCH v3 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
@ 2022-06-03 23:50     ` Noah Goldstein
  2022-06-03 23:50     ` [PATCH v3 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 23:50 UTC (permalink / raw)
  To: libc-alpha

This is not meant as a performance optimization. The previous code was
far to liberal in aligning targets and wasted code size unnecissarily.

The total code size saving is: 59 bytes

There are no major changes in the benchmarks.
Geometric Mean of all benchmarks New / Old: 0.967

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S |   1 +
 sysdeps/x86_64/multiarch/memchr-avx2.S     | 109 +++++++++++----------
 2 files changed, 60 insertions(+), 50 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
index 87b076c7c4..c4d71938c5 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
@@ -2,6 +2,7 @@
 # define MEMCHR __memchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 75bd7262e0..28a01280ec 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -57,7 +57,7 @@
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 5)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 #  ifdef __ILP32__
@@ -87,12 +87,14 @@ ENTRY (MEMCHR)
 # endif
 	testl	%eax, %eax
 	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
 
 # ifndef USE_AS_RAWMEMCHR
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
 	/* Check if first match was before length.  */
 	tzcntl	%eax, %eax
@@ -100,58 +102,31 @@ L(first_vec_x0):
 	/* NB: Multiply length by 4 to get byte count.  */
 	sall	$2, %edx
 #  endif
-	xorl	%ecx, %ecx
+    COND_VZEROUPPER
+	/* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
+	   block. branch here as opposed to cmovcc is not that costly. Common
+	   usage of memchr is to check if the return was NULL (if string was
+	   known to contain CHAR user would use rawmemchr). This branch will be
+	   highly correlated with the user branch and can be used by most
+	   modern branch predictors to predict the user branch.  */
 	cmpl	%eax, %edx
-	leaq	(%rdi, %rax), %rax
-	cmovle	%rcx, %rax
-	VZEROUPPER_RETURN
-
-L(null):
-	xorl	%eax, %eax
-	ret
-# endif
-	.p2align 4
-L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is
-	   necessary for computer return address if byte is found or
-	   adjusting length if it is not and this is memchr.  */
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
-	   and rdi for rawmemchr.  */
-	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
-	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Calculate length until end of page (length checked for a
-	   match).  */
-	leaq	1(%ALGN_PTR_REG), %rsi
-	subq	%RRAW_PTR_REG, %rsi
-#  ifdef USE_AS_WMEMCHR
-	/* NB: Divide bytes by 4 to get wchar_t count.  */
-	shrl	$2, %esi
-#  endif
-# endif
-	/* Remove the leading bytes.  */
-	sarxl	%ERAW_PTR_REG, %eax, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Check the end of data.  */
-	cmpq	%rsi, %rdx
-	jbe	L(first_vec_x0)
+    jle  L(null)
+	addq	%rdi, %rax
+    ret
 # endif
-	testl	%eax, %eax
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
-	addq	%RRAW_PTR_REG, %rax
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x1):
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-
+# ifndef USE_AS_RAWMEMCHR
+	/* First in aligning bytes here.  */
+L(null):
+	xorl	%eax, %eax
+	ret
+# endif
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
@@ -340,7 +315,7 @@ L(first_vec_x1_check):
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-	.p2align 4
+	.p2align 4,, 6
 L(set_zero_end):
 	xorl	%eax, %eax
 	VZEROUPPER_RETURN
@@ -428,5 +403,39 @@ L(last_vec_x3):
 	VZEROUPPER_RETURN
 # endif
 
+	.p2align 4
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is necessary for
+	   computer return address if byte is found or adjusting length if it
+	   is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
+	   rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMPEQ	(%ALGN_PTR_REG), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length checked for a match).  */
+	leal	VEC_SIZE(%ALGN_PTR_REG), %esi
+	subl	%ERAW_PTR_REG, %esi
+# ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+# endif
+# endif
+	/* Remove the leading bytes.  */
+	sarxl	%ERAW_PTR_REG, %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Check the end of data.  */
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	bsfl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
+	VZEROUPPER_RETURN
+
+
 END (MEMCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v3 8/8] x86: Shrink code size of memchr-evex.S
  2022-06-03 23:49   ` [PATCH v3 " Noah Goldstein
                       ` (5 preceding siblings ...)
  2022-06-03 23:50     ` [PATCH v3 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
@ 2022-06-03 23:50     ` Noah Goldstein
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03 23:50 UTC (permalink / raw)
  To: libc-alpha

This is not meant as a performance optimization. The previous code was
far to liberal in aligning targets and wasted code size unnecissarily.

The total code size saving is: 64 bytes

There are no non-negligible changes in the benchmarks.
Geometric Mean of all benchmarks New / Old: 1.000

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memchr-evex.S | 46 ++++++++++++++------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index cfaf02907d..0fd11b7632 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -88,7 +88,7 @@
 # define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 6)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
@@ -131,22 +131,24 @@ L(zero):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
-	/* Check if first match was before length.  */
-	tzcntl	%eax, %eax
-	xorl	%ecx, %ecx
-	cmpl	%eax, %edx
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-	cmovle	%rcx, %rax
+	/* Check if first match was before length. NB: tzcnt has false data-
+	   dependency on destination. eax already had a data-dependency on esi
+	   so this should have no affect here.  */
+	tzcntl	%eax, %esi
+#  ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
+#  else
+	addq	%rsi, %rdi
+#  endif
+	xorl	%eax, %eax
+	cmpl	%esi, %edx
+	cmovg	%rdi, %rax
 	ret
-# else
-	/* NB: first_vec_x0 is 17 bytes which will leave
-	   cross_page_boundary (which is relatively cold) close enough
-	   to ideal alignment. So only realign L(cross_page_boundary) if
-	   rawmemchr.  */
-	.p2align 4
 # endif
+
+	.p2align 4
 L(cross_page_boundary):
 	/* Save pointer before aligning as its original value is
 	   necessary for computer return address if byte is found or
@@ -400,10 +402,14 @@ L(last_2x_vec):
 L(zero_end):
 	ret
 
+L(set_zero_end):
+	xorl	%eax, %eax
+	ret
 
 	.p2align 4
 L(first_vec_x1_check):
-	tzcntl	%eax, %eax
+	/* eax must be non-zero. Use bsfl to save code size.  */
+	bsfl	%eax, %eax
 	/* Adjust length.  */
 	subl	$-(CHAR_PER_VEC * 4), %edx
 	/* Check if match within remaining length.  */
@@ -412,9 +418,6 @@ L(first_vec_x1_check):
 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 	ret
-L(set_zero_end):
-	xorl	%eax, %eax
-	ret
 
 	.p2align 4
 L(loop_4x_vec_end):
@@ -464,7 +467,7 @@ L(loop_4x_vec_end):
 # endif
 	ret
 
-	.p2align 4
+	.p2align 4,, 10
 L(last_vec_x1_return):
 	tzcntl	%eax, %eax
 # if defined USE_AS_WMEMCHR || RET_OFFSET != 0
@@ -496,6 +499,7 @@ L(last_vec_x3_return):
 # endif
 
 # ifndef USE_AS_RAWMEMCHR
+	.p2align 4,, 5
 L(last_4x_vec_or_less_cmpeq):
 	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
 	kmovd	%k0, %eax
@@ -546,7 +550,7 @@ L(last_4x_vec):
 #  endif
 	andl	%ecx, %eax
 	jz	L(zero_end2)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 L(zero_end2):
 	ret
@@ -562,6 +566,6 @@ L(last_vec_x3):
 	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 # endif
-
+	/* 7 bytes from next cache line.  */
 END (MEMCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v4 1/8] x86: Create header for VEC classes in x86 strings library
  2022-06-03  4:42 ` [PATCH v1 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
  2022-06-03 20:04   ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  2022-06-03 23:49   ` [PATCH v3 " Noah Goldstein
@ 2022-06-06 22:37   ` Noah Goldstein
  2022-06-06 22:37     ` [PATCH v4 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
                       ` (6 more replies)
  2022-06-07  4:05   ` [PATCH v5 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  2022-06-07  4:11   ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  4 siblings, 7 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-06 22:37 UTC (permalink / raw)
  To: libc-alpha

This patch does not touch any existing code and is only meant to be a
tool for future patches so that simple source files can more easily be
maintained to target multiple VEC classes.

There is no difference in the objdump of libc.so before and after this
patch.
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h     | 34 ++++++++
 sysdeps/x86_64/multiarch/avx-vecs.h         | 47 +++++++++++
 sysdeps/x86_64/multiarch/evex-vecs-common.h | 39 +++++++++
 sysdeps/x86_64/multiarch/evex256-vecs.h     | 35 ++++++++
 sysdeps/x86_64/multiarch/evex512-vecs.h     | 35 ++++++++
 sysdeps/x86_64/multiarch/sse2-vecs.h        | 47 +++++++++++
 sysdeps/x86_64/multiarch/vec-macros.h       | 90 +++++++++++++++++++++
 7 files changed, 327 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/evex-vecs-common.h
 create mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/vec-macros.h

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
new file mode 100644
index 0000000000..3f531dd47f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
@@ -0,0 +1,34 @@
+/* Common config for AVX-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_RTM_VECS_H
+#define _AVX_RTM_VECS_H			1
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define USE_WITH_RTM			1
+#include "avx-vecs.h"
+
+#undef SECTION
+#define SECTION(p)				p##.avx.rtm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
new file mode 100644
index 0000000000..89680f5db8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for AVX VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_VECS_H
+#define _AVX_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "vec-macros.h"
+
+#define USE_WITH_AVX		1
+#define SECTION(p)			p##.avx
+
+/* 4-byte mov instructions with AVX2.  */
+#define MOV_SIZE			4
+/* 1 (ret) + 3 (vzeroupper).  */
+#define RET_SIZE			4
+#define VZEROUPPER			vzeroupper
+
+#define VMOVU				vmovdqu
+#define VMOVA				vmovdqa
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
new file mode 100644
index 0000000000..99806ebcd7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex-vecs-common.h
@@ -0,0 +1,39 @@
+/* Common config for EVEX256 and EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX_VECS_COMMON_H
+#define _EVEX_VECS_COMMON_H			1
+
+#include "vec-macros.h"
+
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+#define VEC_xmm				VEC_hi_xmm
+#define VEC_ymm				VEC_hi_ymm
+#define VEC_zmm				VEC_hi_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
new file mode 100644
index 0000000000..222ba46dc7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
@@ -0,0 +1,35 @@
+/* Common config for EVEX256 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX256_VECS_H
+#define _EVEX256_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "evex-vecs-common.h"
+
+#define USE_WITH_EVEX256	1
+#define SECTION(p)			p##.evex
+
+#define VEC					VEC_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
new file mode 100644
index 0000000000..d1784d5368
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
@@ -0,0 +1,35 @@
+/* Common config for EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX512_VECS_H
+#define _EVEX512_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			64
+#include "evex-vecs-common.h"
+
+#define USE_WITH_EVEX512	1
+#define SECTION(p)			p##.evex512
+
+#define VEC					VEC_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
new file mode 100644
index 0000000000..2b77a59d56
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for SSE2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SSE2_VECS_H
+#define _SSE2_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			16
+#include "vec-macros.h"
+
+#define USE_WITH_SSE2		1
+#define SECTION(p)			p
+
+/* 3-byte mov instructions with SSE2.  */
+#define MOV_SIZE			3
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				movups
+#define VMOVA				movaps
+#define VMOVNT				movntdq
+
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_xmm
+
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
new file mode 100644
index 0000000000..9f3ffecede
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/vec-macros.h
@@ -0,0 +1,90 @@
+/* Macro helpers for VEC_{type}({vec_num})
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _VEC_MACROS_H
+#define _VEC_MACROS_H			1
+
+#ifndef VEC_SIZE
+# error "Never include this file directly. Always include a vector config."
+#endif
+
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
+   VEC(N) values.  */
+#define VEC_hi_xmm0				xmm16
+#define VEC_hi_xmm1				xmm17
+#define VEC_hi_xmm2				xmm18
+#define VEC_hi_xmm3				xmm19
+#define VEC_hi_xmm4				xmm20
+#define VEC_hi_xmm5				xmm21
+#define VEC_hi_xmm6				xmm22
+#define VEC_hi_xmm7				xmm23
+#define VEC_hi_xmm8				xmm24
+#define VEC_hi_xmm9				xmm25
+#define VEC_hi_xmm10			xmm26
+#define VEC_hi_xmm11			xmm27
+#define VEC_hi_xmm12			xmm28
+#define VEC_hi_xmm13			xmm29
+#define VEC_hi_xmm14			xmm30
+#define VEC_hi_xmm15			xmm31
+
+#define VEC_hi_ymm0				ymm16
+#define VEC_hi_ymm1				ymm17
+#define VEC_hi_ymm2				ymm18
+#define VEC_hi_ymm3				ymm19
+#define VEC_hi_ymm4				ymm20
+#define VEC_hi_ymm5				ymm21
+#define VEC_hi_ymm6				ymm22
+#define VEC_hi_ymm7				ymm23
+#define VEC_hi_ymm8				ymm24
+#define VEC_hi_ymm9				ymm25
+#define VEC_hi_ymm10			ymm26
+#define VEC_hi_ymm11			ymm27
+#define VEC_hi_ymm12			ymm28
+#define VEC_hi_ymm13			ymm29
+#define VEC_hi_ymm14			ymm30
+#define VEC_hi_ymm15			ymm31
+
+#define VEC_hi_zmm0				zmm16
+#define VEC_hi_zmm1				zmm17
+#define VEC_hi_zmm2				zmm18
+#define VEC_hi_zmm3				zmm19
+#define VEC_hi_zmm4				zmm20
+#define VEC_hi_zmm5				zmm21
+#define VEC_hi_zmm6				zmm22
+#define VEC_hi_zmm7				zmm23
+#define VEC_hi_zmm8				zmm24
+#define VEC_hi_zmm9				zmm25
+#define VEC_hi_zmm10			zmm26
+#define VEC_hi_zmm11			zmm27
+#define VEC_hi_zmm12			zmm28
+#define VEC_hi_zmm13			zmm29
+#define VEC_hi_zmm14			zmm30
+#define VEC_hi_zmm15			zmm31
+
+#define PRIMITIVE_VEC(vec, num)		vec##num
+
+#define VEC_any_xmm(i)			PRIMITIVE_VEC(xmm, i)
+#define VEC_any_ymm(i)			PRIMITIVE_VEC(ymm, i)
+#define VEC_any_zmm(i)			PRIMITIVE_VEC(zmm, i)
+
+#define VEC_hi_xmm(i)			PRIMITIVE_VEC(VEC_hi_xmm, i)
+#define VEC_hi_ymm(i)			PRIMITIVE_VEC(VEC_hi_ymm, i)
+#define VEC_hi_zmm(i)			PRIMITIVE_VEC(VEC_hi_zmm, i)
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v4 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
  2022-06-06 22:37   ` [PATCH v4 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
@ 2022-06-06 22:37     ` Noah Goldstein
  2022-06-07  2:45       ` H.J. Lu
  2022-06-06 22:37     ` [PATCH v4 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
                       ` (5 subsequent siblings)
  6 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-06 22:37 UTC (permalink / raw)
  To: libc-alpha

The RTM vzeroupper mitigation has no way of replacing inline
vzeroupper not before a return.

This can be useful when hoisting a vzeroupper to save code size
for example:

```
L(foo):
	cmpl	%eax, %edx
	jz	L(bar)
	tzcntl	%eax, %eax
	addq	%rdi, %rax
	VZEROUPPER_RETURN

L(bar):
	xorl	%eax, %eax
	VZEROUPPER_RETURN
```

Can become:

```
L(foo):
	COND_VZEROUPPER
	cmpl	%eax, %edx
	jz	L(bar)
	tzcntl	%eax, %eax
	addq	%rdi, %rax
	ret

L(bar):
	xorl	%eax, %eax
	ret
```

This code does not change any existing functionality.

There is no difference in the objdump of libc.so before and after this
patch.
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h |  1 +
 sysdeps/x86_64/sysdep.h                 | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
index 3f531dd47f..6ca9f5e6ba 100644
--- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
@@ -20,6 +20,7 @@
 #ifndef _AVX_RTM_VECS_H
 #define _AVX_RTM_VECS_H			1
 
+#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN	\
 	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index f14d50786d..4f512d5566 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -106,6 +106,24 @@ lose:									      \
 	vzeroupper;						\
 	ret
 
+/* Can be used to replace vzeroupper that is not directly before a
+   return.  This is useful when hoisting a vzeroupper from multiple
+   return paths to decrease the total number of vzerouppers and code
+   size.  */
+#define COND_VZEROUPPER_XTEST							\
+    xtest;							\
+    jz 1f;							\
+    vzeroall;							\
+    jmp 2f;							\
+1:							\
+    vzeroupper;							\
+2:
+
+/* In RTM define this as COND_VZEROUPPER_XTEST.  */
+#ifndef COND_VZEROUPPER
+# define COND_VZEROUPPER vzeroupper
+#endif
+
 /* Zero upper vector registers and return.  */
 #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
 # define ZERO_UPPER_VEC_REGISTERS_RETURN \
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v4 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
  2022-06-06 22:37     ` [PATCH v4 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
@ 2022-06-07  2:45       ` H.J. Lu
  2022-07-14  2:12         ` Sunil Pandey
  0 siblings, 1 reply; 82+ messages in thread
From: H.J. Lu @ 2022-06-07  2:45 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 3:37 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The RTM vzeroupper mitigation has no way of replacing inline
> vzeroupper not before a return.
>
> This can be useful when hoisting a vzeroupper to save code size
> for example:
>
> ```
> L(foo):
>         cmpl    %eax, %edx
>         jz      L(bar)
>         tzcntl  %eax, %eax
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
>
> L(bar):
>         xorl    %eax, %eax
>         VZEROUPPER_RETURN
> ```
>
> Can become:
>
> ```
> L(foo):
>         COND_VZEROUPPER
>         cmpl    %eax, %edx
>         jz      L(bar)
>         tzcntl  %eax, %eax
>         addq    %rdi, %rax
>         ret
>
> L(bar):
>         xorl    %eax, %eax
>         ret
> ```
>
> This code does not change any existing functionality.
>
> There is no difference in the objdump of libc.so before and after this
> patch.
> ---
>  sysdeps/x86_64/multiarch/avx-rtm-vecs.h |  1 +
>  sysdeps/x86_64/sysdep.h                 | 18 ++++++++++++++++++
>  2 files changed, 19 insertions(+)
>
> diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> index 3f531dd47f..6ca9f5e6ba 100644
> --- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> +++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> @@ -20,6 +20,7 @@
>  #ifndef _AVX_RTM_VECS_H
>  #define _AVX_RTM_VECS_H                        1
>
> +#define COND_VZEROUPPER                        COND_VZEROUPPER_XTEST
>  #define ZERO_UPPER_VEC_REGISTERS_RETURN        \
>         ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>
> diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
> index f14d50786d..4f512d5566 100644
> --- a/sysdeps/x86_64/sysdep.h
> +++ b/sysdeps/x86_64/sysdep.h
> @@ -106,6 +106,24 @@ lose:                                                                            \
>         vzeroupper;                                             \
>         ret
>
> +/* Can be used to replace vzeroupper that is not directly before a
> +   return.  This is useful when hoisting a vzeroupper from multiple
> +   return paths to decrease the total number of vzerouppers and code
> +   size.  */
> +#define COND_VZEROUPPER_XTEST                                                  \
> +    xtest;                                                     \
> +    jz 1f;                                                     \
> +    vzeroall;                                                  \
> +    jmp 2f;                                                    \
> +1:                                                     \
> +    vzeroupper;                                                        \
> +2:
> +
> +/* In RTM define this as COND_VZEROUPPER_XTEST.  */
> +#ifndef COND_VZEROUPPER
> +# define COND_VZEROUPPER vzeroupper
> +#endif
> +
>  /* Zero upper vector registers and return.  */
>  #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
>  # define ZERO_UPPER_VEC_REGISTERS_RETURN \
> --
> 2.34.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.


-- 
H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v4 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
  2022-06-07  2:45       ` H.J. Lu
@ 2022-07-14  2:12         ` Sunil Pandey
  0 siblings, 0 replies; 82+ messages in thread
From: Sunil Pandey @ 2022-07-14  2:12 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Noah Goldstein, GNU C Library

On Mon, Jun 6, 2022 at 7:46 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Mon, Jun 6, 2022 at 3:37 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The RTM vzeroupper mitigation has no way of replacing inline
> > vzeroupper not before a return.
> >
> > This can be useful when hoisting a vzeroupper to save code size
> > for example:
> >
> > ```
> > L(foo):
> >         cmpl    %eax, %edx
> >         jz      L(bar)
> >         tzcntl  %eax, %eax
> >         addq    %rdi, %rax
> >         VZEROUPPER_RETURN
> >
> > L(bar):
> >         xorl    %eax, %eax
> >         VZEROUPPER_RETURN
> > ```
> >
> > Can become:
> >
> > ```
> > L(foo):
> >         COND_VZEROUPPER
> >         cmpl    %eax, %edx
> >         jz      L(bar)
> >         tzcntl  %eax, %eax
> >         addq    %rdi, %rax
> >         ret
> >
> > L(bar):
> >         xorl    %eax, %eax
> >         ret
> > ```
> >
> > This code does not change any existing functionality.
> >
> > There is no difference in the objdump of libc.so before and after this
> > patch.
> > ---
> >  sysdeps/x86_64/multiarch/avx-rtm-vecs.h |  1 +
> >  sysdeps/x86_64/sysdep.h                 | 18 ++++++++++++++++++
> >  2 files changed, 19 insertions(+)
> >
> > diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> > index 3f531dd47f..6ca9f5e6ba 100644
> > --- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> > +++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> > @@ -20,6 +20,7 @@
> >  #ifndef _AVX_RTM_VECS_H
> >  #define _AVX_RTM_VECS_H                        1
> >
> > +#define COND_VZEROUPPER                        COND_VZEROUPPER_XTEST
> >  #define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> >         ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> >
> > diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
> > index f14d50786d..4f512d5566 100644
> > --- a/sysdeps/x86_64/sysdep.h
> > +++ b/sysdeps/x86_64/sysdep.h
> > @@ -106,6 +106,24 @@ lose:                                                                            \
> >         vzeroupper;                                             \
> >         ret
> >
> > +/* Can be used to replace vzeroupper that is not directly before a
> > +   return.  This is useful when hoisting a vzeroupper from multiple
> > +   return paths to decrease the total number of vzerouppers and code
> > +   size.  */
> > +#define COND_VZEROUPPER_XTEST                                                  \
> > +    xtest;                                                     \
> > +    jz 1f;                                                     \
> > +    vzeroall;                                                  \
> > +    jmp 2f;                                                    \
> > +1:                                                     \
> > +    vzeroupper;                                                        \
> > +2:
> > +
> > +/* In RTM define this as COND_VZEROUPPER_XTEST.  */
> > +#ifndef COND_VZEROUPPER
> > +# define COND_VZEROUPPER vzeroupper
> > +#endif
> > +
> >  /* Zero upper vector registers and return.  */
> >  #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
> >  # define ZERO_UPPER_VEC_REGISTERS_RETURN \
> > --
> > 2.34.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v4 3/8] Benchtests: Improve memrchr benchmarks
  2022-06-06 22:37   ` [PATCH v4 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  2022-06-06 22:37     ` [PATCH v4 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
@ 2022-06-06 22:37     ` Noah Goldstein
  2022-06-07  2:44       ` H.J. Lu
  2022-06-06 22:37     ` [PATCH v4 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
                       ` (4 subsequent siblings)
  6 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-06 22:37 UTC (permalink / raw)
  To: libc-alpha

Add a second iteration for memrchr to set `pos` starting from the end
of the buffer.

Previously `pos` was only set relative to the begining of the
buffer. This isn't really useful for memchr because the begining
of the search space is (buf + len).
---
 benchtests/bench-memchr.c | 110 ++++++++++++++++++++++----------------
 1 file changed, 65 insertions(+), 45 deletions(-)

diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
index 4d7212332f..0facda2fa0 100644
--- a/benchtests/bench-memchr.c
+++ b/benchtests/bench-memchr.c
@@ -76,7 +76,7 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
 
 static void
 do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
-	 int seek_char)
+	 int seek_char, int invert_pos)
 {
   size_t i;
 
@@ -96,7 +96,10 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
 
   if (pos < len)
     {
-      buf[align + pos] = seek_char;
+      if (invert_pos)
+	buf[align + len - pos] = seek_char;
+      else
+	buf[align + pos] = seek_char;
       buf[align + len] = -seek_char;
     }
   else
@@ -109,6 +112,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
   json_attr_uint (json_ctx, "pos", pos);
   json_attr_uint (json_ctx, "len", len);
   json_attr_uint (json_ctx, "seek_char", seek_char);
+  json_attr_uint (json_ctx, "invert_pos", invert_pos);
 
   json_array_begin (json_ctx, "timings");
 
@@ -123,6 +127,7 @@ int
 test_main (void)
 {
   size_t i;
+  int repeats;
   json_ctx_t json_ctx;
   test_init ();
 
@@ -142,53 +147,68 @@ test_main (void)
 
   json_array_begin (&json_ctx, "results");
 
-  for (i = 1; i < 8; ++i)
+  for (repeats = 0; repeats < 2; ++repeats)
     {
-      do_test (&json_ctx, 0, 16 << i, 2048, 23);
-      do_test (&json_ctx, i, 64, 256, 23);
-      do_test (&json_ctx, 0, 16 << i, 2048, 0);
-      do_test (&json_ctx, i, 64, 256, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, 64, 256, 0);
+      for (i = 1; i < 8; ++i)
+	{
+	  do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
+	  do_test (&json_ctx, i, 64, 256, 23, repeats);
+	  do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
+	  do_test (&json_ctx, i, 64, 256, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
 #ifdef USE_AS_MEMRCHR
-      /* Also test the position close to the beginning for memrchr.  */
-      do_test (&json_ctx, 0, i, 256, 23);
-      do_test (&json_ctx, 0, i, 256, 0);
-      do_test (&json_ctx, i, i, 256, 23);
-      do_test (&json_ctx, i, i, 256, 0);
+	  /* Also test the position close to the beginning for memrchr.  */
+	  do_test (&json_ctx, 0, i, 256, 23, repeats);
+	  do_test (&json_ctx, 0, i, 256, 0, repeats);
+	  do_test (&json_ctx, i, i, 256, 23, repeats);
+	  do_test (&json_ctx, i, i, 256, 0, repeats);
 #endif
-    }
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (&json_ctx, i, i << 5, 192, 23);
-      do_test (&json_ctx, i, i << 5, 192, 0);
-      do_test (&json_ctx, i, i << 5, 256, 23);
-      do_test (&json_ctx, i, i << 5, 256, 0);
-      do_test (&json_ctx, i, i << 5, 512, 23);
-      do_test (&json_ctx, i, i << 5, 512, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23);
-    }
-  for (i = 1; i < 32; ++i)
-    {
-      do_test (&json_ctx, 0, i, i + 1, 23);
-      do_test (&json_ctx, 0, i, i + 1, 0);
-      do_test (&json_ctx, i, i, i + 1, 23);
-      do_test (&json_ctx, i, i, i + 1, 0);
-      do_test (&json_ctx, 0, i, i - 1, 23);
-      do_test (&json_ctx, 0, i, i - 1, 0);
-      do_test (&json_ctx, i, i, i - 1, 23);
-      do_test (&json_ctx, i, i, i - 1, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23);
-      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23);
-      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0);
+	}
+      for (i = 1; i < 8; ++i)
+	{
+	  do_test (&json_ctx, i, i << 5, 192, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 192, 0, repeats);
+	  do_test (&json_ctx, i, i << 5, 256, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 256, 0, repeats);
+	  do_test (&json_ctx, i, i << 5, 512, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 512, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
+	}
+      for (i = 1; i < 32; ++i)
+	{
+	  do_test (&json_ctx, 0, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, 0, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, i, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, i, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, 0, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, 0, i, i - 1, 0, repeats);
+	  do_test (&json_ctx, i, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, i, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () / 2, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i - 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0, repeats);
+
 #ifdef USE_AS_MEMRCHR
-      /* Also test the position close to the beginning for memrchr.  */
-      do_test (&json_ctx, 0, 1, i + 1, 23);
-      do_test (&json_ctx, 0, 2, i + 1, 0);
+	  do_test (&json_ctx, 0, 1, i + 1, 23, repeats);
+	  do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
+#endif
+	}
+#ifndef USE_AS_MEMRCHR
+      break;
 #endif
     }
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v4 3/8] Benchtests: Improve memrchr benchmarks
  2022-06-06 22:37     ` [PATCH v4 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
@ 2022-06-07  2:44       ` H.J. Lu
  2022-06-07  4:10         ` Noah Goldstein
  0 siblings, 1 reply; 82+ messages in thread
From: H.J. Lu @ 2022-06-07  2:44 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 3:37 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Add a second iteration for memrchr to set `pos` starting from the end
> of the buffer.
>
> Previously `pos` was only set relative to the begining of the

   beginning
> buffer. This isn't really useful for memchr because the beginning
                                                       memrchr
> of the search space is (buf + len).
> ---
>  benchtests/bench-memchr.c | 110 ++++++++++++++++++++++----------------
>  1 file changed, 65 insertions(+), 45 deletions(-)
>
> diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> index 4d7212332f..0facda2fa0 100644
> --- a/benchtests/bench-memchr.c
> +++ b/benchtests/bench-memchr.c
> @@ -76,7 +76,7 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
>
>  static void
>  do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> -        int seek_char)
> +        int seek_char, int invert_pos)
>  {
>    size_t i;
>
> @@ -96,7 +96,10 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
>
>    if (pos < len)
>      {
> -      buf[align + pos] = seek_char;
> +      if (invert_pos)
> +       buf[align + len - pos] = seek_char;
> +      else
> +       buf[align + pos] = seek_char;
>        buf[align + len] = -seek_char;
>      }
>    else
> @@ -109,6 +112,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
>    json_attr_uint (json_ctx, "pos", pos);
>    json_attr_uint (json_ctx, "len", len);
>    json_attr_uint (json_ctx, "seek_char", seek_char);
> +  json_attr_uint (json_ctx, "invert_pos", invert_pos);
>
>    json_array_begin (json_ctx, "timings");
>
> @@ -123,6 +127,7 @@ int
>  test_main (void)
>  {
>    size_t i;
> +  int repeats;
>    json_ctx_t json_ctx;
>    test_init ();
>
> @@ -142,53 +147,68 @@ test_main (void)
>
>    json_array_begin (&json_ctx, "results");
>
> -  for (i = 1; i < 8; ++i)
> +  for (repeats = 0; repeats < 2; ++repeats)
>      {
> -      do_test (&json_ctx, 0, 16 << i, 2048, 23);
> -      do_test (&json_ctx, i, 64, 256, 23);
> -      do_test (&json_ctx, 0, 16 << i, 2048, 0);
> -      do_test (&json_ctx, i, 64, 256, 0);
> -
> -      do_test (&json_ctx, getpagesize () - 15, 64, 256, 0);
> +      for (i = 1; i < 8; ++i)
> +       {
> +         do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
> +         do_test (&json_ctx, i, 64, 256, 23, repeats);
> +         do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
> +         do_test (&json_ctx, i, 64, 256, 0, repeats);
> +
> +         do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
>  #ifdef USE_AS_MEMRCHR
> -      /* Also test the position close to the beginning for memrchr.  */
> -      do_test (&json_ctx, 0, i, 256, 23);
> -      do_test (&json_ctx, 0, i, 256, 0);
> -      do_test (&json_ctx, i, i, 256, 23);
> -      do_test (&json_ctx, i, i, 256, 0);
> +         /* Also test the position close to the beginning for memrchr.  */
> +         do_test (&json_ctx, 0, i, 256, 23, repeats);
> +         do_test (&json_ctx, 0, i, 256, 0, repeats);
> +         do_test (&json_ctx, i, i, 256, 23, repeats);
> +         do_test (&json_ctx, i, i, 256, 0, repeats);
>  #endif
> -    }
> -  for (i = 1; i < 8; ++i)
> -    {
> -      do_test (&json_ctx, i, i << 5, 192, 23);
> -      do_test (&json_ctx, i, i << 5, 192, 0);
> -      do_test (&json_ctx, i, i << 5, 256, 23);
> -      do_test (&json_ctx, i, i << 5, 256, 0);
> -      do_test (&json_ctx, i, i << 5, 512, 23);
> -      do_test (&json_ctx, i, i << 5, 512, 0);
> -
> -      do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23);
> -    }
> -  for (i = 1; i < 32; ++i)
> -    {
> -      do_test (&json_ctx, 0, i, i + 1, 23);
> -      do_test (&json_ctx, 0, i, i + 1, 0);
> -      do_test (&json_ctx, i, i, i + 1, 23);
> -      do_test (&json_ctx, i, i, i + 1, 0);
> -      do_test (&json_ctx, 0, i, i - 1, 23);
> -      do_test (&json_ctx, 0, i, i - 1, 0);
> -      do_test (&json_ctx, i, i, i - 1, 23);
> -      do_test (&json_ctx, i, i, i - 1, 0);
> -
> -      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23);
> -      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0);
> -
> -      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23);
> -      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0);
> +       }
> +      for (i = 1; i < 8; ++i)
> +       {
> +         do_test (&json_ctx, i, i << 5, 192, 23, repeats);
> +         do_test (&json_ctx, i, i << 5, 192, 0, repeats);
> +         do_test (&json_ctx, i, i << 5, 256, 23, repeats);
> +         do_test (&json_ctx, i, i << 5, 256, 0, repeats);
> +         do_test (&json_ctx, i, i << 5, 512, 23, repeats);
> +         do_test (&json_ctx, i, i << 5, 512, 0, repeats);
> +
> +         do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
> +       }
> +      for (i = 1; i < 32; ++i)
> +       {
> +         do_test (&json_ctx, 0, i, i + 1, 23, repeats);
> +         do_test (&json_ctx, 0, i, i + 1, 0, repeats);
> +         do_test (&json_ctx, i, i, i + 1, 23, repeats);
> +         do_test (&json_ctx, i, i, i + 1, 0, repeats);
> +         do_test (&json_ctx, 0, i, i - 1, 23, repeats);
> +         do_test (&json_ctx, 0, i, i - 1, 0, repeats);
> +         do_test (&json_ctx, i, i, i - 1, 23, repeats);
> +         do_test (&json_ctx, i, i, i - 1, 0, repeats);
> +
> +         do_test (&json_ctx, getpagesize () / 2, i, i + 1, 23, repeats);
> +         do_test (&json_ctx, getpagesize () / 2, i, i + 1, 0, repeats);
> +         do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 23, repeats);
> +         do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 0, repeats);
> +         do_test (&json_ctx, getpagesize () / 2, i, i - 1, 23, repeats);
> +         do_test (&json_ctx, getpagesize () / 2, i, i - 1, 0, repeats);
> +         do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 23, repeats);
> +         do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 0, repeats);
> +
> +         do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23, repeats);
> +         do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0, repeats);
> +
> +         do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23, repeats);
> +         do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0, repeats);
> +
>  #ifdef USE_AS_MEMRCHR
> -      /* Also test the position close to the beginning for memrchr.  */
> -      do_test (&json_ctx, 0, 1, i + 1, 23);
> -      do_test (&json_ctx, 0, 2, i + 1, 0);
> +         do_test (&json_ctx, 0, 1, i + 1, 23, repeats);
> +         do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
> +#endif
> +       }
> +#ifndef USE_AS_MEMRCHR
> +      break;
>  #endif
>      }
>
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v4 3/8] Benchtests: Improve memrchr benchmarks
  2022-06-07  2:44       ` H.J. Lu
@ 2022-06-07  4:10         ` Noah Goldstein
  0 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:10 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 7:44 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jun 6, 2022 at 3:37 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Add a second iteration for memrchr to set `pos` starting from the end
> > of the buffer.
> >
> > Previously `pos` was only set relative to the begining of the
>
>    beginning
> > buffer. This isn't really useful for memchr because the beginning
>                                                        memrchr

Fixed in V5.
> > of the search space is (buf + len).
> > ---
> >  benchtests/bench-memchr.c | 110 ++++++++++++++++++++++----------------
> >  1 file changed, 65 insertions(+), 45 deletions(-)
> >
> > diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> > index 4d7212332f..0facda2fa0 100644
> > --- a/benchtests/bench-memchr.c
> > +++ b/benchtests/bench-memchr.c
> > @@ -76,7 +76,7 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
> >
> >  static void
> >  do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> > -        int seek_char)
> > +        int seek_char, int invert_pos)
> >  {
> >    size_t i;
> >
> > @@ -96,7 +96,10 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> >
> >    if (pos < len)
> >      {
> > -      buf[align + pos] = seek_char;
> > +      if (invert_pos)
> > +       buf[align + len - pos] = seek_char;
> > +      else
> > +       buf[align + pos] = seek_char;
> >        buf[align + len] = -seek_char;
> >      }
> >    else
> > @@ -109,6 +112,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> >    json_attr_uint (json_ctx, "pos", pos);
> >    json_attr_uint (json_ctx, "len", len);
> >    json_attr_uint (json_ctx, "seek_char", seek_char);
> > +  json_attr_uint (json_ctx, "invert_pos", invert_pos);
> >
> >    json_array_begin (json_ctx, "timings");
> >
> > @@ -123,6 +127,7 @@ int
> >  test_main (void)
> >  {
> >    size_t i;
> > +  int repeats;
> >    json_ctx_t json_ctx;
> >    test_init ();
> >
> > @@ -142,53 +147,68 @@ test_main (void)
> >
> >    json_array_begin (&json_ctx, "results");
> >
> > -  for (i = 1; i < 8; ++i)
> > +  for (repeats = 0; repeats < 2; ++repeats)
> >      {
> > -      do_test (&json_ctx, 0, 16 << i, 2048, 23);
> > -      do_test (&json_ctx, i, 64, 256, 23);
> > -      do_test (&json_ctx, 0, 16 << i, 2048, 0);
> > -      do_test (&json_ctx, i, 64, 256, 0);
> > -
> > -      do_test (&json_ctx, getpagesize () - 15, 64, 256, 0);
> > +      for (i = 1; i < 8; ++i)
> > +       {
> > +         do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
> > +         do_test (&json_ctx, i, 64, 256, 23, repeats);
> > +         do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
> > +         do_test (&json_ctx, i, 64, 256, 0, repeats);
> > +
> > +         do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
> >  #ifdef USE_AS_MEMRCHR
> > -      /* Also test the position close to the beginning for memrchr.  */
> > -      do_test (&json_ctx, 0, i, 256, 23);
> > -      do_test (&json_ctx, 0, i, 256, 0);
> > -      do_test (&json_ctx, i, i, 256, 23);
> > -      do_test (&json_ctx, i, i, 256, 0);
> > +         /* Also test the position close to the beginning for memrchr.  */
> > +         do_test (&json_ctx, 0, i, 256, 23, repeats);
> > +         do_test (&json_ctx, 0, i, 256, 0, repeats);
> > +         do_test (&json_ctx, i, i, 256, 23, repeats);
> > +         do_test (&json_ctx, i, i, 256, 0, repeats);
> >  #endif
> > -    }
> > -  for (i = 1; i < 8; ++i)
> > -    {
> > -      do_test (&json_ctx, i, i << 5, 192, 23);
> > -      do_test (&json_ctx, i, i << 5, 192, 0);
> > -      do_test (&json_ctx, i, i << 5, 256, 23);
> > -      do_test (&json_ctx, i, i << 5, 256, 0);
> > -      do_test (&json_ctx, i, i << 5, 512, 23);
> > -      do_test (&json_ctx, i, i << 5, 512, 0);
> > -
> > -      do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23);
> > -    }
> > -  for (i = 1; i < 32; ++i)
> > -    {
> > -      do_test (&json_ctx, 0, i, i + 1, 23);
> > -      do_test (&json_ctx, 0, i, i + 1, 0);
> > -      do_test (&json_ctx, i, i, i + 1, 23);
> > -      do_test (&json_ctx, i, i, i + 1, 0);
> > -      do_test (&json_ctx, 0, i, i - 1, 23);
> > -      do_test (&json_ctx, 0, i, i - 1, 0);
> > -      do_test (&json_ctx, i, i, i - 1, 23);
> > -      do_test (&json_ctx, i, i, i - 1, 0);
> > -
> > -      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23);
> > -      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0);
> > -
> > -      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23);
> > -      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0);
> > +       }
> > +      for (i = 1; i < 8; ++i)
> > +       {
> > +         do_test (&json_ctx, i, i << 5, 192, 23, repeats);
> > +         do_test (&json_ctx, i, i << 5, 192, 0, repeats);
> > +         do_test (&json_ctx, i, i << 5, 256, 23, repeats);
> > +         do_test (&json_ctx, i, i << 5, 256, 0, repeats);
> > +         do_test (&json_ctx, i, i << 5, 512, 23, repeats);
> > +         do_test (&json_ctx, i, i << 5, 512, 0, repeats);
> > +
> > +         do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
> > +       }
> > +      for (i = 1; i < 32; ++i)
> > +       {
> > +         do_test (&json_ctx, 0, i, i + 1, 23, repeats);
> > +         do_test (&json_ctx, 0, i, i + 1, 0, repeats);
> > +         do_test (&json_ctx, i, i, i + 1, 23, repeats);
> > +         do_test (&json_ctx, i, i, i + 1, 0, repeats);
> > +         do_test (&json_ctx, 0, i, i - 1, 23, repeats);
> > +         do_test (&json_ctx, 0, i, i - 1, 0, repeats);
> > +         do_test (&json_ctx, i, i, i - 1, 23, repeats);
> > +         do_test (&json_ctx, i, i, i - 1, 0, repeats);
> > +
> > +         do_test (&json_ctx, getpagesize () / 2, i, i + 1, 23, repeats);
> > +         do_test (&json_ctx, getpagesize () / 2, i, i + 1, 0, repeats);
> > +         do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 23, repeats);
> > +         do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 0, repeats);
> > +         do_test (&json_ctx, getpagesize () / 2, i, i - 1, 23, repeats);
> > +         do_test (&json_ctx, getpagesize () / 2, i, i - 1, 0, repeats);
> > +         do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 23, repeats);
> > +         do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 0, repeats);
> > +
> > +         do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23, repeats);
> > +         do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0, repeats);
> > +
> > +         do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23, repeats);
> > +         do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0, repeats);
> > +
> >  #ifdef USE_AS_MEMRCHR
> > -      /* Also test the position close to the beginning for memrchr.  */
> > -      do_test (&json_ctx, 0, 1, i + 1, 23);
> > -      do_test (&json_ctx, 0, 2, i + 1, 0);
> > +         do_test (&json_ctx, 0, 1, i + 1, 23, repeats);
> > +         do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
> > +#endif
> > +       }
> > +#ifndef USE_AS_MEMRCHR
> > +      break;
> >  #endif
> >      }
> >
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v4 4/8] x86: Optimize memrchr-sse2.S
  2022-06-06 22:37   ` [PATCH v4 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  2022-06-06 22:37     ` [PATCH v4 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
  2022-06-06 22:37     ` [PATCH v4 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
@ 2022-06-06 22:37     ` Noah Goldstein
  2022-06-06 22:37     ` [PATCH v4 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
                       ` (3 subsequent siblings)
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-06 22:37 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller lengths more.
    2. optimizes target placement more carefully.
    3. reuses logic more.
    4. fixes up various inefficiencies in the logic.

The total code size saving is: 394 bytes
Geometric Mean of all benchmarks New / Old: 0.874

Regressions:
    1. The page cross case is now colder, especially re-entry from the
       page cross case if a match is not found in the first VEC
       (roughly 50%). My general opinion with this patch is this is
       acceptable given the "coldness" of this case (less than 4%) and
       generally performance improvement in the other far more common
       cases.

    2. There are some regressions 5-15% for medium/large user-arg
       lengths that have a match in the first VEC. This is because the
       logic was rewritten to optimize finds in the first VEC if the
       user-arg length is shorter (where we see roughly 20-50%
       performance improvements). It is not always the case this is a
       regression. My intuition is some frontend quirk is partially
       explaining the data although I haven't been able to find the
       root cause.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++--------------------
 1 file changed, 292 insertions(+), 321 deletions(-)

diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index d1a9f47911..b0dffd2ae2 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -18,362 +18,333 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#define VEC_SIZE			16
+#define PAGE_SIZE			4096
 
 	.text
-ENTRY (__memrchr)
-	movd	%esi, %xmm1
-
-	sub	$16, %RDX_LP
-	jbe	L(length_less16)
-
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	%RDX_LP, %RDI_LP
-	pshufd	$0, %xmm1, %xmm1
-
-	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-
-/* Check if there is a match.  */
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(loop_prolog)
-
-	add	$16, %rdi
-	add	$16, %rdx
-	and	$-16, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(loop_prolog):
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqb	%xmm1, %xmm4
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	mov	%edi, %ecx
-	and	$63, %ecx
-	jz	L(align64_loop)
-
-	add	$64, %rdi
-	add	$64, %rdx
-	and	$-64, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(align64_loop):
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	movdqa	48(%rdi), %xmm4
-
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm1, %xmm4
-
-	pmaxub	%xmm3, %xmm0
-	pmaxub	%xmm4, %xmm2
-	pmaxub	%xmm0, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	test	%eax, %eax
-	jz	L(align64_loop)
-
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm2
-
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	(%rdi), %xmm1
-
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	pmovmskb	%xmm1, %eax
-	bsr	%eax, %eax
-
-	add	%rdi, %rax
+ENTRY_P2ALIGN(__memrchr, 6)
+#ifdef __ILP32__
+	/* Clear upper bits.  */
+	mov	%RDX_LP, %RDX_LP
+#endif
+	movd	%esi, %xmm0
+
+	/* Get end pointer.  */
+	leaq	(%rdx, %rdi), %rcx
+
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	jz	L(page_cross)
+
+	/* NB: This load happens regardless of whether rdx (len) is zero. Since
+	   it doesn't cross a page and the standard gurantees any pointer have
+	   at least one-valid byte this load must be safe. For the entire
+	   history of the x86 memrchr implementation this has been possible so
+	   no code "should" be relying on a zero-length check before this load.
+	   The zero-length check is moved to the page cross case because it is
+	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
+	   into 2-cache lines.  */
+	movups	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
+	   zero.  */
+	bsrl	%eax, %eax
+	jz	L(ret_0)
+	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
+	   if out of bounds.  */
+	addl	%edx, %eax
+	jl	L(zero_0)
+	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
+	   ptr.  */
+	addq	%rdi, %rax
+L(ret_0):
 	ret
 
-	.p2align 4
-L(exit_loop):
-	add	$64, %edx
-	cmp	$32, %edx
-	jbe	L(exit_loop_32)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16_1)
-	cmp	$48, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches0_1)
-	xor	%eax, %eax
+	.p2align 4,, 5
+L(ret_vec_x0):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(exit_loop_32):
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48_1)
-	cmp	$16, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	32(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches32_1)
-	xor	%eax, %eax
+	.p2align 4,, 2
+L(zero_0):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches0):
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
-	ret
 
-	.p2align 4
-L(matches32):
-	bsr	%eax, %eax
-	lea	32(%rax, %rdi), %rax
+	.p2align 4,, 8
+L(more_1x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	/* Align rcx (pointer to string).  */
+	decq	%rcx
+	andq	$-VEC_SIZE, %rcx
+
+	movq	%rcx, %rdx
+	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
+	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
+	   it adds more frontend uops (even if the moves can be eliminated) and
+	   some percentage of the time actual backend uops.  */
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	subq	%rdi, %rdx
+	pmovmskb %xmm1, %eax
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	subl	$VEC_SIZE, %edx
+	jbe	L(ret_vec_x0_test)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_1)
+	addl	%edx, %eax
+	jl	L(zero_0)
+	addq	%rdi, %rax
+L(ret_1):
 	ret
 
-	.p2align 4
-L(matches48):
-	bsr	%eax, %eax
-	lea	48(%rax, %rdi), %rax
+	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
+	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
+	   lines.  Naturally aligned % 16 to 8-bytes.  */
+L(page_cross):
+	/* Zero length check.  */
+	testq	%rdx, %rdx
+	jz	L(zero_0)
+
+	leaq	-1(%rcx), %r8
+	andq	$-(VEC_SIZE), %r8
+
+	movaps	(%r8), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	negl	%ecx
+	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
+	   explicitly.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	shl	%cl, %esi
+	movzwl	%si, %eax
+	leaq	(%rdi, %rdx), %rcx
+	cmpq	%rdi, %r8
+	ja	L(more_1x_vec)
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_2)
+	addl	%edx, %eax
+	jl	L(zero_1)
+	addq	%rdi, %rax
+L(ret_2):
 	ret
 
-	.p2align 4
-L(matches0_1):
-	bsr	%eax, %eax
-	sub	$64, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	add	%rdi, %rax
+	/* Fits in aliging bytes.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches16_1):
-	bsr	%eax, %eax
-	sub	$48, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	16(%rdi, %rax), %rax
+	.p2align 4,, 5
+L(ret_vec_x1):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(matches32_1):
-	bsr	%eax, %eax
-	sub	$32, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	32(%rdi, %rax), %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(matches48_1):
-	bsr	%eax, %eax
-	sub	$16, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	48(%rdi, %rax), %rax
-	ret
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(return_null):
-	xor	%eax, %eax
-	ret
 
-	.p2align 4
-L(length_less16_offset0):
-	test	%edx, %edx
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	mov	%dl, %cl
-	pcmpeqb	(%rdi), %xmm1
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
 
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
+	addl	$(VEC_SIZE), %edx
+	jle	L(ret_vec_x2_test)
 
-	pmovmskb	%xmm1, %eax
+L(last_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
 
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
+	subl	$(VEC_SIZE), %edx
+	bsrl	%eax, %eax
+	jz	L(ret_3)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
+L(ret_3):
 	ret
 
-	.p2align 4
-L(length_less16):
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	$16, %edx
-
-	pshufd	$0, %xmm1, %xmm1
-
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(length_less16_offset0)
-
-	mov	%cl, %dh
-	mov	%ecx, %esi
-	add	%dl, %dh
-	and	$-16, %rdi
-
-	sub	$16, %dh
-	ja	L(length_less16_part2)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-
-	sar	%cl, %eax
-	mov	%dl, %cl
-
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
-
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 6
+L(ret_vec_x2_test):
+	bsrl	%eax, %eax
+	jz	L(zero_2)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4
-L(length_less16_part2):
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	mov	%dh, %cl
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
+L(zero_2):
+	xorl	%eax, %eax
+	ret
 
-	test	%eax, %eax
-	jnz	L(length_less16_part2_return)
 
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
+	.p2align 4,, 5
+L(ret_vec_x2):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	mov	%esi, %ecx
-	sar	%cl, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	.p2align 4,, 5
+L(ret_vec_x3):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
+
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x3)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
+	   keeping the code from spilling to the next cache line.  */
+	addq	$(VEC_SIZE * 4 - 1), %rcx
+	andq	$-(VEC_SIZE * 4), %rcx
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
+	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
+	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
+	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
+	pcmpeqb	%xmm0, %xmm1
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm0, %xmm4
+
+	por	%xmm1, %xmm2
+	por	%xmm3, %xmm4
+	por	%xmm2, %xmm4
+
+	pmovmskb %xmm4, %esi
+	testl	%esi, %esi
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	%rdx, %rcx
+	jne	L(loop_4x_vec)
+
+	subl	%edi, %edx
+
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 2
+L(last_4x_vec):
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+	bsrl	%eax, %eax
+	jz	L(ret_4)
+	addl	%edx, %eax
+	jl	L(zero_3)
+	addq	%rdi, %rax
+L(ret_4):
 	ret
 
-	.p2align 4
-L(length_less16_part2_return):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 3
+L(loop_end):
+	pmovmskb %xmm1, %eax
+	sall	$16, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm2, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm3, %eax
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	sall	$16, %eax
+	orl	%esi, %eax
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
 	ret
 
-END (__memrchr)
+L(ret_vec_end):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
+	ret
+	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
+	   aligning bytes.  */
+L(zero_3):
+	xorl	%eax, %eax
+	ret
+	/* 2-bytes from next cache line.  */
+END(__memrchr)
 weak_alias (__memrchr, memrchr)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v4 5/8] x86: Optimize memrchr-evex.S
  2022-06-06 22:37   ` [PATCH v4 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (2 preceding siblings ...)
  2022-06-06 22:37     ` [PATCH v4 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
@ 2022-06-06 22:37     ` Noah Goldstein
  2022-06-07  2:41       ` H.J. Lu
  2022-06-06 22:37     ` [PATCH v4 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
                       ` (2 subsequent siblings)
  6 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-06 22:37 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller user-arg lengths more.
    2. optimizes target placement more carefully
    3. reuses logic more
    4. fixes up various inefficiencies in the logic. The biggest
       case here is the `lzcnt` logic for checking returns which
       saves either a branch or multiple instructions.

The total code size saving is: 263 bytes
Geometric Mean of all benchmarks New / Old: 0.755

Regressions:
There are some regressions. Particularly where the length (user arg
length) is large but the position of the match char is near the
begining of the string (in first VEC). This case has roughly a
20% regression.

This is because the new logic gives the hot path for immediate matches
to shorter lengths (the more common input). This case has roughly
a 35% speedup.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memrchr-evex.S | 539 ++++++++++++------------
 1 file changed, 268 insertions(+), 271 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index 0b99709c6b..ad541c0e50 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -19,319 +19,316 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
+# ifndef MEMRCHR
+#  define MEMRCHR				__memrchr_evex
+# endif
+
+# define PAGE_SIZE			4096
+# define VECMATCH			VEC(0)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(MEMRCHR, 6)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
+
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdi, %rdx), %rax
+	vpbroadcastb %esi, %VECMATCH
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+
+	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will gurantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-# define VMOVA		vmovdqa64
-
-# define YMMMATCH	ymm16
-
-# define VEC_SIZE 32
-
-	.section .text.evex,"ax",@progbits
-ENTRY (__memrchr_evex)
-	/* Broadcast CHAR to YMMMATCH.  */
-	vpbroadcastb %esi, %YMMMATCH
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
-
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
-
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
-	kord	%k1, %k2, %k5
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
-	kord	%k3, %k4, %k6
-	kortestd %k5, %k6
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	kmovd	%k1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0_dec):
+	decq	%rax
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
+	/* Align rax (pointer to string).  */
+	andq	$-VEC_SIZE, %rax
 
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
+	/* Recompute length after aligning.  */
+	movq	%rax, %rdx
 
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	ret
+	subq	%rdi, %rdx
 
-	.p2align 4
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
+
+	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
+	decq	%rax
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpb	$0, (%rsi), %VECMATCH, %k0
+	kmovd	%k0, %r8d
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %ecx
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%ecx
+	shlxl	%ecx, %r8d, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_1)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
+	/* Continue creating zero labels that fit in aligning bytes and get
+	   2-byte encoding / are in the same cache line as condition.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
-
-	kmovd	%k1, %eax
-
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
-
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(zero)
-
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
-
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	.p2align 4,, 8
+L(ret_vec_x2):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	/* Check the last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecissary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	ret
+	decq	%rax
+	andq	$-(VEC_SIZE * 4), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	andq	$-(VEC_SIZE * 4), %rdx
 
 	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
-
-	/* Check the last VEC.  */
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+L(loop_4x_vec):
+	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
+	   on).  */
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
+
+	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
+	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
+	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
+
+	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
+	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
+	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
+	vptestnmb %VEC(3), %VEC(3), %k2
+
+	/* Any 1s and we found CHAR.  */
+	kortestd %k2, %k4
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
+
+	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
+	subq	$-(VEC_SIZE * 4), %rdx
+	movq	%rdx, %rax
+	subl	%edi, %edx
+L(last_4x_vec):
+
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	kmovd	%k1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	movl	%r8d, %ecx
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	kmovd	%k1, %eax
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret_1)
+	xorl	%eax, %eax
+L(ret_1):
+	ret
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 6
+L(loop_end):
+	kmovd	%k1, %ecx
+	notl	%ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vptestnmb %VEC(2), %VEC(2), %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	kmovd	%k2, %ecx
+	kmovd	%k4, %esi
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	addq	%rcx, %rax
+	ret
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	addq	$(VEC_SIZE), %rax
+L(ret_vec_x1_end):
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
 	ret
-END (__memrchr_evex)
+
+END(MEMRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v4 5/8] x86: Optimize memrchr-evex.S
  2022-06-06 22:37     ` [PATCH v4 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
@ 2022-06-07  2:41       ` H.J. Lu
  2022-06-07  4:09         ` Noah Goldstein
  0 siblings, 1 reply; 82+ messages in thread
From: H.J. Lu @ 2022-06-07  2:41 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 3:37 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code:
>     1. prioritizes smaller user-arg lengths more.
>     2. optimizes target placement more carefully
>     3. reuses logic more
>     4. fixes up various inefficiencies in the logic. The biggest
>        case here is the `lzcnt` logic for checking returns which
>        saves either a branch or multiple instructions.
>
> The total code size saving is: 263 bytes
> Geometric Mean of all benchmarks New / Old: 0.755
>
> Regressions:
> There are some regressions. Particularly where the length (user arg
> length) is large but the position of the match char is near the
> begining of the string (in first VEC). This case has roughly a

beginning

> 20% regression.
>
> This is because the new logic gives the hot path for immediate matches
> to shorter lengths (the more common input). This case has roughly
> a 35% speedup.
>
> Full xcheck passes on x86_64.
> ---
>  sysdeps/x86_64/multiarch/memrchr-evex.S | 539 ++++++++++++------------
>  1 file changed, 268 insertions(+), 271 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
> index 0b99709c6b..ad541c0e50 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
> @@ -19,319 +19,316 @@
>  #if IS_IN (libc)
>
>  # include <sysdep.h>
> +# include "evex256-vecs.h"
> +# if VEC_SIZE != 32
> +#  error "VEC_SIZE != 32 unimplemented"
> +# endif
> +
> +# ifndef MEMRCHR
> +#  define MEMRCHR                              __memrchr_evex
> +# endif
> +
> +# define PAGE_SIZE                     4096
> +# define VECMATCH                      VEC(0)
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN(MEMRCHR, 6)
> +# ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       and     %RDX_LP, %RDX_LP
> +# else
> +       test    %RDX_LP, %RDX_LP
> +# endif
> +       jz      L(zero_0)
> +
> +       /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
> +          correct page cross check and 2) it correctly sets up end ptr to be
> +          subtract by lzcnt aligned.  */
> +       leaq    -1(%rdi, %rdx), %rax
> +       vpbroadcastb %esi, %VECMATCH
> +
> +       /* Check if we can load 1x VEC without cross a page.  */
> +       testl   $(PAGE_SIZE - VEC_SIZE), %eax
> +       jz      L(page_cross)
> +
> +       /* Don't use rax for pointer here because EVEX has better encoding with
> +          offset % VEC_SIZE == 0.  */
> +       vpcmpb  $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
> +
> +       /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
> +       cmpq    $VEC_SIZE, %rdx
> +       ja      L(more_1x_vec)
> +L(ret_vec_x0_test):
> +
> +       /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
> +          will gurantee edx (len) is less than it.  */
                     guarantee
> +       lzcntl  %ecx, %ecx
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
> +       ret
>
> -# define VMOVA         vmovdqa64
> -
> -# define YMMMATCH      ymm16
> -
> -# define VEC_SIZE 32
> -
> -       .section .text.evex,"ax",@progbits
> -ENTRY (__memrchr_evex)
> -       /* Broadcast CHAR to YMMMATCH.  */
> -       vpbroadcastb %esi, %YMMMATCH
> -
> -       sub     $VEC_SIZE, %RDX_LP
> -       jbe     L(last_vec_or_less)
> -
> -       add     %RDX_LP, %RDI_LP
> -
> -       /* Check the last VEC_SIZE bytes.  */
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x0)
> -
> -       subq    $(VEC_SIZE * 4), %rdi
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -       jz      L(aligned_more)
> -
> -       /* Align data for aligned loads in the loop.  */
> -       addq    $VEC_SIZE, %rdi
> -       addq    $VEC_SIZE, %rdx
> -       andq    $-VEC_SIZE, %rdi
> -       subq    %rcx, %rdx
> -
> -       .p2align 4
> -L(aligned_more):
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -
> -       /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -          since data is only aligned to VEC_SIZE.  */
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
> -       kmovd   %k2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
> -       kmovd   %k3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k4
> -       kmovd   %k4, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x0)
> -
> -       /* Align data to 4 * VEC_SIZE for loop with fewer branches.
> -          There are some overlaps with above if data isn't aligned
> -          to 4 * VEC_SIZE.  */
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE * 4 - 1), %ecx
> -       jz      L(loop_4x_vec)
> -
> -       addq    $(VEC_SIZE * 4), %rdi
> -       addq    $(VEC_SIZE * 4), %rdx
> -       andq    $-(VEC_SIZE * 4), %rdi
> -       subq    %rcx, %rdx
> +       /* Fits in aligning bytes of first cache line.  */
> +L(zero_0):
> +       xorl    %eax, %eax
> +       ret
>
> -       .p2align 4
> -L(loop_4x_vec):
> -       /* Compare 4 * VEC at a time forward.  */
> -       subq    $(VEC_SIZE * 4), %rdi
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> -       kord    %k1, %k2, %k5
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> -
> -       kord    %k3, %k4, %k6
> -       kortestd %k5, %k6
> -       jz      L(loop_4x_vec)
> -
> -       /* There is a match.  */
> -       kmovd   %k4, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       kmovd   %k3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       kmovd   %k2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       kmovd   %k1, %eax
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> +       .p2align 4,, 9
> +L(ret_vec_x0_dec):
> +       decq    %rax
> +L(ret_vec_x0):
> +       lzcntl  %ecx, %ecx
> +       subq    %rcx, %rax
>         ret
>
> -       .p2align 4
> -L(last_4x_vec_or_less):
> -       addl    $(VEC_SIZE * 4), %edx
> -       cmpl    $(VEC_SIZE * 2), %edx
> -       jbe     L(last_2x_vec)
> +       .p2align 4,, 10
> +L(more_1x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
>
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> +       /* Align rax (pointer to string).  */
> +       andq    $-VEC_SIZE, %rax
>
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
> -       kmovd   %k2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> +       /* Recompute length after aligning.  */
> +       movq    %rax, %rdx
>
> -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
> -       kmovd   %k3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1_check)
> -       cmpl    $(VEC_SIZE * 3), %edx
> -       jbe     L(zero)
> +       /* Need no matter what.  */
> +       vpcmpb  $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k4
> -       kmovd   %k4, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 4), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addq    %rdi, %rax
> -       ret
> +       subq    %rdi, %rdx
>
> -       .p2align 4
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       ja      L(more_2x_vec)
>  L(last_2x_vec):
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3_check)
> +
> +       /* Must dec rax because L(ret_vec_x0_test) expects it.  */
> +       decq    %rax
>         cmpl    $VEC_SIZE, %edx
> -       jbe     L(zero)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 2), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $(VEC_SIZE * 2), %eax
> -       addq    %rdi, %rax
> +       jbe     L(ret_vec_x0_test)
> +
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
> +
> +       /* Don't use rax for pointer here because EVEX has better encoding with
> +          offset % VEC_SIZE == 0.  */
> +       vpcmpb  $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
> +       /* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
> +       lzcntq  %rcx, %rcx
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x0):
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> +       /* Inexpensive place to put this regarding code size / target alignments
> +          / ICache NLP. Necessary for 2-byte encoding of jump to page cross
> +          case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
                                  ^^^^^^^^^^^^^^^^^^^ Typo?
> +          in first cache line.  */
> +L(page_cross):
> +       movq    %rax, %rsi
> +       andq    $-VEC_SIZE, %rsi
> +       vpcmpb  $0, (%rsi), %VECMATCH, %k0
> +       kmovd   %k0, %r8d
> +       /* Shift out negative alignment (because we are starting from endptr and
> +          working backwards).  */
> +       movl    %eax, %ecx
> +       /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
> +       notl    %ecx
> +       shlxl   %ecx, %r8d, %ecx
> +       cmpq    %rdi, %rsi
> +       ja      L(more_1x_vec)
> +       lzcntl  %ecx, %ecx
> +       cmpl    %ecx, %edx
> +       jle     L(zero_1)
> +       subq    %rcx, %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x1):
> -       bsrl    %eax, %eax
> -       addl    $VEC_SIZE, %eax
> -       addq    %rdi, %rax
> +       /* Continue creating zero labels that fit in aligning bytes and get
> +          2-byte encoding / are in the same cache line as condition.  */
> +L(zero_1):
> +       xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(last_vec_x2):
> -       bsrl    %eax, %eax
> -       addl    $(VEC_SIZE * 2), %eax
> -       addq    %rdi, %rax
> +       .p2align 4,, 8
> +L(ret_vec_x1):
> +       /* This will naturally add 32 to position.  */
> +       bsrl    %ecx, %ecx
> +       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x3):
> -       bsrl    %eax, %eax
> -       addl    $(VEC_SIZE * 3), %eax
> -       addq    %rdi, %rax
> -       ret
> +       .p2align 4,, 8
> +L(more_2x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_dec)
>
> -       .p2align 4
> -L(last_vec_x1_check):
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 3), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $VEC_SIZE, %eax
> -       addq    %rdi, %rax
> -       ret
> +       vpcmpb  $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
>
> -       .p2align 4
> -L(last_vec_x3_check):
> -       bsrl    %eax, %eax
> -       subq    $VEC_SIZE, %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $(VEC_SIZE * 3), %eax
> -       addq    %rdi, %rax
> -       ret
> +       /* Need no matter what.  */
> +       vpcmpb  $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       .p2align 4
> -L(zero):
> -       xorl    %eax, %eax
> +       subq    $(VEC_SIZE * 4), %rdx
> +       ja      L(more_4x_vec)
> +
> +       cmpl    $(VEC_SIZE * -1), %edx
> +       jle     L(ret_vec_x2_test)
> +L(last_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
> +
> +
> +       /* Need no matter what.  */
> +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 3 + 1), %rax
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       ja      L(zero_1)
>         ret
>
> -       .p2align 4
> -L(last_vec_or_less_aligned):
> -       movl    %edx, %ecx
> -
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> -
> -       movl    $1, %edx
> -       /* Support rdx << 32.  */
> -       salq    %cl, %rdx
> -       subq    $1, %rdx
> -
> -       kmovd   %k1, %eax
> -
> -       /* Remove the trailing bytes.  */
> -       andl    %edx, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> +       .p2align 4,, 8
> +L(ret_vec_x2_test):
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 2 + 1), %rax
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       ja      L(zero_1)
>         ret
>
> -       .p2align 4
> -L(last_vec_or_less):
> -       addl    $VEC_SIZE, %edx
> -
> -       /* Check for zero length.  */
> -       testl   %edx, %edx
> -       jz      L(zero)
> -
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -       jz      L(last_vec_or_less_aligned)
> -
> -       movl    %ecx, %esi
> -       movl    %ecx, %r8d
> -       addl    %edx, %esi
> -       andq    $-VEC_SIZE, %rdi
> +       .p2align 4,, 8
> +L(ret_vec_x2):
> +       bsrl    %ecx, %ecx
> +       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> +       ret
>
> -       subl    $VEC_SIZE, %esi
> -       ja      L(last_vec_2x_aligned)
> +       .p2align 4,, 8
> +L(ret_vec_x3):
> +       bsrl    %ecx, %ecx
> +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> +       ret
>
> -       /* Check the last VEC.  */
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
>
> -       /* Remove the leading and trailing bytes.  */
> -       sarl    %cl, %eax
> -       movl    %edx, %ecx
> +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       movl    $1, %edx
> -       sall    %cl, %edx
> -       subl    $1, %edx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
>
> -       andl    %edx, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> +       /* Check if near end before re-aligning (otherwise might do an
> +          unnecissary loop iteration).  */
               unnecessary
> +       addq    $-(VEC_SIZE * 4), %rax
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       jbe     L(last_4x_vec)
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       addq    %r8, %rax
> -       ret
> +       decq    %rax
> +       andq    $-(VEC_SIZE * 4), %rax
> +       movq    %rdi, %rdx
> +       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
> +          lengths that overflow can be valid and break the comparison.  */
> +       andq    $-(VEC_SIZE * 4), %rdx
>
>         .p2align 4
> -L(last_vec_2x_aligned):
> -       movl    %esi, %ecx
> -
> -       /* Check the last VEC.  */
> -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> +L(loop_4x_vec):
> +       /* Store 1 were not-equals and 0 where equals in k1 (used to mask later
> +          on).  */
> +       vpcmpb  $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
> +
> +       /* VEC(2/3) will have zero-byte where we found a CHAR.  */
> +       vpxorq  (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
> +       vpxorq  (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
> +       vpcmpb  $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
> +
> +       /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
> +          CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
> +       vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z}
> +       vptestnmb %VEC(3), %VEC(3), %k2
> +
> +       /* Any 1s and we found CHAR.  */
> +       kortestd %k2, %k4
> +       jnz     L(loop_end)
> +
> +       addq    $-(VEC_SIZE * 4), %rax
> +       cmpq    %rdx, %rax
> +       jne     L(loop_4x_vec)
> +
> +       /* Need to re-adjust rdx / rax for L(last_4x_vec).  */
> +       subq    $-(VEC_SIZE * 4), %rdx
> +       movq    %rdx, %rax
> +       subl    %edi, %edx
> +L(last_4x_vec):
> +
> +       /* Used no matter what.  */
> +       vpcmpb  $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       movl    $1, %edx
> -       sall    %cl, %edx
> -       subl    $1, %edx
> +       cmpl    $(VEC_SIZE * 2), %edx
> +       jbe     L(last_2x_vec)
>
> -       kmovd   %k1, %eax
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_dec)
>
> -       /* Remove the trailing bytes.  */
> -       andl    %edx, %eax
>
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> +       vpcmpb  $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       /* Check the second last VEC.  */
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
>
> -       movl    %r8d, %ecx
> +       /* Used no matter what.  */
> +       vpcmpb  $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       kmovd   %k1, %eax
> +       cmpl    $(VEC_SIZE * 3), %edx
> +       ja      L(last_vec)
>
> -       /* Remove the leading bytes.  Must use unsigned right shift for
> -          bsrl below.  */
> -       shrl    %cl, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 2 + 1), %rax
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       jbe     L(ret_1)
> +       xorl    %eax, %eax
> +L(ret_1):
> +       ret
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       addq    %r8, %rax
> +       .p2align 4,, 6
> +L(loop_end):
> +       kmovd   %k1, %ecx
> +       notl    %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_end)
> +
> +       vptestnmb %VEC(2), %VEC(2), %k0
> +       kmovd   %k0, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1_end)
> +
> +       kmovd   %k2, %ecx
> +       kmovd   %k4, %esi
> +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> +          then CHAR in VEC3 and bsrq will use that position.  */
> +       salq    $32, %rcx
> +       orq     %rsi, %rcx
> +       bsrq    %rcx, %rcx
> +       addq    %rcx, %rax
> +       ret
> +       .p2align 4,, 4
> +L(ret_vec_x0_end):
> +       addq    $(VEC_SIZE), %rax
> +L(ret_vec_x1_end):
> +       bsrl    %ecx, %ecx
> +       leaq    (VEC_SIZE * 2)(%rax, %rcx), %rax
>         ret
> -END (__memrchr_evex)
> +
> +END(MEMRCHR)
>  #endif
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v4 5/8] x86: Optimize memrchr-evex.S
  2022-06-07  2:41       ` H.J. Lu
@ 2022-06-07  4:09         ` Noah Goldstein
  2022-06-07  4:12           ` Noah Goldstein
  0 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:09 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 7:41 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jun 6, 2022 at 3:37 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code:
> >     1. prioritizes smaller user-arg lengths more.
> >     2. optimizes target placement more carefully
> >     3. reuses logic more
> >     4. fixes up various inefficiencies in the logic. The biggest
> >        case here is the `lzcnt` logic for checking returns which
> >        saves either a branch or multiple instructions.
> >
> > The total code size saving is: 263 bytes
> > Geometric Mean of all benchmarks New / Old: 0.755
> >
> > Regressions:
> > There are some regressions. Particularly where the length (user arg
> > length) is large but the position of the match char is near the
> > begining of the string (in first VEC). This case has roughly a
>
> beginning

Fixed in V5.
>
> > 20% regression.
> >
> > This is because the new logic gives the hot path for immediate matches
> > to shorter lengths (the more common input). This case has roughly
> > a 35% speedup.
> >
> > Full xcheck passes on x86_64.
> > ---
> >  sysdeps/x86_64/multiarch/memrchr-evex.S | 539 ++++++++++++------------
> >  1 file changed, 268 insertions(+), 271 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
> > index 0b99709c6b..ad541c0e50 100644
> > --- a/sysdeps/x86_64/multiarch/memrchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
> > @@ -19,319 +19,316 @@
> >  #if IS_IN (libc)
> >
> >  # include <sysdep.h>
> > +# include "evex256-vecs.h"
> > +# if VEC_SIZE != 32
> > +#  error "VEC_SIZE != 32 unimplemented"
> > +# endif
> > +
> > +# ifndef MEMRCHR
> > +#  define MEMRCHR                              __memrchr_evex
> > +# endif
> > +
> > +# define PAGE_SIZE                     4096
> > +# define VECMATCH                      VEC(0)
> > +
> > +       .section SECTION(.text), "ax", @progbits
> > +ENTRY_P2ALIGN(MEMRCHR, 6)
> > +# ifdef __ILP32__
> > +       /* Clear upper bits.  */
> > +       and     %RDX_LP, %RDX_LP
> > +# else
> > +       test    %RDX_LP, %RDX_LP
> > +# endif
> > +       jz      L(zero_0)
> > +
> > +       /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
> > +          correct page cross check and 2) it correctly sets up end ptr to be
> > +          subtract by lzcnt aligned.  */
> > +       leaq    -1(%rdi, %rdx), %rax
> > +       vpbroadcastb %esi, %VECMATCH
> > +
> > +       /* Check if we can load 1x VEC without cross a page.  */
> > +       testl   $(PAGE_SIZE - VEC_SIZE), %eax
> > +       jz      L(page_cross)
> > +
> > +       /* Don't use rax for pointer here because EVEX has better encoding with
> > +          offset % VEC_SIZE == 0.  */
> > +       vpcmpb  $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> > +
> > +       /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
> > +       cmpq    $VEC_SIZE, %rdx
> > +       ja      L(more_1x_vec)
> > +L(ret_vec_x0_test):
> > +
> > +       /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
> > +          will gurantee edx (len) is less than it.  */
>                      guarantee

Fixed in V5.

> > +       lzcntl  %ecx, %ecx
> > +       cmpl    %ecx, %edx
> > +       jle     L(zero_0)
> > +       subq    %rcx, %rax
> > +       ret
> >
> > -# define VMOVA         vmovdqa64
> > -
> > -# define YMMMATCH      ymm16
> > -
> > -# define VEC_SIZE 32
> > -
> > -       .section .text.evex,"ax",@progbits
> > -ENTRY (__memrchr_evex)
> > -       /* Broadcast CHAR to YMMMATCH.  */
> > -       vpbroadcastb %esi, %YMMMATCH
> > -
> > -       sub     $VEC_SIZE, %RDX_LP
> > -       jbe     L(last_vec_or_less)
> > -
> > -       add     %RDX_LP, %RDI_LP
> > -
> > -       /* Check the last VEC_SIZE bytes.  */
> > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> > -       kmovd   %k1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x0)
> > -
> > -       subq    $(VEC_SIZE * 4), %rdi
> > -       movl    %edi, %ecx
> > -       andl    $(VEC_SIZE - 1), %ecx
> > -       jz      L(aligned_more)
> > -
> > -       /* Align data for aligned loads in the loop.  */
> > -       addq    $VEC_SIZE, %rdi
> > -       addq    $VEC_SIZE, %rdx
> > -       andq    $-VEC_SIZE, %rdi
> > -       subq    %rcx, %rdx
> > -
> > -       .p2align 4
> > -L(aligned_more):
> > -       subq    $(VEC_SIZE * 4), %rdx
> > -       jbe     L(last_4x_vec_or_less)
> > -
> > -       /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > -          since data is only aligned to VEC_SIZE.  */
> > -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > -       kmovd   %k1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
> > -       kmovd   %k2, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x2)
> > -
> > -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
> > -       kmovd   %k3, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1)
> > -
> > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k4
> > -       kmovd   %k4, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x0)
> > -
> > -       /* Align data to 4 * VEC_SIZE for loop with fewer branches.
> > -          There are some overlaps with above if data isn't aligned
> > -          to 4 * VEC_SIZE.  */
> > -       movl    %edi, %ecx
> > -       andl    $(VEC_SIZE * 4 - 1), %ecx
> > -       jz      L(loop_4x_vec)
> > -
> > -       addq    $(VEC_SIZE * 4), %rdi
> > -       addq    $(VEC_SIZE * 4), %rdx
> > -       andq    $-(VEC_SIZE * 4), %rdi
> > -       subq    %rcx, %rdx
> > +       /* Fits in aligning bytes of first cache line.  */
> > +L(zero_0):
> > +       xorl    %eax, %eax
> > +       ret
> >
> > -       .p2align 4
> > -L(loop_4x_vec):
> > -       /* Compare 4 * VEC at a time forward.  */
> > -       subq    $(VEC_SIZE * 4), %rdi
> > -       subq    $(VEC_SIZE * 4), %rdx
> > -       jbe     L(last_4x_vec_or_less)
> > -
> > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> > -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> > -       kord    %k1, %k2, %k5
> > -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> > -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> > -
> > -       kord    %k3, %k4, %k6
> > -       kortestd %k5, %k6
> > -       jz      L(loop_4x_vec)
> > -
> > -       /* There is a match.  */
> > -       kmovd   %k4, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3)
> > -
> > -       kmovd   %k3, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x2)
> > -
> > -       kmovd   %k2, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1)
> > -
> > -       kmovd   %k1, %eax
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > +       .p2align 4,, 9
> > +L(ret_vec_x0_dec):
> > +       decq    %rax
> > +L(ret_vec_x0):
> > +       lzcntl  %ecx, %ecx
> > +       subq    %rcx, %rax
> >         ret
> >
> > -       .p2align 4
> > -L(last_4x_vec_or_less):
> > -       addl    $(VEC_SIZE * 4), %edx
> > -       cmpl    $(VEC_SIZE * 2), %edx
> > -       jbe     L(last_2x_vec)
> > +       .p2align 4,, 10
> > +L(more_1x_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0)
> >
> > -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > -       kmovd   %k1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3)
> > +       /* Align rax (pointer to string).  */
> > +       andq    $-VEC_SIZE, %rax
> >
> > -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
> > -       kmovd   %k2, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x2)
> > +       /* Recompute length after aligning.  */
> > +       movq    %rax, %rdx
> >
> > -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
> > -       kmovd   %k3, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1_check)
> > -       cmpl    $(VEC_SIZE * 3), %edx
> > -       jbe     L(zero)
> > +       /* Need no matter what.  */
> > +       vpcmpb  $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> >
> > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k4
> > -       kmovd   %k4, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > -       bsrl    %eax, %eax
> > -       subq    $(VEC_SIZE * 4), %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addq    %rdi, %rax
> > -       ret
> > +       subq    %rdi, %rdx
> >
> > -       .p2align 4
> > +       cmpq    $(VEC_SIZE * 2), %rdx
> > +       ja      L(more_2x_vec)
> >  L(last_2x_vec):
> > -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > -       kmovd   %k1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3_check)
> > +
> > +       /* Must dec rax because L(ret_vec_x0_test) expects it.  */
> > +       decq    %rax
> >         cmpl    $VEC_SIZE, %edx
> > -       jbe     L(zero)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> > -       kmovd   %k1, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > -       bsrl    %eax, %eax
> > -       subq    $(VEC_SIZE * 2), %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addl    $(VEC_SIZE * 2), %eax
> > -       addq    %rdi, %rax
> > +       jbe     L(ret_vec_x0_test)
> > +
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0)
> > +
> > +       /* Don't use rax for pointer here because EVEX has better encoding with
> > +          offset % VEC_SIZE == 0.  */
> > +       vpcmpb  $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> > +       /* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
> > +       lzcntq  %rcx, %rcx
> > +       cmpl    %ecx, %edx
> > +       jle     L(zero_0)
> > +       subq    %rcx, %rax
> >         ret
> >
> > -       .p2align 4
> > -L(last_vec_x0):
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > +       /* Inexpensive place to put this regarding code size / target alignments
> > +          / ICache NLP. Necessary for 2-byte encoding of jump to page cross
> > +          case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
>                                   ^^^^^^^^^^^^^^^^^^^ Typo?

Missed this in V5. Will fix in V6 (will wait for other feedback).
> > +          in first cache line.  */
> > +L(page_cross):
> > +       movq    %rax, %rsi
> > +       andq    $-VEC_SIZE, %rsi
> > +       vpcmpb  $0, (%rsi), %VECMATCH, %k0
> > +       kmovd   %k0, %r8d
> > +       /* Shift out negative alignment (because we are starting from endptr and
> > +          working backwards).  */
> > +       movl    %eax, %ecx
> > +       /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
> > +       notl    %ecx
> > +       shlxl   %ecx, %r8d, %ecx
> > +       cmpq    %rdi, %rsi
> > +       ja      L(more_1x_vec)
> > +       lzcntl  %ecx, %ecx
> > +       cmpl    %ecx, %edx
> > +       jle     L(zero_1)
> > +       subq    %rcx, %rax
> >         ret
> >
> > -       .p2align 4
> > -L(last_vec_x1):
> > -       bsrl    %eax, %eax
> > -       addl    $VEC_SIZE, %eax
> > -       addq    %rdi, %rax
> > +       /* Continue creating zero labels that fit in aligning bytes and get
> > +          2-byte encoding / are in the same cache line as condition.  */
> > +L(zero_1):
> > +       xorl    %eax, %eax
> >         ret
> >
> > -       .p2align 4
> > -L(last_vec_x2):
> > -       bsrl    %eax, %eax
> > -       addl    $(VEC_SIZE * 2), %eax
> > -       addq    %rdi, %rax
> > +       .p2align 4,, 8
> > +L(ret_vec_x1):
> > +       /* This will naturally add 32 to position.  */
> > +       bsrl    %ecx, %ecx
> > +       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
> >         ret
> >
> > -       .p2align 4
> > -L(last_vec_x3):
> > -       bsrl    %eax, %eax
> > -       addl    $(VEC_SIZE * 3), %eax
> > -       addq    %rdi, %rax
> > -       ret
> > +       .p2align 4,, 8
> > +L(more_2x_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0_dec)
> >
> > -       .p2align 4
> > -L(last_vec_x1_check):
> > -       bsrl    %eax, %eax
> > -       subq    $(VEC_SIZE * 3), %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addl    $VEC_SIZE, %eax
> > -       addq    %rdi, %rax
> > -       ret
> > +       vpcmpb  $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x1)
> >
> > -       .p2align 4
> > -L(last_vec_x3_check):
> > -       bsrl    %eax, %eax
> > -       subq    $VEC_SIZE, %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addl    $(VEC_SIZE * 3), %eax
> > -       addq    %rdi, %rax
> > -       ret
> > +       /* Need no matter what.  */
> > +       vpcmpb  $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> >
> > -       .p2align 4
> > -L(zero):
> > -       xorl    %eax, %eax
> > +       subq    $(VEC_SIZE * 4), %rdx
> > +       ja      L(more_4x_vec)
> > +
> > +       cmpl    $(VEC_SIZE * -1), %edx
> > +       jle     L(ret_vec_x2_test)
> > +L(last_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x2)
> > +
> > +
> > +       /* Need no matter what.  */
> > +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> > +       lzcntl  %ecx, %ecx
> > +       subq    $(VEC_SIZE * 3 + 1), %rax
> > +       subq    %rcx, %rax
> > +       cmpq    %rax, %rdi
> > +       ja      L(zero_1)
> >         ret
> >
> > -       .p2align 4
> > -L(last_vec_or_less_aligned):
> > -       movl    %edx, %ecx
> > -
> > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> > -
> > -       movl    $1, %edx
> > -       /* Support rdx << 32.  */
> > -       salq    %cl, %rdx
> > -       subq    $1, %rdx
> > -
> > -       kmovd   %k1, %eax
> > -
> > -       /* Remove the trailing bytes.  */
> > -       andl    %edx, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > -
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > +       .p2align 4,, 8
> > +L(ret_vec_x2_test):
> > +       lzcntl  %ecx, %ecx
> > +       subq    $(VEC_SIZE * 2 + 1), %rax
> > +       subq    %rcx, %rax
> > +       cmpq    %rax, %rdi
> > +       ja      L(zero_1)
> >         ret
> >
> > -       .p2align 4
> > -L(last_vec_or_less):
> > -       addl    $VEC_SIZE, %edx
> > -
> > -       /* Check for zero length.  */
> > -       testl   %edx, %edx
> > -       jz      L(zero)
> > -
> > -       movl    %edi, %ecx
> > -       andl    $(VEC_SIZE - 1), %ecx
> > -       jz      L(last_vec_or_less_aligned)
> > -
> > -       movl    %ecx, %esi
> > -       movl    %ecx, %r8d
> > -       addl    %edx, %esi
> > -       andq    $-VEC_SIZE, %rdi
> > +       .p2align 4,, 8
> > +L(ret_vec_x2):
> > +       bsrl    %ecx, %ecx
> > +       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> > +       ret
> >
> > -       subl    $VEC_SIZE, %esi
> > -       ja      L(last_vec_2x_aligned)
> > +       .p2align 4,, 8
> > +L(ret_vec_x3):
> > +       bsrl    %ecx, %ecx
> > +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> > +       ret
> >
> > -       /* Check the last VEC.  */
> > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> > -       kmovd   %k1, %eax
> > +       .p2align 4,, 8
> > +L(more_4x_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x2)
> >
> > -       /* Remove the leading and trailing bytes.  */
> > -       sarl    %cl, %eax
> > -       movl    %edx, %ecx
> > +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> >
> > -       movl    $1, %edx
> > -       sall    %cl, %edx
> > -       subl    $1, %edx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x3)
> >
> > -       andl    %edx, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > +       /* Check if near end before re-aligning (otherwise might do an
> > +          unnecissary loop iteration).  */
>                unnecessary
> > +       addq    $-(VEC_SIZE * 4), %rax
> > +       cmpq    $(VEC_SIZE * 4), %rdx
> > +       jbe     L(last_4x_vec)
> >
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > -       addq    %r8, %rax
> > -       ret
> > +       decq    %rax
> > +       andq    $-(VEC_SIZE * 4), %rax
> > +       movq    %rdi, %rdx
> > +       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
> > +          lengths that overflow can be valid and break the comparison.  */
> > +       andq    $-(VEC_SIZE * 4), %rdx
> >
> >         .p2align 4
> > -L(last_vec_2x_aligned):
> > -       movl    %esi, %ecx
> > -
> > -       /* Check the last VEC.  */
> > -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > +L(loop_4x_vec):
> > +       /* Store 1 were not-equals and 0 where equals in k1 (used to mask later
> > +          on).  */
> > +       vpcmpb  $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
> > +
> > +       /* VEC(2/3) will have zero-byte where we found a CHAR.  */
> > +       vpxorq  (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
> > +       vpxorq  (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
> > +       vpcmpb  $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
> > +
> > +       /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
> > +          CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
> > +       vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z}
> > +       vptestnmb %VEC(3), %VEC(3), %k2
> > +
> > +       /* Any 1s and we found CHAR.  */
> > +       kortestd %k2, %k4
> > +       jnz     L(loop_end)
> > +
> > +       addq    $-(VEC_SIZE * 4), %rax
> > +       cmpq    %rdx, %rax
> > +       jne     L(loop_4x_vec)
> > +
> > +       /* Need to re-adjust rdx / rax for L(last_4x_vec).  */
> > +       subq    $-(VEC_SIZE * 4), %rdx
> > +       movq    %rdx, %rax
> > +       subl    %edi, %edx
> > +L(last_4x_vec):
> > +
> > +       /* Used no matter what.  */
> > +       vpcmpb  $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> >
> > -       movl    $1, %edx
> > -       sall    %cl, %edx
> > -       subl    $1, %edx
> > +       cmpl    $(VEC_SIZE * 2), %edx
> > +       jbe     L(last_2x_vec)
> >
> > -       kmovd   %k1, %eax
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0_dec)
> >
> > -       /* Remove the trailing bytes.  */
> > -       andl    %edx, %eax
> >
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1)
> > +       vpcmpb  $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> >
> > -       /* Check the second last VEC.  */
> > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x1)
> >
> > -       movl    %r8d, %ecx
> > +       /* Used no matter what.  */
> > +       vpcmpb  $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> >
> > -       kmovd   %k1, %eax
> > +       cmpl    $(VEC_SIZE * 3), %edx
> > +       ja      L(last_vec)
> >
> > -       /* Remove the leading bytes.  Must use unsigned right shift for
> > -          bsrl below.  */
> > -       shrl    %cl, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > +       lzcntl  %ecx, %ecx
> > +       subq    $(VEC_SIZE * 2 + 1), %rax
> > +       subq    %rcx, %rax
> > +       cmpq    %rax, %rdi
> > +       jbe     L(ret_1)
> > +       xorl    %eax, %eax
> > +L(ret_1):
> > +       ret
> >
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > -       addq    %r8, %rax
> > +       .p2align 4,, 6
> > +L(loop_end):
> > +       kmovd   %k1, %ecx
> > +       notl    %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0_end)
> > +
> > +       vptestnmb %VEC(2), %VEC(2), %k0
> > +       kmovd   %k0, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x1_end)
> > +
> > +       kmovd   %k2, %ecx
> > +       kmovd   %k4, %esi
> > +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> > +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> > +          then CHAR in VEC3 and bsrq will use that position.  */
> > +       salq    $32, %rcx
> > +       orq     %rsi, %rcx
> > +       bsrq    %rcx, %rcx
> > +       addq    %rcx, %rax
> > +       ret
> > +       .p2align 4,, 4
> > +L(ret_vec_x0_end):
> > +       addq    $(VEC_SIZE), %rax
> > +L(ret_vec_x1_end):
> > +       bsrl    %ecx, %ecx
> > +       leaq    (VEC_SIZE * 2)(%rax, %rcx), %rax
> >         ret
> > -END (__memrchr_evex)
> > +
> > +END(MEMRCHR)
> >  #endif
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v4 5/8] x86: Optimize memrchr-evex.S
  2022-06-07  4:09         ` Noah Goldstein
@ 2022-06-07  4:12           ` Noah Goldstein
  0 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:12 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 9:09 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Jun 6, 2022 at 7:41 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Jun 6, 2022 at 3:37 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > The new code:
> > >     1. prioritizes smaller user-arg lengths more.
> > >     2. optimizes target placement more carefully
> > >     3. reuses logic more
> > >     4. fixes up various inefficiencies in the logic. The biggest
> > >        case here is the `lzcnt` logic for checking returns which
> > >        saves either a branch or multiple instructions.
> > >
> > > The total code size saving is: 263 bytes
> > > Geometric Mean of all benchmarks New / Old: 0.755
> > >
> > > Regressions:
> > > There are some regressions. Particularly where the length (user arg
> > > length) is large but the position of the match char is near the
> > > begining of the string (in first VEC). This case has roughly a
> >
> > beginning
>
> Fixed in V5.
> >
> > > 20% regression.
> > >
> > > This is because the new logic gives the hot path for immediate matches
> > > to shorter lengths (the more common input). This case has roughly
> > > a 35% speedup.
> > >
> > > Full xcheck passes on x86_64.
> > > ---
> > >  sysdeps/x86_64/multiarch/memrchr-evex.S | 539 ++++++++++++------------
> > >  1 file changed, 268 insertions(+), 271 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
> > > index 0b99709c6b..ad541c0e50 100644
> > > --- a/sysdeps/x86_64/multiarch/memrchr-evex.S
> > > +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
> > > @@ -19,319 +19,316 @@
> > >  #if IS_IN (libc)
> > >
> > >  # include <sysdep.h>
> > > +# include "evex256-vecs.h"
> > > +# if VEC_SIZE != 32
> > > +#  error "VEC_SIZE != 32 unimplemented"
> > > +# endif
> > > +
> > > +# ifndef MEMRCHR
> > > +#  define MEMRCHR                              __memrchr_evex
> > > +# endif
> > > +
> > > +# define PAGE_SIZE                     4096
> > > +# define VECMATCH                      VEC(0)
> > > +
> > > +       .section SECTION(.text), "ax", @progbits
> > > +ENTRY_P2ALIGN(MEMRCHR, 6)
> > > +# ifdef __ILP32__
> > > +       /* Clear upper bits.  */
> > > +       and     %RDX_LP, %RDX_LP
> > > +# else
> > > +       test    %RDX_LP, %RDX_LP
> > > +# endif
> > > +       jz      L(zero_0)
> > > +
> > > +       /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
> > > +          correct page cross check and 2) it correctly sets up end ptr to be
> > > +          subtract by lzcnt aligned.  */
> > > +       leaq    -1(%rdi, %rdx), %rax
> > > +       vpbroadcastb %esi, %VECMATCH
> > > +
> > > +       /* Check if we can load 1x VEC without cross a page.  */
> > > +       testl   $(PAGE_SIZE - VEC_SIZE), %eax
> > > +       jz      L(page_cross)
> > > +
> > > +       /* Don't use rax for pointer here because EVEX has better encoding with
> > > +          offset % VEC_SIZE == 0.  */
> > > +       vpcmpb  $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
> > > +       kmovd   %k0, %ecx
> > > +
> > > +       /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
> > > +       cmpq    $VEC_SIZE, %rdx
> > > +       ja      L(more_1x_vec)
> > > +L(ret_vec_x0_test):
> > > +
> > > +       /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
> > > +          will gurantee edx (len) is less than it.  */
> >                      guarantee
>
> Fixed in V5.
>
> > > +       lzcntl  %ecx, %ecx
> > > +       cmpl    %ecx, %edx
> > > +       jle     L(zero_0)
> > > +       subq    %rcx, %rax
> > > +       ret
> > >
> > > -# define VMOVA         vmovdqa64
> > > -
> > > -# define YMMMATCH      ymm16
> > > -
> > > -# define VEC_SIZE 32
> > > -
> > > -       .section .text.evex,"ax",@progbits
> > > -ENTRY (__memrchr_evex)
> > > -       /* Broadcast CHAR to YMMMATCH.  */
> > > -       vpbroadcastb %esi, %YMMMATCH
> > > -
> > > -       sub     $VEC_SIZE, %RDX_LP
> > > -       jbe     L(last_vec_or_less)
> > > -
> > > -       add     %RDX_LP, %RDI_LP
> > > -
> > > -       /* Check the last VEC_SIZE bytes.  */
> > > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> > > -       kmovd   %k1, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x0)
> > > -
> > > -       subq    $(VEC_SIZE * 4), %rdi
> > > -       movl    %edi, %ecx
> > > -       andl    $(VEC_SIZE - 1), %ecx
> > > -       jz      L(aligned_more)
> > > -
> > > -       /* Align data for aligned loads in the loop.  */
> > > -       addq    $VEC_SIZE, %rdi
> > > -       addq    $VEC_SIZE, %rdx
> > > -       andq    $-VEC_SIZE, %rdi
> > > -       subq    %rcx, %rdx
> > > -
> > > -       .p2align 4
> > > -L(aligned_more):
> > > -       subq    $(VEC_SIZE * 4), %rdx
> > > -       jbe     L(last_4x_vec_or_less)
> > > -
> > > -       /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > -          since data is only aligned to VEC_SIZE.  */
> > > -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > > -       kmovd   %k1, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x3)
> > > -
> > > -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
> > > -       kmovd   %k2, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x2)
> > > -
> > > -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
> > > -       kmovd   %k3, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x1)
> > > -
> > > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k4
> > > -       kmovd   %k4, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x0)
> > > -
> > > -       /* Align data to 4 * VEC_SIZE for loop with fewer branches.
> > > -          There are some overlaps with above if data isn't aligned
> > > -          to 4 * VEC_SIZE.  */
> > > -       movl    %edi, %ecx
> > > -       andl    $(VEC_SIZE * 4 - 1), %ecx
> > > -       jz      L(loop_4x_vec)
> > > -
> > > -       addq    $(VEC_SIZE * 4), %rdi
> > > -       addq    $(VEC_SIZE * 4), %rdx
> > > -       andq    $-(VEC_SIZE * 4), %rdi
> > > -       subq    %rcx, %rdx
> > > +       /* Fits in aligning bytes of first cache line.  */
> > > +L(zero_0):
> > > +       xorl    %eax, %eax
> > > +       ret
> > >
> > > -       .p2align 4
> > > -L(loop_4x_vec):
> > > -       /* Compare 4 * VEC at a time forward.  */
> > > -       subq    $(VEC_SIZE * 4), %rdi
> > > -       subq    $(VEC_SIZE * 4), %rdx
> > > -       jbe     L(last_4x_vec_or_less)
> > > -
> > > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> > > -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> > > -       kord    %k1, %k2, %k5
> > > -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> > > -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> > > -
> > > -       kord    %k3, %k4, %k6
> > > -       kortestd %k5, %k6
> > > -       jz      L(loop_4x_vec)
> > > -
> > > -       /* There is a match.  */
> > > -       kmovd   %k4, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x3)
> > > -
> > > -       kmovd   %k3, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x2)
> > > -
> > > -       kmovd   %k2, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x1)
> > > -
> > > -       kmovd   %k1, %eax
> > > -       bsrl    %eax, %eax
> > > -       addq    %rdi, %rax
> > > +       .p2align 4,, 9
> > > +L(ret_vec_x0_dec):
> > > +       decq    %rax
> > > +L(ret_vec_x0):
> > > +       lzcntl  %ecx, %ecx
> > > +       subq    %rcx, %rax
> > >         ret
> > >
> > > -       .p2align 4
> > > -L(last_4x_vec_or_less):
> > > -       addl    $(VEC_SIZE * 4), %edx
> > > -       cmpl    $(VEC_SIZE * 2), %edx
> > > -       jbe     L(last_2x_vec)
> > > +       .p2align 4,, 10
> > > +L(more_1x_vec):
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x0)
> > >
> > > -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > > -       kmovd   %k1, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x3)
> > > +       /* Align rax (pointer to string).  */
> > > +       andq    $-VEC_SIZE, %rax
> > >
> > > -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
> > > -       kmovd   %k2, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x2)
> > > +       /* Recompute length after aligning.  */
> > > +       movq    %rax, %rdx
> > >
> > > -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
> > > -       kmovd   %k3, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x1_check)
> > > -       cmpl    $(VEC_SIZE * 3), %edx
> > > -       jbe     L(zero)
> > > +       /* Need no matter what.  */
> > > +       vpcmpb  $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
> > > +       kmovd   %k0, %ecx
> > >
> > > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k4
> > > -       kmovd   %k4, %eax
> > > -       testl   %eax, %eax
> > > -       jz      L(zero)
> > > -       bsrl    %eax, %eax
> > > -       subq    $(VEC_SIZE * 4), %rdx
> > > -       addq    %rax, %rdx
> > > -       jl      L(zero)
> > > -       addq    %rdi, %rax
> > > -       ret
> > > +       subq    %rdi, %rdx
> > >
> > > -       .p2align 4
> > > +       cmpq    $(VEC_SIZE * 2), %rdx
> > > +       ja      L(more_2x_vec)
> > >  L(last_2x_vec):
> > > -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > > -       kmovd   %k1, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x3_check)
> > > +
> > > +       /* Must dec rax because L(ret_vec_x0_test) expects it.  */
> > > +       decq    %rax
> > >         cmpl    $VEC_SIZE, %edx
> > > -       jbe     L(zero)
> > > -
> > > -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> > > -       kmovd   %k1, %eax
> > > -       testl   %eax, %eax
> > > -       jz      L(zero)
> > > -       bsrl    %eax, %eax
> > > -       subq    $(VEC_SIZE * 2), %rdx
> > > -       addq    %rax, %rdx
> > > -       jl      L(zero)
> > > -       addl    $(VEC_SIZE * 2), %eax
> > > -       addq    %rdi, %rax
> > > +       jbe     L(ret_vec_x0_test)
> > > +
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x0)
> > > +
> > > +       /* Don't use rax for pointer here because EVEX has better encoding with
> > > +          offset % VEC_SIZE == 0.  */
> > > +       vpcmpb  $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
> > > +       kmovd   %k0, %ecx
> > > +       /* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
> > > +       lzcntq  %rcx, %rcx
> > > +       cmpl    %ecx, %edx
> > > +       jle     L(zero_0)
> > > +       subq    %rcx, %rax
> > >         ret
> > >
> > > -       .p2align 4
> > > -L(last_vec_x0):
> > > -       bsrl    %eax, %eax
> > > -       addq    %rdi, %rax
> > > +       /* Inexpensive place to put this regarding code size / target alignments
> > > +          / ICache NLP. Necessary for 2-byte encoding of jump to page cross
> > > +          case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
> >                                   ^^^^^^^^^^^^^^^^^^^ Typo?
>
> Missed this in V5. Will fix in V6 (will wait for other feedback).

Fixed in v6 (in avx2 version as well).
> > > +          in first cache line.  */
> > > +L(page_cross):
> > > +       movq    %rax, %rsi
> > > +       andq    $-VEC_SIZE, %rsi
> > > +       vpcmpb  $0, (%rsi), %VECMATCH, %k0
> > > +       kmovd   %k0, %r8d
> > > +       /* Shift out negative alignment (because we are starting from endptr and
> > > +          working backwards).  */
> > > +       movl    %eax, %ecx
> > > +       /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
> > > +       notl    %ecx
> > > +       shlxl   %ecx, %r8d, %ecx
> > > +       cmpq    %rdi, %rsi
> > > +       ja      L(more_1x_vec)
> > > +       lzcntl  %ecx, %ecx
> > > +       cmpl    %ecx, %edx
> > > +       jle     L(zero_1)
> > > +       subq    %rcx, %rax
> > >         ret
> > >
> > > -       .p2align 4
> > > -L(last_vec_x1):
> > > -       bsrl    %eax, %eax
> > > -       addl    $VEC_SIZE, %eax
> > > -       addq    %rdi, %rax
> > > +       /* Continue creating zero labels that fit in aligning bytes and get
> > > +          2-byte encoding / are in the same cache line as condition.  */
> > > +L(zero_1):
> > > +       xorl    %eax, %eax
> > >         ret
> > >
> > > -       .p2align 4
> > > -L(last_vec_x2):
> > > -       bsrl    %eax, %eax
> > > -       addl    $(VEC_SIZE * 2), %eax
> > > -       addq    %rdi, %rax
> > > +       .p2align 4,, 8
> > > +L(ret_vec_x1):
> > > +       /* This will naturally add 32 to position.  */
> > > +       bsrl    %ecx, %ecx
> > > +       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
> > >         ret
> > >
> > > -       .p2align 4
> > > -L(last_vec_x3):
> > > -       bsrl    %eax, %eax
> > > -       addl    $(VEC_SIZE * 3), %eax
> > > -       addq    %rdi, %rax
> > > -       ret
> > > +       .p2align 4,, 8
> > > +L(more_2x_vec):
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x0_dec)
> > >
> > > -       .p2align 4
> > > -L(last_vec_x1_check):
> > > -       bsrl    %eax, %eax
> > > -       subq    $(VEC_SIZE * 3), %rdx
> > > -       addq    %rax, %rdx
> > > -       jl      L(zero)
> > > -       addl    $VEC_SIZE, %eax
> > > -       addq    %rdi, %rax
> > > -       ret
> > > +       vpcmpb  $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
> > > +       kmovd   %k0, %ecx
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x1)
> > >
> > > -       .p2align 4
> > > -L(last_vec_x3_check):
> > > -       bsrl    %eax, %eax
> > > -       subq    $VEC_SIZE, %rdx
> > > -       addq    %rax, %rdx
> > > -       jl      L(zero)
> > > -       addl    $(VEC_SIZE * 3), %eax
> > > -       addq    %rdi, %rax
> > > -       ret
> > > +       /* Need no matter what.  */
> > > +       vpcmpb  $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
> > > +       kmovd   %k0, %ecx
> > >
> > > -       .p2align 4
> > > -L(zero):
> > > -       xorl    %eax, %eax
> > > +       subq    $(VEC_SIZE * 4), %rdx
> > > +       ja      L(more_4x_vec)
> > > +
> > > +       cmpl    $(VEC_SIZE * -1), %edx
> > > +       jle     L(ret_vec_x2_test)
> > > +L(last_vec):
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x2)
> > > +
> > > +
> > > +       /* Need no matter what.  */
> > > +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> > > +       kmovd   %k0, %ecx
> > > +       lzcntl  %ecx, %ecx
> > > +       subq    $(VEC_SIZE * 3 + 1), %rax
> > > +       subq    %rcx, %rax
> > > +       cmpq    %rax, %rdi
> > > +       ja      L(zero_1)
> > >         ret
> > >
> > > -       .p2align 4
> > > -L(last_vec_or_less_aligned):
> > > -       movl    %edx, %ecx
> > > -
> > > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> > > -
> > > -       movl    $1, %edx
> > > -       /* Support rdx << 32.  */
> > > -       salq    %cl, %rdx
> > > -       subq    $1, %rdx
> > > -
> > > -       kmovd   %k1, %eax
> > > -
> > > -       /* Remove the trailing bytes.  */
> > > -       andl    %edx, %eax
> > > -       testl   %eax, %eax
> > > -       jz      L(zero)
> > > -
> > > -       bsrl    %eax, %eax
> > > -       addq    %rdi, %rax
> > > +       .p2align 4,, 8
> > > +L(ret_vec_x2_test):
> > > +       lzcntl  %ecx, %ecx
> > > +       subq    $(VEC_SIZE * 2 + 1), %rax
> > > +       subq    %rcx, %rax
> > > +       cmpq    %rax, %rdi
> > > +       ja      L(zero_1)
> > >         ret
> > >
> > > -       .p2align 4
> > > -L(last_vec_or_less):
> > > -       addl    $VEC_SIZE, %edx
> > > -
> > > -       /* Check for zero length.  */
> > > -       testl   %edx, %edx
> > > -       jz      L(zero)
> > > -
> > > -       movl    %edi, %ecx
> > > -       andl    $(VEC_SIZE - 1), %ecx
> > > -       jz      L(last_vec_or_less_aligned)
> > > -
> > > -       movl    %ecx, %esi
> > > -       movl    %ecx, %r8d
> > > -       addl    %edx, %esi
> > > -       andq    $-VEC_SIZE, %rdi
> > > +       .p2align 4,, 8
> > > +L(ret_vec_x2):
> > > +       bsrl    %ecx, %ecx
> > > +       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> > > +       ret
> > >
> > > -       subl    $VEC_SIZE, %esi
> > > -       ja      L(last_vec_2x_aligned)
> > > +       .p2align 4,, 8
> > > +L(ret_vec_x3):
> > > +       bsrl    %ecx, %ecx
> > > +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> > > +       ret
> > >
> > > -       /* Check the last VEC.  */
> > > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> > > -       kmovd   %k1, %eax
> > > +       .p2align 4,, 8
> > > +L(more_4x_vec):
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x2)
> > >
> > > -       /* Remove the leading and trailing bytes.  */
> > > -       sarl    %cl, %eax
> > > -       movl    %edx, %ecx
> > > +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> > > +       kmovd   %k0, %ecx
> > >
> > > -       movl    $1, %edx
> > > -       sall    %cl, %edx
> > > -       subl    $1, %edx
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x3)
> > >
> > > -       andl    %edx, %eax
> > > -       testl   %eax, %eax
> > > -       jz      L(zero)
> > > +       /* Check if near end before re-aligning (otherwise might do an
> > > +          unnecissary loop iteration).  */
> >                unnecessary
> > > +       addq    $-(VEC_SIZE * 4), %rax
> > > +       cmpq    $(VEC_SIZE * 4), %rdx
> > > +       jbe     L(last_4x_vec)
> > >
> > > -       bsrl    %eax, %eax
> > > -       addq    %rdi, %rax
> > > -       addq    %r8, %rax
> > > -       ret
> > > +       decq    %rax
> > > +       andq    $-(VEC_SIZE * 4), %rax
> > > +       movq    %rdi, %rdx
> > > +       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
> > > +          lengths that overflow can be valid and break the comparison.  */
> > > +       andq    $-(VEC_SIZE * 4), %rdx
> > >
> > >         .p2align 4
> > > -L(last_vec_2x_aligned):
> > > -       movl    %esi, %ecx
> > > -
> > > -       /* Check the last VEC.  */
> > > -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > > +L(loop_4x_vec):
> > > +       /* Store 1 were not-equals and 0 where equals in k1 (used to mask later
> > > +          on).  */
> > > +       vpcmpb  $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
> > > +
> > > +       /* VEC(2/3) will have zero-byte where we found a CHAR.  */
> > > +       vpxorq  (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
> > > +       vpxorq  (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
> > > +       vpcmpb  $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
> > > +
> > > +       /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
> > > +          CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
> > > +       vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z}
> > > +       vptestnmb %VEC(3), %VEC(3), %k2
> > > +
> > > +       /* Any 1s and we found CHAR.  */
> > > +       kortestd %k2, %k4
> > > +       jnz     L(loop_end)
> > > +
> > > +       addq    $-(VEC_SIZE * 4), %rax
> > > +       cmpq    %rdx, %rax
> > > +       jne     L(loop_4x_vec)
> > > +
> > > +       /* Need to re-adjust rdx / rax for L(last_4x_vec).  */
> > > +       subq    $-(VEC_SIZE * 4), %rdx
> > > +       movq    %rdx, %rax
> > > +       subl    %edi, %edx
> > > +L(last_4x_vec):
> > > +
> > > +       /* Used no matter what.  */
> > > +       vpcmpb  $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
> > > +       kmovd   %k0, %ecx
> > >
> > > -       movl    $1, %edx
> > > -       sall    %cl, %edx
> > > -       subl    $1, %edx
> > > +       cmpl    $(VEC_SIZE * 2), %edx
> > > +       jbe     L(last_2x_vec)
> > >
> > > -       kmovd   %k1, %eax
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x0_dec)
> > >
> > > -       /* Remove the trailing bytes.  */
> > > -       andl    %edx, %eax
> > >
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x1)
> > > +       vpcmpb  $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
> > > +       kmovd   %k0, %ecx
> > >
> > > -       /* Check the second last VEC.  */
> > > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x1)
> > >
> > > -       movl    %r8d, %ecx
> > > +       /* Used no matter what.  */
> > > +       vpcmpb  $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
> > > +       kmovd   %k0, %ecx
> > >
> > > -       kmovd   %k1, %eax
> > > +       cmpl    $(VEC_SIZE * 3), %edx
> > > +       ja      L(last_vec)
> > >
> > > -       /* Remove the leading bytes.  Must use unsigned right shift for
> > > -          bsrl below.  */
> > > -       shrl    %cl, %eax
> > > -       testl   %eax, %eax
> > > -       jz      L(zero)
> > > +       lzcntl  %ecx, %ecx
> > > +       subq    $(VEC_SIZE * 2 + 1), %rax
> > > +       subq    %rcx, %rax
> > > +       cmpq    %rax, %rdi
> > > +       jbe     L(ret_1)
> > > +       xorl    %eax, %eax
> > > +L(ret_1):
> > > +       ret
> > >
> > > -       bsrl    %eax, %eax
> > > -       addq    %rdi, %rax
> > > -       addq    %r8, %rax
> > > +       .p2align 4,, 6
> > > +L(loop_end):
> > > +       kmovd   %k1, %ecx
> > > +       notl    %ecx
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x0_end)
> > > +
> > > +       vptestnmb %VEC(2), %VEC(2), %k0
> > > +       kmovd   %k0, %ecx
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x1_end)
> > > +
> > > +       kmovd   %k2, %ecx
> > > +       kmovd   %k4, %esi
> > > +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> > > +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> > > +          then CHAR in VEC3 and bsrq will use that position.  */
> > > +       salq    $32, %rcx
> > > +       orq     %rsi, %rcx
> > > +       bsrq    %rcx, %rcx
> > > +       addq    %rcx, %rax
> > > +       ret
> > > +       .p2align 4,, 4
> > > +L(ret_vec_x0_end):
> > > +       addq    $(VEC_SIZE), %rax
> > > +L(ret_vec_x1_end):
> > > +       bsrl    %ecx, %ecx
> > > +       leaq    (VEC_SIZE * 2)(%rax, %rcx), %rax
> > >         ret
> > > -END (__memrchr_evex)
> > > +
> > > +END(MEMRCHR)
> > >  #endif
> > > --
> > > 2.34.1
> > >
> >
> >
> > --
> > H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v4 6/8] x86: Optimize memrchr-avx2.S
  2022-06-06 22:37   ` [PATCH v4 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (3 preceding siblings ...)
  2022-06-06 22:37     ` [PATCH v4 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
@ 2022-06-06 22:37     ` Noah Goldstein
  2022-06-07  2:35       ` H.J. Lu
  2022-06-06 22:37     ` [PATCH v4 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
  2022-06-06 22:37     ` [PATCH v4 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
  6 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-06 22:37 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller user-arg lengths more.
    2. optimizes target placement more carefully
    3. reuses logic more
    4. fixes up various inefficiencies in the logic. The biggest
       case here is the `lzcnt` logic for checking returns which
       saves either a branch or multiple instructions.

The total code size saving is: 306 bytes
Geometric Mean of all benchmarks New / Old: 0.760

Regressions:
There are some regressions. Particularly where the length (user arg
length) is large but the position of the match char is near the
begining of the string (in first VEC). This case has roughly a
10-20% regression.

This is because the new logic gives the hot path for immediate matches
to shorter lengths (the more common input). This case has roughly
a 15-45% speedup.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S |   1 +
 sysdeps/x86_64/multiarch/memrchr-avx2.S     | 538 ++++++++++----------
 2 files changed, 260 insertions(+), 279 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
index cea2d2a72d..5e9beeeef2 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
@@ -2,6 +2,7 @@
 # define MEMRCHR __memrchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index ba2ce7cb03..6915e1c373 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -21,340 +21,320 @@
 # include <sysdep.h>
 
 # ifndef MEMRCHR
-#  define MEMRCHR	__memrchr_avx2
+#  define MEMRCHR				__memrchr_avx2
 # endif
 
 # ifndef VZEROUPPER
-#  define VZEROUPPER	vzeroupper
+#  define VZEROUPPER			vzeroupper
 # endif
 
+// abf-off
 # ifndef SECTION
 #  define SECTION(p)	p##.avx
 # endif
+// abf-on
+
+# define VEC_SIZE			32
+# define PAGE_SIZE			4096
+	.section SECTION(.text), "ax", @progbits
+ENTRY(MEMRCHR)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
 
-# define VEC_SIZE 32
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMRCHR)
-	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
-	vpbroadcastb %xmm0, %ymm0
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdx, %rdi), %rax
 
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
+	vpbroadcastb %xmm0, %ymm0
 
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+
+L(ret_vec_x0_test):
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will gurantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+
+	/* Hoist vzeroupper (not great for RTM) to save code size. This allows
+	   all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vmovdqa	(%rdi), %ymm1
-	vmovdqa	VEC_SIZE(%rdi), %ymm2
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
-
-	vpcmpeqb %ymm1, %ymm0, %ymm1
-	vpcmpeqb %ymm2, %ymm0, %ymm2
-	vpcmpeqb %ymm3, %ymm0, %ymm3
-	vpcmpeqb %ymm4, %ymm0, %ymm4
-
-	vpor	%ymm1, %ymm2, %ymm5
-	vpor	%ymm3, %ymm4, %ymm6
-	vpor	%ymm5, %ymm6, %ymm5
-
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpmovmskb %ymm1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Align rax (string pointer).  */
+	andq	$-VEC_SIZE, %rax
+
+	/* Recompute remaining length after aligning.  */
+	movq	%rax, %rdx
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
+	subq	%rdi, %rdx
+	decq	%rax
+	vpmovmskb %ymm1, %ecx
+	/* Fall through for short (hotter than length).  */
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpeqb (%rsi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %r8d
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%r8d
+	shlxl	%r8d, %ecx, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
+	.p2align 4,, 11
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
+	.p2align 4,, 10
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	VZEROUPPER_RETURN
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	.p2align 4
-L(null):
+	/* First in aligning bytes.  */
+L(zero_2):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
+	.p2align 4,, 4
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	vpcmpeqb (%rdi), %ymm0, %ymm1
 
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
+	.p2align 4,, 11
+L(ret_vec_x2):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4,, 14
+L(ret_vec_x3):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
 	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(null)
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecissary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	/* Align rax to (VEC_SIZE - 1).  */
+	orq	$(VEC_SIZE * 4 - 1), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	orq	$(VEC_SIZE * 4 - 1), %rdx
 
-	/* Check the last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	.p2align 4
+L(loop_4x_vec):
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	vpor	%ymm1, %ymm2, %ymm2
+	vpor	%ymm3, %ymm4, %ymm4
+	vpor	%ymm2, %ymm4, %ymm4
+	vpmovmskb %ymm4, %esi
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	testl	%esi, %esi
+	jnz	L(loop_end)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	VZEROUPPER_RETURN
+	addq	$(VEC_SIZE * -4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
 
-	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
+	subl	%edi, %edx
+	incl	%edx
 
-	/* Check the last VEC.  */
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
+L(last_4x_vec):
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	vpmovmskb %ymm1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret0)
+	xorl	%eax, %eax
+L(ret0):
+	ret
 
-	movl	%r8d, %ecx
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(loop_end):
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vpmovmskb %ymm2, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	vpmovmskb %ymm3, %ecx
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	.p2align 4,, 4
+L(ret_vec_x1_end):
+	/* 64-bit version will automatically add 32 (VEC_SIZE).  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
+	VZEROUPPER_RETURN
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
-END (MEMRCHR)
+
+	/* 2 bytes until next cache line.  */
+END(MEMRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v4 6/8] x86: Optimize memrchr-avx2.S
  2022-06-06 22:37     ` [PATCH v4 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
@ 2022-06-07  2:35       ` H.J. Lu
  2022-06-07  4:06         ` Noah Goldstein
  0 siblings, 1 reply; 82+ messages in thread
From: H.J. Lu @ 2022-06-07  2:35 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 3:37 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code:
>     1. prioritizes smaller user-arg lengths more.
>     2. optimizes target placement more carefully
>     3. reuses logic more
>     4. fixes up various inefficiencies in the logic. The biggest
>        case here is the `lzcnt` logic for checking returns which
>        saves either a branch or multiple instructions.
>
> The total code size saving is: 306 bytes
> Geometric Mean of all benchmarks New / Old: 0.760
>
> Regressions:
> There are some regressions. Particularly where the length (user arg
> length) is large but the position of the match char is near the
> begining of the string (in first VEC). This case has roughly a
> 10-20% regression.
>
> This is because the new logic gives the hot path for immediate matches
> to shorter lengths (the more common input). This case has roughly
> a 15-45% speedup.
>
> Full xcheck passes on x86_64.
> ---
>  sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S |   1 +
>  sysdeps/x86_64/multiarch/memrchr-avx2.S     | 538 ++++++++++----------
>  2 files changed, 260 insertions(+), 279 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> index cea2d2a72d..5e9beeeef2 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> @@ -2,6 +2,7 @@
>  # define MEMRCHR __memrchr_avx2_rtm
>  #endif
>
> +#define COND_VZEROUPPER        COND_VZEROUPPER_XTEST
>  #define ZERO_UPPER_VEC_REGISTERS_RETURN \
>    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>
> diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
> index ba2ce7cb03..6915e1c373 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
> @@ -21,340 +21,320 @@
>  # include <sysdep.h>
>
>  # ifndef MEMRCHR
> -#  define MEMRCHR      __memrchr_avx2
> +#  define MEMRCHR                              __memrchr_avx2
>  # endif
>
>  # ifndef VZEROUPPER
> -#  define VZEROUPPER   vzeroupper
> +#  define VZEROUPPER                   vzeroupper
>  # endif
>
> +// abf-off
>  # ifndef SECTION
>  #  define SECTION(p)   p##.avx
>  # endif
> +// abf-on

What are the above changes for?

> +# define VEC_SIZE                      32
> +# define PAGE_SIZE                     4096
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(MEMRCHR)
> +# ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       and     %RDX_LP, %RDX_LP
> +# else
> +       test    %RDX_LP, %RDX_LP
> +# endif
> +       jz      L(zero_0)
>
> -# define VEC_SIZE 32
> -
> -       .section SECTION(.text),"ax",@progbits
> -ENTRY (MEMRCHR)
> -       /* Broadcast CHAR to YMM0.  */
>         vmovd   %esi, %xmm0
> -       vpbroadcastb %xmm0, %ymm0
> -
> -       sub     $VEC_SIZE, %RDX_LP
> -       jbe     L(last_vec_or_less)
> -
> -       add     %RDX_LP, %RDI_LP
> +       /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
> +          correct page cross check and 2) it correctly sets up end ptr to be
> +          subtract by lzcnt aligned.  */
> +       leaq    -1(%rdx, %rdi), %rax
>
> -       /* Check the last VEC_SIZE bytes.  */
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x0)
> -
> -       subq    $(VEC_SIZE * 4), %rdi
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -       jz      L(aligned_more)
> +       vpbroadcastb %xmm0, %ymm0
>
> -       /* Align data for aligned loads in the loop.  */
> -       addq    $VEC_SIZE, %rdi
> -       addq    $VEC_SIZE, %rdx
> -       andq    $-VEC_SIZE, %rdi
> -       subq    %rcx, %rdx
> +       /* Check if we can load 1x VEC without cross a page.  */
> +       testl   $(PAGE_SIZE - VEC_SIZE), %eax
> +       jz      L(page_cross)
> +
> +       vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       cmpq    $VEC_SIZE, %rdx
> +       ja      L(more_1x_vec)
> +
> +L(ret_vec_x0_test):
> +       /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
> +          will gurantee edx (len) is less than it.  */
> +       lzcntl  %ecx, %ecx
> +
> +       /* Hoist vzeroupper (not great for RTM) to save code size. This allows
> +          all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
> +       COND_VZEROUPPER
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
> +       ret
>
> -       .p2align 4
> -L(aligned_more):
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -
> -       /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -          since data is only aligned to VEC_SIZE.  */
> -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
> -       vpmovmskb %ymm2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
> -       vpmovmskb %ymm3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       vpcmpeqb (%rdi), %ymm0, %ymm4
> -       vpmovmskb %ymm4, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x0)
> -
> -       /* Align data to 4 * VEC_SIZE for loop with fewer branches.
> -          There are some overlaps with above if data isn't aligned
> -          to 4 * VEC_SIZE.  */
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE * 4 - 1), %ecx
> -       jz      L(loop_4x_vec)
> -
> -       addq    $(VEC_SIZE * 4), %rdi
> -       addq    $(VEC_SIZE * 4), %rdx
> -       andq    $-(VEC_SIZE * 4), %rdi
> -       subq    %rcx, %rdx
> +       /* Fits in aligning bytes of first cache line.  */
> +L(zero_0):
> +       xorl    %eax, %eax
> +       ret
>
> -       .p2align 4
> -L(loop_4x_vec):
> -       /* Compare 4 * VEC at a time forward.  */
> -       subq    $(VEC_SIZE * 4), %rdi
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -
> -       vmovdqa (%rdi), %ymm1
> -       vmovdqa VEC_SIZE(%rdi), %ymm2
> -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> -
> -       vpcmpeqb %ymm1, %ymm0, %ymm1
> -       vpcmpeqb %ymm2, %ymm0, %ymm2
> -       vpcmpeqb %ymm3, %ymm0, %ymm3
> -       vpcmpeqb %ymm4, %ymm0, %ymm4
> -
> -       vpor    %ymm1, %ymm2, %ymm5
> -       vpor    %ymm3, %ymm4, %ymm6
> -       vpor    %ymm5, %ymm6, %ymm5
> -
> -       vpmovmskb %ymm5, %eax
> -       testl   %eax, %eax
> -       jz      L(loop_4x_vec)
> -
> -       /* There is a match.  */
> -       vpmovmskb %ymm4, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       vpmovmskb %ymm3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       vpmovmskb %ymm2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       vpmovmskb %ymm1, %eax
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> +       .p2align 4,, 9
> +L(ret_vec_x0):
> +       lzcntl  %ecx, %ecx
> +       subq    %rcx, %rax
>  L(return_vzeroupper):
>         ZERO_UPPER_VEC_REGISTERS_RETURN
>
> -       .p2align 4
> -L(last_4x_vec_or_less):
> -       addl    $(VEC_SIZE * 4), %edx
> -       cmpl    $(VEC_SIZE * 2), %edx
> -       jbe     L(last_2x_vec)
> -
> -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
> -       vpmovmskb %ymm2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
> -       vpmovmskb %ymm3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1_check)
> -       cmpl    $(VEC_SIZE * 3), %edx
> -       jbe     L(zero)
> -
> -       vpcmpeqb (%rdi), %ymm0, %ymm4
> -       vpmovmskb %ymm4, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 4), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> +       .p2align 4,, 10
> +L(more_1x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
> +
> +       /* Align rax (string pointer).  */
> +       andq    $-VEC_SIZE, %rax
> +
> +       /* Recompute remaining length after aligning.  */
> +       movq    %rax, %rdx
> +       /* Need this comparison next no matter what.  */
> +       vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
> +       subq    %rdi, %rdx
> +       decq    %rax
> +       vpmovmskb %ymm1, %ecx
> +       /* Fall through for short (hotter than length).  */
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       ja      L(more_2x_vec)
>  L(last_2x_vec):
> -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3_check)
>         cmpl    $VEC_SIZE, %edx
> -       jbe     L(zero)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 2), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $(VEC_SIZE * 2), %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(last_vec_x0):
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> +       jbe     L(ret_vec_x0_test)
> +
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
> +
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       /* 64-bit lzcnt. This will naturally add 32 to position.  */
> +       lzcntq  %rcx, %rcx
> +       COND_VZEROUPPER
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
> +       ret
>
> -       .p2align 4
> -L(last_vec_x1):
> -       bsrl    %eax, %eax
> -       addl    $VEC_SIZE, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(last_vec_x2):
> -       bsrl    %eax, %eax
> -       addl    $(VEC_SIZE * 2), %eax
> -       addq    %rdi, %rax
> +       /* Inexpensive place to put this regarding code size / target alignments
> +          / ICache NLP. Necessary for 2-byte encoding of jump to page cross
> +          case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
                                  in turn
> +          in first cache line.  */
> +L(page_cross):
> +       movq    %rax, %rsi
> +       andq    $-VEC_SIZE, %rsi
> +       vpcmpeqb (%rsi), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       /* Shift out negative alignment (because we are starting from endptr and
> +          working backwards).  */
> +       movl    %eax, %r8d
> +       /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
> +       notl    %r8d
> +       shlxl   %r8d, %ecx, %ecx
> +       cmpq    %rdi, %rsi
> +       ja      L(more_1x_vec)
> +       lzcntl  %ecx, %ecx
> +       COND_VZEROUPPER
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
> +       ret
> +       .p2align 4,, 11
> +L(ret_vec_x1):
> +       /* This will naturally add 32 to position.  */
> +       lzcntq  %rcx, %rcx
> +       subq    %rcx, %rax
>         VZEROUPPER_RETURN
> +       .p2align 4,, 10
> +L(more_2x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
>
> -       .p2align 4
> -L(last_vec_x3):
> -       bsrl    %eax, %eax
> -       addl    $(VEC_SIZE * 3), %eax
> -       addq    %rdi, %rax
> -       ret
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
>
> -       .p2align 4
> -L(last_vec_x1_check):
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 3), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $VEC_SIZE, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(last_vec_x3_check):
> -       bsrl    %eax, %eax
> -       subq    $VEC_SIZE, %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $(VEC_SIZE * 3), %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> +       /* Needed no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       .p2align 4
> -L(zero):
> -       xorl    %eax, %eax
> -       VZEROUPPER_RETURN
> +       subq    $(VEC_SIZE * 4), %rdx
> +       ja      L(more_4x_vec)
> +
> +       cmpl    $(VEC_SIZE * -1), %edx
> +       jle     L(ret_vec_x2_test)
> +
> +L(last_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
> +
> +       /* Needed no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 3), %rax
> +       COND_VZEROUPPER
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       ja      L(zero_2)
> +       ret
>
> -       .p2align 4
> -L(null):
> +       /* First in aligning bytes.  */
> +L(zero_2):
>         xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(last_vec_or_less_aligned):
> -       movl    %edx, %ecx
> +       .p2align 4,, 4
> +L(ret_vec_x2_test):
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 2), %rax
> +       COND_VZEROUPPER
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       ja      L(zero_2)
> +       ret
>
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
>
> -       movl    $1, %edx
> -       /* Support rdx << 32.  */
> -       salq    %cl, %rdx
> -       subq    $1, %rdx
> +       .p2align 4,, 11
> +L(ret_vec_x2):
> +       /* ecx must be non-zero.  */
> +       bsrl    %ecx, %ecx
> +       leaq    (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
> +       VZEROUPPER_RETURN
>
> -       vpmovmskb %ymm1, %eax
> +       .p2align 4,, 14
> +L(ret_vec_x3):
> +       /* ecx must be non-zero.  */
> +       bsrl    %ecx, %ecx
> +       leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
> +       VZEROUPPER_RETURN
>
> -       /* Remove the trailing bytes.  */
> -       andl    %edx, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
>
>         .p2align 4
> -L(last_vec_or_less):
> -       addl    $VEC_SIZE, %edx
> +L(more_4x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
>
> -       /* Check for zero length.  */
> -       testl   %edx, %edx
> -       jz      L(null)
> +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -       jz      L(last_vec_or_less_aligned)
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
>
> -       movl    %ecx, %esi
> -       movl    %ecx, %r8d
> -       addl    %edx, %esi
> -       andq    $-VEC_SIZE, %rdi
> +       /* Check if near end before re-aligning (otherwise might do an
> +          unnecissary loop iteration).  */
> +       addq    $-(VEC_SIZE * 4), %rax
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       jbe     L(last_4x_vec)
>
> -       subl    $VEC_SIZE, %esi
> -       ja      L(last_vec_2x_aligned)
> +       /* Align rax to (VEC_SIZE - 1).  */
> +       orq     $(VEC_SIZE * 4 - 1), %rax
> +       movq    %rdi, %rdx
> +       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
> +          lengths that overflow can be valid and break the comparison.  */
> +       orq     $(VEC_SIZE * 4 - 1), %rdx
>
> -       /* Check the last VEC.  */
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -
> -       /* Remove the leading and trailing bytes.  */
> -       sarl    %cl, %eax
> -       movl    %edx, %ecx
> +       .p2align 4
> +L(loop_4x_vec):
> +       /* Need this comparison next no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
> +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
> +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
>
> -       movl    $1, %edx
> -       sall    %cl, %edx
> -       subl    $1, %edx
> +       vpor    %ymm1, %ymm2, %ymm2
> +       vpor    %ymm3, %ymm4, %ymm4
> +       vpor    %ymm2, %ymm4, %ymm4
> +       vpmovmskb %ymm4, %esi
>
> -       andl    %edx, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> +       testl   %esi, %esi
> +       jnz     L(loop_end)
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       addq    %r8, %rax
> -       VZEROUPPER_RETURN
> +       addq    $(VEC_SIZE * -4), %rax
> +       cmpq    %rdx, %rax
> +       jne     L(loop_4x_vec)
>
> -       .p2align 4
> -L(last_vec_2x_aligned):
> -       movl    %esi, %ecx
> +       subl    %edi, %edx
> +       incl    %edx
>
> -       /* Check the last VEC.  */
> -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
> +L(last_4x_vec):
> +       /* Used no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       movl    $1, %edx
> -       sall    %cl, %edx
> -       subl    $1, %edx
> +       cmpl    $(VEC_SIZE * 2), %edx
> +       jbe     L(last_2x_vec)
>
> -       vpmovmskb %ymm1, %eax
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_end)
>
> -       /* Remove the trailing bytes.  */
> -       andl    %edx, %eax
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1_end)
>
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> +       /* Used no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       /* Check the second last VEC.  */
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
> +       cmpl    $(VEC_SIZE * 3), %edx
> +       ja      L(last_vec)
> +
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 2), %rax
> +       COND_VZEROUPPER
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       jbe     L(ret0)
> +       xorl    %eax, %eax
> +L(ret0):
> +       ret
>
> -       movl    %r8d, %ecx
>
> -       vpmovmskb %ymm1, %eax
> +       .p2align 4
> +L(loop_end):
> +       vpmovmskb %ymm1, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_end)
> +
> +       vpmovmskb %ymm2, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1_end)
> +
> +       vpmovmskb %ymm3, %ecx
> +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> +          then CHAR in VEC3 and bsrq will use that position.  */
> +       salq    $32, %rcx
> +       orq     %rsi, %rcx
> +       bsrq    %rcx, %rcx
> +       leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
> +       VZEROUPPER_RETURN
>
> -       /* Remove the leading bytes.  Must use unsigned right shift for
> -          bsrl below.  */
> -       shrl    %cl, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> +       .p2align 4,, 4
> +L(ret_vec_x1_end):
> +       /* 64-bit version will automatically add 32 (VEC_SIZE).  */
> +       lzcntq  %rcx, %rcx
> +       subq    %rcx, %rax
> +       VZEROUPPER_RETURN
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       addq    %r8, %rax
> +       .p2align 4,, 4
> +L(ret_vec_x0_end):
> +       lzcntl  %ecx, %ecx
> +       subq    %rcx, %rax
>         VZEROUPPER_RETURN
> -END (MEMRCHR)
> +
> +       /* 2 bytes until next cache line.  */
> +END(MEMRCHR)
>  #endif
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v4 6/8] x86: Optimize memrchr-avx2.S
  2022-06-07  2:35       ` H.J. Lu
@ 2022-06-07  4:06         ` Noah Goldstein
  0 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:06 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 7:35 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jun 6, 2022 at 3:37 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code:
> >     1. prioritizes smaller user-arg lengths more.
> >     2. optimizes target placement more carefully
> >     3. reuses logic more
> >     4. fixes up various inefficiencies in the logic. The biggest
> >        case here is the `lzcnt` logic for checking returns which
> >        saves either a branch or multiple instructions.
> >
> > The total code size saving is: 306 bytes
> > Geometric Mean of all benchmarks New / Old: 0.760
> >
> > Regressions:
> > There are some regressions. Particularly where the length (user arg
> > length) is large but the position of the match char is near the
> > begining of the string (in first VEC). This case has roughly a

Fixed this in V5
> > 10-20% regression.
> >
> > This is because the new logic gives the hot path for immediate matches
> > to shorter lengths (the more common input). This case has roughly
> > a 15-45% speedup.
> >
> > Full xcheck passes on x86_64.
> > ---
> >  sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S |   1 +
> >  sysdeps/x86_64/multiarch/memrchr-avx2.S     | 538 ++++++++++----------
> >  2 files changed, 260 insertions(+), 279 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> > index cea2d2a72d..5e9beeeef2 100644
> > --- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> > @@ -2,6 +2,7 @@
> >  # define MEMRCHR __memrchr_avx2_rtm
> >  #endif
> >
> > +#define COND_VZEROUPPER        COND_VZEROUPPER_XTEST
> >  #define ZERO_UPPER_VEC_REGISTERS_RETURN \
> >    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> >
> > diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
> > index ba2ce7cb03..6915e1c373 100644
> > --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
> > @@ -21,340 +21,320 @@
> >  # include <sysdep.h>
> >
> >  # ifndef MEMRCHR
> > -#  define MEMRCHR      __memrchr_avx2
> > +#  define MEMRCHR                              __memrchr_avx2
> >  # endif
> >
> >  # ifndef VZEROUPPER
> > -#  define VZEROUPPER   vzeroupper
> > +#  define VZEROUPPER                   vzeroupper
> >  # endif
> >
> > +// abf-off
> >  # ifndef SECTION
> >  #  define SECTION(p)   p##.avx
> >  # endif
> > +// abf-on
>
> What are the above changes for?

Removed in V5 (directive for auto-formatter).
>
> > +# define VEC_SIZE                      32
> > +# define PAGE_SIZE                     4096
> > +       .section SECTION(.text), "ax", @progbits
> > +ENTRY(MEMRCHR)
> > +# ifdef __ILP32__
> > +       /* Clear upper bits.  */
> > +       and     %RDX_LP, %RDX_LP
> > +# else
> > +       test    %RDX_LP, %RDX_LP
> > +# endif
> > +       jz      L(zero_0)
> >
> > -# define VEC_SIZE 32
> > -
> > -       .section SECTION(.text),"ax",@progbits
> > -ENTRY (MEMRCHR)
> > -       /* Broadcast CHAR to YMM0.  */
> >         vmovd   %esi, %xmm0
> > -       vpbroadcastb %xmm0, %ymm0
> > -
> > -       sub     $VEC_SIZE, %RDX_LP
> > -       jbe     L(last_vec_or_less)
> > -
> > -       add     %RDX_LP, %RDI_LP
> > +       /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
> > +          correct page cross check and 2) it correctly sets up end ptr to be
> > +          subtract by lzcnt aligned.  */
> > +       leaq    -1(%rdx, %rdi), %rax
> >
> > -       /* Check the last VEC_SIZE bytes.  */
> > -       vpcmpeqb (%rdi), %ymm0, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x0)
> > -
> > -       subq    $(VEC_SIZE * 4), %rdi
> > -       movl    %edi, %ecx
> > -       andl    $(VEC_SIZE - 1), %ecx
> > -       jz      L(aligned_more)
> > +       vpbroadcastb %xmm0, %ymm0
> >
> > -       /* Align data for aligned loads in the loop.  */
> > -       addq    $VEC_SIZE, %rdi
> > -       addq    $VEC_SIZE, %rdx
> > -       andq    $-VEC_SIZE, %rdi
> > -       subq    %rcx, %rdx
> > +       /* Check if we can load 1x VEC without cross a page.  */
> > +       testl   $(PAGE_SIZE - VEC_SIZE), %eax
> > +       jz      L(page_cross)
> > +
> > +       vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> > +       cmpq    $VEC_SIZE, %rdx
> > +       ja      L(more_1x_vec)
> > +
> > +L(ret_vec_x0_test):
> > +       /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
> > +          will gurantee edx (len) is less than it.  */
> > +       lzcntl  %ecx, %ecx
> > +
> > +       /* Hoist vzeroupper (not great for RTM) to save code size. This allows
> > +          all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
> > +       COND_VZEROUPPER
> > +       cmpl    %ecx, %edx
> > +       jle     L(zero_0)
> > +       subq    %rcx, %rax
> > +       ret
> >
> > -       .p2align 4
> > -L(aligned_more):
> > -       subq    $(VEC_SIZE * 4), %rdx
> > -       jbe     L(last_4x_vec_or_less)
> > -
> > -       /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > -          since data is only aligned to VEC_SIZE.  */
> > -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3)
> > -
> > -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
> > -       vpmovmskb %ymm2, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x2)
> > -
> > -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
> > -       vpmovmskb %ymm3, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1)
> > -
> > -       vpcmpeqb (%rdi), %ymm0, %ymm4
> > -       vpmovmskb %ymm4, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x0)
> > -
> > -       /* Align data to 4 * VEC_SIZE for loop with fewer branches.
> > -          There are some overlaps with above if data isn't aligned
> > -          to 4 * VEC_SIZE.  */
> > -       movl    %edi, %ecx
> > -       andl    $(VEC_SIZE * 4 - 1), %ecx
> > -       jz      L(loop_4x_vec)
> > -
> > -       addq    $(VEC_SIZE * 4), %rdi
> > -       addq    $(VEC_SIZE * 4), %rdx
> > -       andq    $-(VEC_SIZE * 4), %rdi
> > -       subq    %rcx, %rdx
> > +       /* Fits in aligning bytes of first cache line.  */
> > +L(zero_0):
> > +       xorl    %eax, %eax
> > +       ret
> >
> > -       .p2align 4
> > -L(loop_4x_vec):
> > -       /* Compare 4 * VEC at a time forward.  */
> > -       subq    $(VEC_SIZE * 4), %rdi
> > -       subq    $(VEC_SIZE * 4), %rdx
> > -       jbe     L(last_4x_vec_or_less)
> > -
> > -       vmovdqa (%rdi), %ymm1
> > -       vmovdqa VEC_SIZE(%rdi), %ymm2
> > -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> > -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> > -
> > -       vpcmpeqb %ymm1, %ymm0, %ymm1
> > -       vpcmpeqb %ymm2, %ymm0, %ymm2
> > -       vpcmpeqb %ymm3, %ymm0, %ymm3
> > -       vpcmpeqb %ymm4, %ymm0, %ymm4
> > -
> > -       vpor    %ymm1, %ymm2, %ymm5
> > -       vpor    %ymm3, %ymm4, %ymm6
> > -       vpor    %ymm5, %ymm6, %ymm5
> > -
> > -       vpmovmskb %ymm5, %eax
> > -       testl   %eax, %eax
> > -       jz      L(loop_4x_vec)
> > -
> > -       /* There is a match.  */
> > -       vpmovmskb %ymm4, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3)
> > -
> > -       vpmovmskb %ymm3, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x2)
> > -
> > -       vpmovmskb %ymm2, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1)
> > -
> > -       vpmovmskb %ymm1, %eax
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > +       .p2align 4,, 9
> > +L(ret_vec_x0):
> > +       lzcntl  %ecx, %ecx
> > +       subq    %rcx, %rax
> >  L(return_vzeroupper):
> >         ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> > -       .p2align 4
> > -L(last_4x_vec_or_less):
> > -       addl    $(VEC_SIZE * 4), %edx
> > -       cmpl    $(VEC_SIZE * 2), %edx
> > -       jbe     L(last_2x_vec)
> > -
> > -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3)
> > -
> > -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
> > -       vpmovmskb %ymm2, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x2)
> > -
> > -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
> > -       vpmovmskb %ymm3, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1_check)
> > -       cmpl    $(VEC_SIZE * 3), %edx
> > -       jbe     L(zero)
> > -
> > -       vpcmpeqb (%rdi), %ymm0, %ymm4
> > -       vpmovmskb %ymm4, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > -       bsrl    %eax, %eax
> > -       subq    $(VEC_SIZE * 4), %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > +       .p2align 4,, 10
> > +L(more_1x_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0)
> > +
> > +       /* Align rax (string pointer).  */
> > +       andq    $-VEC_SIZE, %rax
> > +
> > +       /* Recompute remaining length after aligning.  */
> > +       movq    %rax, %rdx
> > +       /* Need this comparison next no matter what.  */
> > +       vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
> > +       subq    %rdi, %rdx
> > +       decq    %rax
> > +       vpmovmskb %ymm1, %ecx
> > +       /* Fall through for short (hotter than length).  */
> > +       cmpq    $(VEC_SIZE * 2), %rdx
> > +       ja      L(more_2x_vec)
> >  L(last_2x_vec):
> > -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3_check)
> >         cmpl    $VEC_SIZE, %edx
> > -       jbe     L(zero)
> > -
> > -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > -       bsrl    %eax, %eax
> > -       subq    $(VEC_SIZE * 2), %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addl    $(VEC_SIZE * 2), %eax
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(last_vec_x0):
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> > +       jbe     L(ret_vec_x0_test)
> > +
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0)
> > +
> > +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> > +       /* 64-bit lzcnt. This will naturally add 32 to position.  */
> > +       lzcntq  %rcx, %rcx
> > +       COND_VZEROUPPER
> > +       cmpl    %ecx, %edx
> > +       jle     L(zero_0)
> > +       subq    %rcx, %rax
> > +       ret
> >
> > -       .p2align 4
> > -L(last_vec_x1):
> > -       bsrl    %eax, %eax
> > -       addl    $VEC_SIZE, %eax
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> >
> > -       .p2align 4
> > -L(last_vec_x2):
> > -       bsrl    %eax, %eax
> > -       addl    $(VEC_SIZE * 2), %eax
> > -       addq    %rdi, %rax
> > +       /* Inexpensive place to put this regarding code size / target alignments
> > +          / ICache NLP. Necessary for 2-byte encoding of jump to page cross
> > +          case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
>                                   in turn

Fixed in V5.
> > +          in first cache line.  */
> > +L(page_cross):
> > +       movq    %rax, %rsi
> > +       andq    $-VEC_SIZE, %rsi
> > +       vpcmpeqb (%rsi), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> > +       /* Shift out negative alignment (because we are starting from endptr and
> > +          working backwards).  */
> > +       movl    %eax, %r8d
> > +       /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
> > +       notl    %r8d
> > +       shlxl   %r8d, %ecx, %ecx
> > +       cmpq    %rdi, %rsi
> > +       ja      L(more_1x_vec)
> > +       lzcntl  %ecx, %ecx
> > +       COND_VZEROUPPER
> > +       cmpl    %ecx, %edx
> > +       jle     L(zero_0)
> > +       subq    %rcx, %rax
> > +       ret
> > +       .p2align 4,, 11
> > +L(ret_vec_x1):
> > +       /* This will naturally add 32 to position.  */
> > +       lzcntq  %rcx, %rcx
> > +       subq    %rcx, %rax
> >         VZEROUPPER_RETURN
> > +       .p2align 4,, 10
> > +L(more_2x_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0)
> >
> > -       .p2align 4
> > -L(last_vec_x3):
> > -       bsrl    %eax, %eax
> > -       addl    $(VEC_SIZE * 3), %eax
> > -       addq    %rdi, %rax
> > -       ret
> > +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x1)
> >
> > -       .p2align 4
> > -L(last_vec_x1_check):
> > -       bsrl    %eax, %eax
> > -       subq    $(VEC_SIZE * 3), %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addl    $VEC_SIZE, %eax
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> >
> > -       .p2align 4
> > -L(last_vec_x3_check):
> > -       bsrl    %eax, %eax
> > -       subq    $VEC_SIZE, %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addl    $(VEC_SIZE * 3), %eax
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> > +       /* Needed no matter what.  */
> > +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> >
> > -       .p2align 4
> > -L(zero):
> > -       xorl    %eax, %eax
> > -       VZEROUPPER_RETURN
> > +       subq    $(VEC_SIZE * 4), %rdx
> > +       ja      L(more_4x_vec)
> > +
> > +       cmpl    $(VEC_SIZE * -1), %edx
> > +       jle     L(ret_vec_x2_test)
> > +
> > +L(last_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x2)
> > +
> > +       /* Needed no matter what.  */
> > +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> > +       lzcntl  %ecx, %ecx
> > +       subq    $(VEC_SIZE * 3), %rax
> > +       COND_VZEROUPPER
> > +       subq    %rcx, %rax
> > +       cmpq    %rax, %rdi
> > +       ja      L(zero_2)
> > +       ret
> >
> > -       .p2align 4
> > -L(null):
> > +       /* First in aligning bytes.  */
> > +L(zero_2):
> >         xorl    %eax, %eax
> >         ret
> >
> > -       .p2align 4
> > -L(last_vec_or_less_aligned):
> > -       movl    %edx, %ecx
> > +       .p2align 4,, 4
> > +L(ret_vec_x2_test):
> > +       lzcntl  %ecx, %ecx
> > +       subq    $(VEC_SIZE * 2), %rax
> > +       COND_VZEROUPPER
> > +       subq    %rcx, %rax
> > +       cmpq    %rax, %rdi
> > +       ja      L(zero_2)
> > +       ret
> >
> > -       vpcmpeqb (%rdi), %ymm0, %ymm1
> >
> > -       movl    $1, %edx
> > -       /* Support rdx << 32.  */
> > -       salq    %cl, %rdx
> > -       subq    $1, %rdx
> > +       .p2align 4,, 11
> > +L(ret_vec_x2):
> > +       /* ecx must be non-zero.  */
> > +       bsrl    %ecx, %ecx
> > +       leaq    (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
> > +       VZEROUPPER_RETURN
> >
> > -       vpmovmskb %ymm1, %eax
> > +       .p2align 4,, 14
> > +L(ret_vec_x3):
> > +       /* ecx must be non-zero.  */
> > +       bsrl    %ecx, %ecx
> > +       leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
> > +       VZEROUPPER_RETURN
> >
> > -       /* Remove the trailing bytes.  */
> > -       andl    %edx, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> >
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> >
> >         .p2align 4
> > -L(last_vec_or_less):
> > -       addl    $VEC_SIZE, %edx
> > +L(more_4x_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x2)
> >
> > -       /* Check for zero length.  */
> > -       testl   %edx, %edx
> > -       jz      L(null)
> > +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> >
> > -       movl    %edi, %ecx
> > -       andl    $(VEC_SIZE - 1), %ecx
> > -       jz      L(last_vec_or_less_aligned)
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x3)
> >
> > -       movl    %ecx, %esi
> > -       movl    %ecx, %r8d
> > -       addl    %edx, %esi
> > -       andq    $-VEC_SIZE, %rdi
> > +       /* Check if near end before re-aligning (otherwise might do an
> > +          unnecissary loop iteration).  */
> > +       addq    $-(VEC_SIZE * 4), %rax
> > +       cmpq    $(VEC_SIZE * 4), %rdx
> > +       jbe     L(last_4x_vec)
> >
> > -       subl    $VEC_SIZE, %esi
> > -       ja      L(last_vec_2x_aligned)
> > +       /* Align rax to (VEC_SIZE - 1).  */
> > +       orq     $(VEC_SIZE * 4 - 1), %rax
> > +       movq    %rdi, %rdx
> > +       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
> > +          lengths that overflow can be valid and break the comparison.  */
> > +       orq     $(VEC_SIZE * 4 - 1), %rdx
> >
> > -       /* Check the last VEC.  */
> > -       vpcmpeqb (%rdi), %ymm0, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > -
> > -       /* Remove the leading and trailing bytes.  */
> > -       sarl    %cl, %eax
> > -       movl    %edx, %ecx
> > +       .p2align 4
> > +L(loop_4x_vec):
> > +       /* Need this comparison next no matter what.  */
> > +       vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
> > +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
> > +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
> > +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
> >
> > -       movl    $1, %edx
> > -       sall    %cl, %edx
> > -       subl    $1, %edx
> > +       vpor    %ymm1, %ymm2, %ymm2
> > +       vpor    %ymm3, %ymm4, %ymm4
> > +       vpor    %ymm2, %ymm4, %ymm4
> > +       vpmovmskb %ymm4, %esi
> >
> > -       andl    %edx, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > +       testl   %esi, %esi
> > +       jnz     L(loop_end)
> >
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > -       addq    %r8, %rax
> > -       VZEROUPPER_RETURN
> > +       addq    $(VEC_SIZE * -4), %rax
> > +       cmpq    %rdx, %rax
> > +       jne     L(loop_4x_vec)
> >
> > -       .p2align 4
> > -L(last_vec_2x_aligned):
> > -       movl    %esi, %ecx
> > +       subl    %edi, %edx
> > +       incl    %edx
> >
> > -       /* Check the last VEC.  */
> > -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
> > +L(last_4x_vec):
> > +       /* Used no matter what.  */
> > +       vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> >
> > -       movl    $1, %edx
> > -       sall    %cl, %edx
> > -       subl    $1, %edx
> > +       cmpl    $(VEC_SIZE * 2), %edx
> > +       jbe     L(last_2x_vec)
> >
> > -       vpmovmskb %ymm1, %eax
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0_end)
> >
> > -       /* Remove the trailing bytes.  */
> > -       andl    %edx, %eax
> > +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x1_end)
> >
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1)
> > +       /* Used no matter what.  */
> > +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> >
> > -       /* Check the second last VEC.  */
> > -       vpcmpeqb (%rdi), %ymm0, %ymm1
> > +       cmpl    $(VEC_SIZE * 3), %edx
> > +       ja      L(last_vec)
> > +
> > +       lzcntl  %ecx, %ecx
> > +       subq    $(VEC_SIZE * 2), %rax
> > +       COND_VZEROUPPER
> > +       subq    %rcx, %rax
> > +       cmpq    %rax, %rdi
> > +       jbe     L(ret0)
> > +       xorl    %eax, %eax
> > +L(ret0):
> > +       ret
> >
> > -       movl    %r8d, %ecx
> >
> > -       vpmovmskb %ymm1, %eax
> > +       .p2align 4
> > +L(loop_end):
> > +       vpmovmskb %ymm1, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0_end)
> > +
> > +       vpmovmskb %ymm2, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x1_end)
> > +
> > +       vpmovmskb %ymm3, %ecx
> > +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> > +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> > +          then CHAR in VEC3 and bsrq will use that position.  */
> > +       salq    $32, %rcx
> > +       orq     %rsi, %rcx
> > +       bsrq    %rcx, %rcx
> > +       leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
> > +       VZEROUPPER_RETURN
> >
> > -       /* Remove the leading bytes.  Must use unsigned right shift for
> > -          bsrl below.  */
> > -       shrl    %cl, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > +       .p2align 4,, 4
> > +L(ret_vec_x1_end):
> > +       /* 64-bit version will automatically add 32 (VEC_SIZE).  */
> > +       lzcntq  %rcx, %rcx
> > +       subq    %rcx, %rax
> > +       VZEROUPPER_RETURN
> >
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > -       addq    %r8, %rax
> > +       .p2align 4,, 4
> > +L(ret_vec_x0_end):
> > +       lzcntl  %ecx, %ecx
> > +       subq    %rcx, %rax
> >         VZEROUPPER_RETURN
> > -END (MEMRCHR)
> > +
> > +       /* 2 bytes until next cache line.  */
> > +END(MEMRCHR)
> >  #endif
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v4 7/8] x86: Shrink code size of memchr-avx2.S
  2022-06-06 22:37   ` [PATCH v4 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (4 preceding siblings ...)
  2022-06-06 22:37     ` [PATCH v4 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
@ 2022-06-06 22:37     ` Noah Goldstein
  2022-06-06 22:37     ` [PATCH v4 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-06 22:37 UTC (permalink / raw)
  To: libc-alpha

This is not meant as a performance optimization. The previous code was
far to liberal in aligning targets and wasted code size unnecissarily.

The total code size saving is: 59 bytes

There are no major changes in the benchmarks.
Geometric Mean of all benchmarks New / Old: 0.967

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S |   1 +
 sysdeps/x86_64/multiarch/memchr-avx2.S     | 109 +++++++++++----------
 2 files changed, 60 insertions(+), 50 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
index 87b076c7c4..c4d71938c5 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
@@ -2,6 +2,7 @@
 # define MEMCHR __memchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 75bd7262e0..28a01280ec 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -57,7 +57,7 @@
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 5)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 #  ifdef __ILP32__
@@ -87,12 +87,14 @@ ENTRY (MEMCHR)
 # endif
 	testl	%eax, %eax
 	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
 
 # ifndef USE_AS_RAWMEMCHR
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
 	/* Check if first match was before length.  */
 	tzcntl	%eax, %eax
@@ -100,58 +102,31 @@ L(first_vec_x0):
 	/* NB: Multiply length by 4 to get byte count.  */
 	sall	$2, %edx
 #  endif
-	xorl	%ecx, %ecx
+    COND_VZEROUPPER
+	/* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
+	   block. branch here as opposed to cmovcc is not that costly. Common
+	   usage of memchr is to check if the return was NULL (if string was
+	   known to contain CHAR user would use rawmemchr). This branch will be
+	   highly correlated with the user branch and can be used by most
+	   modern branch predictors to predict the user branch.  */
 	cmpl	%eax, %edx
-	leaq	(%rdi, %rax), %rax
-	cmovle	%rcx, %rax
-	VZEROUPPER_RETURN
-
-L(null):
-	xorl	%eax, %eax
-	ret
-# endif
-	.p2align 4
-L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is
-	   necessary for computer return address if byte is found or
-	   adjusting length if it is not and this is memchr.  */
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
-	   and rdi for rawmemchr.  */
-	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
-	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Calculate length until end of page (length checked for a
-	   match).  */
-	leaq	1(%ALGN_PTR_REG), %rsi
-	subq	%RRAW_PTR_REG, %rsi
-#  ifdef USE_AS_WMEMCHR
-	/* NB: Divide bytes by 4 to get wchar_t count.  */
-	shrl	$2, %esi
-#  endif
-# endif
-	/* Remove the leading bytes.  */
-	sarxl	%ERAW_PTR_REG, %eax, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Check the end of data.  */
-	cmpq	%rsi, %rdx
-	jbe	L(first_vec_x0)
+    jle  L(null)
+	addq	%rdi, %rax
+    ret
 # endif
-	testl	%eax, %eax
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
-	addq	%RRAW_PTR_REG, %rax
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x1):
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-
+# ifndef USE_AS_RAWMEMCHR
+	/* First in aligning bytes here.  */
+L(null):
+	xorl	%eax, %eax
+	ret
+# endif
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
@@ -340,7 +315,7 @@ L(first_vec_x1_check):
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-	.p2align 4
+	.p2align 4,, 6
 L(set_zero_end):
 	xorl	%eax, %eax
 	VZEROUPPER_RETURN
@@ -428,5 +403,39 @@ L(last_vec_x3):
 	VZEROUPPER_RETURN
 # endif
 
+	.p2align 4
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is necessary for
+	   computer return address if byte is found or adjusting length if it
+	   is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
+	   rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMPEQ	(%ALGN_PTR_REG), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length checked for a match).  */
+	leal	VEC_SIZE(%ALGN_PTR_REG), %esi
+	subl	%ERAW_PTR_REG, %esi
+# ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+# endif
+# endif
+	/* Remove the leading bytes.  */
+	sarxl	%ERAW_PTR_REG, %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Check the end of data.  */
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	bsfl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
+	VZEROUPPER_RETURN
+
+
 END (MEMCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v4 8/8] x86: Shrink code size of memchr-evex.S
  2022-06-06 22:37   ` [PATCH v4 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (5 preceding siblings ...)
  2022-06-06 22:37     ` [PATCH v4 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
@ 2022-06-06 22:37     ` Noah Goldstein
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-06 22:37 UTC (permalink / raw)
  To: libc-alpha

This is not meant as a performance optimization. The previous code was
far to liberal in aligning targets and wasted code size unnecissarily.

The total code size saving is: 64 bytes

There are no non-negligible changes in the benchmarks.
Geometric Mean of all benchmarks New / Old: 1.000

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memchr-evex.S | 46 ++++++++++++++------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index cfaf02907d..0fd11b7632 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -88,7 +88,7 @@
 # define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 6)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
@@ -131,22 +131,24 @@ L(zero):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
-	/* Check if first match was before length.  */
-	tzcntl	%eax, %eax
-	xorl	%ecx, %ecx
-	cmpl	%eax, %edx
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-	cmovle	%rcx, %rax
+	/* Check if first match was before length. NB: tzcnt has false data-
+	   dependency on destination. eax already had a data-dependency on esi
+	   so this should have no affect here.  */
+	tzcntl	%eax, %esi
+#  ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
+#  else
+	addq	%rsi, %rdi
+#  endif
+	xorl	%eax, %eax
+	cmpl	%esi, %edx
+	cmovg	%rdi, %rax
 	ret
-# else
-	/* NB: first_vec_x0 is 17 bytes which will leave
-	   cross_page_boundary (which is relatively cold) close enough
-	   to ideal alignment. So only realign L(cross_page_boundary) if
-	   rawmemchr.  */
-	.p2align 4
 # endif
+
+	.p2align 4
 L(cross_page_boundary):
 	/* Save pointer before aligning as its original value is
 	   necessary for computer return address if byte is found or
@@ -400,10 +402,14 @@ L(last_2x_vec):
 L(zero_end):
 	ret
 
+L(set_zero_end):
+	xorl	%eax, %eax
+	ret
 
 	.p2align 4
 L(first_vec_x1_check):
-	tzcntl	%eax, %eax
+	/* eax must be non-zero. Use bsfl to save code size.  */
+	bsfl	%eax, %eax
 	/* Adjust length.  */
 	subl	$-(CHAR_PER_VEC * 4), %edx
 	/* Check if match within remaining length.  */
@@ -412,9 +418,6 @@ L(first_vec_x1_check):
 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 	ret
-L(set_zero_end):
-	xorl	%eax, %eax
-	ret
 
 	.p2align 4
 L(loop_4x_vec_end):
@@ -464,7 +467,7 @@ L(loop_4x_vec_end):
 # endif
 	ret
 
-	.p2align 4
+	.p2align 4,, 10
 L(last_vec_x1_return):
 	tzcntl	%eax, %eax
 # if defined USE_AS_WMEMCHR || RET_OFFSET != 0
@@ -496,6 +499,7 @@ L(last_vec_x3_return):
 # endif
 
 # ifndef USE_AS_RAWMEMCHR
+	.p2align 4,, 5
 L(last_4x_vec_or_less_cmpeq):
 	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
 	kmovd	%k0, %eax
@@ -546,7 +550,7 @@ L(last_4x_vec):
 #  endif
 	andl	%ecx, %eax
 	jz	L(zero_end2)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 L(zero_end2):
 	ret
@@ -562,6 +566,6 @@ L(last_vec_x3):
 	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 # endif
-
+	/* 7 bytes from next cache line.  */
 END (MEMCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v5 1/8] x86: Create header for VEC classes in x86 strings library
  2022-06-03  4:42 ` [PATCH v1 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-06-06 22:37   ` [PATCH v4 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
@ 2022-06-07  4:05   ` Noah Goldstein
  2022-06-07  4:05     ` [PATCH v5 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
                       ` (6 more replies)
  2022-06-07  4:11   ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  4 siblings, 7 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:05 UTC (permalink / raw)
  To: libc-alpha

This patch does not touch any existing code and is only meant to be a
tool for future patches so that simple source files can more easily be
maintained to target multiple VEC classes.

There is no difference in the objdump of libc.so before and after this
patch.
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h     | 34 ++++++++
 sysdeps/x86_64/multiarch/avx-vecs.h         | 47 +++++++++++
 sysdeps/x86_64/multiarch/evex-vecs-common.h | 39 +++++++++
 sysdeps/x86_64/multiarch/evex256-vecs.h     | 35 ++++++++
 sysdeps/x86_64/multiarch/evex512-vecs.h     | 35 ++++++++
 sysdeps/x86_64/multiarch/sse2-vecs.h        | 47 +++++++++++
 sysdeps/x86_64/multiarch/vec-macros.h       | 90 +++++++++++++++++++++
 7 files changed, 327 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/evex-vecs-common.h
 create mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/vec-macros.h

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
new file mode 100644
index 0000000000..3f531dd47f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
@@ -0,0 +1,34 @@
+/* Common config for AVX-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_RTM_VECS_H
+#define _AVX_RTM_VECS_H			1
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define USE_WITH_RTM			1
+#include "avx-vecs.h"
+
+#undef SECTION
+#define SECTION(p)				p##.avx.rtm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
new file mode 100644
index 0000000000..89680f5db8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for AVX VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_VECS_H
+#define _AVX_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "vec-macros.h"
+
+#define USE_WITH_AVX		1
+#define SECTION(p)			p##.avx
+
+/* 4-byte mov instructions with AVX2.  */
+#define MOV_SIZE			4
+/* 1 (ret) + 3 (vzeroupper).  */
+#define RET_SIZE			4
+#define VZEROUPPER			vzeroupper
+
+#define VMOVU				vmovdqu
+#define VMOVA				vmovdqa
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
new file mode 100644
index 0000000000..99806ebcd7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex-vecs-common.h
@@ -0,0 +1,39 @@
+/* Common config for EVEX256 and EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX_VECS_COMMON_H
+#define _EVEX_VECS_COMMON_H			1
+
+#include "vec-macros.h"
+
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+#define VEC_xmm				VEC_hi_xmm
+#define VEC_ymm				VEC_hi_ymm
+#define VEC_zmm				VEC_hi_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
new file mode 100644
index 0000000000..222ba46dc7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
@@ -0,0 +1,35 @@
+/* Common config for EVEX256 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX256_VECS_H
+#define _EVEX256_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "evex-vecs-common.h"
+
+#define USE_WITH_EVEX256	1
+#define SECTION(p)			p##.evex
+
+#define VEC					VEC_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
new file mode 100644
index 0000000000..d1784d5368
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
@@ -0,0 +1,35 @@
+/* Common config for EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX512_VECS_H
+#define _EVEX512_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			64
+#include "evex-vecs-common.h"
+
+#define USE_WITH_EVEX512	1
+#define SECTION(p)			p##.evex512
+
+#define VEC					VEC_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
new file mode 100644
index 0000000000..2b77a59d56
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for SSE2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SSE2_VECS_H
+#define _SSE2_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			16
+#include "vec-macros.h"
+
+#define USE_WITH_SSE2		1
+#define SECTION(p)			p
+
+/* 3-byte mov instructions with SSE2.  */
+#define MOV_SIZE			3
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				movups
+#define VMOVA				movaps
+#define VMOVNT				movntdq
+
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_xmm
+
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
new file mode 100644
index 0000000000..9f3ffecede
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/vec-macros.h
@@ -0,0 +1,90 @@
+/* Macro helpers for VEC_{type}({vec_num})
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _VEC_MACROS_H
+#define _VEC_MACROS_H			1
+
+#ifndef VEC_SIZE
+# error "Never include this file directly. Always include a vector config."
+#endif
+
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
+   VEC(N) values.  */
+#define VEC_hi_xmm0				xmm16
+#define VEC_hi_xmm1				xmm17
+#define VEC_hi_xmm2				xmm18
+#define VEC_hi_xmm3				xmm19
+#define VEC_hi_xmm4				xmm20
+#define VEC_hi_xmm5				xmm21
+#define VEC_hi_xmm6				xmm22
+#define VEC_hi_xmm7				xmm23
+#define VEC_hi_xmm8				xmm24
+#define VEC_hi_xmm9				xmm25
+#define VEC_hi_xmm10			xmm26
+#define VEC_hi_xmm11			xmm27
+#define VEC_hi_xmm12			xmm28
+#define VEC_hi_xmm13			xmm29
+#define VEC_hi_xmm14			xmm30
+#define VEC_hi_xmm15			xmm31
+
+#define VEC_hi_ymm0				ymm16
+#define VEC_hi_ymm1				ymm17
+#define VEC_hi_ymm2				ymm18
+#define VEC_hi_ymm3				ymm19
+#define VEC_hi_ymm4				ymm20
+#define VEC_hi_ymm5				ymm21
+#define VEC_hi_ymm6				ymm22
+#define VEC_hi_ymm7				ymm23
+#define VEC_hi_ymm8				ymm24
+#define VEC_hi_ymm9				ymm25
+#define VEC_hi_ymm10			ymm26
+#define VEC_hi_ymm11			ymm27
+#define VEC_hi_ymm12			ymm28
+#define VEC_hi_ymm13			ymm29
+#define VEC_hi_ymm14			ymm30
+#define VEC_hi_ymm15			ymm31
+
+#define VEC_hi_zmm0				zmm16
+#define VEC_hi_zmm1				zmm17
+#define VEC_hi_zmm2				zmm18
+#define VEC_hi_zmm3				zmm19
+#define VEC_hi_zmm4				zmm20
+#define VEC_hi_zmm5				zmm21
+#define VEC_hi_zmm6				zmm22
+#define VEC_hi_zmm7				zmm23
+#define VEC_hi_zmm8				zmm24
+#define VEC_hi_zmm9				zmm25
+#define VEC_hi_zmm10			zmm26
+#define VEC_hi_zmm11			zmm27
+#define VEC_hi_zmm12			zmm28
+#define VEC_hi_zmm13			zmm29
+#define VEC_hi_zmm14			zmm30
+#define VEC_hi_zmm15			zmm31
+
+#define PRIMITIVE_VEC(vec, num)		vec##num
+
+#define VEC_any_xmm(i)			PRIMITIVE_VEC(xmm, i)
+#define VEC_any_ymm(i)			PRIMITIVE_VEC(ymm, i)
+#define VEC_any_zmm(i)			PRIMITIVE_VEC(zmm, i)
+
+#define VEC_hi_xmm(i)			PRIMITIVE_VEC(VEC_hi_xmm, i)
+#define VEC_hi_ymm(i)			PRIMITIVE_VEC(VEC_hi_ymm, i)
+#define VEC_hi_zmm(i)			PRIMITIVE_VEC(VEC_hi_zmm, i)
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v5 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
  2022-06-07  4:05   ` [PATCH v5 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
@ 2022-06-07  4:05     ` Noah Goldstein
  2022-06-07  4:05     ` [PATCH v5 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
                       ` (5 subsequent siblings)
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:05 UTC (permalink / raw)
  To: libc-alpha

The RTM vzeroupper mitigation has no way of replacing inline
vzeroupper not before a return.

This can be useful when hoisting a vzeroupper to save code size
for example:

```
L(foo):
	cmpl	%eax, %edx
	jz	L(bar)
	tzcntl	%eax, %eax
	addq	%rdi, %rax
	VZEROUPPER_RETURN

L(bar):
	xorl	%eax, %eax
	VZEROUPPER_RETURN
```

Can become:

```
L(foo):
	COND_VZEROUPPER
	cmpl	%eax, %edx
	jz	L(bar)
	tzcntl	%eax, %eax
	addq	%rdi, %rax
	ret

L(bar):
	xorl	%eax, %eax
	ret
```

This code does not change any existing functionality.

There is no difference in the objdump of libc.so before and after this
patch.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h |  1 +
 sysdeps/x86_64/sysdep.h                 | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
index 3f531dd47f..6ca9f5e6ba 100644
--- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
@@ -20,6 +20,7 @@
 #ifndef _AVX_RTM_VECS_H
 #define _AVX_RTM_VECS_H			1
 
+#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN	\
 	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index f14d50786d..4f512d5566 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -106,6 +106,24 @@ lose:									      \
 	vzeroupper;						\
 	ret
 
+/* Can be used to replace vzeroupper that is not directly before a
+   return.  This is useful when hoisting a vzeroupper from multiple
+   return paths to decrease the total number of vzerouppers and code
+   size.  */
+#define COND_VZEROUPPER_XTEST							\
+    xtest;							\
+    jz 1f;							\
+    vzeroall;							\
+    jmp 2f;							\
+1:							\
+    vzeroupper;							\
+2:
+
+/* In RTM define this as COND_VZEROUPPER_XTEST.  */
+#ifndef COND_VZEROUPPER
+# define COND_VZEROUPPER vzeroupper
+#endif
+
 /* Zero upper vector registers and return.  */
 #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
 # define ZERO_UPPER_VEC_REGISTERS_RETURN \
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v5 3/8] Benchtests: Improve memrchr benchmarks
  2022-06-07  4:05   ` [PATCH v5 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  2022-06-07  4:05     ` [PATCH v5 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
@ 2022-06-07  4:05     ` Noah Goldstein
  2022-06-07  4:05     ` [PATCH v5 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
                       ` (4 subsequent siblings)
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:05 UTC (permalink / raw)
  To: libc-alpha

Add a second iteration for memrchr to set `pos` starting from the end
of the buffer.

Previously `pos` was only set relative to the begining of the
buffer. This isn't really useful for memrchr because the begining
of the search space is (buf + len).
---
 benchtests/bench-memchr.c | 110 ++++++++++++++++++++++----------------
 1 file changed, 65 insertions(+), 45 deletions(-)

diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
index 4d7212332f..0facda2fa0 100644
--- a/benchtests/bench-memchr.c
+++ b/benchtests/bench-memchr.c
@@ -76,7 +76,7 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
 
 static void
 do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
-	 int seek_char)
+	 int seek_char, int invert_pos)
 {
   size_t i;
 
@@ -96,7 +96,10 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
 
   if (pos < len)
     {
-      buf[align + pos] = seek_char;
+      if (invert_pos)
+	buf[align + len - pos] = seek_char;
+      else
+	buf[align + pos] = seek_char;
       buf[align + len] = -seek_char;
     }
   else
@@ -109,6 +112,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
   json_attr_uint (json_ctx, "pos", pos);
   json_attr_uint (json_ctx, "len", len);
   json_attr_uint (json_ctx, "seek_char", seek_char);
+  json_attr_uint (json_ctx, "invert_pos", invert_pos);
 
   json_array_begin (json_ctx, "timings");
 
@@ -123,6 +127,7 @@ int
 test_main (void)
 {
   size_t i;
+  int repeats;
   json_ctx_t json_ctx;
   test_init ();
 
@@ -142,53 +147,68 @@ test_main (void)
 
   json_array_begin (&json_ctx, "results");
 
-  for (i = 1; i < 8; ++i)
+  for (repeats = 0; repeats < 2; ++repeats)
     {
-      do_test (&json_ctx, 0, 16 << i, 2048, 23);
-      do_test (&json_ctx, i, 64, 256, 23);
-      do_test (&json_ctx, 0, 16 << i, 2048, 0);
-      do_test (&json_ctx, i, 64, 256, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, 64, 256, 0);
+      for (i = 1; i < 8; ++i)
+	{
+	  do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
+	  do_test (&json_ctx, i, 64, 256, 23, repeats);
+	  do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
+	  do_test (&json_ctx, i, 64, 256, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
 #ifdef USE_AS_MEMRCHR
-      /* Also test the position close to the beginning for memrchr.  */
-      do_test (&json_ctx, 0, i, 256, 23);
-      do_test (&json_ctx, 0, i, 256, 0);
-      do_test (&json_ctx, i, i, 256, 23);
-      do_test (&json_ctx, i, i, 256, 0);
+	  /* Also test the position close to the beginning for memrchr.  */
+	  do_test (&json_ctx, 0, i, 256, 23, repeats);
+	  do_test (&json_ctx, 0, i, 256, 0, repeats);
+	  do_test (&json_ctx, i, i, 256, 23, repeats);
+	  do_test (&json_ctx, i, i, 256, 0, repeats);
 #endif
-    }
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (&json_ctx, i, i << 5, 192, 23);
-      do_test (&json_ctx, i, i << 5, 192, 0);
-      do_test (&json_ctx, i, i << 5, 256, 23);
-      do_test (&json_ctx, i, i << 5, 256, 0);
-      do_test (&json_ctx, i, i << 5, 512, 23);
-      do_test (&json_ctx, i, i << 5, 512, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23);
-    }
-  for (i = 1; i < 32; ++i)
-    {
-      do_test (&json_ctx, 0, i, i + 1, 23);
-      do_test (&json_ctx, 0, i, i + 1, 0);
-      do_test (&json_ctx, i, i, i + 1, 23);
-      do_test (&json_ctx, i, i, i + 1, 0);
-      do_test (&json_ctx, 0, i, i - 1, 23);
-      do_test (&json_ctx, 0, i, i - 1, 0);
-      do_test (&json_ctx, i, i, i - 1, 23);
-      do_test (&json_ctx, i, i, i - 1, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23);
-      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23);
-      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0);
+	}
+      for (i = 1; i < 8; ++i)
+	{
+	  do_test (&json_ctx, i, i << 5, 192, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 192, 0, repeats);
+	  do_test (&json_ctx, i, i << 5, 256, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 256, 0, repeats);
+	  do_test (&json_ctx, i, i << 5, 512, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 512, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
+	}
+      for (i = 1; i < 32; ++i)
+	{
+	  do_test (&json_ctx, 0, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, 0, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, i, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, i, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, 0, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, 0, i, i - 1, 0, repeats);
+	  do_test (&json_ctx, i, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, i, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () / 2, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i - 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0, repeats);
+
 #ifdef USE_AS_MEMRCHR
-      /* Also test the position close to the beginning for memrchr.  */
-      do_test (&json_ctx, 0, 1, i + 1, 23);
-      do_test (&json_ctx, 0, 2, i + 1, 0);
+	  do_test (&json_ctx, 0, 1, i + 1, 23, repeats);
+	  do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
+#endif
+	}
+#ifndef USE_AS_MEMRCHR
+      break;
 #endif
     }
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v5 4/8] x86: Optimize memrchr-sse2.S
  2022-06-07  4:05   ` [PATCH v5 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  2022-06-07  4:05     ` [PATCH v5 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
  2022-06-07  4:05     ` [PATCH v5 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
@ 2022-06-07  4:05     ` Noah Goldstein
  2022-06-07  4:05     ` [PATCH v5 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
                       ` (3 subsequent siblings)
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:05 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller lengths more.
    2. optimizes target placement more carefully.
    3. reuses logic more.
    4. fixes up various inefficiencies in the logic.

The total code size saving is: 394 bytes
Geometric Mean of all benchmarks New / Old: 0.874

Regressions:
    1. The page cross case is now colder, especially re-entry from the
       page cross case if a match is not found in the first VEC
       (roughly 50%). My general opinion with this patch is this is
       acceptable given the "coldness" of this case (less than 4%) and
       generally performance improvement in the other far more common
       cases.

    2. There are some regressions 5-15% for medium/large user-arg
       lengths that have a match in the first VEC. This is because the
       logic was rewritten to optimize finds in the first VEC if the
       user-arg length is shorter (where we see roughly 20-50%
       performance improvements). It is not always the case this is a
       regression. My intuition is some frontend quirk is partially
       explaining the data although I haven't been able to find the
       root cause.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++--------------------
 1 file changed, 292 insertions(+), 321 deletions(-)

diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index d1a9f47911..b0dffd2ae2 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -18,362 +18,333 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#define VEC_SIZE			16
+#define PAGE_SIZE			4096
 
 	.text
-ENTRY (__memrchr)
-	movd	%esi, %xmm1
-
-	sub	$16, %RDX_LP
-	jbe	L(length_less16)
-
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	%RDX_LP, %RDI_LP
-	pshufd	$0, %xmm1, %xmm1
-
-	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-
-/* Check if there is a match.  */
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(loop_prolog)
-
-	add	$16, %rdi
-	add	$16, %rdx
-	and	$-16, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(loop_prolog):
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqb	%xmm1, %xmm4
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	mov	%edi, %ecx
-	and	$63, %ecx
-	jz	L(align64_loop)
-
-	add	$64, %rdi
-	add	$64, %rdx
-	and	$-64, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(align64_loop):
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	movdqa	48(%rdi), %xmm4
-
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm1, %xmm4
-
-	pmaxub	%xmm3, %xmm0
-	pmaxub	%xmm4, %xmm2
-	pmaxub	%xmm0, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	test	%eax, %eax
-	jz	L(align64_loop)
-
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm2
-
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	(%rdi), %xmm1
-
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	pmovmskb	%xmm1, %eax
-	bsr	%eax, %eax
-
-	add	%rdi, %rax
+ENTRY_P2ALIGN(__memrchr, 6)
+#ifdef __ILP32__
+	/* Clear upper bits.  */
+	mov	%RDX_LP, %RDX_LP
+#endif
+	movd	%esi, %xmm0
+
+	/* Get end pointer.  */
+	leaq	(%rdx, %rdi), %rcx
+
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	jz	L(page_cross)
+
+	/* NB: This load happens regardless of whether rdx (len) is zero. Since
+	   it doesn't cross a page and the standard gurantees any pointer have
+	   at least one-valid byte this load must be safe. For the entire
+	   history of the x86 memrchr implementation this has been possible so
+	   no code "should" be relying on a zero-length check before this load.
+	   The zero-length check is moved to the page cross case because it is
+	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
+	   into 2-cache lines.  */
+	movups	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
+	   zero.  */
+	bsrl	%eax, %eax
+	jz	L(ret_0)
+	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
+	   if out of bounds.  */
+	addl	%edx, %eax
+	jl	L(zero_0)
+	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
+	   ptr.  */
+	addq	%rdi, %rax
+L(ret_0):
 	ret
 
-	.p2align 4
-L(exit_loop):
-	add	$64, %edx
-	cmp	$32, %edx
-	jbe	L(exit_loop_32)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16_1)
-	cmp	$48, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches0_1)
-	xor	%eax, %eax
+	.p2align 4,, 5
+L(ret_vec_x0):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(exit_loop_32):
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48_1)
-	cmp	$16, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	32(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches32_1)
-	xor	%eax, %eax
+	.p2align 4,, 2
+L(zero_0):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches0):
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
-	ret
 
-	.p2align 4
-L(matches32):
-	bsr	%eax, %eax
-	lea	32(%rax, %rdi), %rax
+	.p2align 4,, 8
+L(more_1x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	/* Align rcx (pointer to string).  */
+	decq	%rcx
+	andq	$-VEC_SIZE, %rcx
+
+	movq	%rcx, %rdx
+	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
+	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
+	   it adds more frontend uops (even if the moves can be eliminated) and
+	   some percentage of the time actual backend uops.  */
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	subq	%rdi, %rdx
+	pmovmskb %xmm1, %eax
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	subl	$VEC_SIZE, %edx
+	jbe	L(ret_vec_x0_test)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_1)
+	addl	%edx, %eax
+	jl	L(zero_0)
+	addq	%rdi, %rax
+L(ret_1):
 	ret
 
-	.p2align 4
-L(matches48):
-	bsr	%eax, %eax
-	lea	48(%rax, %rdi), %rax
+	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
+	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
+	   lines.  Naturally aligned % 16 to 8-bytes.  */
+L(page_cross):
+	/* Zero length check.  */
+	testq	%rdx, %rdx
+	jz	L(zero_0)
+
+	leaq	-1(%rcx), %r8
+	andq	$-(VEC_SIZE), %r8
+
+	movaps	(%r8), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	negl	%ecx
+	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
+	   explicitly.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	shl	%cl, %esi
+	movzwl	%si, %eax
+	leaq	(%rdi, %rdx), %rcx
+	cmpq	%rdi, %r8
+	ja	L(more_1x_vec)
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_2)
+	addl	%edx, %eax
+	jl	L(zero_1)
+	addq	%rdi, %rax
+L(ret_2):
 	ret
 
-	.p2align 4
-L(matches0_1):
-	bsr	%eax, %eax
-	sub	$64, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	add	%rdi, %rax
+	/* Fits in aliging bytes.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches16_1):
-	bsr	%eax, %eax
-	sub	$48, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	16(%rdi, %rax), %rax
+	.p2align 4,, 5
+L(ret_vec_x1):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(matches32_1):
-	bsr	%eax, %eax
-	sub	$32, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	32(%rdi, %rax), %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(matches48_1):
-	bsr	%eax, %eax
-	sub	$16, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	48(%rdi, %rax), %rax
-	ret
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(return_null):
-	xor	%eax, %eax
-	ret
 
-	.p2align 4
-L(length_less16_offset0):
-	test	%edx, %edx
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	mov	%dl, %cl
-	pcmpeqb	(%rdi), %xmm1
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
 
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
+	addl	$(VEC_SIZE), %edx
+	jle	L(ret_vec_x2_test)
 
-	pmovmskb	%xmm1, %eax
+L(last_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
 
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
+	subl	$(VEC_SIZE), %edx
+	bsrl	%eax, %eax
+	jz	L(ret_3)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
+L(ret_3):
 	ret
 
-	.p2align 4
-L(length_less16):
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	$16, %edx
-
-	pshufd	$0, %xmm1, %xmm1
-
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(length_less16_offset0)
-
-	mov	%cl, %dh
-	mov	%ecx, %esi
-	add	%dl, %dh
-	and	$-16, %rdi
-
-	sub	$16, %dh
-	ja	L(length_less16_part2)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-
-	sar	%cl, %eax
-	mov	%dl, %cl
-
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
-
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 6
+L(ret_vec_x2_test):
+	bsrl	%eax, %eax
+	jz	L(zero_2)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4
-L(length_less16_part2):
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	mov	%dh, %cl
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
+L(zero_2):
+	xorl	%eax, %eax
+	ret
 
-	test	%eax, %eax
-	jnz	L(length_less16_part2_return)
 
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
+	.p2align 4,, 5
+L(ret_vec_x2):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	mov	%esi, %ecx
-	sar	%cl, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	.p2align 4,, 5
+L(ret_vec_x3):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
+
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x3)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
+	   keeping the code from spilling to the next cache line.  */
+	addq	$(VEC_SIZE * 4 - 1), %rcx
+	andq	$-(VEC_SIZE * 4), %rcx
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
+	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
+	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
+	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
+	pcmpeqb	%xmm0, %xmm1
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm0, %xmm4
+
+	por	%xmm1, %xmm2
+	por	%xmm3, %xmm4
+	por	%xmm2, %xmm4
+
+	pmovmskb %xmm4, %esi
+	testl	%esi, %esi
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	%rdx, %rcx
+	jne	L(loop_4x_vec)
+
+	subl	%edi, %edx
+
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 2
+L(last_4x_vec):
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+	bsrl	%eax, %eax
+	jz	L(ret_4)
+	addl	%edx, %eax
+	jl	L(zero_3)
+	addq	%rdi, %rax
+L(ret_4):
 	ret
 
-	.p2align 4
-L(length_less16_part2_return):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 3
+L(loop_end):
+	pmovmskb %xmm1, %eax
+	sall	$16, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm2, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm3, %eax
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	sall	$16, %eax
+	orl	%esi, %eax
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
 	ret
 
-END (__memrchr)
+L(ret_vec_end):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
+	ret
+	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
+	   aligning bytes.  */
+L(zero_3):
+	xorl	%eax, %eax
+	ret
+	/* 2-bytes from next cache line.  */
+END(__memrchr)
 weak_alias (__memrchr, memrchr)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v5 5/8] x86: Optimize memrchr-evex.S
  2022-06-07  4:05   ` [PATCH v5 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (2 preceding siblings ...)
  2022-06-07  4:05     ` [PATCH v5 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
@ 2022-06-07  4:05     ` Noah Goldstein
  2022-06-07  4:05     ` [PATCH v5 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
                       ` (2 subsequent siblings)
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:05 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller user-arg lengths more.
    2. optimizes target placement more carefully
    3. reuses logic more
    4. fixes up various inefficiencies in the logic. The biggest
       case here is the `lzcnt` logic for checking returns which
       saves either a branch or multiple instructions.

The total code size saving is: 263 bytes
Geometric Mean of all benchmarks New / Old: 0.755

Regressions:
There are some regressions. Particularly where the length (user arg
length) is large but the position of the match char is near the
beginning of the string (in first VEC). This case has roughly a
20% regression.

This is because the new logic gives the hot path for immediate matches
to shorter lengths (the more common input). This case has roughly
a 35% speedup.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memrchr-evex.S | 539 ++++++++++++------------
 1 file changed, 268 insertions(+), 271 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index 0b99709c6b..f0bc4f175a 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -19,319 +19,316 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
+# ifndef MEMRCHR
+#  define MEMRCHR				__memrchr_evex
+# endif
+
+# define PAGE_SIZE			4096
+# define VECMATCH			VEC(0)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(MEMRCHR, 6)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
+
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdi, %rdx), %rax
+	vpbroadcastb %esi, %VECMATCH
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+
+	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will guarantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-# define VMOVA		vmovdqa64
-
-# define YMMMATCH	ymm16
-
-# define VEC_SIZE 32
-
-	.section .text.evex,"ax",@progbits
-ENTRY (__memrchr_evex)
-	/* Broadcast CHAR to YMMMATCH.  */
-	vpbroadcastb %esi, %YMMMATCH
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
-
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
-
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
-	kord	%k1, %k2, %k5
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
-	kord	%k3, %k4, %k6
-	kortestd %k5, %k6
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	kmovd	%k1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0_dec):
+	decq	%rax
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
+	/* Align rax (pointer to string).  */
+	andq	$-VEC_SIZE, %rax
 
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
+	/* Recompute length after aligning.  */
+	movq	%rax, %rdx
 
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	ret
+	subq	%rdi, %rdx
 
-	.p2align 4
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
+
+	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
+	decq	%rax
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpb	$0, (%rsi), %VECMATCH, %k0
+	kmovd	%k0, %r8d
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %ecx
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%ecx
+	shlxl	%ecx, %r8d, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_1)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
+	/* Continue creating zero labels that fit in aligning bytes and get
+	   2-byte encoding / are in the same cache line as condition.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
-
-	kmovd	%k1, %eax
-
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
-
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(zero)
-
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
-
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	.p2align 4,, 8
+L(ret_vec_x2):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	/* Check the last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecessary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	ret
+	decq	%rax
+	andq	$-(VEC_SIZE * 4), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	andq	$-(VEC_SIZE * 4), %rdx
 
 	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
-
-	/* Check the last VEC.  */
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+L(loop_4x_vec):
+	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
+	   on).  */
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
+
+	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
+	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
+	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
+
+	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
+	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
+	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
+	vptestnmb %VEC(3), %VEC(3), %k2
+
+	/* Any 1s and we found CHAR.  */
+	kortestd %k2, %k4
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
+
+	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
+	subq	$-(VEC_SIZE * 4), %rdx
+	movq	%rdx, %rax
+	subl	%edi, %edx
+L(last_4x_vec):
+
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	kmovd	%k1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	movl	%r8d, %ecx
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	kmovd	%k1, %eax
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret_1)
+	xorl	%eax, %eax
+L(ret_1):
+	ret
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 6
+L(loop_end):
+	kmovd	%k1, %ecx
+	notl	%ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vptestnmb %VEC(2), %VEC(2), %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	kmovd	%k2, %ecx
+	kmovd	%k4, %esi
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	addq	%rcx, %rax
+	ret
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	addq	$(VEC_SIZE), %rax
+L(ret_vec_x1_end):
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
 	ret
-END (__memrchr_evex)
+
+END(MEMRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v5 6/8] x86: Optimize memrchr-avx2.S
  2022-06-07  4:05   ` [PATCH v5 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (3 preceding siblings ...)
  2022-06-07  4:05     ` [PATCH v5 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
@ 2022-06-07  4:05     ` Noah Goldstein
  2022-06-07  4:05     ` [PATCH v5 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
  2022-06-07  4:05     ` [PATCH v5 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:05 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller user-arg lengths more.
    2. optimizes target placement more carefully
    3. reuses logic more
    4. fixes up various inefficiencies in the logic. The biggest
       case here is the `lzcnt` logic for checking returns which
       saves either a branch or multiple instructions.

The total code size saving is: 306 bytes
Geometric Mean of all benchmarks New / Old: 0.760

Regressions:
There are some regressions. Particularly where the length (user arg
length) is large but the position of the match char is near the
beginning of the string (in first VEC). This case has roughly a
10-20% regression.

This is because the new logic gives the hot path for immediate matches
to shorter lengths (the more common input). This case has roughly
a 15-45% speedup.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S |   1 +
 sysdeps/x86_64/multiarch/memrchr-avx2.S     | 534 ++++++++++----------
 2 files changed, 257 insertions(+), 278 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
index cea2d2a72d..5e9beeeef2 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
@@ -2,6 +2,7 @@
 # define MEMRCHR __memrchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index ba2ce7cb03..7d11a41618 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -21,340 +21,318 @@
 # include <sysdep.h>
 
 # ifndef MEMRCHR
-#  define MEMRCHR	__memrchr_avx2
+#  define MEMRCHR				__memrchr_avx2
 # endif
 
 # ifndef VZEROUPPER
-#  define VZEROUPPER	vzeroupper
+#  define VZEROUPPER			vzeroupper
 # endif
 
 # ifndef SECTION
 #  define SECTION(p)	p##.avx
 # endif
 
-# define VEC_SIZE 32
+# define VEC_SIZE			32
+# define PAGE_SIZE			4096
+	.section SECTION(.text), "ax", @progbits
+ENTRY(MEMRCHR)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
 
-	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMRCHR)
-	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
-	vpbroadcastb %xmm0, %ymm0
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
-
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdx, %rdi), %rax
 
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
+	vpbroadcastb %xmm0, %ymm0
 
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+
+L(ret_vec_x0_test):
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will gurantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+
+	/* Hoist vzeroupper (not great for RTM) to save code size. This allows
+	   all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vmovdqa	(%rdi), %ymm1
-	vmovdqa	VEC_SIZE(%rdi), %ymm2
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
-
-	vpcmpeqb %ymm1, %ymm0, %ymm1
-	vpcmpeqb %ymm2, %ymm0, %ymm2
-	vpcmpeqb %ymm3, %ymm0, %ymm3
-	vpcmpeqb %ymm4, %ymm0, %ymm4
-
-	vpor	%ymm1, %ymm2, %ymm5
-	vpor	%ymm3, %ymm4, %ymm6
-	vpor	%ymm5, %ymm6, %ymm5
-
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpmovmskb %ymm1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Align rax (string pointer).  */
+	andq	$-VEC_SIZE, %rax
+
+	/* Recompute remaining length after aligning.  */
+	movq	%rax, %rdx
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
+	subq	%rdi, %rdx
+	decq	%rax
+	vpmovmskb %ymm1, %ecx
+	/* Fall through for short (hotter than length).  */
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which in turn in necessray for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpeqb (%rsi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %r8d
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%r8d
+	shlxl	%r8d, %ecx, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
+	.p2align 4,, 11
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
+	.p2align 4,, 10
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	VZEROUPPER_RETURN
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	.p2align 4
-L(null):
+	/* First in aligning bytes.  */
+L(zero_2):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
+	.p2align 4,, 4
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	vpcmpeqb (%rdi), %ymm0, %ymm1
 
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
+	.p2align 4,, 11
+L(ret_vec_x2):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4,, 14
+L(ret_vec_x3):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
 	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(null)
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecissary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	/* Align rax to (VEC_SIZE - 1).  */
+	orq	$(VEC_SIZE * 4 - 1), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	orq	$(VEC_SIZE * 4 - 1), %rdx
 
-	/* Check the last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	.p2align 4
+L(loop_4x_vec):
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	vpor	%ymm1, %ymm2, %ymm2
+	vpor	%ymm3, %ymm4, %ymm4
+	vpor	%ymm2, %ymm4, %ymm4
+	vpmovmskb %ymm4, %esi
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	testl	%esi, %esi
+	jnz	L(loop_end)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	VZEROUPPER_RETURN
+	addq	$(VEC_SIZE * -4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
 
-	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
+	subl	%edi, %edx
+	incl	%edx
 
-	/* Check the last VEC.  */
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
+L(last_4x_vec):
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	vpmovmskb %ymm1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret0)
+	xorl	%eax, %eax
+L(ret0):
+	ret
 
-	movl	%r8d, %ecx
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(loop_end):
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vpmovmskb %ymm2, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	vpmovmskb %ymm3, %ecx
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	.p2align 4,, 4
+L(ret_vec_x1_end):
+	/* 64-bit version will automatically add 32 (VEC_SIZE).  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
+	VZEROUPPER_RETURN
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
-END (MEMRCHR)
+
+	/* 2 bytes until next cache line.  */
+END(MEMRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v5 7/8] x86: Shrink code size of memchr-avx2.S
  2022-06-07  4:05   ` [PATCH v5 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (4 preceding siblings ...)
  2022-06-07  4:05     ` [PATCH v5 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
@ 2022-06-07  4:05     ` Noah Goldstein
  2022-06-07  4:05     ` [PATCH v5 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:05 UTC (permalink / raw)
  To: libc-alpha

This is not meant as a performance optimization. The previous code was
far to liberal in aligning targets and wasted code size unnecissarily.

The total code size saving is: 59 bytes

There are no major changes in the benchmarks.
Geometric Mean of all benchmarks New / Old: 0.967

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S |   1 +
 sysdeps/x86_64/multiarch/memchr-avx2.S     | 109 +++++++++++----------
 2 files changed, 60 insertions(+), 50 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
index 87b076c7c4..c4d71938c5 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
@@ -2,6 +2,7 @@
 # define MEMCHR __memchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 75bd7262e0..28a01280ec 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -57,7 +57,7 @@
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 5)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 #  ifdef __ILP32__
@@ -87,12 +87,14 @@ ENTRY (MEMCHR)
 # endif
 	testl	%eax, %eax
 	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
 
 # ifndef USE_AS_RAWMEMCHR
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
 	/* Check if first match was before length.  */
 	tzcntl	%eax, %eax
@@ -100,58 +102,31 @@ L(first_vec_x0):
 	/* NB: Multiply length by 4 to get byte count.  */
 	sall	$2, %edx
 #  endif
-	xorl	%ecx, %ecx
+    COND_VZEROUPPER
+	/* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
+	   block. branch here as opposed to cmovcc is not that costly. Common
+	   usage of memchr is to check if the return was NULL (if string was
+	   known to contain CHAR user would use rawmemchr). This branch will be
+	   highly correlated with the user branch and can be used by most
+	   modern branch predictors to predict the user branch.  */
 	cmpl	%eax, %edx
-	leaq	(%rdi, %rax), %rax
-	cmovle	%rcx, %rax
-	VZEROUPPER_RETURN
-
-L(null):
-	xorl	%eax, %eax
-	ret
-# endif
-	.p2align 4
-L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is
-	   necessary for computer return address if byte is found or
-	   adjusting length if it is not and this is memchr.  */
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
-	   and rdi for rawmemchr.  */
-	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
-	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Calculate length until end of page (length checked for a
-	   match).  */
-	leaq	1(%ALGN_PTR_REG), %rsi
-	subq	%RRAW_PTR_REG, %rsi
-#  ifdef USE_AS_WMEMCHR
-	/* NB: Divide bytes by 4 to get wchar_t count.  */
-	shrl	$2, %esi
-#  endif
-# endif
-	/* Remove the leading bytes.  */
-	sarxl	%ERAW_PTR_REG, %eax, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Check the end of data.  */
-	cmpq	%rsi, %rdx
-	jbe	L(first_vec_x0)
+    jle  L(null)
+	addq	%rdi, %rax
+    ret
 # endif
-	testl	%eax, %eax
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
-	addq	%RRAW_PTR_REG, %rax
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x1):
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-
+# ifndef USE_AS_RAWMEMCHR
+	/* First in aligning bytes here.  */
+L(null):
+	xorl	%eax, %eax
+	ret
+# endif
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
@@ -340,7 +315,7 @@ L(first_vec_x1_check):
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-	.p2align 4
+	.p2align 4,, 6
 L(set_zero_end):
 	xorl	%eax, %eax
 	VZEROUPPER_RETURN
@@ -428,5 +403,39 @@ L(last_vec_x3):
 	VZEROUPPER_RETURN
 # endif
 
+	.p2align 4
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is necessary for
+	   computer return address if byte is found or adjusting length if it
+	   is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
+	   rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMPEQ	(%ALGN_PTR_REG), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length checked for a match).  */
+	leal	VEC_SIZE(%ALGN_PTR_REG), %esi
+	subl	%ERAW_PTR_REG, %esi
+# ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+# endif
+# endif
+	/* Remove the leading bytes.  */
+	sarxl	%ERAW_PTR_REG, %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Check the end of data.  */
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	bsfl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
+	VZEROUPPER_RETURN
+
+
 END (MEMCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v5 8/8] x86: Shrink code size of memchr-evex.S
  2022-06-07  4:05   ` [PATCH v5 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (5 preceding siblings ...)
  2022-06-07  4:05     ` [PATCH v5 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
@ 2022-06-07  4:05     ` Noah Goldstein
  6 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:05 UTC (permalink / raw)
  To: libc-alpha

This is not meant as a performance optimization. The previous code was
far to liberal in aligning targets and wasted code size unnecissarily.

The total code size saving is: 64 bytes

There are no non-negligible changes in the benchmarks.
Geometric Mean of all benchmarks New / Old: 1.000

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memchr-evex.S | 46 ++++++++++++++------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index cfaf02907d..0fd11b7632 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -88,7 +88,7 @@
 # define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 6)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
@@ -131,22 +131,24 @@ L(zero):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
-	/* Check if first match was before length.  */
-	tzcntl	%eax, %eax
-	xorl	%ecx, %ecx
-	cmpl	%eax, %edx
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-	cmovle	%rcx, %rax
+	/* Check if first match was before length. NB: tzcnt has false data-
+	   dependency on destination. eax already had a data-dependency on esi
+	   so this should have no affect here.  */
+	tzcntl	%eax, %esi
+#  ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
+#  else
+	addq	%rsi, %rdi
+#  endif
+	xorl	%eax, %eax
+	cmpl	%esi, %edx
+	cmovg	%rdi, %rax
 	ret
-# else
-	/* NB: first_vec_x0 is 17 bytes which will leave
-	   cross_page_boundary (which is relatively cold) close enough
-	   to ideal alignment. So only realign L(cross_page_boundary) if
-	   rawmemchr.  */
-	.p2align 4
 # endif
+
+	.p2align 4
 L(cross_page_boundary):
 	/* Save pointer before aligning as its original value is
 	   necessary for computer return address if byte is found or
@@ -400,10 +402,14 @@ L(last_2x_vec):
 L(zero_end):
 	ret
 
+L(set_zero_end):
+	xorl	%eax, %eax
+	ret
 
 	.p2align 4
 L(first_vec_x1_check):
-	tzcntl	%eax, %eax
+	/* eax must be non-zero. Use bsfl to save code size.  */
+	bsfl	%eax, %eax
 	/* Adjust length.  */
 	subl	$-(CHAR_PER_VEC * 4), %edx
 	/* Check if match within remaining length.  */
@@ -412,9 +418,6 @@ L(first_vec_x1_check):
 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 	ret
-L(set_zero_end):
-	xorl	%eax, %eax
-	ret
 
 	.p2align 4
 L(loop_4x_vec_end):
@@ -464,7 +467,7 @@ L(loop_4x_vec_end):
 # endif
 	ret
 
-	.p2align 4
+	.p2align 4,, 10
 L(last_vec_x1_return):
 	tzcntl	%eax, %eax
 # if defined USE_AS_WMEMCHR || RET_OFFSET != 0
@@ -496,6 +499,7 @@ L(last_vec_x3_return):
 # endif
 
 # ifndef USE_AS_RAWMEMCHR
+	.p2align 4,, 5
 L(last_4x_vec_or_less_cmpeq):
 	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
 	kmovd	%k0, %eax
@@ -546,7 +550,7 @@ L(last_4x_vec):
 #  endif
 	andl	%ecx, %eax
 	jz	L(zero_end2)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 L(zero_end2):
 	ret
@@ -562,6 +566,6 @@ L(last_vec_x3):
 	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 # endif
-
+	/* 7 bytes from next cache line.  */
 END (MEMCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library
  2022-06-03  4:42 ` [PATCH v1 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-06-07  4:05   ` [PATCH v5 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
@ 2022-06-07  4:11   ` Noah Goldstein
  2022-06-07  4:11     ` [PATCH v6 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
                       ` (7 more replies)
  4 siblings, 8 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:11 UTC (permalink / raw)
  To: libc-alpha

This patch does not touch any existing code and is only meant to be a
tool for future patches so that simple source files can more easily be
maintained to target multiple VEC classes.

There is no difference in the objdump of libc.so before and after this
patch.
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h     | 34 ++++++++
 sysdeps/x86_64/multiarch/avx-vecs.h         | 47 +++++++++++
 sysdeps/x86_64/multiarch/evex-vecs-common.h | 39 +++++++++
 sysdeps/x86_64/multiarch/evex256-vecs.h     | 35 ++++++++
 sysdeps/x86_64/multiarch/evex512-vecs.h     | 35 ++++++++
 sysdeps/x86_64/multiarch/sse2-vecs.h        | 47 +++++++++++
 sysdeps/x86_64/multiarch/vec-macros.h       | 90 +++++++++++++++++++++
 7 files changed, 327 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/evex-vecs-common.h
 create mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/vec-macros.h

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
new file mode 100644
index 0000000000..3f531dd47f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
@@ -0,0 +1,34 @@
+/* Common config for AVX-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_RTM_VECS_H
+#define _AVX_RTM_VECS_H			1
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define USE_WITH_RTM			1
+#include "avx-vecs.h"
+
+#undef SECTION
+#define SECTION(p)				p##.avx.rtm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
new file mode 100644
index 0000000000..89680f5db8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for AVX VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_VECS_H
+#define _AVX_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "vec-macros.h"
+
+#define USE_WITH_AVX		1
+#define SECTION(p)			p##.avx
+
+/* 4-byte mov instructions with AVX2.  */
+#define MOV_SIZE			4
+/* 1 (ret) + 3 (vzeroupper).  */
+#define RET_SIZE			4
+#define VZEROUPPER			vzeroupper
+
+#define VMOVU				vmovdqu
+#define VMOVA				vmovdqa
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
new file mode 100644
index 0000000000..99806ebcd7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex-vecs-common.h
@@ -0,0 +1,39 @@
+/* Common config for EVEX256 and EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX_VECS_COMMON_H
+#define _EVEX_VECS_COMMON_H			1
+
+#include "vec-macros.h"
+
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+#define VEC_xmm				VEC_hi_xmm
+#define VEC_ymm				VEC_hi_ymm
+#define VEC_zmm				VEC_hi_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
new file mode 100644
index 0000000000..222ba46dc7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
@@ -0,0 +1,35 @@
+/* Common config for EVEX256 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX256_VECS_H
+#define _EVEX256_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "evex-vecs-common.h"
+
+#define USE_WITH_EVEX256	1
+#define SECTION(p)			p##.evex
+
+#define VEC					VEC_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
new file mode 100644
index 0000000000..d1784d5368
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
@@ -0,0 +1,35 @@
+/* Common config for EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX512_VECS_H
+#define _EVEX512_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			64
+#include "evex-vecs-common.h"
+
+#define USE_WITH_EVEX512	1
+#define SECTION(p)			p##.evex512
+
+#define VEC					VEC_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
new file mode 100644
index 0000000000..2b77a59d56
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for SSE2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SSE2_VECS_H
+#define _SSE2_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			16
+#include "vec-macros.h"
+
+#define USE_WITH_SSE2		1
+#define SECTION(p)			p
+
+/* 3-byte mov instructions with SSE2.  */
+#define MOV_SIZE			3
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				movups
+#define VMOVA				movaps
+#define VMOVNT				movntdq
+
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_xmm
+
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
new file mode 100644
index 0000000000..9f3ffecede
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/vec-macros.h
@@ -0,0 +1,90 @@
+/* Macro helpers for VEC_{type}({vec_num})
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _VEC_MACROS_H
+#define _VEC_MACROS_H			1
+
+#ifndef VEC_SIZE
+# error "Never include this file directly. Always include a vector config."
+#endif
+
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
+   VEC(N) values.  */
+#define VEC_hi_xmm0				xmm16
+#define VEC_hi_xmm1				xmm17
+#define VEC_hi_xmm2				xmm18
+#define VEC_hi_xmm3				xmm19
+#define VEC_hi_xmm4				xmm20
+#define VEC_hi_xmm5				xmm21
+#define VEC_hi_xmm6				xmm22
+#define VEC_hi_xmm7				xmm23
+#define VEC_hi_xmm8				xmm24
+#define VEC_hi_xmm9				xmm25
+#define VEC_hi_xmm10			xmm26
+#define VEC_hi_xmm11			xmm27
+#define VEC_hi_xmm12			xmm28
+#define VEC_hi_xmm13			xmm29
+#define VEC_hi_xmm14			xmm30
+#define VEC_hi_xmm15			xmm31
+
+#define VEC_hi_ymm0				ymm16
+#define VEC_hi_ymm1				ymm17
+#define VEC_hi_ymm2				ymm18
+#define VEC_hi_ymm3				ymm19
+#define VEC_hi_ymm4				ymm20
+#define VEC_hi_ymm5				ymm21
+#define VEC_hi_ymm6				ymm22
+#define VEC_hi_ymm7				ymm23
+#define VEC_hi_ymm8				ymm24
+#define VEC_hi_ymm9				ymm25
+#define VEC_hi_ymm10			ymm26
+#define VEC_hi_ymm11			ymm27
+#define VEC_hi_ymm12			ymm28
+#define VEC_hi_ymm13			ymm29
+#define VEC_hi_ymm14			ymm30
+#define VEC_hi_ymm15			ymm31
+
+#define VEC_hi_zmm0				zmm16
+#define VEC_hi_zmm1				zmm17
+#define VEC_hi_zmm2				zmm18
+#define VEC_hi_zmm3				zmm19
+#define VEC_hi_zmm4				zmm20
+#define VEC_hi_zmm5				zmm21
+#define VEC_hi_zmm6				zmm22
+#define VEC_hi_zmm7				zmm23
+#define VEC_hi_zmm8				zmm24
+#define VEC_hi_zmm9				zmm25
+#define VEC_hi_zmm10			zmm26
+#define VEC_hi_zmm11			zmm27
+#define VEC_hi_zmm12			zmm28
+#define VEC_hi_zmm13			zmm29
+#define VEC_hi_zmm14			zmm30
+#define VEC_hi_zmm15			zmm31
+
+#define PRIMITIVE_VEC(vec, num)		vec##num
+
+#define VEC_any_xmm(i)			PRIMITIVE_VEC(xmm, i)
+#define VEC_any_ymm(i)			PRIMITIVE_VEC(ymm, i)
+#define VEC_any_zmm(i)			PRIMITIVE_VEC(zmm, i)
+
+#define VEC_hi_xmm(i)			PRIMITIVE_VEC(VEC_hi_xmm, i)
+#define VEC_hi_ymm(i)			PRIMITIVE_VEC(VEC_hi_ymm, i)
+#define VEC_hi_zmm(i)			PRIMITIVE_VEC(VEC_hi_zmm, i)
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v6 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
  2022-06-07  4:11   ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
@ 2022-06-07  4:11     ` Noah Goldstein
  2022-06-07  4:11     ` [PATCH v6 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
                       ` (6 subsequent siblings)
  7 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:11 UTC (permalink / raw)
  To: libc-alpha

The RTM vzeroupper mitigation has no way of replacing inline
vzeroupper not before a return.

This can be useful when hoisting a vzeroupper to save code size
for example:

```
L(foo):
	cmpl	%eax, %edx
	jz	L(bar)
	tzcntl	%eax, %eax
	addq	%rdi, %rax
	VZEROUPPER_RETURN

L(bar):
	xorl	%eax, %eax
	VZEROUPPER_RETURN
```

Can become:

```
L(foo):
	COND_VZEROUPPER
	cmpl	%eax, %edx
	jz	L(bar)
	tzcntl	%eax, %eax
	addq	%rdi, %rax
	ret

L(bar):
	xorl	%eax, %eax
	ret
```

This code does not change any existing functionality.

There is no difference in the objdump of libc.so before and after this
patch.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h |  1 +
 sysdeps/x86_64/sysdep.h                 | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
index 3f531dd47f..6ca9f5e6ba 100644
--- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
@@ -20,6 +20,7 @@
 #ifndef _AVX_RTM_VECS_H
 #define _AVX_RTM_VECS_H			1
 
+#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN	\
 	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index f14d50786d..4f512d5566 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -106,6 +106,24 @@ lose:									      \
 	vzeroupper;						\
 	ret
 
+/* Can be used to replace vzeroupper that is not directly before a
+   return.  This is useful when hoisting a vzeroupper from multiple
+   return paths to decrease the total number of vzerouppers and code
+   size.  */
+#define COND_VZEROUPPER_XTEST							\
+    xtest;							\
+    jz 1f;							\
+    vzeroall;							\
+    jmp 2f;							\
+1:							\
+    vzeroupper;							\
+2:
+
+/* In RTM define this as COND_VZEROUPPER_XTEST.  */
+#ifndef COND_VZEROUPPER
+# define COND_VZEROUPPER vzeroupper
+#endif
+
 /* Zero upper vector registers and return.  */
 #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
 # define ZERO_UPPER_VEC_REGISTERS_RETURN \
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v6 3/8] Benchtests: Improve memrchr benchmarks
  2022-06-07  4:11   ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  2022-06-07  4:11     ` [PATCH v6 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
@ 2022-06-07  4:11     ` Noah Goldstein
  2022-06-07 18:03       ` H.J. Lu
  2022-06-07  4:11     ` [PATCH v6 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
                       ` (5 subsequent siblings)
  7 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:11 UTC (permalink / raw)
  To: libc-alpha

Add a second iteration for memrchr to set `pos` starting from the end
of the buffer.

Previously `pos` was only set relative to the begining of the
buffer. This isn't really useful for memrchr because the begining
of the search space is (buf + len).
---
 benchtests/bench-memchr.c | 110 ++++++++++++++++++++++----------------
 1 file changed, 65 insertions(+), 45 deletions(-)

diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
index 4d7212332f..0facda2fa0 100644
--- a/benchtests/bench-memchr.c
+++ b/benchtests/bench-memchr.c
@@ -76,7 +76,7 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
 
 static void
 do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
-	 int seek_char)
+	 int seek_char, int invert_pos)
 {
   size_t i;
 
@@ -96,7 +96,10 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
 
   if (pos < len)
     {
-      buf[align + pos] = seek_char;
+      if (invert_pos)
+	buf[align + len - pos] = seek_char;
+      else
+	buf[align + pos] = seek_char;
       buf[align + len] = -seek_char;
     }
   else
@@ -109,6 +112,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
   json_attr_uint (json_ctx, "pos", pos);
   json_attr_uint (json_ctx, "len", len);
   json_attr_uint (json_ctx, "seek_char", seek_char);
+  json_attr_uint (json_ctx, "invert_pos", invert_pos);
 
   json_array_begin (json_ctx, "timings");
 
@@ -123,6 +127,7 @@ int
 test_main (void)
 {
   size_t i;
+  int repeats;
   json_ctx_t json_ctx;
   test_init ();
 
@@ -142,53 +147,68 @@ test_main (void)
 
   json_array_begin (&json_ctx, "results");
 
-  for (i = 1; i < 8; ++i)
+  for (repeats = 0; repeats < 2; ++repeats)
     {
-      do_test (&json_ctx, 0, 16 << i, 2048, 23);
-      do_test (&json_ctx, i, 64, 256, 23);
-      do_test (&json_ctx, 0, 16 << i, 2048, 0);
-      do_test (&json_ctx, i, 64, 256, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, 64, 256, 0);
+      for (i = 1; i < 8; ++i)
+	{
+	  do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
+	  do_test (&json_ctx, i, 64, 256, 23, repeats);
+	  do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
+	  do_test (&json_ctx, i, 64, 256, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
 #ifdef USE_AS_MEMRCHR
-      /* Also test the position close to the beginning for memrchr.  */
-      do_test (&json_ctx, 0, i, 256, 23);
-      do_test (&json_ctx, 0, i, 256, 0);
-      do_test (&json_ctx, i, i, 256, 23);
-      do_test (&json_ctx, i, i, 256, 0);
+	  /* Also test the position close to the beginning for memrchr.  */
+	  do_test (&json_ctx, 0, i, 256, 23, repeats);
+	  do_test (&json_ctx, 0, i, 256, 0, repeats);
+	  do_test (&json_ctx, i, i, 256, 23, repeats);
+	  do_test (&json_ctx, i, i, 256, 0, repeats);
 #endif
-    }
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (&json_ctx, i, i << 5, 192, 23);
-      do_test (&json_ctx, i, i << 5, 192, 0);
-      do_test (&json_ctx, i, i << 5, 256, 23);
-      do_test (&json_ctx, i, i << 5, 256, 0);
-      do_test (&json_ctx, i, i << 5, 512, 23);
-      do_test (&json_ctx, i, i << 5, 512, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23);
-    }
-  for (i = 1; i < 32; ++i)
-    {
-      do_test (&json_ctx, 0, i, i + 1, 23);
-      do_test (&json_ctx, 0, i, i + 1, 0);
-      do_test (&json_ctx, i, i, i + 1, 23);
-      do_test (&json_ctx, i, i, i + 1, 0);
-      do_test (&json_ctx, 0, i, i - 1, 23);
-      do_test (&json_ctx, 0, i, i - 1, 0);
-      do_test (&json_ctx, i, i, i - 1, 23);
-      do_test (&json_ctx, i, i, i - 1, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23);
-      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23);
-      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0);
+	}
+      for (i = 1; i < 8; ++i)
+	{
+	  do_test (&json_ctx, i, i << 5, 192, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 192, 0, repeats);
+	  do_test (&json_ctx, i, i << 5, 256, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 256, 0, repeats);
+	  do_test (&json_ctx, i, i << 5, 512, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 512, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
+	}
+      for (i = 1; i < 32; ++i)
+	{
+	  do_test (&json_ctx, 0, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, 0, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, i, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, i, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, 0, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, 0, i, i - 1, 0, repeats);
+	  do_test (&json_ctx, i, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, i, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () / 2, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i - 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0, repeats);
+
 #ifdef USE_AS_MEMRCHR
-      /* Also test the position close to the beginning for memrchr.  */
-      do_test (&json_ctx, 0, 1, i + 1, 23);
-      do_test (&json_ctx, 0, 2, i + 1, 0);
+	  do_test (&json_ctx, 0, 1, i + 1, 23, repeats);
+	  do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
+#endif
+	}
+#ifndef USE_AS_MEMRCHR
+      break;
 #endif
     }
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v6 3/8] Benchtests: Improve memrchr benchmarks
  2022-06-07  4:11     ` [PATCH v6 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
@ 2022-06-07 18:03       ` H.J. Lu
  0 siblings, 0 replies; 82+ messages in thread
From: H.J. Lu @ 2022-06-07 18:03 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Add a second iteration for memrchr to set `pos` starting from the end
> of the buffer.
>
> Previously `pos` was only set relative to the begining of the
> buffer. This isn't really useful for memrchr because the begining
> of the search space is (buf + len).
> ---
>  benchtests/bench-memchr.c | 110 ++++++++++++++++++++++----------------
>  1 file changed, 65 insertions(+), 45 deletions(-)
>
> diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> index 4d7212332f..0facda2fa0 100644
> --- a/benchtests/bench-memchr.c
> +++ b/benchtests/bench-memchr.c
> @@ -76,7 +76,7 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
>
>  static void
>  do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> -        int seek_char)
> +        int seek_char, int invert_pos)
>  {
>    size_t i;
>
> @@ -96,7 +96,10 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
>
>    if (pos < len)
>      {
> -      buf[align + pos] = seek_char;
> +      if (invert_pos)
> +       buf[align + len - pos] = seek_char;
> +      else
> +       buf[align + pos] = seek_char;
>        buf[align + len] = -seek_char;
>      }
>    else
> @@ -109,6 +112,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
>    json_attr_uint (json_ctx, "pos", pos);
>    json_attr_uint (json_ctx, "len", len);
>    json_attr_uint (json_ctx, "seek_char", seek_char);
> +  json_attr_uint (json_ctx, "invert_pos", invert_pos);
>
>    json_array_begin (json_ctx, "timings");
>
> @@ -123,6 +127,7 @@ int
>  test_main (void)
>  {
>    size_t i;
> +  int repeats;
>    json_ctx_t json_ctx;
>    test_init ();
>
> @@ -142,53 +147,68 @@ test_main (void)
>
>    json_array_begin (&json_ctx, "results");
>
> -  for (i = 1; i < 8; ++i)
> +  for (repeats = 0; repeats < 2; ++repeats)
>      {
> -      do_test (&json_ctx, 0, 16 << i, 2048, 23);
> -      do_test (&json_ctx, i, 64, 256, 23);
> -      do_test (&json_ctx, 0, 16 << i, 2048, 0);
> -      do_test (&json_ctx, i, 64, 256, 0);
> -
> -      do_test (&json_ctx, getpagesize () - 15, 64, 256, 0);
> +      for (i = 1; i < 8; ++i)
> +       {
> +         do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
> +         do_test (&json_ctx, i, 64, 256, 23, repeats);
> +         do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
> +         do_test (&json_ctx, i, 64, 256, 0, repeats);
> +
> +         do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
>  #ifdef USE_AS_MEMRCHR
> -      /* Also test the position close to the beginning for memrchr.  */
> -      do_test (&json_ctx, 0, i, 256, 23);
> -      do_test (&json_ctx, 0, i, 256, 0);
> -      do_test (&json_ctx, i, i, 256, 23);
> -      do_test (&json_ctx, i, i, 256, 0);
> +         /* Also test the position close to the beginning for memrchr.  */
> +         do_test (&json_ctx, 0, i, 256, 23, repeats);
> +         do_test (&json_ctx, 0, i, 256, 0, repeats);
> +         do_test (&json_ctx, i, i, 256, 23, repeats);
> +         do_test (&json_ctx, i, i, 256, 0, repeats);
>  #endif
> -    }
> -  for (i = 1; i < 8; ++i)
> -    {
> -      do_test (&json_ctx, i, i << 5, 192, 23);
> -      do_test (&json_ctx, i, i << 5, 192, 0);
> -      do_test (&json_ctx, i, i << 5, 256, 23);
> -      do_test (&json_ctx, i, i << 5, 256, 0);
> -      do_test (&json_ctx, i, i << 5, 512, 23);
> -      do_test (&json_ctx, i, i << 5, 512, 0);
> -
> -      do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23);
> -    }
> -  for (i = 1; i < 32; ++i)
> -    {
> -      do_test (&json_ctx, 0, i, i + 1, 23);
> -      do_test (&json_ctx, 0, i, i + 1, 0);
> -      do_test (&json_ctx, i, i, i + 1, 23);
> -      do_test (&json_ctx, i, i, i + 1, 0);
> -      do_test (&json_ctx, 0, i, i - 1, 23);
> -      do_test (&json_ctx, 0, i, i - 1, 0);
> -      do_test (&json_ctx, i, i, i - 1, 23);
> -      do_test (&json_ctx, i, i, i - 1, 0);
> -
> -      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23);
> -      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0);
> -
> -      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23);
> -      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0);
> +       }
> +      for (i = 1; i < 8; ++i)
> +       {
> +         do_test (&json_ctx, i, i << 5, 192, 23, repeats);
> +         do_test (&json_ctx, i, i << 5, 192, 0, repeats);
> +         do_test (&json_ctx, i, i << 5, 256, 23, repeats);
> +         do_test (&json_ctx, i, i << 5, 256, 0, repeats);
> +         do_test (&json_ctx, i, i << 5, 512, 23, repeats);
> +         do_test (&json_ctx, i, i << 5, 512, 0, repeats);
> +
> +         do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
> +       }
> +      for (i = 1; i < 32; ++i)
> +       {
> +         do_test (&json_ctx, 0, i, i + 1, 23, repeats);
> +         do_test (&json_ctx, 0, i, i + 1, 0, repeats);
> +         do_test (&json_ctx, i, i, i + 1, 23, repeats);
> +         do_test (&json_ctx, i, i, i + 1, 0, repeats);
> +         do_test (&json_ctx, 0, i, i - 1, 23, repeats);
> +         do_test (&json_ctx, 0, i, i - 1, 0, repeats);
> +         do_test (&json_ctx, i, i, i - 1, 23, repeats);
> +         do_test (&json_ctx, i, i, i - 1, 0, repeats);
> +
> +         do_test (&json_ctx, getpagesize () / 2, i, i + 1, 23, repeats);
> +         do_test (&json_ctx, getpagesize () / 2, i, i + 1, 0, repeats);
> +         do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 23, repeats);
> +         do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 0, repeats);
> +         do_test (&json_ctx, getpagesize () / 2, i, i - 1, 23, repeats);
> +         do_test (&json_ctx, getpagesize () / 2, i, i - 1, 0, repeats);
> +         do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 23, repeats);
> +         do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 0, repeats);
> +
> +         do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23, repeats);
> +         do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0, repeats);
> +
> +         do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23, repeats);
> +         do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0, repeats);
> +
>  #ifdef USE_AS_MEMRCHR
> -      /* Also test the position close to the beginning for memrchr.  */
> -      do_test (&json_ctx, 0, 1, i + 1, 23);
> -      do_test (&json_ctx, 0, 2, i + 1, 0);
> +         do_test (&json_ctx, 0, 1, i + 1, 23, repeats);
> +         do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
> +#endif
> +       }
> +#ifndef USE_AS_MEMRCHR
> +      break;
>  #endif
>      }
>
> --
> 2.34.1
>

Please change begining to beginning in commit log.   Otherwise, it
is OK.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v6 4/8] x86: Optimize memrchr-sse2.S
  2022-06-07  4:11   ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  2022-06-07  4:11     ` [PATCH v6 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
  2022-06-07  4:11     ` [PATCH v6 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
@ 2022-06-07  4:11     ` Noah Goldstein
  2022-06-07 18:04       ` H.J. Lu
  2022-06-07  4:11     ` [PATCH v6 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
                       ` (4 subsequent siblings)
  7 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:11 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller lengths more.
    2. optimizes target placement more carefully.
    3. reuses logic more.
    4. fixes up various inefficiencies in the logic.

The total code size saving is: 394 bytes
Geometric Mean of all benchmarks New / Old: 0.874

Regressions:
    1. The page cross case is now colder, especially re-entry from the
       page cross case if a match is not found in the first VEC
       (roughly 50%). My general opinion with this patch is this is
       acceptable given the "coldness" of this case (less than 4%) and
       generally performance improvement in the other far more common
       cases.

    2. There are some regressions 5-15% for medium/large user-arg
       lengths that have a match in the first VEC. This is because the
       logic was rewritten to optimize finds in the first VEC if the
       user-arg length is shorter (where we see roughly 20-50%
       performance improvements). It is not always the case this is a
       regression. My intuition is some frontend quirk is partially
       explaining the data although I haven't been able to find the
       root cause.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++--------------------
 1 file changed, 292 insertions(+), 321 deletions(-)

diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index d1a9f47911..b0dffd2ae2 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -18,362 +18,333 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#define VEC_SIZE			16
+#define PAGE_SIZE			4096
 
 	.text
-ENTRY (__memrchr)
-	movd	%esi, %xmm1
-
-	sub	$16, %RDX_LP
-	jbe	L(length_less16)
-
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	%RDX_LP, %RDI_LP
-	pshufd	$0, %xmm1, %xmm1
-
-	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-
-/* Check if there is a match.  */
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(loop_prolog)
-
-	add	$16, %rdi
-	add	$16, %rdx
-	and	$-16, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(loop_prolog):
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqb	%xmm1, %xmm4
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	mov	%edi, %ecx
-	and	$63, %ecx
-	jz	L(align64_loop)
-
-	add	$64, %rdi
-	add	$64, %rdx
-	and	$-64, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(align64_loop):
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	movdqa	48(%rdi), %xmm4
-
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm1, %xmm4
-
-	pmaxub	%xmm3, %xmm0
-	pmaxub	%xmm4, %xmm2
-	pmaxub	%xmm0, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	test	%eax, %eax
-	jz	L(align64_loop)
-
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm2
-
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	(%rdi), %xmm1
-
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	pmovmskb	%xmm1, %eax
-	bsr	%eax, %eax
-
-	add	%rdi, %rax
+ENTRY_P2ALIGN(__memrchr, 6)
+#ifdef __ILP32__
+	/* Clear upper bits.  */
+	mov	%RDX_LP, %RDX_LP
+#endif
+	movd	%esi, %xmm0
+
+	/* Get end pointer.  */
+	leaq	(%rdx, %rdi), %rcx
+
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	jz	L(page_cross)
+
+	/* NB: This load happens regardless of whether rdx (len) is zero. Since
+	   it doesn't cross a page and the standard gurantees any pointer have
+	   at least one-valid byte this load must be safe. For the entire
+	   history of the x86 memrchr implementation this has been possible so
+	   no code "should" be relying on a zero-length check before this load.
+	   The zero-length check is moved to the page cross case because it is
+	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
+	   into 2-cache lines.  */
+	movups	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
+	   zero.  */
+	bsrl	%eax, %eax
+	jz	L(ret_0)
+	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
+	   if out of bounds.  */
+	addl	%edx, %eax
+	jl	L(zero_0)
+	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
+	   ptr.  */
+	addq	%rdi, %rax
+L(ret_0):
 	ret
 
-	.p2align 4
-L(exit_loop):
-	add	$64, %edx
-	cmp	$32, %edx
-	jbe	L(exit_loop_32)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16_1)
-	cmp	$48, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches0_1)
-	xor	%eax, %eax
+	.p2align 4,, 5
+L(ret_vec_x0):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(exit_loop_32):
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48_1)
-	cmp	$16, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	32(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches32_1)
-	xor	%eax, %eax
+	.p2align 4,, 2
+L(zero_0):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches0):
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
-	ret
 
-	.p2align 4
-L(matches32):
-	bsr	%eax, %eax
-	lea	32(%rax, %rdi), %rax
+	.p2align 4,, 8
+L(more_1x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	/* Align rcx (pointer to string).  */
+	decq	%rcx
+	andq	$-VEC_SIZE, %rcx
+
+	movq	%rcx, %rdx
+	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
+	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
+	   it adds more frontend uops (even if the moves can be eliminated) and
+	   some percentage of the time actual backend uops.  */
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	subq	%rdi, %rdx
+	pmovmskb %xmm1, %eax
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	subl	$VEC_SIZE, %edx
+	jbe	L(ret_vec_x0_test)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_1)
+	addl	%edx, %eax
+	jl	L(zero_0)
+	addq	%rdi, %rax
+L(ret_1):
 	ret
 
-	.p2align 4
-L(matches48):
-	bsr	%eax, %eax
-	lea	48(%rax, %rdi), %rax
+	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
+	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
+	   lines.  Naturally aligned % 16 to 8-bytes.  */
+L(page_cross):
+	/* Zero length check.  */
+	testq	%rdx, %rdx
+	jz	L(zero_0)
+
+	leaq	-1(%rcx), %r8
+	andq	$-(VEC_SIZE), %r8
+
+	movaps	(%r8), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	negl	%ecx
+	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
+	   explicitly.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	shl	%cl, %esi
+	movzwl	%si, %eax
+	leaq	(%rdi, %rdx), %rcx
+	cmpq	%rdi, %r8
+	ja	L(more_1x_vec)
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_2)
+	addl	%edx, %eax
+	jl	L(zero_1)
+	addq	%rdi, %rax
+L(ret_2):
 	ret
 
-	.p2align 4
-L(matches0_1):
-	bsr	%eax, %eax
-	sub	$64, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	add	%rdi, %rax
+	/* Fits in aliging bytes.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches16_1):
-	bsr	%eax, %eax
-	sub	$48, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	16(%rdi, %rax), %rax
+	.p2align 4,, 5
+L(ret_vec_x1):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(matches32_1):
-	bsr	%eax, %eax
-	sub	$32, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	32(%rdi, %rax), %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(matches48_1):
-	bsr	%eax, %eax
-	sub	$16, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	48(%rdi, %rax), %rax
-	ret
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(return_null):
-	xor	%eax, %eax
-	ret
 
-	.p2align 4
-L(length_less16_offset0):
-	test	%edx, %edx
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	mov	%dl, %cl
-	pcmpeqb	(%rdi), %xmm1
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
 
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
+	addl	$(VEC_SIZE), %edx
+	jle	L(ret_vec_x2_test)
 
-	pmovmskb	%xmm1, %eax
+L(last_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
 
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
+	subl	$(VEC_SIZE), %edx
+	bsrl	%eax, %eax
+	jz	L(ret_3)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
+L(ret_3):
 	ret
 
-	.p2align 4
-L(length_less16):
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	$16, %edx
-
-	pshufd	$0, %xmm1, %xmm1
-
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(length_less16_offset0)
-
-	mov	%cl, %dh
-	mov	%ecx, %esi
-	add	%dl, %dh
-	and	$-16, %rdi
-
-	sub	$16, %dh
-	ja	L(length_less16_part2)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-
-	sar	%cl, %eax
-	mov	%dl, %cl
-
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
-
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 6
+L(ret_vec_x2_test):
+	bsrl	%eax, %eax
+	jz	L(zero_2)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4
-L(length_less16_part2):
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	mov	%dh, %cl
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
+L(zero_2):
+	xorl	%eax, %eax
+	ret
 
-	test	%eax, %eax
-	jnz	L(length_less16_part2_return)
 
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
+	.p2align 4,, 5
+L(ret_vec_x2):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	mov	%esi, %ecx
-	sar	%cl, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	.p2align 4,, 5
+L(ret_vec_x3):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
+
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x3)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
+	   keeping the code from spilling to the next cache line.  */
+	addq	$(VEC_SIZE * 4 - 1), %rcx
+	andq	$-(VEC_SIZE * 4), %rcx
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
+	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
+	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
+	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
+	pcmpeqb	%xmm0, %xmm1
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm0, %xmm4
+
+	por	%xmm1, %xmm2
+	por	%xmm3, %xmm4
+	por	%xmm2, %xmm4
+
+	pmovmskb %xmm4, %esi
+	testl	%esi, %esi
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	%rdx, %rcx
+	jne	L(loop_4x_vec)
+
+	subl	%edi, %edx
+
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 2
+L(last_4x_vec):
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+	bsrl	%eax, %eax
+	jz	L(ret_4)
+	addl	%edx, %eax
+	jl	L(zero_3)
+	addq	%rdi, %rax
+L(ret_4):
 	ret
 
-	.p2align 4
-L(length_less16_part2_return):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 3
+L(loop_end):
+	pmovmskb %xmm1, %eax
+	sall	$16, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm2, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm3, %eax
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	sall	$16, %eax
+	orl	%esi, %eax
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
 	ret
 
-END (__memrchr)
+L(ret_vec_end):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
+	ret
+	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
+	   aligning bytes.  */
+L(zero_3):
+	xorl	%eax, %eax
+	ret
+	/* 2-bytes from next cache line.  */
+END(__memrchr)
 weak_alias (__memrchr, memrchr)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v6 4/8] x86: Optimize memrchr-sse2.S
  2022-06-07  4:11     ` [PATCH v6 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
@ 2022-06-07 18:04       ` H.J. Lu
  2022-07-14  2:19         ` Sunil Pandey
  0 siblings, 1 reply; 82+ messages in thread
From: H.J. Lu @ 2022-06-07 18:04 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code:
>     1. prioritizes smaller lengths more.
>     2. optimizes target placement more carefully.
>     3. reuses logic more.
>     4. fixes up various inefficiencies in the logic.
>
> The total code size saving is: 394 bytes
> Geometric Mean of all benchmarks New / Old: 0.874
>
> Regressions:
>     1. The page cross case is now colder, especially re-entry from the
>        page cross case if a match is not found in the first VEC
>        (roughly 50%). My general opinion with this patch is this is
>        acceptable given the "coldness" of this case (less than 4%) and
>        generally performance improvement in the other far more common
>        cases.
>
>     2. There are some regressions 5-15% for medium/large user-arg
>        lengths that have a match in the first VEC. This is because the
>        logic was rewritten to optimize finds in the first VEC if the
>        user-arg length is shorter (where we see roughly 20-50%
>        performance improvements). It is not always the case this is a
>        regression. My intuition is some frontend quirk is partially
>        explaining the data although I haven't been able to find the
>        root cause.
>
> Full xcheck passes on x86_64.
> ---
>  sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++--------------------
>  1 file changed, 292 insertions(+), 321 deletions(-)
>
> diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
> index d1a9f47911..b0dffd2ae2 100644
> --- a/sysdeps/x86_64/memrchr.S
> +++ b/sysdeps/x86_64/memrchr.S
> @@ -18,362 +18,333 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #include <sysdep.h>
> +#define VEC_SIZE                       16
> +#define PAGE_SIZE                      4096
>
>         .text
> -ENTRY (__memrchr)
> -       movd    %esi, %xmm1
> -
> -       sub     $16, %RDX_LP
> -       jbe     L(length_less16)
> -
> -       punpcklbw       %xmm1, %xmm1
> -       punpcklbw       %xmm1, %xmm1
> -
> -       add     %RDX_LP, %RDI_LP
> -       pshufd  $0, %xmm1, %xmm1
> -
> -       movdqu  (%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -
> -/* Check if there is a match.  */
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0)
> -
> -       sub     $64, %rdi
> -       mov     %edi, %ecx
> -       and     $15, %ecx
> -       jz      L(loop_prolog)
> -
> -       add     $16, %rdi
> -       add     $16, %rdx
> -       and     $-16, %rdi
> -       sub     %rcx, %rdx
> -
> -       .p2align 4
> -L(loop_prolog):
> -       sub     $64, %rdx
> -       jbe     L(exit_loop)
> -
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       movdqa  32(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16)
> -
> -       movdqa  (%rdi), %xmm4
> -       pcmpeqb %xmm1, %xmm4
> -       pmovmskb        %xmm4, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0)
> -
> -       sub     $64, %rdi
> -       sub     $64, %rdx
> -       jbe     L(exit_loop)
> -
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       movdqa  32(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16)
> -
> -       movdqa  (%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0)
> -
> -       mov     %edi, %ecx
> -       and     $63, %ecx
> -       jz      L(align64_loop)
> -
> -       add     $64, %rdi
> -       add     $64, %rdx
> -       and     $-64, %rdi
> -       sub     %rcx, %rdx
> -
> -       .p2align 4
> -L(align64_loop):
> -       sub     $64, %rdi
> -       sub     $64, %rdx
> -       jbe     L(exit_loop)
> -
> -       movdqa  (%rdi), %xmm0
> -       movdqa  16(%rdi), %xmm2
> -       movdqa  32(%rdi), %xmm3
> -       movdqa  48(%rdi), %xmm4
> -
> -       pcmpeqb %xmm1, %xmm0
> -       pcmpeqb %xmm1, %xmm2
> -       pcmpeqb %xmm1, %xmm3
> -       pcmpeqb %xmm1, %xmm4
> -
> -       pmaxub  %xmm3, %xmm0
> -       pmaxub  %xmm4, %xmm2
> -       pmaxub  %xmm0, %xmm2
> -       pmovmskb        %xmm2, %eax
> -
> -       test    %eax, %eax
> -       jz      L(align64_loop)
> -
> -       pmovmskb        %xmm4, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm2
> -
> -       pcmpeqb %xmm1, %xmm2
> -       pcmpeqb (%rdi), %xmm1
> -
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16)
> -
> -       pmovmskb        %xmm1, %eax
> -       bsr     %eax, %eax
> -
> -       add     %rdi, %rax
> +ENTRY_P2ALIGN(__memrchr, 6)
> +#ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       mov     %RDX_LP, %RDX_LP
> +#endif
> +       movd    %esi, %xmm0
> +
> +       /* Get end pointer.  */
> +       leaq    (%rdx, %rdi), %rcx
> +
> +       punpcklbw %xmm0, %xmm0
> +       punpcklwd %xmm0, %xmm0
> +       pshufd  $0, %xmm0, %xmm0
> +
> +       /* Check if we can load 1x VEC without cross a page.  */
> +       testl   $(PAGE_SIZE - VEC_SIZE), %ecx
> +       jz      L(page_cross)
> +
> +       /* NB: This load happens regardless of whether rdx (len) is zero. Since
> +          it doesn't cross a page and the standard gurantees any pointer have
> +          at least one-valid byte this load must be safe. For the entire
> +          history of the x86 memrchr implementation this has been possible so
> +          no code "should" be relying on a zero-length check before this load.
> +          The zero-length check is moved to the page cross case because it is
> +          1) pretty cold and including it pushes the hot case len <= VEC_SIZE
> +          into 2-cache lines.  */
> +       movups  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subq    $VEC_SIZE, %rdx
> +       ja      L(more_1x_vec)
> +L(ret_vec_x0_test):
> +       /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
> +          zero.  */
> +       bsrl    %eax, %eax
> +       jz      L(ret_0)
> +       /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
> +          if out of bounds.  */
> +       addl    %edx, %eax
> +       jl      L(zero_0)
> +       /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
> +          ptr.  */
> +       addq    %rdi, %rax
> +L(ret_0):
>         ret
>
> -       .p2align 4
> -L(exit_loop):
> -       add     $64, %edx
> -       cmp     $32, %edx
> -       jbe     L(exit_loop_32)
> -
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       movdqa  32(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16_1)
> -       cmp     $48, %edx
> -       jbe     L(return_null)
> -
> -       pcmpeqb (%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0_1)
> -       xor     %eax, %eax
> +       .p2align 4,, 5
> +L(ret_vec_x0):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE)(%rcx, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(exit_loop_32):
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48_1)
> -       cmp     $16, %edx
> -       jbe     L(return_null)
> -
> -       pcmpeqb 32(%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32_1)
> -       xor     %eax, %eax
> +       .p2align 4,, 2
> +L(zero_0):
> +       xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(matches0):
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(matches16):
> -       bsr     %eax, %eax
> -       lea     16(%rax, %rdi), %rax
> -       ret
>
> -       .p2align 4
> -L(matches32):
> -       bsr     %eax, %eax
> -       lea     32(%rax, %rdi), %rax
> +       .p2align 4,, 8
> +L(more_1x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +       /* Align rcx (pointer to string).  */
> +       decq    %rcx
> +       andq    $-VEC_SIZE, %rcx
> +
> +       movq    %rcx, %rdx
> +       /* NB: We could consistenyl save 1-byte in this pattern with `movaps
> +          %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
> +          it adds more frontend uops (even if the moves can be eliminated) and
> +          some percentage of the time actual backend uops.  */
> +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       subq    %rdi, %rdx
> +       pmovmskb %xmm1, %eax
> +
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       ja      L(more_2x_vec)
> +L(last_2x_vec):
> +       subl    $VEC_SIZE, %edx
> +       jbe     L(ret_vec_x0_test)
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subl    $VEC_SIZE, %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_1)
> +       addl    %edx, %eax
> +       jl      L(zero_0)
> +       addq    %rdi, %rax
> +L(ret_1):
>         ret
>
> -       .p2align 4
> -L(matches48):
> -       bsr     %eax, %eax
> -       lea     48(%rax, %rdi), %rax
> +       /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
> +          causes the hot pause (length <= VEC_SIZE) to span multiple cache
> +          lines.  Naturally aligned % 16 to 8-bytes.  */
> +L(page_cross):
> +       /* Zero length check.  */
> +       testq   %rdx, %rdx
> +       jz      L(zero_0)
> +
> +       leaq    -1(%rcx), %r8
> +       andq    $-(VEC_SIZE), %r8
> +
> +       movaps  (%r8), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %esi
> +       /* Shift out negative alignment (because we are starting from endptr and
> +          working backwards).  */
> +       negl    %ecx
> +       /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
> +          explicitly.  */
> +       andl    $(VEC_SIZE - 1), %ecx
> +       shl     %cl, %esi
> +       movzwl  %si, %eax
> +       leaq    (%rdi, %rdx), %rcx
> +       cmpq    %rdi, %r8
> +       ja      L(more_1x_vec)
> +       subl    $VEC_SIZE, %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_2)
> +       addl    %edx, %eax
> +       jl      L(zero_1)
> +       addq    %rdi, %rax
> +L(ret_2):
>         ret
>
> -       .p2align 4
> -L(matches0_1):
> -       bsr     %eax, %eax
> -       sub     $64, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       add     %rdi, %rax
> +       /* Fits in aliging bytes.  */
> +L(zero_1):
> +       xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(matches16_1):
> -       bsr     %eax, %eax
> -       sub     $48, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       lea     16(%rdi, %rax), %rax
> +       .p2align 4,, 5
> +L(ret_vec_x1):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(matches32_1):
> -       bsr     %eax, %eax
> -       sub     $32, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       lea     32(%rdi, %rax), %rax
> -       ret
> +       .p2align 4,, 8
> +L(more_2x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
>
> -       .p2align 4
> -L(matches48_1):
> -       bsr     %eax, %eax
> -       sub     $16, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       lea     48(%rdi, %rax), %rax
> -       ret
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x1)
>
> -       .p2align 4
> -L(return_null):
> -       xor     %eax, %eax
> -       ret
>
> -       .p2align 4
> -L(length_less16_offset0):
> -       test    %edx, %edx
> -       jz      L(return_null)
> +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
>
> -       mov     %dl, %cl
> -       pcmpeqb (%rdi), %xmm1
> +       subq    $(VEC_SIZE * 4), %rdx
> +       ja      L(more_4x_vec)
>
> -       mov     $1, %edx
> -       sal     %cl, %edx
> -       sub     $1, %edx
> +       addl    $(VEC_SIZE), %edx
> +       jle     L(ret_vec_x2_test)
>
> -       pmovmskb        %xmm1, %eax
> +L(last_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x2)
>
> -       and     %edx, %eax
> -       test    %eax, %eax
> -       jz      L(return_null)
> +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
>
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> +       subl    $(VEC_SIZE), %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_3)
> +       addl    %edx, %eax
> +       jl      L(zero_2)
> +       addq    %rdi, %rax
> +L(ret_3):
>         ret
>
> -       .p2align 4
> -L(length_less16):
> -       punpcklbw       %xmm1, %xmm1
> -       punpcklbw       %xmm1, %xmm1
> -
> -       add     $16, %edx
> -
> -       pshufd  $0, %xmm1, %xmm1
> -
> -       mov     %edi, %ecx
> -       and     $15, %ecx
> -       jz      L(length_less16_offset0)
> -
> -       mov     %cl, %dh
> -       mov     %ecx, %esi
> -       add     %dl, %dh
> -       and     $-16, %rdi
> -
> -       sub     $16, %dh
> -       ja      L(length_less16_part2)
> -
> -       pcmpeqb (%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> -
> -       sar     %cl, %eax
> -       mov     %dl, %cl
> -
> -       mov     $1, %edx
> -       sal     %cl, %edx
> -       sub     $1, %edx
> -
> -       and     %edx, %eax
> -       test    %eax, %eax
> -       jz      L(return_null)
> -
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> -       add     %rsi, %rax
> +       .p2align 4,, 6
> +L(ret_vec_x2_test):
> +       bsrl    %eax, %eax
> +       jz      L(zero_2)
> +       addl    %edx, %eax
> +       jl      L(zero_2)
> +       addq    %rdi, %rax
>         ret
>
> -       .p2align 4
> -L(length_less16_part2):
> -       movdqa  16(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -
> -       mov     %dh, %cl
> -       mov     $1, %edx
> -       sal     %cl, %edx
> -       sub     $1, %edx
> -
> -       and     %edx, %eax
> +L(zero_2):
> +       xorl    %eax, %eax
> +       ret
>
> -       test    %eax, %eax
> -       jnz     L(length_less16_part2_return)
>
> -       pcmpeqb (%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> +       .p2align 4,, 5
> +L(ret_vec_x2):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> +       ret
>
> -       mov     %esi, %ecx
> -       sar     %cl, %eax
> -       test    %eax, %eax
> -       jz      L(return_null)
> +       .p2align 4,, 5
> +L(ret_vec_x3):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> +       ret
>
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> -       add     %rsi, %rax
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x2)
> +
> +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x3)
> +
> +       addq    $-(VEC_SIZE * 4), %rcx
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       jbe     L(last_4x_vec)
> +
> +       /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
> +          keeping the code from spilling to the next cache line.  */
> +       addq    $(VEC_SIZE * 4 - 1), %rcx
> +       andq    $-(VEC_SIZE * 4), %rcx
> +       leaq    (VEC_SIZE * 4)(%rdi), %rdx
> +       andq    $-(VEC_SIZE * 4), %rdx
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +       movaps  (VEC_SIZE * -1)(%rcx), %xmm1
> +       movaps  (VEC_SIZE * -2)(%rcx), %xmm2
> +       movaps  (VEC_SIZE * -3)(%rcx), %xmm3
> +       movaps  (VEC_SIZE * -4)(%rcx), %xmm4
> +       pcmpeqb %xmm0, %xmm1
> +       pcmpeqb %xmm0, %xmm2
> +       pcmpeqb %xmm0, %xmm3
> +       pcmpeqb %xmm0, %xmm4
> +
> +       por     %xmm1, %xmm2
> +       por     %xmm3, %xmm4
> +       por     %xmm2, %xmm4
> +
> +       pmovmskb %xmm4, %esi
> +       testl   %esi, %esi
> +       jnz     L(loop_end)
> +
> +       addq    $-(VEC_SIZE * 4), %rcx
> +       cmpq    %rdx, %rcx
> +       jne     L(loop_4x_vec)
> +
> +       subl    %edi, %edx
> +
> +       /* Ends up being 1-byte nop.  */
> +       .p2align 4,, 2
> +L(last_4x_vec):
> +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       cmpl    $(VEC_SIZE * 2), %edx
> +       jbe     L(last_2x_vec)
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_end)
> +
> +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subl    $(VEC_SIZE * 3), %edx
> +       ja      L(last_vec)
> +       bsrl    %eax, %eax
> +       jz      L(ret_4)
> +       addl    %edx, %eax
> +       jl      L(zero_3)
> +       addq    %rdi, %rax
> +L(ret_4):
>         ret
>
> -       .p2align 4
> -L(length_less16_part2_return):
> -       bsr     %eax, %eax
> -       lea     16(%rax, %rdi), %rax
> +       /* Ends up being 1-byte nop.  */
> +       .p2align 4,, 3
> +L(loop_end):
> +       pmovmskb %xmm1, %eax
> +       sall    $16, %eax
> +       jnz     L(ret_vec_end)
> +
> +       pmovmskb %xmm2, %eax
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_end)
> +
> +       pmovmskb %xmm3, %eax
> +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> +          then CHAR in VEC3 and bsrq will use that position.  */
> +       sall    $16, %eax
> +       orl     %esi, %eax
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
>         ret
>
> -END (__memrchr)
> +L(ret_vec_end):
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE * -2)(%rax, %rcx), %rax
> +       ret
> +       /* Use in L(last_4x_vec). In the same cache line. This is just a spare
> +          aligning bytes.  */
> +L(zero_3):
> +       xorl    %eax, %eax
> +       ret
> +       /* 2-bytes from next cache line.  */
> +END(__memrchr)
>  weak_alias (__memrchr, memrchr)
> --
> 2.34.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v6 4/8] x86: Optimize memrchr-sse2.S
  2022-06-07 18:04       ` H.J. Lu
@ 2022-07-14  2:19         ` Sunil Pandey
  0 siblings, 0 replies; 82+ messages in thread
From: Sunil Pandey @ 2022-07-14  2:19 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Noah Goldstein, GNU C Library

On Tue, Jun 7, 2022 at 11:07 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code:
> >     1. prioritizes smaller lengths more.
> >     2. optimizes target placement more carefully.
> >     3. reuses logic more.
> >     4. fixes up various inefficiencies in the logic.
> >
> > The total code size saving is: 394 bytes
> > Geometric Mean of all benchmarks New / Old: 0.874
> >
> > Regressions:
> >     1. The page cross case is now colder, especially re-entry from the
> >        page cross case if a match is not found in the first VEC
> >        (roughly 50%). My general opinion with this patch is this is
> >        acceptable given the "coldness" of this case (less than 4%) and
> >        generally performance improvement in the other far more common
> >        cases.
> >
> >     2. There are some regressions 5-15% for medium/large user-arg
> >        lengths that have a match in the first VEC. This is because the
> >        logic was rewritten to optimize finds in the first VEC if the
> >        user-arg length is shorter (where we see roughly 20-50%
> >        performance improvements). It is not always the case this is a
> >        regression. My intuition is some frontend quirk is partially
> >        explaining the data although I haven't been able to find the
> >        root cause.
> >
> > Full xcheck passes on x86_64.
> > ---
> >  sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++--------------------
> >  1 file changed, 292 insertions(+), 321 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
> > index d1a9f47911..b0dffd2ae2 100644
> > --- a/sysdeps/x86_64/memrchr.S
> > +++ b/sysdeps/x86_64/memrchr.S
> > @@ -18,362 +18,333 @@
> >     <https://www.gnu.org/licenses/>.  */
> >
> >  #include <sysdep.h>
> > +#define VEC_SIZE                       16
> > +#define PAGE_SIZE                      4096
> >
> >         .text
> > -ENTRY (__memrchr)
> > -       movd    %esi, %xmm1
> > -
> > -       sub     $16, %RDX_LP
> > -       jbe     L(length_less16)
> > -
> > -       punpcklbw       %xmm1, %xmm1
> > -       punpcklbw       %xmm1, %xmm1
> > -
> > -       add     %RDX_LP, %RDI_LP
> > -       pshufd  $0, %xmm1, %xmm1
> > -
> > -       movdqu  (%rdi), %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -
> > -/* Check if there is a match.  */
> > -       pmovmskb        %xmm0, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches0)
> > -
> > -       sub     $64, %rdi
> > -       mov     %edi, %ecx
> > -       and     $15, %ecx
> > -       jz      L(loop_prolog)
> > -
> > -       add     $16, %rdi
> > -       add     $16, %rdx
> > -       and     $-16, %rdi
> > -       sub     %rcx, %rdx
> > -
> > -       .p2align 4
> > -L(loop_prolog):
> > -       sub     $64, %rdx
> > -       jbe     L(exit_loop)
> > -
> > -       movdqa  48(%rdi), %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       pmovmskb        %xmm0, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches48)
> > -
> > -       movdqa  32(%rdi), %xmm2
> > -       pcmpeqb %xmm1, %xmm2
> > -       pmovmskb        %xmm2, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches32)
> > -
> > -       movdqa  16(%rdi), %xmm3
> > -       pcmpeqb %xmm1, %xmm3
> > -       pmovmskb        %xmm3, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches16)
> > -
> > -       movdqa  (%rdi), %xmm4
> > -       pcmpeqb %xmm1, %xmm4
> > -       pmovmskb        %xmm4, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches0)
> > -
> > -       sub     $64, %rdi
> > -       sub     $64, %rdx
> > -       jbe     L(exit_loop)
> > -
> > -       movdqa  48(%rdi), %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       pmovmskb        %xmm0, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches48)
> > -
> > -       movdqa  32(%rdi), %xmm2
> > -       pcmpeqb %xmm1, %xmm2
> > -       pmovmskb        %xmm2, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches32)
> > -
> > -       movdqa  16(%rdi), %xmm3
> > -       pcmpeqb %xmm1, %xmm3
> > -       pmovmskb        %xmm3, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches16)
> > -
> > -       movdqa  (%rdi), %xmm3
> > -       pcmpeqb %xmm1, %xmm3
> > -       pmovmskb        %xmm3, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches0)
> > -
> > -       mov     %edi, %ecx
> > -       and     $63, %ecx
> > -       jz      L(align64_loop)
> > -
> > -       add     $64, %rdi
> > -       add     $64, %rdx
> > -       and     $-64, %rdi
> > -       sub     %rcx, %rdx
> > -
> > -       .p2align 4
> > -L(align64_loop):
> > -       sub     $64, %rdi
> > -       sub     $64, %rdx
> > -       jbe     L(exit_loop)
> > -
> > -       movdqa  (%rdi), %xmm0
> > -       movdqa  16(%rdi), %xmm2
> > -       movdqa  32(%rdi), %xmm3
> > -       movdqa  48(%rdi), %xmm4
> > -
> > -       pcmpeqb %xmm1, %xmm0
> > -       pcmpeqb %xmm1, %xmm2
> > -       pcmpeqb %xmm1, %xmm3
> > -       pcmpeqb %xmm1, %xmm4
> > -
> > -       pmaxub  %xmm3, %xmm0
> > -       pmaxub  %xmm4, %xmm2
> > -       pmaxub  %xmm0, %xmm2
> > -       pmovmskb        %xmm2, %eax
> > -
> > -       test    %eax, %eax
> > -       jz      L(align64_loop)
> > -
> > -       pmovmskb        %xmm4, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches48)
> > -
> > -       pmovmskb        %xmm3, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches32)
> > -
> > -       movdqa  16(%rdi), %xmm2
> > -
> > -       pcmpeqb %xmm1, %xmm2
> > -       pcmpeqb (%rdi), %xmm1
> > -
> > -       pmovmskb        %xmm2, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches16)
> > -
> > -       pmovmskb        %xmm1, %eax
> > -       bsr     %eax, %eax
> > -
> > -       add     %rdi, %rax
> > +ENTRY_P2ALIGN(__memrchr, 6)
> > +#ifdef __ILP32__
> > +       /* Clear upper bits.  */
> > +       mov     %RDX_LP, %RDX_LP
> > +#endif
> > +       movd    %esi, %xmm0
> > +
> > +       /* Get end pointer.  */
> > +       leaq    (%rdx, %rdi), %rcx
> > +
> > +       punpcklbw %xmm0, %xmm0
> > +       punpcklwd %xmm0, %xmm0
> > +       pshufd  $0, %xmm0, %xmm0
> > +
> > +       /* Check if we can load 1x VEC without cross a page.  */
> > +       testl   $(PAGE_SIZE - VEC_SIZE), %ecx
> > +       jz      L(page_cross)
> > +
> > +       /* NB: This load happens regardless of whether rdx (len) is zero. Since
> > +          it doesn't cross a page and the standard gurantees any pointer have
> > +          at least one-valid byte this load must be safe. For the entire
> > +          history of the x86 memrchr implementation this has been possible so
> > +          no code "should" be relying on a zero-length check before this load.
> > +          The zero-length check is moved to the page cross case because it is
> > +          1) pretty cold and including it pushes the hot case len <= VEC_SIZE
> > +          into 2-cache lines.  */
> > +       movups  -(VEC_SIZE)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +
> > +       subq    $VEC_SIZE, %rdx
> > +       ja      L(more_1x_vec)
> > +L(ret_vec_x0_test):
> > +       /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
> > +          zero.  */
> > +       bsrl    %eax, %eax
> > +       jz      L(ret_0)
> > +       /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
> > +          if out of bounds.  */
> > +       addl    %edx, %eax
> > +       jl      L(zero_0)
> > +       /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
> > +          ptr.  */
> > +       addq    %rdi, %rax
> > +L(ret_0):
> >         ret
> >
> > -       .p2align 4
> > -L(exit_loop):
> > -       add     $64, %edx
> > -       cmp     $32, %edx
> > -       jbe     L(exit_loop_32)
> > -
> > -       movdqa  48(%rdi), %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       pmovmskb        %xmm0, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches48)
> > -
> > -       movdqa  32(%rdi), %xmm2
> > -       pcmpeqb %xmm1, %xmm2
> > -       pmovmskb        %xmm2, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches32)
> > -
> > -       movdqa  16(%rdi), %xmm3
> > -       pcmpeqb %xmm1, %xmm3
> > -       pmovmskb        %xmm3, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches16_1)
> > -       cmp     $48, %edx
> > -       jbe     L(return_null)
> > -
> > -       pcmpeqb (%rdi), %xmm1
> > -       pmovmskb        %xmm1, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches0_1)
> > -       xor     %eax, %eax
> > +       .p2align 4,, 5
> > +L(ret_vec_x0):
> > +       bsrl    %eax, %eax
> > +       leaq    -(VEC_SIZE)(%rcx, %rax), %rax
> >         ret
> >
> > -       .p2align 4
> > -L(exit_loop_32):
> > -       movdqa  48(%rdi), %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       pmovmskb        %xmm0, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches48_1)
> > -       cmp     $16, %edx
> > -       jbe     L(return_null)
> > -
> > -       pcmpeqb 32(%rdi), %xmm1
> > -       pmovmskb        %xmm1, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches32_1)
> > -       xor     %eax, %eax
> > +       .p2align 4,, 2
> > +L(zero_0):
> > +       xorl    %eax, %eax
> >         ret
> >
> > -       .p2align 4
> > -L(matches0):
> > -       bsr     %eax, %eax
> > -       add     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(matches16):
> > -       bsr     %eax, %eax
> > -       lea     16(%rax, %rdi), %rax
> > -       ret
> >
> > -       .p2align 4
> > -L(matches32):
> > -       bsr     %eax, %eax
> > -       lea     32(%rax, %rdi), %rax
> > +       .p2align 4,, 8
> > +L(more_1x_vec):
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x0)
> > +
> > +       /* Align rcx (pointer to string).  */
> > +       decq    %rcx
> > +       andq    $-VEC_SIZE, %rcx
> > +
> > +       movq    %rcx, %rdx
> > +       /* NB: We could consistenyl save 1-byte in this pattern with `movaps
> > +          %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
> > +          it adds more frontend uops (even if the moves can be eliminated) and
> > +          some percentage of the time actual backend uops.  */
> > +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       subq    %rdi, %rdx
> > +       pmovmskb %xmm1, %eax
> > +
> > +       cmpq    $(VEC_SIZE * 2), %rdx
> > +       ja      L(more_2x_vec)
> > +L(last_2x_vec):
> > +       subl    $VEC_SIZE, %edx
> > +       jbe     L(ret_vec_x0_test)
> > +
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x0)
> > +
> > +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +
> > +       subl    $VEC_SIZE, %edx
> > +       bsrl    %eax, %eax
> > +       jz      L(ret_1)
> > +       addl    %edx, %eax
> > +       jl      L(zero_0)
> > +       addq    %rdi, %rax
> > +L(ret_1):
> >         ret
> >
> > -       .p2align 4
> > -L(matches48):
> > -       bsr     %eax, %eax
> > -       lea     48(%rax, %rdi), %rax
> > +       /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
> > +          causes the hot pause (length <= VEC_SIZE) to span multiple cache
> > +          lines.  Naturally aligned % 16 to 8-bytes.  */
> > +L(page_cross):
> > +       /* Zero length check.  */
> > +       testq   %rdx, %rdx
> > +       jz      L(zero_0)
> > +
> > +       leaq    -1(%rcx), %r8
> > +       andq    $-(VEC_SIZE), %r8
> > +
> > +       movaps  (%r8), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %esi
> > +       /* Shift out negative alignment (because we are starting from endptr and
> > +          working backwards).  */
> > +       negl    %ecx
> > +       /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
> > +          explicitly.  */
> > +       andl    $(VEC_SIZE - 1), %ecx
> > +       shl     %cl, %esi
> > +       movzwl  %si, %eax
> > +       leaq    (%rdi, %rdx), %rcx
> > +       cmpq    %rdi, %r8
> > +       ja      L(more_1x_vec)
> > +       subl    $VEC_SIZE, %edx
> > +       bsrl    %eax, %eax
> > +       jz      L(ret_2)
> > +       addl    %edx, %eax
> > +       jl      L(zero_1)
> > +       addq    %rdi, %rax
> > +L(ret_2):
> >         ret
> >
> > -       .p2align 4
> > -L(matches0_1):
> > -       bsr     %eax, %eax
> > -       sub     $64, %rdx
> > -       add     %rax, %rdx
> > -       jl      L(return_null)
> > -       add     %rdi, %rax
> > +       /* Fits in aliging bytes.  */
> > +L(zero_1):
> > +       xorl    %eax, %eax
> >         ret
> >
> > -       .p2align 4
> > -L(matches16_1):
> > -       bsr     %eax, %eax
> > -       sub     $48, %rdx
> > -       add     %rax, %rdx
> > -       jl      L(return_null)
> > -       lea     16(%rdi, %rax), %rax
> > +       .p2align 4,, 5
> > +L(ret_vec_x1):
> > +       bsrl    %eax, %eax
> > +       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
> >         ret
> >
> > -       .p2align 4
> > -L(matches32_1):
> > -       bsr     %eax, %eax
> > -       sub     $32, %rdx
> > -       add     %rax, %rdx
> > -       jl      L(return_null)
> > -       lea     32(%rdi, %rax), %rax
> > -       ret
> > +       .p2align 4,, 8
> > +L(more_2x_vec):
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x0)
> >
> > -       .p2align 4
> > -L(matches48_1):
> > -       bsr     %eax, %eax
> > -       sub     $16, %rdx
> > -       add     %rax, %rdx
> > -       jl      L(return_null)
> > -       lea     48(%rdi, %rax), %rax
> > -       ret
> > +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x1)
> >
> > -       .p2align 4
> > -L(return_null):
> > -       xor     %eax, %eax
> > -       ret
> >
> > -       .p2align 4
> > -L(length_less16_offset0):
> > -       test    %edx, %edx
> > -       jz      L(return_null)
> > +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> >
> > -       mov     %dl, %cl
> > -       pcmpeqb (%rdi), %xmm1
> > +       subq    $(VEC_SIZE * 4), %rdx
> > +       ja      L(more_4x_vec)
> >
> > -       mov     $1, %edx
> > -       sal     %cl, %edx
> > -       sub     $1, %edx
> > +       addl    $(VEC_SIZE), %edx
> > +       jle     L(ret_vec_x2_test)
> >
> > -       pmovmskb        %xmm1, %eax
> > +L(last_vec):
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x2)
> >
> > -       and     %edx, %eax
> > -       test    %eax, %eax
> > -       jz      L(return_null)
> > +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> >
> > -       bsr     %eax, %eax
> > -       add     %rdi, %rax
> > +       subl    $(VEC_SIZE), %edx
> > +       bsrl    %eax, %eax
> > +       jz      L(ret_3)
> > +       addl    %edx, %eax
> > +       jl      L(zero_2)
> > +       addq    %rdi, %rax
> > +L(ret_3):
> >         ret
> >
> > -       .p2align 4
> > -L(length_less16):
> > -       punpcklbw       %xmm1, %xmm1
> > -       punpcklbw       %xmm1, %xmm1
> > -
> > -       add     $16, %edx
> > -
> > -       pshufd  $0, %xmm1, %xmm1
> > -
> > -       mov     %edi, %ecx
> > -       and     $15, %ecx
> > -       jz      L(length_less16_offset0)
> > -
> > -       mov     %cl, %dh
> > -       mov     %ecx, %esi
> > -       add     %dl, %dh
> > -       and     $-16, %rdi
> > -
> > -       sub     $16, %dh
> > -       ja      L(length_less16_part2)
> > -
> > -       pcmpeqb (%rdi), %xmm1
> > -       pmovmskb        %xmm1, %eax
> > -
> > -       sar     %cl, %eax
> > -       mov     %dl, %cl
> > -
> > -       mov     $1, %edx
> > -       sal     %cl, %edx
> > -       sub     $1, %edx
> > -
> > -       and     %edx, %eax
> > -       test    %eax, %eax
> > -       jz      L(return_null)
> > -
> > -       bsr     %eax, %eax
> > -       add     %rdi, %rax
> > -       add     %rsi, %rax
> > +       .p2align 4,, 6
> > +L(ret_vec_x2_test):
> > +       bsrl    %eax, %eax
> > +       jz      L(zero_2)
> > +       addl    %edx, %eax
> > +       jl      L(zero_2)
> > +       addq    %rdi, %rax
> >         ret
> >
> > -       .p2align 4
> > -L(length_less16_part2):
> > -       movdqa  16(%rdi), %xmm2
> > -       pcmpeqb %xmm1, %xmm2
> > -       pmovmskb        %xmm2, %eax
> > -
> > -       mov     %dh, %cl
> > -       mov     $1, %edx
> > -       sal     %cl, %edx
> > -       sub     $1, %edx
> > -
> > -       and     %edx, %eax
> > +L(zero_2):
> > +       xorl    %eax, %eax
> > +       ret
> >
> > -       test    %eax, %eax
> > -       jnz     L(length_less16_part2_return)
> >
> > -       pcmpeqb (%rdi), %xmm1
> > -       pmovmskb        %xmm1, %eax
> > +       .p2align 4,, 5
> > +L(ret_vec_x2):
> > +       bsrl    %eax, %eax
> > +       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> > +       ret
> >
> > -       mov     %esi, %ecx
> > -       sar     %cl, %eax
> > -       test    %eax, %eax
> > -       jz      L(return_null)
> > +       .p2align 4,, 5
> > +L(ret_vec_x3):
> > +       bsrl    %eax, %eax
> > +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> > +       ret
> >
> > -       bsr     %eax, %eax
> > -       add     %rdi, %rax
> > -       add     %rsi, %rax
> > +       .p2align 4,, 8
> > +L(more_4x_vec):
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x2)
> > +
> > +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x3)
> > +
> > +       addq    $-(VEC_SIZE * 4), %rcx
> > +       cmpq    $(VEC_SIZE * 4), %rdx
> > +       jbe     L(last_4x_vec)
> > +
> > +       /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
> > +          keeping the code from spilling to the next cache line.  */
> > +       addq    $(VEC_SIZE * 4 - 1), %rcx
> > +       andq    $-(VEC_SIZE * 4), %rcx
> > +       leaq    (VEC_SIZE * 4)(%rdi), %rdx
> > +       andq    $-(VEC_SIZE * 4), %rdx
> > +
> > +       .p2align 4,, 11
> > +L(loop_4x_vec):
> > +       movaps  (VEC_SIZE * -1)(%rcx), %xmm1
> > +       movaps  (VEC_SIZE * -2)(%rcx), %xmm2
> > +       movaps  (VEC_SIZE * -3)(%rcx), %xmm3
> > +       movaps  (VEC_SIZE * -4)(%rcx), %xmm4
> > +       pcmpeqb %xmm0, %xmm1
> > +       pcmpeqb %xmm0, %xmm2
> > +       pcmpeqb %xmm0, %xmm3
> > +       pcmpeqb %xmm0, %xmm4
> > +
> > +       por     %xmm1, %xmm2
> > +       por     %xmm3, %xmm4
> > +       por     %xmm2, %xmm4
> > +
> > +       pmovmskb %xmm4, %esi
> > +       testl   %esi, %esi
> > +       jnz     L(loop_end)
> > +
> > +       addq    $-(VEC_SIZE * 4), %rcx
> > +       cmpq    %rdx, %rcx
> > +       jne     L(loop_4x_vec)
> > +
> > +       subl    %edi, %edx
> > +
> > +       /* Ends up being 1-byte nop.  */
> > +       .p2align 4,, 2
> > +L(last_4x_vec):
> > +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +
> > +       cmpl    $(VEC_SIZE * 2), %edx
> > +       jbe     L(last_2x_vec)
> > +
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x0)
> > +
> > +
> > +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_end)
> > +
> > +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +
> > +       subl    $(VEC_SIZE * 3), %edx
> > +       ja      L(last_vec)
> > +       bsrl    %eax, %eax
> > +       jz      L(ret_4)
> > +       addl    %edx, %eax
> > +       jl      L(zero_3)
> > +       addq    %rdi, %rax
> > +L(ret_4):
> >         ret
> >
> > -       .p2align 4
> > -L(length_less16_part2_return):
> > -       bsr     %eax, %eax
> > -       lea     16(%rax, %rdi), %rax
> > +       /* Ends up being 1-byte nop.  */
> > +       .p2align 4,, 3
> > +L(loop_end):
> > +       pmovmskb %xmm1, %eax
> > +       sall    $16, %eax
> > +       jnz     L(ret_vec_end)
> > +
> > +       pmovmskb %xmm2, %eax
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_end)
> > +
> > +       pmovmskb %xmm3, %eax
> > +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> > +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> > +          then CHAR in VEC3 and bsrq will use that position.  */
> > +       sall    $16, %eax
> > +       orl     %esi, %eax
> > +       bsrl    %eax, %eax
> > +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> >         ret
> >
> > -END (__memrchr)
> > +L(ret_vec_end):
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE * -2)(%rax, %rcx), %rax
> > +       ret
> > +       /* Use in L(last_4x_vec). In the same cache line. This is just a spare
> > +          aligning bytes.  */
> > +L(zero_3):
> > +       xorl    %eax, %eax
> > +       ret
> > +       /* 2-bytes from next cache line.  */
> > +END(__memrchr)
> >  weak_alias (__memrchr, memrchr)
> > --
> > 2.34.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v6 5/8] x86: Optimize memrchr-evex.S
  2022-06-07  4:11   ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (2 preceding siblings ...)
  2022-06-07  4:11     ` [PATCH v6 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
@ 2022-06-07  4:11     ` Noah Goldstein
  2022-06-07 18:21       ` H.J. Lu
  2022-06-07  4:11     ` [PATCH v6 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
                       ` (3 subsequent siblings)
  7 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:11 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller user-arg lengths more.
    2. optimizes target placement more carefully
    3. reuses logic more
    4. fixes up various inefficiencies in the logic. The biggest
       case here is the `lzcnt` logic for checking returns which
       saves either a branch or multiple instructions.

The total code size saving is: 263 bytes
Geometric Mean of all benchmarks New / Old: 0.755

Regressions:
There are some regressions. Particularly where the length (user arg
length) is large but the position of the match char is near the
beginning of the string (in first VEC). This case has roughly a
20% regression.

This is because the new logic gives the hot path for immediate matches
to shorter lengths (the more common input). This case has roughly
a 35% speedup.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memrchr-evex.S | 539 ++++++++++++------------
 1 file changed, 268 insertions(+), 271 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index 0b99709c6b..2d7da06dfc 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -19,319 +19,316 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
+# ifndef MEMRCHR
+#  define MEMRCHR				__memrchr_evex
+# endif
+
+# define PAGE_SIZE			4096
+# define VECMATCH			VEC(0)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(MEMRCHR, 6)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
+
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdi, %rdx), %rax
+	vpbroadcastb %esi, %VECMATCH
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+
+	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will guarantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-# define VMOVA		vmovdqa64
-
-# define YMMMATCH	ymm16
-
-# define VEC_SIZE 32
-
-	.section .text.evex,"ax",@progbits
-ENTRY (__memrchr_evex)
-	/* Broadcast CHAR to YMMMATCH.  */
-	vpbroadcastb %esi, %YMMMATCH
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
-
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
-
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
-	kord	%k1, %k2, %k5
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
-	kord	%k3, %k4, %k6
-	kortestd %k5, %k6
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	kmovd	%k1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0_dec):
+	decq	%rax
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
+	/* Align rax (pointer to string).  */
+	andq	$-VEC_SIZE, %rax
 
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
+	/* Recompute length after aligning.  */
+	movq	%rax, %rdx
 
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	ret
+	subq	%rdi, %rdx
 
-	.p2align 4
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
+
+	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
+	decq	%rax
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which in turn in necessary for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpb	$0, (%rsi), %VECMATCH, %k0
+	kmovd	%k0, %r8d
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %ecx
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%ecx
+	shlxl	%ecx, %r8d, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_1)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
+	/* Continue creating zero labels that fit in aligning bytes and get
+	   2-byte encoding / are in the same cache line as condition.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
-
-	kmovd	%k1, %eax
-
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
-
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(zero)
-
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
-
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	.p2align 4,, 8
+L(ret_vec_x2):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	/* Check the last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecessary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	ret
+	decq	%rax
+	andq	$-(VEC_SIZE * 4), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	andq	$-(VEC_SIZE * 4), %rdx
 
 	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
-
-	/* Check the last VEC.  */
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+L(loop_4x_vec):
+	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
+	   on).  */
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
+
+	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
+	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
+	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
+
+	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
+	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
+	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
+	vptestnmb %VEC(3), %VEC(3), %k2
+
+	/* Any 1s and we found CHAR.  */
+	kortestd %k2, %k4
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
+
+	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
+	subq	$-(VEC_SIZE * 4), %rdx
+	movq	%rdx, %rax
+	subl	%edi, %edx
+L(last_4x_vec):
+
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	kmovd	%k1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	movl	%r8d, %ecx
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	kmovd	%k1, %eax
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret_1)
+	xorl	%eax, %eax
+L(ret_1):
+	ret
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 6
+L(loop_end):
+	kmovd	%k1, %ecx
+	notl	%ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vptestnmb %VEC(2), %VEC(2), %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	kmovd	%k2, %ecx
+	kmovd	%k4, %esi
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	addq	%rcx, %rax
+	ret
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	addq	$(VEC_SIZE), %rax
+L(ret_vec_x1_end):
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
 	ret
-END (__memrchr_evex)
+
+END(MEMRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v6 5/8] x86: Optimize memrchr-evex.S
  2022-06-07  4:11     ` [PATCH v6 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
@ 2022-06-07 18:21       ` H.J. Lu
  2022-07-14  2:21         ` Sunil Pandey
  0 siblings, 1 reply; 82+ messages in thread
From: H.J. Lu @ 2022-06-07 18:21 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code:
>     1. prioritizes smaller user-arg lengths more.
>     2. optimizes target placement more carefully
>     3. reuses logic more
>     4. fixes up various inefficiencies in the logic. The biggest
>        case here is the `lzcnt` logic for checking returns which
>        saves either a branch or multiple instructions.
>
> The total code size saving is: 263 bytes
> Geometric Mean of all benchmarks New / Old: 0.755
>
> Regressions:
> There are some regressions. Particularly where the length (user arg
> length) is large but the position of the match char is near the
> beginning of the string (in first VEC). This case has roughly a
> 20% regression.
>
> This is because the new logic gives the hot path for immediate matches
> to shorter lengths (the more common input). This case has roughly
> a 35% speedup.
>
> Full xcheck passes on x86_64.
> ---
>  sysdeps/x86_64/multiarch/memrchr-evex.S | 539 ++++++++++++------------
>  1 file changed, 268 insertions(+), 271 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
> index 0b99709c6b..2d7da06dfc 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
> @@ -19,319 +19,316 @@
>  #if IS_IN (libc)
>
>  # include <sysdep.h>
> +# include "evex256-vecs.h"
> +# if VEC_SIZE != 32
> +#  error "VEC_SIZE != 32 unimplemented"
> +# endif
> +
> +# ifndef MEMRCHR
> +#  define MEMRCHR                              __memrchr_evex
> +# endif
> +
> +# define PAGE_SIZE                     4096
> +# define VECMATCH                      VEC(0)
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN(MEMRCHR, 6)
> +# ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       and     %RDX_LP, %RDX_LP
> +# else
> +       test    %RDX_LP, %RDX_LP
> +# endif
> +       jz      L(zero_0)
> +
> +       /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
> +          correct page cross check and 2) it correctly sets up end ptr to be
> +          subtract by lzcnt aligned.  */
> +       leaq    -1(%rdi, %rdx), %rax
> +       vpbroadcastb %esi, %VECMATCH
> +
> +       /* Check if we can load 1x VEC without cross a page.  */
> +       testl   $(PAGE_SIZE - VEC_SIZE), %eax
> +       jz      L(page_cross)
> +
> +       /* Don't use rax for pointer here because EVEX has better encoding with
> +          offset % VEC_SIZE == 0.  */
> +       vpcmpb  $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
> +
> +       /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
> +       cmpq    $VEC_SIZE, %rdx
> +       ja      L(more_1x_vec)
> +L(ret_vec_x0_test):
> +
> +       /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
> +          will guarantee edx (len) is less than it.  */
> +       lzcntl  %ecx, %ecx
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
> +       ret
>
> -# define VMOVA         vmovdqa64
> -
> -# define YMMMATCH      ymm16
> -
> -# define VEC_SIZE 32
> -
> -       .section .text.evex,"ax",@progbits
> -ENTRY (__memrchr_evex)
> -       /* Broadcast CHAR to YMMMATCH.  */
> -       vpbroadcastb %esi, %YMMMATCH
> -
> -       sub     $VEC_SIZE, %RDX_LP
> -       jbe     L(last_vec_or_less)
> -
> -       add     %RDX_LP, %RDI_LP
> -
> -       /* Check the last VEC_SIZE bytes.  */
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x0)
> -
> -       subq    $(VEC_SIZE * 4), %rdi
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -       jz      L(aligned_more)
> -
> -       /* Align data for aligned loads in the loop.  */
> -       addq    $VEC_SIZE, %rdi
> -       addq    $VEC_SIZE, %rdx
> -       andq    $-VEC_SIZE, %rdi
> -       subq    %rcx, %rdx
> -
> -       .p2align 4
> -L(aligned_more):
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -
> -       /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -          since data is only aligned to VEC_SIZE.  */
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
> -       kmovd   %k2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
> -       kmovd   %k3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k4
> -       kmovd   %k4, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x0)
> -
> -       /* Align data to 4 * VEC_SIZE for loop with fewer branches.
> -          There are some overlaps with above if data isn't aligned
> -          to 4 * VEC_SIZE.  */
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE * 4 - 1), %ecx
> -       jz      L(loop_4x_vec)
> -
> -       addq    $(VEC_SIZE * 4), %rdi
> -       addq    $(VEC_SIZE * 4), %rdx
> -       andq    $-(VEC_SIZE * 4), %rdi
> -       subq    %rcx, %rdx
> +       /* Fits in aligning bytes of first cache line.  */
> +L(zero_0):
> +       xorl    %eax, %eax
> +       ret
>
> -       .p2align 4
> -L(loop_4x_vec):
> -       /* Compare 4 * VEC at a time forward.  */
> -       subq    $(VEC_SIZE * 4), %rdi
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> -       kord    %k1, %k2, %k5
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> -
> -       kord    %k3, %k4, %k6
> -       kortestd %k5, %k6
> -       jz      L(loop_4x_vec)
> -
> -       /* There is a match.  */
> -       kmovd   %k4, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       kmovd   %k3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       kmovd   %k2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       kmovd   %k1, %eax
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> +       .p2align 4,, 9
> +L(ret_vec_x0_dec):
> +       decq    %rax
> +L(ret_vec_x0):
> +       lzcntl  %ecx, %ecx
> +       subq    %rcx, %rax
>         ret
>
> -       .p2align 4
> -L(last_4x_vec_or_less):
> -       addl    $(VEC_SIZE * 4), %edx
> -       cmpl    $(VEC_SIZE * 2), %edx
> -       jbe     L(last_2x_vec)
> +       .p2align 4,, 10
> +L(more_1x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
>
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> +       /* Align rax (pointer to string).  */
> +       andq    $-VEC_SIZE, %rax
>
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
> -       kmovd   %k2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> +       /* Recompute length after aligning.  */
> +       movq    %rax, %rdx
>
> -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
> -       kmovd   %k3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1_check)
> -       cmpl    $(VEC_SIZE * 3), %edx
> -       jbe     L(zero)
> +       /* Need no matter what.  */
> +       vpcmpb  $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k4
> -       kmovd   %k4, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 4), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addq    %rdi, %rax
> -       ret
> +       subq    %rdi, %rdx
>
> -       .p2align 4
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       ja      L(more_2x_vec)
>  L(last_2x_vec):
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3_check)
> +
> +       /* Must dec rax because L(ret_vec_x0_test) expects it.  */
> +       decq    %rax
>         cmpl    $VEC_SIZE, %edx
> -       jbe     L(zero)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 2), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $(VEC_SIZE * 2), %eax
> -       addq    %rdi, %rax
> +       jbe     L(ret_vec_x0_test)
> +
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
> +
> +       /* Don't use rax for pointer here because EVEX has better encoding with
> +          offset % VEC_SIZE == 0.  */
> +       vpcmpb  $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
> +       /* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
> +       lzcntq  %rcx, %rcx
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x0):
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> +       /* Inexpensive place to put this regarding code size / target alignments
> +          / ICache NLP. Necessary for 2-byte encoding of jump to page cross
> +          case which in turn in necessary for hot path (len <= VEC_SIZE) to fit
                                             is necessary?

> +          in first cache line.  */
> +L(page_cross):
> +       movq    %rax, %rsi
> +       andq    $-VEC_SIZE, %rsi
> +       vpcmpb  $0, (%rsi), %VECMATCH, %k0
> +       kmovd   %k0, %r8d
> +       /* Shift out negative alignment (because we are starting from endptr and
> +          working backwards).  */
> +       movl    %eax, %ecx
> +       /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
> +       notl    %ecx
> +       shlxl   %ecx, %r8d, %ecx
> +       cmpq    %rdi, %rsi
> +       ja      L(more_1x_vec)
> +       lzcntl  %ecx, %ecx
> +       cmpl    %ecx, %edx
> +       jle     L(zero_1)
> +       subq    %rcx, %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x1):
> -       bsrl    %eax, %eax
> -       addl    $VEC_SIZE, %eax
> -       addq    %rdi, %rax
> +       /* Continue creating zero labels that fit in aligning bytes and get
> +          2-byte encoding / are in the same cache line as condition.  */
> +L(zero_1):
> +       xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(last_vec_x2):
> -       bsrl    %eax, %eax
> -       addl    $(VEC_SIZE * 2), %eax
> -       addq    %rdi, %rax
> +       .p2align 4,, 8
> +L(ret_vec_x1):
> +       /* This will naturally add 32 to position.  */
> +       bsrl    %ecx, %ecx
> +       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x3):
> -       bsrl    %eax, %eax
> -       addl    $(VEC_SIZE * 3), %eax
> -       addq    %rdi, %rax
> -       ret
> +       .p2align 4,, 8
> +L(more_2x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_dec)
>
> -       .p2align 4
> -L(last_vec_x1_check):
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 3), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $VEC_SIZE, %eax
> -       addq    %rdi, %rax
> -       ret
> +       vpcmpb  $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
>
> -       .p2align 4
> -L(last_vec_x3_check):
> -       bsrl    %eax, %eax
> -       subq    $VEC_SIZE, %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $(VEC_SIZE * 3), %eax
> -       addq    %rdi, %rax
> -       ret
> +       /* Need no matter what.  */
> +       vpcmpb  $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       .p2align 4
> -L(zero):
> -       xorl    %eax, %eax
> +       subq    $(VEC_SIZE * 4), %rdx
> +       ja      L(more_4x_vec)
> +
> +       cmpl    $(VEC_SIZE * -1), %edx
> +       jle     L(ret_vec_x2_test)
> +L(last_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
> +
> +
> +       /* Need no matter what.  */
> +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 3 + 1), %rax
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       ja      L(zero_1)
>         ret
>
> -       .p2align 4
> -L(last_vec_or_less_aligned):
> -       movl    %edx, %ecx
> -
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> -
> -       movl    $1, %edx
> -       /* Support rdx << 32.  */
> -       salq    %cl, %rdx
> -       subq    $1, %rdx
> -
> -       kmovd   %k1, %eax
> -
> -       /* Remove the trailing bytes.  */
> -       andl    %edx, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> +       .p2align 4,, 8
> +L(ret_vec_x2_test):
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 2 + 1), %rax
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       ja      L(zero_1)
>         ret
>
> -       .p2align 4
> -L(last_vec_or_less):
> -       addl    $VEC_SIZE, %edx
> -
> -       /* Check for zero length.  */
> -       testl   %edx, %edx
> -       jz      L(zero)
> -
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -       jz      L(last_vec_or_less_aligned)
> -
> -       movl    %ecx, %esi
> -       movl    %ecx, %r8d
> -       addl    %edx, %esi
> -       andq    $-VEC_SIZE, %rdi
> +       .p2align 4,, 8
> +L(ret_vec_x2):
> +       bsrl    %ecx, %ecx
> +       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> +       ret
>
> -       subl    $VEC_SIZE, %esi
> -       ja      L(last_vec_2x_aligned)
> +       .p2align 4,, 8
> +L(ret_vec_x3):
> +       bsrl    %ecx, %ecx
> +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> +       ret
>
> -       /* Check the last VEC.  */
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
>
> -       /* Remove the leading and trailing bytes.  */
> -       sarl    %cl, %eax
> -       movl    %edx, %ecx
> +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       movl    $1, %edx
> -       sall    %cl, %edx
> -       subl    $1, %edx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
>
> -       andl    %edx, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> +       /* Check if near end before re-aligning (otherwise might do an
> +          unnecessary loop iteration).  */
> +       addq    $-(VEC_SIZE * 4), %rax
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       jbe     L(last_4x_vec)
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       addq    %r8, %rax
> -       ret
> +       decq    %rax
> +       andq    $-(VEC_SIZE * 4), %rax
> +       movq    %rdi, %rdx
> +       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
> +          lengths that overflow can be valid and break the comparison.  */
> +       andq    $-(VEC_SIZE * 4), %rdx
>
>         .p2align 4
> -L(last_vec_2x_aligned):
> -       movl    %esi, %ecx
> -
> -       /* Check the last VEC.  */
> -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> +L(loop_4x_vec):
> +       /* Store 1 were not-equals and 0 where equals in k1 (used to mask later
> +          on).  */
> +       vpcmpb  $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
> +
> +       /* VEC(2/3) will have zero-byte where we found a CHAR.  */
> +       vpxorq  (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
> +       vpxorq  (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
> +       vpcmpb  $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
> +
> +       /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
> +          CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
> +       vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z}
> +       vptestnmb %VEC(3), %VEC(3), %k2
> +
> +       /* Any 1s and we found CHAR.  */
> +       kortestd %k2, %k4
> +       jnz     L(loop_end)
> +
> +       addq    $-(VEC_SIZE * 4), %rax
> +       cmpq    %rdx, %rax
> +       jne     L(loop_4x_vec)
> +
> +       /* Need to re-adjust rdx / rax for L(last_4x_vec).  */
> +       subq    $-(VEC_SIZE * 4), %rdx
> +       movq    %rdx, %rax
> +       subl    %edi, %edx
> +L(last_4x_vec):
> +
> +       /* Used no matter what.  */
> +       vpcmpb  $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       movl    $1, %edx
> -       sall    %cl, %edx
> -       subl    $1, %edx
> +       cmpl    $(VEC_SIZE * 2), %edx
> +       jbe     L(last_2x_vec)
>
> -       kmovd   %k1, %eax
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_dec)
>
> -       /* Remove the trailing bytes.  */
> -       andl    %edx, %eax
>
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> +       vpcmpb  $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       /* Check the second last VEC.  */
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
>
> -       movl    %r8d, %ecx
> +       /* Used no matter what.  */
> +       vpcmpb  $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       kmovd   %k1, %eax
> +       cmpl    $(VEC_SIZE * 3), %edx
> +       ja      L(last_vec)
>
> -       /* Remove the leading bytes.  Must use unsigned right shift for
> -          bsrl below.  */
> -       shrl    %cl, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 2 + 1), %rax
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       jbe     L(ret_1)
> +       xorl    %eax, %eax
> +L(ret_1):
> +       ret
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       addq    %r8, %rax
> +       .p2align 4,, 6
> +L(loop_end):
> +       kmovd   %k1, %ecx
> +       notl    %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_end)
> +
> +       vptestnmb %VEC(2), %VEC(2), %k0
> +       kmovd   %k0, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1_end)
> +
> +       kmovd   %k2, %ecx
> +       kmovd   %k4, %esi
> +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> +          then CHAR in VEC3 and bsrq will use that position.  */
> +       salq    $32, %rcx
> +       orq     %rsi, %rcx
> +       bsrq    %rcx, %rcx
> +       addq    %rcx, %rax
> +       ret
> +       .p2align 4,, 4
> +L(ret_vec_x0_end):
> +       addq    $(VEC_SIZE), %rax
> +L(ret_vec_x1_end):
> +       bsrl    %ecx, %ecx
> +       leaq    (VEC_SIZE * 2)(%rax, %rcx), %rax
>         ret
> -END (__memrchr_evex)
> +
> +END(MEMRCHR)
>  #endif
> --
> 2.34.1
>

OK with the updated comments.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v6 5/8] x86: Optimize memrchr-evex.S
  2022-06-07 18:21       ` H.J. Lu
@ 2022-07-14  2:21         ` Sunil Pandey
  0 siblings, 0 replies; 82+ messages in thread
From: Sunil Pandey @ 2022-07-14  2:21 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Noah Goldstein, GNU C Library

On Tue, Jun 7, 2022 at 11:23 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code:
> >     1. prioritizes smaller user-arg lengths more.
> >     2. optimizes target placement more carefully
> >     3. reuses logic more
> >     4. fixes up various inefficiencies in the logic. The biggest
> >        case here is the `lzcnt` logic for checking returns which
> >        saves either a branch or multiple instructions.
> >
> > The total code size saving is: 263 bytes
> > Geometric Mean of all benchmarks New / Old: 0.755
> >
> > Regressions:
> > There are some regressions. Particularly where the length (user arg
> > length) is large but the position of the match char is near the
> > beginning of the string (in first VEC). This case has roughly a
> > 20% regression.
> >
> > This is because the new logic gives the hot path for immediate matches
> > to shorter lengths (the more common input). This case has roughly
> > a 35% speedup.
> >
> > Full xcheck passes on x86_64.
> > ---
> >  sysdeps/x86_64/multiarch/memrchr-evex.S | 539 ++++++++++++------------
> >  1 file changed, 268 insertions(+), 271 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
> > index 0b99709c6b..2d7da06dfc 100644
> > --- a/sysdeps/x86_64/multiarch/memrchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
> > @@ -19,319 +19,316 @@
> >  #if IS_IN (libc)
> >
> >  # include <sysdep.h>
> > +# include "evex256-vecs.h"
> > +# if VEC_SIZE != 32
> > +#  error "VEC_SIZE != 32 unimplemented"
> > +# endif
> > +
> > +# ifndef MEMRCHR
> > +#  define MEMRCHR                              __memrchr_evex
> > +# endif
> > +
> > +# define PAGE_SIZE                     4096
> > +# define VECMATCH                      VEC(0)
> > +
> > +       .section SECTION(.text), "ax", @progbits
> > +ENTRY_P2ALIGN(MEMRCHR, 6)
> > +# ifdef __ILP32__
> > +       /* Clear upper bits.  */
> > +       and     %RDX_LP, %RDX_LP
> > +# else
> > +       test    %RDX_LP, %RDX_LP
> > +# endif
> > +       jz      L(zero_0)
> > +
> > +       /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
> > +          correct page cross check and 2) it correctly sets up end ptr to be
> > +          subtract by lzcnt aligned.  */
> > +       leaq    -1(%rdi, %rdx), %rax
> > +       vpbroadcastb %esi, %VECMATCH
> > +
> > +       /* Check if we can load 1x VEC without cross a page.  */
> > +       testl   $(PAGE_SIZE - VEC_SIZE), %eax
> > +       jz      L(page_cross)
> > +
> > +       /* Don't use rax for pointer here because EVEX has better encoding with
> > +          offset % VEC_SIZE == 0.  */
> > +       vpcmpb  $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> > +
> > +       /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
> > +       cmpq    $VEC_SIZE, %rdx
> > +       ja      L(more_1x_vec)
> > +L(ret_vec_x0_test):
> > +
> > +       /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
> > +          will guarantee edx (len) is less than it.  */
> > +       lzcntl  %ecx, %ecx
> > +       cmpl    %ecx, %edx
> > +       jle     L(zero_0)
> > +       subq    %rcx, %rax
> > +       ret
> >
> > -# define VMOVA         vmovdqa64
> > -
> > -# define YMMMATCH      ymm16
> > -
> > -# define VEC_SIZE 32
> > -
> > -       .section .text.evex,"ax",@progbits
> > -ENTRY (__memrchr_evex)
> > -       /* Broadcast CHAR to YMMMATCH.  */
> > -       vpbroadcastb %esi, %YMMMATCH
> > -
> > -       sub     $VEC_SIZE, %RDX_LP
> > -       jbe     L(last_vec_or_less)
> > -
> > -       add     %RDX_LP, %RDI_LP
> > -
> > -       /* Check the last VEC_SIZE bytes.  */
> > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> > -       kmovd   %k1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x0)
> > -
> > -       subq    $(VEC_SIZE * 4), %rdi
> > -       movl    %edi, %ecx
> > -       andl    $(VEC_SIZE - 1), %ecx
> > -       jz      L(aligned_more)
> > -
> > -       /* Align data for aligned loads in the loop.  */
> > -       addq    $VEC_SIZE, %rdi
> > -       addq    $VEC_SIZE, %rdx
> > -       andq    $-VEC_SIZE, %rdi
> > -       subq    %rcx, %rdx
> > -
> > -       .p2align 4
> > -L(aligned_more):
> > -       subq    $(VEC_SIZE * 4), %rdx
> > -       jbe     L(last_4x_vec_or_less)
> > -
> > -       /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > -          since data is only aligned to VEC_SIZE.  */
> > -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > -       kmovd   %k1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
> > -       kmovd   %k2, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x2)
> > -
> > -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
> > -       kmovd   %k3, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1)
> > -
> > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k4
> > -       kmovd   %k4, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x0)
> > -
> > -       /* Align data to 4 * VEC_SIZE for loop with fewer branches.
> > -          There are some overlaps with above if data isn't aligned
> > -          to 4 * VEC_SIZE.  */
> > -       movl    %edi, %ecx
> > -       andl    $(VEC_SIZE * 4 - 1), %ecx
> > -       jz      L(loop_4x_vec)
> > -
> > -       addq    $(VEC_SIZE * 4), %rdi
> > -       addq    $(VEC_SIZE * 4), %rdx
> > -       andq    $-(VEC_SIZE * 4), %rdi
> > -       subq    %rcx, %rdx
> > +       /* Fits in aligning bytes of first cache line.  */
> > +L(zero_0):
> > +       xorl    %eax, %eax
> > +       ret
> >
> > -       .p2align 4
> > -L(loop_4x_vec):
> > -       /* Compare 4 * VEC at a time forward.  */
> > -       subq    $(VEC_SIZE * 4), %rdi
> > -       subq    $(VEC_SIZE * 4), %rdx
> > -       jbe     L(last_4x_vec_or_less)
> > -
> > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> > -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> > -       kord    %k1, %k2, %k5
> > -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> > -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> > -
> > -       kord    %k3, %k4, %k6
> > -       kortestd %k5, %k6
> > -       jz      L(loop_4x_vec)
> > -
> > -       /* There is a match.  */
> > -       kmovd   %k4, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3)
> > -
> > -       kmovd   %k3, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x2)
> > -
> > -       kmovd   %k2, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1)
> > -
> > -       kmovd   %k1, %eax
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > +       .p2align 4,, 9
> > +L(ret_vec_x0_dec):
> > +       decq    %rax
> > +L(ret_vec_x0):
> > +       lzcntl  %ecx, %ecx
> > +       subq    %rcx, %rax
> >         ret
> >
> > -       .p2align 4
> > -L(last_4x_vec_or_less):
> > -       addl    $(VEC_SIZE * 4), %edx
> > -       cmpl    $(VEC_SIZE * 2), %edx
> > -       jbe     L(last_2x_vec)
> > +       .p2align 4,, 10
> > +L(more_1x_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0)
> >
> > -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > -       kmovd   %k1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3)
> > +       /* Align rax (pointer to string).  */
> > +       andq    $-VEC_SIZE, %rax
> >
> > -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
> > -       kmovd   %k2, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x2)
> > +       /* Recompute length after aligning.  */
> > +       movq    %rax, %rdx
> >
> > -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
> > -       kmovd   %k3, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1_check)
> > -       cmpl    $(VEC_SIZE * 3), %edx
> > -       jbe     L(zero)
> > +       /* Need no matter what.  */
> > +       vpcmpb  $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> >
> > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k4
> > -       kmovd   %k4, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > -       bsrl    %eax, %eax
> > -       subq    $(VEC_SIZE * 4), %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addq    %rdi, %rax
> > -       ret
> > +       subq    %rdi, %rdx
> >
> > -       .p2align 4
> > +       cmpq    $(VEC_SIZE * 2), %rdx
> > +       ja      L(more_2x_vec)
> >  L(last_2x_vec):
> > -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > -       kmovd   %k1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3_check)
> > +
> > +       /* Must dec rax because L(ret_vec_x0_test) expects it.  */
> > +       decq    %rax
> >         cmpl    $VEC_SIZE, %edx
> > -       jbe     L(zero)
> > -
> > -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> > -       kmovd   %k1, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > -       bsrl    %eax, %eax
> > -       subq    $(VEC_SIZE * 2), %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addl    $(VEC_SIZE * 2), %eax
> > -       addq    %rdi, %rax
> > +       jbe     L(ret_vec_x0_test)
> > +
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0)
> > +
> > +       /* Don't use rax for pointer here because EVEX has better encoding with
> > +          offset % VEC_SIZE == 0.  */
> > +       vpcmpb  $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> > +       /* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
> > +       lzcntq  %rcx, %rcx
> > +       cmpl    %ecx, %edx
> > +       jle     L(zero_0)
> > +       subq    %rcx, %rax
> >         ret
> >
> > -       .p2align 4
> > -L(last_vec_x0):
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > +       /* Inexpensive place to put this regarding code size / target alignments
> > +          / ICache NLP. Necessary for 2-byte encoding of jump to page cross
> > +          case which in turn in necessary for hot path (len <= VEC_SIZE) to fit
>                                              is necessary?
>
> > +          in first cache line.  */
> > +L(page_cross):
> > +       movq    %rax, %rsi
> > +       andq    $-VEC_SIZE, %rsi
> > +       vpcmpb  $0, (%rsi), %VECMATCH, %k0
> > +       kmovd   %k0, %r8d
> > +       /* Shift out negative alignment (because we are starting from endptr and
> > +          working backwards).  */
> > +       movl    %eax, %ecx
> > +       /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
> > +       notl    %ecx
> > +       shlxl   %ecx, %r8d, %ecx
> > +       cmpq    %rdi, %rsi
> > +       ja      L(more_1x_vec)
> > +       lzcntl  %ecx, %ecx
> > +       cmpl    %ecx, %edx
> > +       jle     L(zero_1)
> > +       subq    %rcx, %rax
> >         ret
> >
> > -       .p2align 4
> > -L(last_vec_x1):
> > -       bsrl    %eax, %eax
> > -       addl    $VEC_SIZE, %eax
> > -       addq    %rdi, %rax
> > +       /* Continue creating zero labels that fit in aligning bytes and get
> > +          2-byte encoding / are in the same cache line as condition.  */
> > +L(zero_1):
> > +       xorl    %eax, %eax
> >         ret
> >
> > -       .p2align 4
> > -L(last_vec_x2):
> > -       bsrl    %eax, %eax
> > -       addl    $(VEC_SIZE * 2), %eax
> > -       addq    %rdi, %rax
> > +       .p2align 4,, 8
> > +L(ret_vec_x1):
> > +       /* This will naturally add 32 to position.  */
> > +       bsrl    %ecx, %ecx
> > +       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
> >         ret
> >
> > -       .p2align 4
> > -L(last_vec_x3):
> > -       bsrl    %eax, %eax
> > -       addl    $(VEC_SIZE * 3), %eax
> > -       addq    %rdi, %rax
> > -       ret
> > +       .p2align 4,, 8
> > +L(more_2x_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0_dec)
> >
> > -       .p2align 4
> > -L(last_vec_x1_check):
> > -       bsrl    %eax, %eax
> > -       subq    $(VEC_SIZE * 3), %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addl    $VEC_SIZE, %eax
> > -       addq    %rdi, %rax
> > -       ret
> > +       vpcmpb  $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x1)
> >
> > -       .p2align 4
> > -L(last_vec_x3_check):
> > -       bsrl    %eax, %eax
> > -       subq    $VEC_SIZE, %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addl    $(VEC_SIZE * 3), %eax
> > -       addq    %rdi, %rax
> > -       ret
> > +       /* Need no matter what.  */
> > +       vpcmpb  $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> >
> > -       .p2align 4
> > -L(zero):
> > -       xorl    %eax, %eax
> > +       subq    $(VEC_SIZE * 4), %rdx
> > +       ja      L(more_4x_vec)
> > +
> > +       cmpl    $(VEC_SIZE * -1), %edx
> > +       jle     L(ret_vec_x2_test)
> > +L(last_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x2)
> > +
> > +
> > +       /* Need no matter what.  */
> > +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> > +       lzcntl  %ecx, %ecx
> > +       subq    $(VEC_SIZE * 3 + 1), %rax
> > +       subq    %rcx, %rax
> > +       cmpq    %rax, %rdi
> > +       ja      L(zero_1)
> >         ret
> >
> > -       .p2align 4
> > -L(last_vec_or_less_aligned):
> > -       movl    %edx, %ecx
> > -
> > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> > -
> > -       movl    $1, %edx
> > -       /* Support rdx << 32.  */
> > -       salq    %cl, %rdx
> > -       subq    $1, %rdx
> > -
> > -       kmovd   %k1, %eax
> > -
> > -       /* Remove the trailing bytes.  */
> > -       andl    %edx, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > -
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > +       .p2align 4,, 8
> > +L(ret_vec_x2_test):
> > +       lzcntl  %ecx, %ecx
> > +       subq    $(VEC_SIZE * 2 + 1), %rax
> > +       subq    %rcx, %rax
> > +       cmpq    %rax, %rdi
> > +       ja      L(zero_1)
> >         ret
> >
> > -       .p2align 4
> > -L(last_vec_or_less):
> > -       addl    $VEC_SIZE, %edx
> > -
> > -       /* Check for zero length.  */
> > -       testl   %edx, %edx
> > -       jz      L(zero)
> > -
> > -       movl    %edi, %ecx
> > -       andl    $(VEC_SIZE - 1), %ecx
> > -       jz      L(last_vec_or_less_aligned)
> > -
> > -       movl    %ecx, %esi
> > -       movl    %ecx, %r8d
> > -       addl    %edx, %esi
> > -       andq    $-VEC_SIZE, %rdi
> > +       .p2align 4,, 8
> > +L(ret_vec_x2):
> > +       bsrl    %ecx, %ecx
> > +       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> > +       ret
> >
> > -       subl    $VEC_SIZE, %esi
> > -       ja      L(last_vec_2x_aligned)
> > +       .p2align 4,, 8
> > +L(ret_vec_x3):
> > +       bsrl    %ecx, %ecx
> > +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> > +       ret
> >
> > -       /* Check the last VEC.  */
> > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> > -       kmovd   %k1, %eax
> > +       .p2align 4,, 8
> > +L(more_4x_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x2)
> >
> > -       /* Remove the leading and trailing bytes.  */
> > -       sarl    %cl, %eax
> > -       movl    %edx, %ecx
> > +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> >
> > -       movl    $1, %edx
> > -       sall    %cl, %edx
> > -       subl    $1, %edx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x3)
> >
> > -       andl    %edx, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > +       /* Check if near end before re-aligning (otherwise might do an
> > +          unnecessary loop iteration).  */
> > +       addq    $-(VEC_SIZE * 4), %rax
> > +       cmpq    $(VEC_SIZE * 4), %rdx
> > +       jbe     L(last_4x_vec)
> >
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > -       addq    %r8, %rax
> > -       ret
> > +       decq    %rax
> > +       andq    $-(VEC_SIZE * 4), %rax
> > +       movq    %rdi, %rdx
> > +       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
> > +          lengths that overflow can be valid and break the comparison.  */
> > +       andq    $-(VEC_SIZE * 4), %rdx
> >
> >         .p2align 4
> > -L(last_vec_2x_aligned):
> > -       movl    %esi, %ecx
> > -
> > -       /* Check the last VEC.  */
> > -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > +L(loop_4x_vec):
> > +       /* Store 1 were not-equals and 0 where equals in k1 (used to mask later
> > +          on).  */
> > +       vpcmpb  $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
> > +
> > +       /* VEC(2/3) will have zero-byte where we found a CHAR.  */
> > +       vpxorq  (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
> > +       vpxorq  (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
> > +       vpcmpb  $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
> > +
> > +       /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
> > +          CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
> > +       vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z}
> > +       vptestnmb %VEC(3), %VEC(3), %k2
> > +
> > +       /* Any 1s and we found CHAR.  */
> > +       kortestd %k2, %k4
> > +       jnz     L(loop_end)
> > +
> > +       addq    $-(VEC_SIZE * 4), %rax
> > +       cmpq    %rdx, %rax
> > +       jne     L(loop_4x_vec)
> > +
> > +       /* Need to re-adjust rdx / rax for L(last_4x_vec).  */
> > +       subq    $-(VEC_SIZE * 4), %rdx
> > +       movq    %rdx, %rax
> > +       subl    %edi, %edx
> > +L(last_4x_vec):
> > +
> > +       /* Used no matter what.  */
> > +       vpcmpb  $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> >
> > -       movl    $1, %edx
> > -       sall    %cl, %edx
> > -       subl    $1, %edx
> > +       cmpl    $(VEC_SIZE * 2), %edx
> > +       jbe     L(last_2x_vec)
> >
> > -       kmovd   %k1, %eax
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0_dec)
> >
> > -       /* Remove the trailing bytes.  */
> > -       andl    %edx, %eax
> >
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1)
> > +       vpcmpb  $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> >
> > -       /* Check the second last VEC.  */
> > -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x1)
> >
> > -       movl    %r8d, %ecx
> > +       /* Used no matter what.  */
> > +       vpcmpb  $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
> > +       kmovd   %k0, %ecx
> >
> > -       kmovd   %k1, %eax
> > +       cmpl    $(VEC_SIZE * 3), %edx
> > +       ja      L(last_vec)
> >
> > -       /* Remove the leading bytes.  Must use unsigned right shift for
> > -          bsrl below.  */
> > -       shrl    %cl, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > +       lzcntl  %ecx, %ecx
> > +       subq    $(VEC_SIZE * 2 + 1), %rax
> > +       subq    %rcx, %rax
> > +       cmpq    %rax, %rdi
> > +       jbe     L(ret_1)
> > +       xorl    %eax, %eax
> > +L(ret_1):
> > +       ret
> >
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > -       addq    %r8, %rax
> > +       .p2align 4,, 6
> > +L(loop_end):
> > +       kmovd   %k1, %ecx
> > +       notl    %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0_end)
> > +
> > +       vptestnmb %VEC(2), %VEC(2), %k0
> > +       kmovd   %k0, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x1_end)
> > +
> > +       kmovd   %k2, %ecx
> > +       kmovd   %k4, %esi
> > +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> > +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> > +          then CHAR in VEC3 and bsrq will use that position.  */
> > +       salq    $32, %rcx
> > +       orq     %rsi, %rcx
> > +       bsrq    %rcx, %rcx
> > +       addq    %rcx, %rax
> > +       ret
> > +       .p2align 4,, 4
> > +L(ret_vec_x0_end):
> > +       addq    $(VEC_SIZE), %rax
> > +L(ret_vec_x1_end):
> > +       bsrl    %ecx, %ecx
> > +       leaq    (VEC_SIZE * 2)(%rax, %rcx), %rax
> >         ret
> > -END (__memrchr_evex)
> > +
> > +END(MEMRCHR)
> >  #endif
> > --
> > 2.34.1
> >
>
> OK with the updated comments.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v6 6/8] x86: Optimize memrchr-avx2.S
  2022-06-07  4:11   ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (3 preceding siblings ...)
  2022-06-07  4:11     ` [PATCH v6 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
@ 2022-06-07  4:11     ` Noah Goldstein
  2022-06-07 18:17       ` H.J. Lu
  2022-06-07  4:11     ` [PATCH v6 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
                       ` (2 subsequent siblings)
  7 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:11 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller user-arg lengths more.
    2. optimizes target placement more carefully
    3. reuses logic more
    4. fixes up various inefficiencies in the logic. The biggest
       case here is the `lzcnt` logic for checking returns which
       saves either a branch or multiple instructions.

The total code size saving is: 306 bytes
Geometric Mean of all benchmarks New / Old: 0.760

Regressions:
There are some regressions. Particularly where the length (user arg
length) is large but the position of the match char is near the
beginning of the string (in first VEC). This case has roughly a
10-20% regression.

This is because the new logic gives the hot path for immediate matches
to shorter lengths (the more common input). This case has roughly
a 15-45% speedup.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S |   1 +
 sysdeps/x86_64/multiarch/memrchr-avx2.S     | 534 ++++++++++----------
 2 files changed, 257 insertions(+), 278 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
index cea2d2a72d..5e9beeeef2 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
@@ -2,6 +2,7 @@
 # define MEMRCHR __memrchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index ba2ce7cb03..bea4528068 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -21,340 +21,318 @@
 # include <sysdep.h>
 
 # ifndef MEMRCHR
-#  define MEMRCHR	__memrchr_avx2
+#  define MEMRCHR				__memrchr_avx2
 # endif
 
 # ifndef VZEROUPPER
-#  define VZEROUPPER	vzeroupper
+#  define VZEROUPPER			vzeroupper
 # endif
 
 # ifndef SECTION
 #  define SECTION(p)	p##.avx
 # endif
 
-# define VEC_SIZE 32
+# define VEC_SIZE			32
+# define PAGE_SIZE			4096
+	.section SECTION(.text), "ax", @progbits
+ENTRY(MEMRCHR)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
 
-	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMRCHR)
-	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
-	vpbroadcastb %xmm0, %ymm0
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
-
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdx, %rdi), %rax
 
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
+	vpbroadcastb %xmm0, %ymm0
 
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+
+L(ret_vec_x0_test):
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will gurantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+
+	/* Hoist vzeroupper (not great for RTM) to save code size. This allows
+	   all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vmovdqa	(%rdi), %ymm1
-	vmovdqa	VEC_SIZE(%rdi), %ymm2
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
-
-	vpcmpeqb %ymm1, %ymm0, %ymm1
-	vpcmpeqb %ymm2, %ymm0, %ymm2
-	vpcmpeqb %ymm3, %ymm0, %ymm3
-	vpcmpeqb %ymm4, %ymm0, %ymm4
-
-	vpor	%ymm1, %ymm2, %ymm5
-	vpor	%ymm3, %ymm4, %ymm6
-	vpor	%ymm5, %ymm6, %ymm5
-
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpmovmskb %ymm1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Align rax (string pointer).  */
+	andq	$-VEC_SIZE, %rax
+
+	/* Recompute remaining length after aligning.  */
+	movq	%rax, %rdx
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
+	subq	%rdi, %rdx
+	decq	%rax
+	vpmovmskb %ymm1, %ecx
+	/* Fall through for short (hotter than length).  */
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which in turn in necessary for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpeqb (%rsi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %r8d
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%r8d
+	shlxl	%r8d, %ecx, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
+	.p2align 4,, 11
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
+	.p2align 4,, 10
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	VZEROUPPER_RETURN
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	.p2align 4
-L(null):
+	/* First in aligning bytes.  */
+L(zero_2):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
+	.p2align 4,, 4
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	vpcmpeqb (%rdi), %ymm0, %ymm1
 
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
+	.p2align 4,, 11
+L(ret_vec_x2):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4,, 14
+L(ret_vec_x3):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
 	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(null)
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecissary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	/* Align rax to (VEC_SIZE - 1).  */
+	orq	$(VEC_SIZE * 4 - 1), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	orq	$(VEC_SIZE * 4 - 1), %rdx
 
-	/* Check the last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	.p2align 4
+L(loop_4x_vec):
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	vpor	%ymm1, %ymm2, %ymm2
+	vpor	%ymm3, %ymm4, %ymm4
+	vpor	%ymm2, %ymm4, %ymm4
+	vpmovmskb %ymm4, %esi
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	testl	%esi, %esi
+	jnz	L(loop_end)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	VZEROUPPER_RETURN
+	addq	$(VEC_SIZE * -4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
 
-	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
+	subl	%edi, %edx
+	incl	%edx
 
-	/* Check the last VEC.  */
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
+L(last_4x_vec):
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	vpmovmskb %ymm1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret0)
+	xorl	%eax, %eax
+L(ret0):
+	ret
 
-	movl	%r8d, %ecx
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(loop_end):
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vpmovmskb %ymm2, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	vpmovmskb %ymm3, %ecx
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	.p2align 4,, 4
+L(ret_vec_x1_end):
+	/* 64-bit version will automatically add 32 (VEC_SIZE).  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
+	VZEROUPPER_RETURN
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
-END (MEMRCHR)
+
+	/* 2 bytes until next cache line.  */
+END(MEMRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v6 6/8] x86: Optimize memrchr-avx2.S
  2022-06-07  4:11     ` [PATCH v6 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
@ 2022-06-07 18:17       ` H.J. Lu
  2022-07-14  2:26         ` Sunil Pandey
  0 siblings, 1 reply; 82+ messages in thread
From: H.J. Lu @ 2022-06-07 18:17 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code:
>     1. prioritizes smaller user-arg lengths more.
>     2. optimizes target placement more carefully
>     3. reuses logic more
>     4. fixes up various inefficiencies in the logic. The biggest
>        case here is the `lzcnt` logic for checking returns which
>        saves either a branch or multiple instructions.
>
> The total code size saving is: 306 bytes
> Geometric Mean of all benchmarks New / Old: 0.760
>
> Regressions:
> There are some regressions. Particularly where the length (user arg
> length) is large but the position of the match char is near the
> beginning of the string (in first VEC). This case has roughly a
> 10-20% regression.
>
> This is because the new logic gives the hot path for immediate matches
> to shorter lengths (the more common input). This case has roughly
> a 15-45% speedup.
>
> Full xcheck passes on x86_64.
> ---
>  sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S |   1 +
>  sysdeps/x86_64/multiarch/memrchr-avx2.S     | 534 ++++++++++----------
>  2 files changed, 257 insertions(+), 278 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> index cea2d2a72d..5e9beeeef2 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> @@ -2,6 +2,7 @@
>  # define MEMRCHR __memrchr_avx2_rtm
>  #endif
>
> +#define COND_VZEROUPPER        COND_VZEROUPPER_XTEST
>  #define ZERO_UPPER_VEC_REGISTERS_RETURN \
>    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>
> diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
> index ba2ce7cb03..bea4528068 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
> @@ -21,340 +21,318 @@
>  # include <sysdep.h>
>
>  # ifndef MEMRCHR
> -#  define MEMRCHR      __memrchr_avx2
> +#  define MEMRCHR                              __memrchr_avx2
>  # endif
>
>  # ifndef VZEROUPPER
> -#  define VZEROUPPER   vzeroupper
> +#  define VZEROUPPER                   vzeroupper
>  # endif
>
>  # ifndef SECTION
>  #  define SECTION(p)   p##.avx
>  # endif
>
> -# define VEC_SIZE 32
> +# define VEC_SIZE                      32
> +# define PAGE_SIZE                     4096
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(MEMRCHR)
> +# ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       and     %RDX_LP, %RDX_LP
> +# else
> +       test    %RDX_LP, %RDX_LP
> +# endif
> +       jz      L(zero_0)
>
> -       .section SECTION(.text),"ax",@progbits
> -ENTRY (MEMRCHR)
> -       /* Broadcast CHAR to YMM0.  */
>         vmovd   %esi, %xmm0
> -       vpbroadcastb %xmm0, %ymm0
> -
> -       sub     $VEC_SIZE, %RDX_LP
> -       jbe     L(last_vec_or_less)
> -
> -       add     %RDX_LP, %RDI_LP
> -
> -       /* Check the last VEC_SIZE bytes.  */
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x0)
> +       /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
> +          correct page cross check and 2) it correctly sets up end ptr to be
> +          subtract by lzcnt aligned.  */
> +       leaq    -1(%rdx, %rdi), %rax
>
> -       subq    $(VEC_SIZE * 4), %rdi
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -       jz      L(aligned_more)
> +       vpbroadcastb %xmm0, %ymm0
>
> -       /* Align data for aligned loads in the loop.  */
> -       addq    $VEC_SIZE, %rdi
> -       addq    $VEC_SIZE, %rdx
> -       andq    $-VEC_SIZE, %rdi
> -       subq    %rcx, %rdx
> +       /* Check if we can load 1x VEC without cross a page.  */
> +       testl   $(PAGE_SIZE - VEC_SIZE), %eax
> +       jz      L(page_cross)
> +
> +       vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       cmpq    $VEC_SIZE, %rdx
> +       ja      L(more_1x_vec)
> +
> +L(ret_vec_x0_test):
> +       /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
> +          will gurantee edx (len) is less than it.  */
> +       lzcntl  %ecx, %ecx
> +
> +       /* Hoist vzeroupper (not great for RTM) to save code size. This allows
> +          all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
> +       COND_VZEROUPPER
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
> +       ret
>
> -       .p2align 4
> -L(aligned_more):
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -
> -       /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -          since data is only aligned to VEC_SIZE.  */
> -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
> -       vpmovmskb %ymm2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
> -       vpmovmskb %ymm3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       vpcmpeqb (%rdi), %ymm0, %ymm4
> -       vpmovmskb %ymm4, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x0)
> -
> -       /* Align data to 4 * VEC_SIZE for loop with fewer branches.
> -          There are some overlaps with above if data isn't aligned
> -          to 4 * VEC_SIZE.  */
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE * 4 - 1), %ecx
> -       jz      L(loop_4x_vec)
> -
> -       addq    $(VEC_SIZE * 4), %rdi
> -       addq    $(VEC_SIZE * 4), %rdx
> -       andq    $-(VEC_SIZE * 4), %rdi
> -       subq    %rcx, %rdx
> +       /* Fits in aligning bytes of first cache line.  */
> +L(zero_0):
> +       xorl    %eax, %eax
> +       ret
>
> -       .p2align 4
> -L(loop_4x_vec):
> -       /* Compare 4 * VEC at a time forward.  */
> -       subq    $(VEC_SIZE * 4), %rdi
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -
> -       vmovdqa (%rdi), %ymm1
> -       vmovdqa VEC_SIZE(%rdi), %ymm2
> -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> -
> -       vpcmpeqb %ymm1, %ymm0, %ymm1
> -       vpcmpeqb %ymm2, %ymm0, %ymm2
> -       vpcmpeqb %ymm3, %ymm0, %ymm3
> -       vpcmpeqb %ymm4, %ymm0, %ymm4
> -
> -       vpor    %ymm1, %ymm2, %ymm5
> -       vpor    %ymm3, %ymm4, %ymm6
> -       vpor    %ymm5, %ymm6, %ymm5
> -
> -       vpmovmskb %ymm5, %eax
> -       testl   %eax, %eax
> -       jz      L(loop_4x_vec)
> -
> -       /* There is a match.  */
> -       vpmovmskb %ymm4, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       vpmovmskb %ymm3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       vpmovmskb %ymm2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       vpmovmskb %ymm1, %eax
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> +       .p2align 4,, 9
> +L(ret_vec_x0):
> +       lzcntl  %ecx, %ecx
> +       subq    %rcx, %rax
>  L(return_vzeroupper):
>         ZERO_UPPER_VEC_REGISTERS_RETURN
>
> -       .p2align 4
> -L(last_4x_vec_or_less):
> -       addl    $(VEC_SIZE * 4), %edx
> -       cmpl    $(VEC_SIZE * 2), %edx
> -       jbe     L(last_2x_vec)
> -
> -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
> -       vpmovmskb %ymm2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
> -       vpmovmskb %ymm3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1_check)
> -       cmpl    $(VEC_SIZE * 3), %edx
> -       jbe     L(zero)
> -
> -       vpcmpeqb (%rdi), %ymm0, %ymm4
> -       vpmovmskb %ymm4, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 4), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> +       .p2align 4,, 10
> +L(more_1x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
> +
> +       /* Align rax (string pointer).  */
> +       andq    $-VEC_SIZE, %rax
> +
> +       /* Recompute remaining length after aligning.  */
> +       movq    %rax, %rdx
> +       /* Need this comparison next no matter what.  */
> +       vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
> +       subq    %rdi, %rdx
> +       decq    %rax
> +       vpmovmskb %ymm1, %ecx
> +       /* Fall through for short (hotter than length).  */
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       ja      L(more_2x_vec)
>  L(last_2x_vec):
> -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3_check)
>         cmpl    $VEC_SIZE, %edx
> -       jbe     L(zero)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 2), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $(VEC_SIZE * 2), %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(last_vec_x0):
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> +       jbe     L(ret_vec_x0_test)
> +
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
> +
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       /* 64-bit lzcnt. This will naturally add 32 to position.  */
> +       lzcntq  %rcx, %rcx
> +       COND_VZEROUPPER
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
> +       ret
>
> -       .p2align 4
> -L(last_vec_x1):
> -       bsrl    %eax, %eax
> -       addl    $VEC_SIZE, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(last_vec_x2):
> -       bsrl    %eax, %eax
> -       addl    $(VEC_SIZE * 2), %eax
> -       addq    %rdi, %rax
> +       /* Inexpensive place to put this regarding code size / target alignments
> +          / ICache NLP. Necessary for 2-byte encoding of jump to page cross
> +          case which in turn in necessary for hot path (len <= VEC_SIZE) to fit
                                             is necessary?
> +          in first cache line.  */
> +L(page_cross):
> +       movq    %rax, %rsi
> +       andq    $-VEC_SIZE, %rsi
> +       vpcmpeqb (%rsi), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       /* Shift out negative alignment (because we are starting from endptr and
> +          working backwards).  */
> +       movl    %eax, %r8d
> +       /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
> +       notl    %r8d
> +       shlxl   %r8d, %ecx, %ecx
> +       cmpq    %rdi, %rsi
> +       ja      L(more_1x_vec)
> +       lzcntl  %ecx, %ecx
> +       COND_VZEROUPPER
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
> +       ret
> +       .p2align 4,, 11
> +L(ret_vec_x1):
> +       /* This will naturally add 32 to position.  */
> +       lzcntq  %rcx, %rcx
> +       subq    %rcx, %rax
>         VZEROUPPER_RETURN
> +       .p2align 4,, 10
> +L(more_2x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
>
> -       .p2align 4
> -L(last_vec_x3):
> -       bsrl    %eax, %eax
> -       addl    $(VEC_SIZE * 3), %eax
> -       addq    %rdi, %rax
> -       ret
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
>
> -       .p2align 4
> -L(last_vec_x1_check):
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 3), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $VEC_SIZE, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(last_vec_x3_check):
> -       bsrl    %eax, %eax
> -       subq    $VEC_SIZE, %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $(VEC_SIZE * 3), %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> +       /* Needed no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       .p2align 4
> -L(zero):
> -       xorl    %eax, %eax
> -       VZEROUPPER_RETURN
> +       subq    $(VEC_SIZE * 4), %rdx
> +       ja      L(more_4x_vec)
> +
> +       cmpl    $(VEC_SIZE * -1), %edx
> +       jle     L(ret_vec_x2_test)
> +
> +L(last_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
> +
> +       /* Needed no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 3), %rax
> +       COND_VZEROUPPER
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       ja      L(zero_2)
> +       ret
>
> -       .p2align 4
> -L(null):
> +       /* First in aligning bytes.  */
> +L(zero_2):
>         xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(last_vec_or_less_aligned):
> -       movl    %edx, %ecx
> +       .p2align 4,, 4
> +L(ret_vec_x2_test):
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 2), %rax
> +       COND_VZEROUPPER
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       ja      L(zero_2)
> +       ret
>
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
>
> -       movl    $1, %edx
> -       /* Support rdx << 32.  */
> -       salq    %cl, %rdx
> -       subq    $1, %rdx
> +       .p2align 4,, 11
> +L(ret_vec_x2):
> +       /* ecx must be non-zero.  */
> +       bsrl    %ecx, %ecx
> +       leaq    (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
> +       VZEROUPPER_RETURN
>
> -       vpmovmskb %ymm1, %eax
> +       .p2align 4,, 14
> +L(ret_vec_x3):
> +       /* ecx must be non-zero.  */
> +       bsrl    %ecx, %ecx
> +       leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
> +       VZEROUPPER_RETURN
>
> -       /* Remove the trailing bytes.  */
> -       andl    %edx, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
>
>         .p2align 4
> -L(last_vec_or_less):
> -       addl    $VEC_SIZE, %edx
> +L(more_4x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
>
> -       /* Check for zero length.  */
> -       testl   %edx, %edx
> -       jz      L(null)
> +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -       jz      L(last_vec_or_less_aligned)
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
>
> -       movl    %ecx, %esi
> -       movl    %ecx, %r8d
> -       addl    %edx, %esi
> -       andq    $-VEC_SIZE, %rdi
> +       /* Check if near end before re-aligning (otherwise might do an
> +          unnecissary loop iteration).  */
> +       addq    $-(VEC_SIZE * 4), %rax
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       jbe     L(last_4x_vec)
>
> -       subl    $VEC_SIZE, %esi
> -       ja      L(last_vec_2x_aligned)
> +       /* Align rax to (VEC_SIZE - 1).  */
> +       orq     $(VEC_SIZE * 4 - 1), %rax
> +       movq    %rdi, %rdx
> +       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
> +          lengths that overflow can be valid and break the comparison.  */
> +       orq     $(VEC_SIZE * 4 - 1), %rdx
>
> -       /* Check the last VEC.  */
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -
> -       /* Remove the leading and trailing bytes.  */
> -       sarl    %cl, %eax
> -       movl    %edx, %ecx
> +       .p2align 4
> +L(loop_4x_vec):
> +       /* Need this comparison next no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
> +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
> +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
>
> -       movl    $1, %edx
> -       sall    %cl, %edx
> -       subl    $1, %edx
> +       vpor    %ymm1, %ymm2, %ymm2
> +       vpor    %ymm3, %ymm4, %ymm4
> +       vpor    %ymm2, %ymm4, %ymm4
> +       vpmovmskb %ymm4, %esi
>
> -       andl    %edx, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> +       testl   %esi, %esi
> +       jnz     L(loop_end)
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       addq    %r8, %rax
> -       VZEROUPPER_RETURN
> +       addq    $(VEC_SIZE * -4), %rax
> +       cmpq    %rdx, %rax
> +       jne     L(loop_4x_vec)
>
> -       .p2align 4
> -L(last_vec_2x_aligned):
> -       movl    %esi, %ecx
> +       subl    %edi, %edx
> +       incl    %edx
>
> -       /* Check the last VEC.  */
> -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
> +L(last_4x_vec):
> +       /* Used no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       movl    $1, %edx
> -       sall    %cl, %edx
> -       subl    $1, %edx
> +       cmpl    $(VEC_SIZE * 2), %edx
> +       jbe     L(last_2x_vec)
>
> -       vpmovmskb %ymm1, %eax
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_end)
>
> -       /* Remove the trailing bytes.  */
> -       andl    %edx, %eax
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1_end)
>
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> +       /* Used no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       /* Check the second last VEC.  */
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
> +       cmpl    $(VEC_SIZE * 3), %edx
> +       ja      L(last_vec)
> +
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 2), %rax
> +       COND_VZEROUPPER
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       jbe     L(ret0)
> +       xorl    %eax, %eax
> +L(ret0):
> +       ret
>
> -       movl    %r8d, %ecx
>
> -       vpmovmskb %ymm1, %eax
> +       .p2align 4
> +L(loop_end):
> +       vpmovmskb %ymm1, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_end)
> +
> +       vpmovmskb %ymm2, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1_end)
> +
> +       vpmovmskb %ymm3, %ecx
> +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> +          then CHAR in VEC3 and bsrq will use that position.  */
> +       salq    $32, %rcx
> +       orq     %rsi, %rcx
> +       bsrq    %rcx, %rcx
> +       leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
> +       VZEROUPPER_RETURN
>
> -       /* Remove the leading bytes.  Must use unsigned right shift for
> -          bsrl below.  */
> -       shrl    %cl, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> +       .p2align 4,, 4
> +L(ret_vec_x1_end):
> +       /* 64-bit version will automatically add 32 (VEC_SIZE).  */
> +       lzcntq  %rcx, %rcx
> +       subq    %rcx, %rax
> +       VZEROUPPER_RETURN
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       addq    %r8, %rax
> +       .p2align 4,, 4
> +L(ret_vec_x0_end):
> +       lzcntl  %ecx, %ecx
> +       subq    %rcx, %rax
>         VZEROUPPER_RETURN
> -END (MEMRCHR)
> +
> +       /* 2 bytes until next cache line.  */
> +END(MEMRCHR)
>  #endif
> --
> 2.34.1
>

OK with the updated comments.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v6 6/8] x86: Optimize memrchr-avx2.S
  2022-06-07 18:17       ` H.J. Lu
@ 2022-07-14  2:26         ` Sunil Pandey
  2022-07-14  2:43           ` Noah Goldstein
  0 siblings, 1 reply; 82+ messages in thread
From: Sunil Pandey @ 2022-07-14  2:26 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Noah Goldstein, GNU C Library

On Tue, Jun 7, 2022 at 11:18 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code:
> >     1. prioritizes smaller user-arg lengths more.
> >     2. optimizes target placement more carefully
> >     3. reuses logic more
> >     4. fixes up various inefficiencies in the logic. The biggest
> >        case here is the `lzcnt` logic for checking returns which
> >        saves either a branch or multiple instructions.
> >
> > The total code size saving is: 306 bytes
> > Geometric Mean of all benchmarks New / Old: 0.760
> >
> > Regressions:
> > There are some regressions. Particularly where the length (user arg
> > length) is large but the position of the match char is near the
> > beginning of the string (in first VEC). This case has roughly a
> > 10-20% regression.
> >
> > This is because the new logic gives the hot path for immediate matches
> > to shorter lengths (the more common input). This case has roughly
> > a 15-45% speedup.
> >
> > Full xcheck passes on x86_64.
> > ---
> >  sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S |   1 +
> >  sysdeps/x86_64/multiarch/memrchr-avx2.S     | 534 ++++++++++----------
> >  2 files changed, 257 insertions(+), 278 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> > index cea2d2a72d..5e9beeeef2 100644
> > --- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> > @@ -2,6 +2,7 @@
> >  # define MEMRCHR __memrchr_avx2_rtm
> >  #endif
> >
> > +#define COND_VZEROUPPER        COND_VZEROUPPER_XTEST
> >  #define ZERO_UPPER_VEC_REGISTERS_RETURN \
> >    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> >
> > diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
> > index ba2ce7cb03..bea4528068 100644
> > --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
> > @@ -21,340 +21,318 @@
> >  # include <sysdep.h>
> >
> >  # ifndef MEMRCHR
> > -#  define MEMRCHR      __memrchr_avx2
> > +#  define MEMRCHR                              __memrchr_avx2
> >  # endif
> >
> >  # ifndef VZEROUPPER
> > -#  define VZEROUPPER   vzeroupper
> > +#  define VZEROUPPER                   vzeroupper
> >  # endif
> >
> >  # ifndef SECTION
> >  #  define SECTION(p)   p##.avx
> >  # endif
> >
> > -# define VEC_SIZE 32
> > +# define VEC_SIZE                      32
> > +# define PAGE_SIZE                     4096
> > +       .section SECTION(.text), "ax", @progbits
> > +ENTRY(MEMRCHR)
> > +# ifdef __ILP32__
> > +       /* Clear upper bits.  */
> > +       and     %RDX_LP, %RDX_LP
> > +# else
> > +       test    %RDX_LP, %RDX_LP
> > +# endif
> > +       jz      L(zero_0)
> >
> > -       .section SECTION(.text),"ax",@progbits
> > -ENTRY (MEMRCHR)
> > -       /* Broadcast CHAR to YMM0.  */
> >         vmovd   %esi, %xmm0
> > -       vpbroadcastb %xmm0, %ymm0
> > -
> > -       sub     $VEC_SIZE, %RDX_LP
> > -       jbe     L(last_vec_or_less)
> > -
> > -       add     %RDX_LP, %RDI_LP
> > -
> > -       /* Check the last VEC_SIZE bytes.  */
> > -       vpcmpeqb (%rdi), %ymm0, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x0)
> > +       /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
> > +          correct page cross check and 2) it correctly sets up end ptr to be
> > +          subtract by lzcnt aligned.  */
> > +       leaq    -1(%rdx, %rdi), %rax
> >
> > -       subq    $(VEC_SIZE * 4), %rdi
> > -       movl    %edi, %ecx
> > -       andl    $(VEC_SIZE - 1), %ecx
> > -       jz      L(aligned_more)
> > +       vpbroadcastb %xmm0, %ymm0
> >
> > -       /* Align data for aligned loads in the loop.  */
> > -       addq    $VEC_SIZE, %rdi
> > -       addq    $VEC_SIZE, %rdx
> > -       andq    $-VEC_SIZE, %rdi
> > -       subq    %rcx, %rdx
> > +       /* Check if we can load 1x VEC without cross a page.  */
> > +       testl   $(PAGE_SIZE - VEC_SIZE), %eax
> > +       jz      L(page_cross)
> > +
> > +       vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> > +       cmpq    $VEC_SIZE, %rdx
> > +       ja      L(more_1x_vec)
> > +
> > +L(ret_vec_x0_test):
> > +       /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
> > +          will gurantee edx (len) is less than it.  */
> > +       lzcntl  %ecx, %ecx
> > +
> > +       /* Hoist vzeroupper (not great for RTM) to save code size. This allows
> > +          all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
> > +       COND_VZEROUPPER
> > +       cmpl    %ecx, %edx
> > +       jle     L(zero_0)
> > +       subq    %rcx, %rax
> > +       ret
> >
> > -       .p2align 4
> > -L(aligned_more):
> > -       subq    $(VEC_SIZE * 4), %rdx
> > -       jbe     L(last_4x_vec_or_less)
> > -
> > -       /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > -          since data is only aligned to VEC_SIZE.  */
> > -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3)
> > -
> > -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
> > -       vpmovmskb %ymm2, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x2)
> > -
> > -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
> > -       vpmovmskb %ymm3, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1)
> > -
> > -       vpcmpeqb (%rdi), %ymm0, %ymm4
> > -       vpmovmskb %ymm4, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x0)
> > -
> > -       /* Align data to 4 * VEC_SIZE for loop with fewer branches.
> > -          There are some overlaps with above if data isn't aligned
> > -          to 4 * VEC_SIZE.  */
> > -       movl    %edi, %ecx
> > -       andl    $(VEC_SIZE * 4 - 1), %ecx
> > -       jz      L(loop_4x_vec)
> > -
> > -       addq    $(VEC_SIZE * 4), %rdi
> > -       addq    $(VEC_SIZE * 4), %rdx
> > -       andq    $-(VEC_SIZE * 4), %rdi
> > -       subq    %rcx, %rdx
> > +       /* Fits in aligning bytes of first cache line.  */
> > +L(zero_0):
> > +       xorl    %eax, %eax
> > +       ret
> >
> > -       .p2align 4
> > -L(loop_4x_vec):
> > -       /* Compare 4 * VEC at a time forward.  */
> > -       subq    $(VEC_SIZE * 4), %rdi
> > -       subq    $(VEC_SIZE * 4), %rdx
> > -       jbe     L(last_4x_vec_or_less)
> > -
> > -       vmovdqa (%rdi), %ymm1
> > -       vmovdqa VEC_SIZE(%rdi), %ymm2
> > -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> > -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> > -
> > -       vpcmpeqb %ymm1, %ymm0, %ymm1
> > -       vpcmpeqb %ymm2, %ymm0, %ymm2
> > -       vpcmpeqb %ymm3, %ymm0, %ymm3
> > -       vpcmpeqb %ymm4, %ymm0, %ymm4
> > -
> > -       vpor    %ymm1, %ymm2, %ymm5
> > -       vpor    %ymm3, %ymm4, %ymm6
> > -       vpor    %ymm5, %ymm6, %ymm5
> > -
> > -       vpmovmskb %ymm5, %eax
> > -       testl   %eax, %eax
> > -       jz      L(loop_4x_vec)
> > -
> > -       /* There is a match.  */
> > -       vpmovmskb %ymm4, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3)
> > -
> > -       vpmovmskb %ymm3, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x2)
> > -
> > -       vpmovmskb %ymm2, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1)
> > -
> > -       vpmovmskb %ymm1, %eax
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > +       .p2align 4,, 9
> > +L(ret_vec_x0):
> > +       lzcntl  %ecx, %ecx
> > +       subq    %rcx, %rax
> >  L(return_vzeroupper):
> >         ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> > -       .p2align 4
> > -L(last_4x_vec_or_less):
> > -       addl    $(VEC_SIZE * 4), %edx
> > -       cmpl    $(VEC_SIZE * 2), %edx
> > -       jbe     L(last_2x_vec)
> > -
> > -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3)
> > -
> > -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
> > -       vpmovmskb %ymm2, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x2)
> > -
> > -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
> > -       vpmovmskb %ymm3, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1_check)
> > -       cmpl    $(VEC_SIZE * 3), %edx
> > -       jbe     L(zero)
> > -
> > -       vpcmpeqb (%rdi), %ymm0, %ymm4
> > -       vpmovmskb %ymm4, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > -       bsrl    %eax, %eax
> > -       subq    $(VEC_SIZE * 4), %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > +       .p2align 4,, 10
> > +L(more_1x_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0)
> > +
> > +       /* Align rax (string pointer).  */
> > +       andq    $-VEC_SIZE, %rax
> > +
> > +       /* Recompute remaining length after aligning.  */
> > +       movq    %rax, %rdx
> > +       /* Need this comparison next no matter what.  */
> > +       vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
> > +       subq    %rdi, %rdx
> > +       decq    %rax
> > +       vpmovmskb %ymm1, %ecx
> > +       /* Fall through for short (hotter than length).  */
> > +       cmpq    $(VEC_SIZE * 2), %rdx
> > +       ja      L(more_2x_vec)
> >  L(last_2x_vec):
> > -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x3_check)
> >         cmpl    $VEC_SIZE, %edx
> > -       jbe     L(zero)
> > -
> > -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > -       bsrl    %eax, %eax
> > -       subq    $(VEC_SIZE * 2), %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addl    $(VEC_SIZE * 2), %eax
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> > -
> > -       .p2align 4
> > -L(last_vec_x0):
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> > +       jbe     L(ret_vec_x0_test)
> > +
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0)
> > +
> > +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> > +       /* 64-bit lzcnt. This will naturally add 32 to position.  */
> > +       lzcntq  %rcx, %rcx
> > +       COND_VZEROUPPER
> > +       cmpl    %ecx, %edx
> > +       jle     L(zero_0)
> > +       subq    %rcx, %rax
> > +       ret
> >
> > -       .p2align 4
> > -L(last_vec_x1):
> > -       bsrl    %eax, %eax
> > -       addl    $VEC_SIZE, %eax
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> >
> > -       .p2align 4
> > -L(last_vec_x2):
> > -       bsrl    %eax, %eax
> > -       addl    $(VEC_SIZE * 2), %eax
> > -       addq    %rdi, %rax
> > +       /* Inexpensive place to put this regarding code size / target alignments
> > +          / ICache NLP. Necessary for 2-byte encoding of jump to page cross
> > +          case which in turn in necessary for hot path (len <= VEC_SIZE) to fit
>                                              is necessary?
> > +          in first cache line.  */
> > +L(page_cross):
> > +       movq    %rax, %rsi
> > +       andq    $-VEC_SIZE, %rsi
> > +       vpcmpeqb (%rsi), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> > +       /* Shift out negative alignment (because we are starting from endptr and
> > +          working backwards).  */
> > +       movl    %eax, %r8d
> > +       /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
> > +       notl    %r8d
> > +       shlxl   %r8d, %ecx, %ecx
> > +       cmpq    %rdi, %rsi
> > +       ja      L(more_1x_vec)
> > +       lzcntl  %ecx, %ecx
> > +       COND_VZEROUPPER
> > +       cmpl    %ecx, %edx
> > +       jle     L(zero_0)
> > +       subq    %rcx, %rax
> > +       ret
> > +       .p2align 4,, 11
> > +L(ret_vec_x1):
> > +       /* This will naturally add 32 to position.  */
> > +       lzcntq  %rcx, %rcx
> > +       subq    %rcx, %rax
> >         VZEROUPPER_RETURN
> > +       .p2align 4,, 10
> > +L(more_2x_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0)
> >
> > -       .p2align 4
> > -L(last_vec_x3):
> > -       bsrl    %eax, %eax
> > -       addl    $(VEC_SIZE * 3), %eax
> > -       addq    %rdi, %rax
> > -       ret
> > +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x1)
> >
> > -       .p2align 4
> > -L(last_vec_x1_check):
> > -       bsrl    %eax, %eax
> > -       subq    $(VEC_SIZE * 3), %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addl    $VEC_SIZE, %eax
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> >
> > -       .p2align 4
> > -L(last_vec_x3_check):
> > -       bsrl    %eax, %eax
> > -       subq    $VEC_SIZE, %rdx
> > -       addq    %rax, %rdx
> > -       jl      L(zero)
> > -       addl    $(VEC_SIZE * 3), %eax
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> > +       /* Needed no matter what.  */
> > +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> >
> > -       .p2align 4
> > -L(zero):
> > -       xorl    %eax, %eax
> > -       VZEROUPPER_RETURN
> > +       subq    $(VEC_SIZE * 4), %rdx
> > +       ja      L(more_4x_vec)
> > +
> > +       cmpl    $(VEC_SIZE * -1), %edx
> > +       jle     L(ret_vec_x2_test)
> > +
> > +L(last_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x2)
> > +
> > +       /* Needed no matter what.  */
> > +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> > +       lzcntl  %ecx, %ecx
> > +       subq    $(VEC_SIZE * 3), %rax
> > +       COND_VZEROUPPER
> > +       subq    %rcx, %rax
> > +       cmpq    %rax, %rdi
> > +       ja      L(zero_2)
> > +       ret
> >
> > -       .p2align 4
> > -L(null):
> > +       /* First in aligning bytes.  */
> > +L(zero_2):
> >         xorl    %eax, %eax
> >         ret
> >
> > -       .p2align 4
> > -L(last_vec_or_less_aligned):
> > -       movl    %edx, %ecx
> > +       .p2align 4,, 4
> > +L(ret_vec_x2_test):
> > +       lzcntl  %ecx, %ecx
> > +       subq    $(VEC_SIZE * 2), %rax
> > +       COND_VZEROUPPER
> > +       subq    %rcx, %rax
> > +       cmpq    %rax, %rdi
> > +       ja      L(zero_2)
> > +       ret
> >
> > -       vpcmpeqb (%rdi), %ymm0, %ymm1
> >
> > -       movl    $1, %edx
> > -       /* Support rdx << 32.  */
> > -       salq    %cl, %rdx
> > -       subq    $1, %rdx
> > +       .p2align 4,, 11
> > +L(ret_vec_x2):
> > +       /* ecx must be non-zero.  */
> > +       bsrl    %ecx, %ecx
> > +       leaq    (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
> > +       VZEROUPPER_RETURN
> >
> > -       vpmovmskb %ymm1, %eax
> > +       .p2align 4,, 14
> > +L(ret_vec_x3):
> > +       /* ecx must be non-zero.  */
> > +       bsrl    %ecx, %ecx
> > +       leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
> > +       VZEROUPPER_RETURN
> >
> > -       /* Remove the trailing bytes.  */
> > -       andl    %edx, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> >
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> >
> >         .p2align 4
> > -L(last_vec_or_less):
> > -       addl    $VEC_SIZE, %edx
> > +L(more_4x_vec):
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x2)
> >
> > -       /* Check for zero length.  */
> > -       testl   %edx, %edx
> > -       jz      L(null)
> > +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> >
> > -       movl    %edi, %ecx
> > -       andl    $(VEC_SIZE - 1), %ecx
> > -       jz      L(last_vec_or_less_aligned)
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x3)
> >
> > -       movl    %ecx, %esi
> > -       movl    %ecx, %r8d
> > -       addl    %edx, %esi
> > -       andq    $-VEC_SIZE, %rdi
> > +       /* Check if near end before re-aligning (otherwise might do an
> > +          unnecissary loop iteration).  */
> > +       addq    $-(VEC_SIZE * 4), %rax
> > +       cmpq    $(VEC_SIZE * 4), %rdx
> > +       jbe     L(last_4x_vec)
> >
> > -       subl    $VEC_SIZE, %esi
> > -       ja      L(last_vec_2x_aligned)
> > +       /* Align rax to (VEC_SIZE - 1).  */
> > +       orq     $(VEC_SIZE * 4 - 1), %rax
> > +       movq    %rdi, %rdx
> > +       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
> > +          lengths that overflow can be valid and break the comparison.  */
> > +       orq     $(VEC_SIZE * 4 - 1), %rdx
> >
> > -       /* Check the last VEC.  */
> > -       vpcmpeqb (%rdi), %ymm0, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > -
> > -       /* Remove the leading and trailing bytes.  */
> > -       sarl    %cl, %eax
> > -       movl    %edx, %ecx
> > +       .p2align 4
> > +L(loop_4x_vec):
> > +       /* Need this comparison next no matter what.  */
> > +       vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
> > +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
> > +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
> > +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
> >
> > -       movl    $1, %edx
> > -       sall    %cl, %edx
> > -       subl    $1, %edx
> > +       vpor    %ymm1, %ymm2, %ymm2
> > +       vpor    %ymm3, %ymm4, %ymm4
> > +       vpor    %ymm2, %ymm4, %ymm4
> > +       vpmovmskb %ymm4, %esi
> >
> > -       andl    %edx, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > +       testl   %esi, %esi
> > +       jnz     L(loop_end)
> >
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > -       addq    %r8, %rax
> > -       VZEROUPPER_RETURN
> > +       addq    $(VEC_SIZE * -4), %rax
> > +       cmpq    %rdx, %rax
> > +       jne     L(loop_4x_vec)
> >
> > -       .p2align 4
> > -L(last_vec_2x_aligned):
> > -       movl    %esi, %ecx
> > +       subl    %edi, %edx
> > +       incl    %edx
> >
> > -       /* Check the last VEC.  */
> > -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
> > +L(last_4x_vec):
> > +       /* Used no matter what.  */
> > +       vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> >
> > -       movl    $1, %edx
> > -       sall    %cl, %edx
> > -       subl    $1, %edx
> > +       cmpl    $(VEC_SIZE * 2), %edx
> > +       jbe     L(last_2x_vec)
> >
> > -       vpmovmskb %ymm1, %eax
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0_end)
> >
> > -       /* Remove the trailing bytes.  */
> > -       andl    %edx, %eax
> > +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x1_end)
> >
> > -       testl   %eax, %eax
> > -       jnz     L(last_vec_x1)
> > +       /* Used no matter what.  */
> > +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %ecx
> >
> > -       /* Check the second last VEC.  */
> > -       vpcmpeqb (%rdi), %ymm0, %ymm1
> > +       cmpl    $(VEC_SIZE * 3), %edx
> > +       ja      L(last_vec)
> > +
> > +       lzcntl  %ecx, %ecx
> > +       subq    $(VEC_SIZE * 2), %rax
> > +       COND_VZEROUPPER
> > +       subq    %rcx, %rax
> > +       cmpq    %rax, %rdi
> > +       jbe     L(ret0)
> > +       xorl    %eax, %eax
> > +L(ret0):
> > +       ret
> >
> > -       movl    %r8d, %ecx
> >
> > -       vpmovmskb %ymm1, %eax
> > +       .p2align 4
> > +L(loop_end):
> > +       vpmovmskb %ymm1, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x0_end)
> > +
> > +       vpmovmskb %ymm2, %ecx
> > +       testl   %ecx, %ecx
> > +       jnz     L(ret_vec_x1_end)
> > +
> > +       vpmovmskb %ymm3, %ecx
> > +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> > +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> > +          then CHAR in VEC3 and bsrq will use that position.  */
> > +       salq    $32, %rcx
> > +       orq     %rsi, %rcx
> > +       bsrq    %rcx, %rcx
> > +       leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
> > +       VZEROUPPER_RETURN
> >
> > -       /* Remove the leading bytes.  Must use unsigned right shift for
> > -          bsrl below.  */
> > -       shrl    %cl, %eax
> > -       testl   %eax, %eax
> > -       jz      L(zero)
> > +       .p2align 4,, 4
> > +L(ret_vec_x1_end):
> > +       /* 64-bit version will automatically add 32 (VEC_SIZE).  */
> > +       lzcntq  %rcx, %rcx
> > +       subq    %rcx, %rax
> > +       VZEROUPPER_RETURN
> >
> > -       bsrl    %eax, %eax
> > -       addq    %rdi, %rax
> > -       addq    %r8, %rax
> > +       .p2align 4,, 4
> > +L(ret_vec_x0_end):
> > +       lzcntl  %ecx, %ecx
> > +       subq    %rcx, %rax
> >         VZEROUPPER_RETURN
> > -END (MEMRCHR)
> > +
> > +       /* 2 bytes until next cache line.  */
> > +END(MEMRCHR)
> >  #endif
> > --
> > 2.34.1
> >
>
> OK with the updated comments.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v6 6/8] x86: Optimize memrchr-avx2.S
  2022-07-14  2:26         ` Sunil Pandey
@ 2022-07-14  2:43           ` Noah Goldstein
  0 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-07-14  2:43 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: H.J. Lu, GNU C Library

On Wed, Jul 13, 2022 at 7:26 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Tue, Jun 7, 2022 at 11:18 AM H.J. Lu via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > The new code:
> > >     1. prioritizes smaller user-arg lengths more.
> > >     2. optimizes target placement more carefully
> > >     3. reuses logic more
> > >     4. fixes up various inefficiencies in the logic. The biggest
> > >        case here is the `lzcnt` logic for checking returns which
> > >        saves either a branch or multiple instructions.
> > >
> > > The total code size saving is: 306 bytes
> > > Geometric Mean of all benchmarks New / Old: 0.760
> > >
> > > Regressions:
> > > There are some regressions. Particularly where the length (user arg
> > > length) is large but the position of the match char is near the
> > > beginning of the string (in first VEC). This case has roughly a
> > > 10-20% regression.
> > >
> > > This is because the new logic gives the hot path for immediate matches
> > > to shorter lengths (the more common input). This case has roughly
> > > a 15-45% speedup.
> > >
> > > Full xcheck passes on x86_64.
> > > ---
> > >  sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S |   1 +
> > >  sysdeps/x86_64/multiarch/memrchr-avx2.S     | 534 ++++++++++----------
> > >  2 files changed, 257 insertions(+), 278 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> > > index cea2d2a72d..5e9beeeef2 100644
> > > --- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> > > +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> > > @@ -2,6 +2,7 @@
> > >  # define MEMRCHR __memrchr_avx2_rtm
> > >  #endif
> > >
> > > +#define COND_VZEROUPPER        COND_VZEROUPPER_XTEST
> > >  #define ZERO_UPPER_VEC_REGISTERS_RETURN \
> > >    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
> > > index ba2ce7cb03..bea4528068 100644
> > > --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
> > > +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
> > > @@ -21,340 +21,318 @@
> > >  # include <sysdep.h>
> > >
> > >  # ifndef MEMRCHR
> > > -#  define MEMRCHR      __memrchr_avx2
> > > +#  define MEMRCHR                              __memrchr_avx2
> > >  # endif
> > >
> > >  # ifndef VZEROUPPER
> > > -#  define VZEROUPPER   vzeroupper
> > > +#  define VZEROUPPER                   vzeroupper
> > >  # endif
> > >
> > >  # ifndef SECTION
> > >  #  define SECTION(p)   p##.avx
> > >  # endif
> > >
> > > -# define VEC_SIZE 32
> > > +# define VEC_SIZE                      32
> > > +# define PAGE_SIZE                     4096
> > > +       .section SECTION(.text), "ax", @progbits
> > > +ENTRY(MEMRCHR)
> > > +# ifdef __ILP32__
> > > +       /* Clear upper bits.  */
> > > +       and     %RDX_LP, %RDX_LP
> > > +# else
> > > +       test    %RDX_LP, %RDX_LP
> > > +# endif
> > > +       jz      L(zero_0)
> > >
> > > -       .section SECTION(.text),"ax",@progbits
> > > -ENTRY (MEMRCHR)
> > > -       /* Broadcast CHAR to YMM0.  */
> > >         vmovd   %esi, %xmm0
> > > -       vpbroadcastb %xmm0, %ymm0
> > > -
> > > -       sub     $VEC_SIZE, %RDX_LP
> > > -       jbe     L(last_vec_or_less)
> > > -
> > > -       add     %RDX_LP, %RDI_LP
> > > -
> > > -       /* Check the last VEC_SIZE bytes.  */
> > > -       vpcmpeqb (%rdi), %ymm0, %ymm1
> > > -       vpmovmskb %ymm1, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x0)
> > > +       /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
> > > +          correct page cross check and 2) it correctly sets up end ptr to be
> > > +          subtract by lzcnt aligned.  */
> > > +       leaq    -1(%rdx, %rdi), %rax
> > >
> > > -       subq    $(VEC_SIZE * 4), %rdi
> > > -       movl    %edi, %ecx
> > > -       andl    $(VEC_SIZE - 1), %ecx
> > > -       jz      L(aligned_more)
> > > +       vpbroadcastb %xmm0, %ymm0
> > >
> > > -       /* Align data for aligned loads in the loop.  */
> > > -       addq    $VEC_SIZE, %rdi
> > > -       addq    $VEC_SIZE, %rdx
> > > -       andq    $-VEC_SIZE, %rdi
> > > -       subq    %rcx, %rdx
> > > +       /* Check if we can load 1x VEC without cross a page.  */
> > > +       testl   $(PAGE_SIZE - VEC_SIZE), %eax
> > > +       jz      L(page_cross)
> > > +
> > > +       vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
> > > +       vpmovmskb %ymm1, %ecx
> > > +       cmpq    $VEC_SIZE, %rdx
> > > +       ja      L(more_1x_vec)
> > > +
> > > +L(ret_vec_x0_test):
> > > +       /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
> > > +          will gurantee edx (len) is less than it.  */
> > > +       lzcntl  %ecx, %ecx
> > > +
> > > +       /* Hoist vzeroupper (not great for RTM) to save code size. This allows
> > > +          all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
> > > +       COND_VZEROUPPER
> > > +       cmpl    %ecx, %edx
> > > +       jle     L(zero_0)
> > > +       subq    %rcx, %rax
> > > +       ret
> > >
> > > -       .p2align 4
> > > -L(aligned_more):
> > > -       subq    $(VEC_SIZE * 4), %rdx
> > > -       jbe     L(last_4x_vec_or_less)
> > > -
> > > -       /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > -          since data is only aligned to VEC_SIZE.  */
> > > -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > -       vpmovmskb %ymm1, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x3)
> > > -
> > > -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
> > > -       vpmovmskb %ymm2, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x2)
> > > -
> > > -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
> > > -       vpmovmskb %ymm3, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x1)
> > > -
> > > -       vpcmpeqb (%rdi), %ymm0, %ymm4
> > > -       vpmovmskb %ymm4, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x0)
> > > -
> > > -       /* Align data to 4 * VEC_SIZE for loop with fewer branches.
> > > -          There are some overlaps with above if data isn't aligned
> > > -          to 4 * VEC_SIZE.  */
> > > -       movl    %edi, %ecx
> > > -       andl    $(VEC_SIZE * 4 - 1), %ecx
> > > -       jz      L(loop_4x_vec)
> > > -
> > > -       addq    $(VEC_SIZE * 4), %rdi
> > > -       addq    $(VEC_SIZE * 4), %rdx
> > > -       andq    $-(VEC_SIZE * 4), %rdi
> > > -       subq    %rcx, %rdx
> > > +       /* Fits in aligning bytes of first cache line.  */
> > > +L(zero_0):
> > > +       xorl    %eax, %eax
> > > +       ret
> > >
> > > -       .p2align 4
> > > -L(loop_4x_vec):
> > > -       /* Compare 4 * VEC at a time forward.  */
> > > -       subq    $(VEC_SIZE * 4), %rdi
> > > -       subq    $(VEC_SIZE * 4), %rdx
> > > -       jbe     L(last_4x_vec_or_less)
> > > -
> > > -       vmovdqa (%rdi), %ymm1
> > > -       vmovdqa VEC_SIZE(%rdi), %ymm2
> > > -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> > > -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> > > -
> > > -       vpcmpeqb %ymm1, %ymm0, %ymm1
> > > -       vpcmpeqb %ymm2, %ymm0, %ymm2
> > > -       vpcmpeqb %ymm3, %ymm0, %ymm3
> > > -       vpcmpeqb %ymm4, %ymm0, %ymm4
> > > -
> > > -       vpor    %ymm1, %ymm2, %ymm5
> > > -       vpor    %ymm3, %ymm4, %ymm6
> > > -       vpor    %ymm5, %ymm6, %ymm5
> > > -
> > > -       vpmovmskb %ymm5, %eax
> > > -       testl   %eax, %eax
> > > -       jz      L(loop_4x_vec)
> > > -
> > > -       /* There is a match.  */
> > > -       vpmovmskb %ymm4, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x3)
> > > -
> > > -       vpmovmskb %ymm3, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x2)
> > > -
> > > -       vpmovmskb %ymm2, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x1)
> > > -
> > > -       vpmovmskb %ymm1, %eax
> > > -       bsrl    %eax, %eax
> > > -       addq    %rdi, %rax
> > > +       .p2align 4,, 9
> > > +L(ret_vec_x0):
> > > +       lzcntl  %ecx, %ecx
> > > +       subq    %rcx, %rax
> > >  L(return_vzeroupper):
> > >         ZERO_UPPER_VEC_REGISTERS_RETURN
> > >
> > > -       .p2align 4
> > > -L(last_4x_vec_or_less):
> > > -       addl    $(VEC_SIZE * 4), %edx
> > > -       cmpl    $(VEC_SIZE * 2), %edx
> > > -       jbe     L(last_2x_vec)
> > > -
> > > -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > -       vpmovmskb %ymm1, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x3)
> > > -
> > > -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
> > > -       vpmovmskb %ymm2, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x2)
> > > -
> > > -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
> > > -       vpmovmskb %ymm3, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x1_check)
> > > -       cmpl    $(VEC_SIZE * 3), %edx
> > > -       jbe     L(zero)
> > > -
> > > -       vpcmpeqb (%rdi), %ymm0, %ymm4
> > > -       vpmovmskb %ymm4, %eax
> > > -       testl   %eax, %eax
> > > -       jz      L(zero)
> > > -       bsrl    %eax, %eax
> > > -       subq    $(VEC_SIZE * 4), %rdx
> > > -       addq    %rax, %rdx
> > > -       jl      L(zero)
> > > -       addq    %rdi, %rax
> > > -       VZEROUPPER_RETURN
> > > -
> > > -       .p2align 4
> > > +       .p2align 4,, 10
> > > +L(more_1x_vec):
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x0)
> > > +
> > > +       /* Align rax (string pointer).  */
> > > +       andq    $-VEC_SIZE, %rax
> > > +
> > > +       /* Recompute remaining length after aligning.  */
> > > +       movq    %rax, %rdx
> > > +       /* Need this comparison next no matter what.  */
> > > +       vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
> > > +       subq    %rdi, %rdx
> > > +       decq    %rax
> > > +       vpmovmskb %ymm1, %ecx
> > > +       /* Fall through for short (hotter than length).  */
> > > +       cmpq    $(VEC_SIZE * 2), %rdx
> > > +       ja      L(more_2x_vec)
> > >  L(last_2x_vec):
> > > -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > -       vpmovmskb %ymm1, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x3_check)
> > >         cmpl    $VEC_SIZE, %edx
> > > -       jbe     L(zero)
> > > -
> > > -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > -       vpmovmskb %ymm1, %eax
> > > -       testl   %eax, %eax
> > > -       jz      L(zero)
> > > -       bsrl    %eax, %eax
> > > -       subq    $(VEC_SIZE * 2), %rdx
> > > -       addq    %rax, %rdx
> > > -       jl      L(zero)
> > > -       addl    $(VEC_SIZE * 2), %eax
> > > -       addq    %rdi, %rax
> > > -       VZEROUPPER_RETURN
> > > -
> > > -       .p2align 4
> > > -L(last_vec_x0):
> > > -       bsrl    %eax, %eax
> > > -       addq    %rdi, %rax
> > > -       VZEROUPPER_RETURN
> > > +       jbe     L(ret_vec_x0_test)
> > > +
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x0)
> > > +
> > > +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> > > +       vpmovmskb %ymm1, %ecx
> > > +       /* 64-bit lzcnt. This will naturally add 32 to position.  */
> > > +       lzcntq  %rcx, %rcx
> > > +       COND_VZEROUPPER
> > > +       cmpl    %ecx, %edx
> > > +       jle     L(zero_0)
> > > +       subq    %rcx, %rax
> > > +       ret
> > >
> > > -       .p2align 4
> > > -L(last_vec_x1):
> > > -       bsrl    %eax, %eax
> > > -       addl    $VEC_SIZE, %eax
> > > -       addq    %rdi, %rax
> > > -       VZEROUPPER_RETURN
> > >
> > > -       .p2align 4
> > > -L(last_vec_x2):
> > > -       bsrl    %eax, %eax
> > > -       addl    $(VEC_SIZE * 2), %eax
> > > -       addq    %rdi, %rax
> > > +       /* Inexpensive place to put this regarding code size / target alignments
> > > +          / ICache NLP. Necessary for 2-byte encoding of jump to page cross
> > > +          case which in turn in necessary for hot path (len <= VEC_SIZE) to fit
> >                                              is necessary?
> > > +          in first cache line.  */
> > > +L(page_cross):
> > > +       movq    %rax, %rsi
> > > +       andq    $-VEC_SIZE, %rsi
> > > +       vpcmpeqb (%rsi), %ymm0, %ymm1
> > > +       vpmovmskb %ymm1, %ecx
> > > +       /* Shift out negative alignment (because we are starting from endptr and
> > > +          working backwards).  */
> > > +       movl    %eax, %r8d
> > > +       /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
> > > +       notl    %r8d
> > > +       shlxl   %r8d, %ecx, %ecx
> > > +       cmpq    %rdi, %rsi
> > > +       ja      L(more_1x_vec)
> > > +       lzcntl  %ecx, %ecx
> > > +       COND_VZEROUPPER
> > > +       cmpl    %ecx, %edx
> > > +       jle     L(zero_0)
> > > +       subq    %rcx, %rax
> > > +       ret
> > > +       .p2align 4,, 11
> > > +L(ret_vec_x1):
> > > +       /* This will naturally add 32 to position.  */
> > > +       lzcntq  %rcx, %rcx
> > > +       subq    %rcx, %rax
> > >         VZEROUPPER_RETURN
> > > +       .p2align 4,, 10
> > > +L(more_2x_vec):
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x0)
> > >
> > > -       .p2align 4
> > > -L(last_vec_x3):
> > > -       bsrl    %eax, %eax
> > > -       addl    $(VEC_SIZE * 3), %eax
> > > -       addq    %rdi, %rax
> > > -       ret
> > > +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> > > +       vpmovmskb %ymm1, %ecx
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x1)
> > >
> > > -       .p2align 4
> > > -L(last_vec_x1_check):
> > > -       bsrl    %eax, %eax
> > > -       subq    $(VEC_SIZE * 3), %rdx
> > > -       addq    %rax, %rdx
> > > -       jl      L(zero)
> > > -       addl    $VEC_SIZE, %eax
> > > -       addq    %rdi, %rax
> > > -       VZEROUPPER_RETURN
> > >
> > > -       .p2align 4
> > > -L(last_vec_x3_check):
> > > -       bsrl    %eax, %eax
> > > -       subq    $VEC_SIZE, %rdx
> > > -       addq    %rax, %rdx
> > > -       jl      L(zero)
> > > -       addl    $(VEC_SIZE * 3), %eax
> > > -       addq    %rdi, %rax
> > > -       VZEROUPPER_RETURN
> > > +       /* Needed no matter what.  */
> > > +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
> > > +       vpmovmskb %ymm1, %ecx
> > >
> > > -       .p2align 4
> > > -L(zero):
> > > -       xorl    %eax, %eax
> > > -       VZEROUPPER_RETURN
> > > +       subq    $(VEC_SIZE * 4), %rdx
> > > +       ja      L(more_4x_vec)
> > > +
> > > +       cmpl    $(VEC_SIZE * -1), %edx
> > > +       jle     L(ret_vec_x2_test)
> > > +
> > > +L(last_vec):
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x2)
> > > +
> > > +       /* Needed no matter what.  */
> > > +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
> > > +       vpmovmskb %ymm1, %ecx
> > > +       lzcntl  %ecx, %ecx
> > > +       subq    $(VEC_SIZE * 3), %rax
> > > +       COND_VZEROUPPER
> > > +       subq    %rcx, %rax
> > > +       cmpq    %rax, %rdi
> > > +       ja      L(zero_2)
> > > +       ret
> > >
> > > -       .p2align 4
> > > -L(null):
> > > +       /* First in aligning bytes.  */
> > > +L(zero_2):
> > >         xorl    %eax, %eax
> > >         ret
> > >
> > > -       .p2align 4
> > > -L(last_vec_or_less_aligned):
> > > -       movl    %edx, %ecx
> > > +       .p2align 4,, 4
> > > +L(ret_vec_x2_test):
> > > +       lzcntl  %ecx, %ecx
> > > +       subq    $(VEC_SIZE * 2), %rax
> > > +       COND_VZEROUPPER
> > > +       subq    %rcx, %rax
> > > +       cmpq    %rax, %rdi
> > > +       ja      L(zero_2)
> > > +       ret
> > >
> > > -       vpcmpeqb (%rdi), %ymm0, %ymm1
> > >
> > > -       movl    $1, %edx
> > > -       /* Support rdx << 32.  */
> > > -       salq    %cl, %rdx
> > > -       subq    $1, %rdx
> > > +       .p2align 4,, 11
> > > +L(ret_vec_x2):
> > > +       /* ecx must be non-zero.  */
> > > +       bsrl    %ecx, %ecx
> > > +       leaq    (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
> > > +       VZEROUPPER_RETURN
> > >
> > > -       vpmovmskb %ymm1, %eax
> > > +       .p2align 4,, 14
> > > +L(ret_vec_x3):
> > > +       /* ecx must be non-zero.  */
> > > +       bsrl    %ecx, %ecx
> > > +       leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
> > > +       VZEROUPPER_RETURN
> > >
> > > -       /* Remove the trailing bytes.  */
> > > -       andl    %edx, %eax
> > > -       testl   %eax, %eax
> > > -       jz      L(zero)
> > >
> > > -       bsrl    %eax, %eax
> > > -       addq    %rdi, %rax
> > > -       VZEROUPPER_RETURN
> > >
> > >         .p2align 4
> > > -L(last_vec_or_less):
> > > -       addl    $VEC_SIZE, %edx
> > > +L(more_4x_vec):
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x2)
> > >
> > > -       /* Check for zero length.  */
> > > -       testl   %edx, %edx
> > > -       jz      L(null)
> > > +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
> > > +       vpmovmskb %ymm1, %ecx
> > >
> > > -       movl    %edi, %ecx
> > > -       andl    $(VEC_SIZE - 1), %ecx
> > > -       jz      L(last_vec_or_less_aligned)
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x3)
> > >
> > > -       movl    %ecx, %esi
> > > -       movl    %ecx, %r8d
> > > -       addl    %edx, %esi
> > > -       andq    $-VEC_SIZE, %rdi
> > > +       /* Check if near end before re-aligning (otherwise might do an
> > > +          unnecissary loop iteration).  */
> > > +       addq    $-(VEC_SIZE * 4), %rax
> > > +       cmpq    $(VEC_SIZE * 4), %rdx
> > > +       jbe     L(last_4x_vec)
> > >
> > > -       subl    $VEC_SIZE, %esi
> > > -       ja      L(last_vec_2x_aligned)
> > > +       /* Align rax to (VEC_SIZE - 1).  */
> > > +       orq     $(VEC_SIZE * 4 - 1), %rax
> > > +       movq    %rdi, %rdx
> > > +       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
> > > +          lengths that overflow can be valid and break the comparison.  */
> > > +       orq     $(VEC_SIZE * 4 - 1), %rdx
> > >
> > > -       /* Check the last VEC.  */
> > > -       vpcmpeqb (%rdi), %ymm0, %ymm1
> > > -       vpmovmskb %ymm1, %eax
> > > -
> > > -       /* Remove the leading and trailing bytes.  */
> > > -       sarl    %cl, %eax
> > > -       movl    %edx, %ecx
> > > +       .p2align 4
> > > +L(loop_4x_vec):
> > > +       /* Need this comparison next no matter what.  */
> > > +       vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
> > > +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
> > > +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
> > > +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
> > >
> > > -       movl    $1, %edx
> > > -       sall    %cl, %edx
> > > -       subl    $1, %edx
> > > +       vpor    %ymm1, %ymm2, %ymm2
> > > +       vpor    %ymm3, %ymm4, %ymm4
> > > +       vpor    %ymm2, %ymm4, %ymm4
> > > +       vpmovmskb %ymm4, %esi
> > >
> > > -       andl    %edx, %eax
> > > -       testl   %eax, %eax
> > > -       jz      L(zero)
> > > +       testl   %esi, %esi
> > > +       jnz     L(loop_end)
> > >
> > > -       bsrl    %eax, %eax
> > > -       addq    %rdi, %rax
> > > -       addq    %r8, %rax
> > > -       VZEROUPPER_RETURN
> > > +       addq    $(VEC_SIZE * -4), %rax
> > > +       cmpq    %rdx, %rax
> > > +       jne     L(loop_4x_vec)
> > >
> > > -       .p2align 4
> > > -L(last_vec_2x_aligned):
> > > -       movl    %esi, %ecx
> > > +       subl    %edi, %edx
> > > +       incl    %edx
> > >
> > > -       /* Check the last VEC.  */
> > > -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
> > > +L(last_4x_vec):
> > > +       /* Used no matter what.  */
> > > +       vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
> > > +       vpmovmskb %ymm1, %ecx
> > >
> > > -       movl    $1, %edx
> > > -       sall    %cl, %edx
> > > -       subl    $1, %edx
> > > +       cmpl    $(VEC_SIZE * 2), %edx
> > > +       jbe     L(last_2x_vec)
> > >
> > > -       vpmovmskb %ymm1, %eax
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x0_end)
> > >
> > > -       /* Remove the trailing bytes.  */
> > > -       andl    %edx, %eax
> > > +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> > > +       vpmovmskb %ymm1, %ecx
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x1_end)
> > >
> > > -       testl   %eax, %eax
> > > -       jnz     L(last_vec_x1)
> > > +       /* Used no matter what.  */
> > > +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
> > > +       vpmovmskb %ymm1, %ecx
> > >
> > > -       /* Check the second last VEC.  */
> > > -       vpcmpeqb (%rdi), %ymm0, %ymm1
> > > +       cmpl    $(VEC_SIZE * 3), %edx
> > > +       ja      L(last_vec)
> > > +
> > > +       lzcntl  %ecx, %ecx
> > > +       subq    $(VEC_SIZE * 2), %rax
> > > +       COND_VZEROUPPER
> > > +       subq    %rcx, %rax
> > > +       cmpq    %rax, %rdi
> > > +       jbe     L(ret0)
> > > +       xorl    %eax, %eax
> > > +L(ret0):
> > > +       ret
> > >
> > > -       movl    %r8d, %ecx
> > >
> > > -       vpmovmskb %ymm1, %eax
> > > +       .p2align 4
> > > +L(loop_end):
> > > +       vpmovmskb %ymm1, %ecx
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x0_end)
> > > +
> > > +       vpmovmskb %ymm2, %ecx
> > > +       testl   %ecx, %ecx
> > > +       jnz     L(ret_vec_x1_end)
> > > +
> > > +       vpmovmskb %ymm3, %ecx
> > > +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> > > +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> > > +          then CHAR in VEC3 and bsrq will use that position.  */
> > > +       salq    $32, %rcx
> > > +       orq     %rsi, %rcx
> > > +       bsrq    %rcx, %rcx
> > > +       leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
> > > +       VZEROUPPER_RETURN
> > >
> > > -       /* Remove the leading bytes.  Must use unsigned right shift for
> > > -          bsrl below.  */
> > > -       shrl    %cl, %eax
> > > -       testl   %eax, %eax
> > > -       jz      L(zero)
> > > +       .p2align 4,, 4
> > > +L(ret_vec_x1_end):
> > > +       /* 64-bit version will automatically add 32 (VEC_SIZE).  */
> > > +       lzcntq  %rcx, %rcx
> > > +       subq    %rcx, %rax
> > > +       VZEROUPPER_RETURN
> > >
> > > -       bsrl    %eax, %eax
> > > -       addq    %rdi, %rax
> > > -       addq    %r8, %rax
> > > +       .p2align 4,, 4
> > > +L(ret_vec_x0_end):
> > > +       lzcntl  %ecx, %ecx
> > > +       subq    %rcx, %rax
> > >         VZEROUPPER_RETURN
> > > -END (MEMRCHR)
> > > +
> > > +       /* 2 bytes until next cache line.  */
> > > +END(MEMRCHR)
> > >  #endif
> > > --
> > > 2.34.1
> > >
> >
> > OK with the updated comments.
> >
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> >
> > Thanks.
> >
> > --
> > H.J.
>
> I would like to backport this patch to release branches.
> Any comments or objections?

Probably should also do:
https://sourceware.org/git/?p=glibc.git;a=commit;h=227afaa67213efcdce6a870ef5086200f1076438
>
> --Sunil

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v6 7/8] x86: Shrink code size of memchr-avx2.S
  2022-06-07  4:11   ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (4 preceding siblings ...)
  2022-06-07  4:11     ` [PATCH v6 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
@ 2022-06-07  4:11     ` Noah Goldstein
  2022-06-07 18:18       ` H.J. Lu
  2022-06-07  4:11     ` [PATCH v6 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
  2022-06-07 18:04     ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library H.J. Lu
  7 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:11 UTC (permalink / raw)
  To: libc-alpha

This is not meant as a performance optimization. The previous code was
far to liberal in aligning targets and wasted code size unnecissarily.

The total code size saving is: 59 bytes

There are no major changes in the benchmarks.
Geometric Mean of all benchmarks New / Old: 0.967

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S |   1 +
 sysdeps/x86_64/multiarch/memchr-avx2.S     | 109 +++++++++++----------
 2 files changed, 60 insertions(+), 50 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
index 87b076c7c4..c4d71938c5 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
@@ -2,6 +2,7 @@
 # define MEMCHR __memchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 75bd7262e0..28a01280ec 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -57,7 +57,7 @@
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 5)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 #  ifdef __ILP32__
@@ -87,12 +87,14 @@ ENTRY (MEMCHR)
 # endif
 	testl	%eax, %eax
 	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
 
 # ifndef USE_AS_RAWMEMCHR
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
 	/* Check if first match was before length.  */
 	tzcntl	%eax, %eax
@@ -100,58 +102,31 @@ L(first_vec_x0):
 	/* NB: Multiply length by 4 to get byte count.  */
 	sall	$2, %edx
 #  endif
-	xorl	%ecx, %ecx
+    COND_VZEROUPPER
+	/* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
+	   block. branch here as opposed to cmovcc is not that costly. Common
+	   usage of memchr is to check if the return was NULL (if string was
+	   known to contain CHAR user would use rawmemchr). This branch will be
+	   highly correlated with the user branch and can be used by most
+	   modern branch predictors to predict the user branch.  */
 	cmpl	%eax, %edx
-	leaq	(%rdi, %rax), %rax
-	cmovle	%rcx, %rax
-	VZEROUPPER_RETURN
-
-L(null):
-	xorl	%eax, %eax
-	ret
-# endif
-	.p2align 4
-L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is
-	   necessary for computer return address if byte is found or
-	   adjusting length if it is not and this is memchr.  */
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
-	   and rdi for rawmemchr.  */
-	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
-	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Calculate length until end of page (length checked for a
-	   match).  */
-	leaq	1(%ALGN_PTR_REG), %rsi
-	subq	%RRAW_PTR_REG, %rsi
-#  ifdef USE_AS_WMEMCHR
-	/* NB: Divide bytes by 4 to get wchar_t count.  */
-	shrl	$2, %esi
-#  endif
-# endif
-	/* Remove the leading bytes.  */
-	sarxl	%ERAW_PTR_REG, %eax, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Check the end of data.  */
-	cmpq	%rsi, %rdx
-	jbe	L(first_vec_x0)
+    jle  L(null)
+	addq	%rdi, %rax
+    ret
 # endif
-	testl	%eax, %eax
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
-	addq	%RRAW_PTR_REG, %rax
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x1):
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-
+# ifndef USE_AS_RAWMEMCHR
+	/* First in aligning bytes here.  */
+L(null):
+	xorl	%eax, %eax
+	ret
+# endif
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
@@ -340,7 +315,7 @@ L(first_vec_x1_check):
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-	.p2align 4
+	.p2align 4,, 6
 L(set_zero_end):
 	xorl	%eax, %eax
 	VZEROUPPER_RETURN
@@ -428,5 +403,39 @@ L(last_vec_x3):
 	VZEROUPPER_RETURN
 # endif
 
+	.p2align 4
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is necessary for
+	   computer return address if byte is found or adjusting length if it
+	   is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
+	   rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMPEQ	(%ALGN_PTR_REG), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length checked for a match).  */
+	leal	VEC_SIZE(%ALGN_PTR_REG), %esi
+	subl	%ERAW_PTR_REG, %esi
+# ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+# endif
+# endif
+	/* Remove the leading bytes.  */
+	sarxl	%ERAW_PTR_REG, %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Check the end of data.  */
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	bsfl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
+	VZEROUPPER_RETURN
+
+
 END (MEMCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v6 7/8] x86: Shrink code size of memchr-avx2.S
  2022-06-07  4:11     ` [PATCH v6 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
@ 2022-06-07 18:18       ` H.J. Lu
  2022-07-14  2:31         ` Sunil Pandey
  0 siblings, 1 reply; 82+ messages in thread
From: H.J. Lu @ 2022-06-07 18:18 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This is not meant as a performance optimization. The previous code was
> far to liberal in aligning targets and wasted code size unnecissarily.
>
> The total code size saving is: 59 bytes
>
> There are no major changes in the benchmarks.
> Geometric Mean of all benchmarks New / Old: 0.967
>
> Full xcheck passes on x86_64.
> ---
>  sysdeps/x86_64/multiarch/memchr-avx2-rtm.S |   1 +
>  sysdeps/x86_64/multiarch/memchr-avx2.S     | 109 +++++++++++----------
>  2 files changed, 60 insertions(+), 50 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> index 87b076c7c4..c4d71938c5 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> @@ -2,6 +2,7 @@
>  # define MEMCHR __memchr_avx2_rtm
>  #endif
>
> +#define COND_VZEROUPPER        COND_VZEROUPPER_XTEST
>  #define ZERO_UPPER_VEC_REGISTERS_RETURN \
>    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 75bd7262e0..28a01280ec 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -57,7 +57,7 @@
>  # define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
>         .section SECTION(.text),"ax",@progbits
> -ENTRY (MEMCHR)
> +ENTRY_P2ALIGN (MEMCHR, 5)
>  # ifndef USE_AS_RAWMEMCHR
>         /* Check for zero length.  */
>  #  ifdef __ILP32__
> @@ -87,12 +87,14 @@ ENTRY (MEMCHR)
>  # endif
>         testl   %eax, %eax
>         jz      L(aligned_more)
> -       tzcntl  %eax, %eax
> +       bsfl    %eax, %eax
>         addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> +L(return_vzeroupper):
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
> +
>
>  # ifndef USE_AS_RAWMEMCHR
> -       .p2align 5
> +       .p2align 4
>  L(first_vec_x0):
>         /* Check if first match was before length.  */
>         tzcntl  %eax, %eax
> @@ -100,58 +102,31 @@ L(first_vec_x0):
>         /* NB: Multiply length by 4 to get byte count.  */
>         sall    $2, %edx
>  #  endif
> -       xorl    %ecx, %ecx
> +    COND_VZEROUPPER
> +       /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
> +          block. branch here as opposed to cmovcc is not that costly. Common
> +          usage of memchr is to check if the return was NULL (if string was
> +          known to contain CHAR user would use rawmemchr). This branch will be
> +          highly correlated with the user branch and can be used by most
> +          modern branch predictors to predict the user branch.  */
>         cmpl    %eax, %edx
> -       leaq    (%rdi, %rax), %rax
> -       cmovle  %rcx, %rax
> -       VZEROUPPER_RETURN
> -
> -L(null):
> -       xorl    %eax, %eax
> -       ret
> -# endif
> -       .p2align 4
> -L(cross_page_boundary):
> -       /* Save pointer before aligning as its original value is
> -          necessary for computer return address if byte is found or
> -          adjusting length if it is not and this is memchr.  */
> -       movq    %rdi, %rcx
> -       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
> -          and rdi for rawmemchr.  */
> -       orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
> -       VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Calculate length until end of page (length checked for a
> -          match).  */
> -       leaq    1(%ALGN_PTR_REG), %rsi
> -       subq    %RRAW_PTR_REG, %rsi
> -#  ifdef USE_AS_WMEMCHR
> -       /* NB: Divide bytes by 4 to get wchar_t count.  */
> -       shrl    $2, %esi
> -#  endif
> -# endif
> -       /* Remove the leading bytes.  */
> -       sarxl   %ERAW_PTR_REG, %eax, %eax
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Check the end of data.  */
> -       cmpq    %rsi, %rdx
> -       jbe     L(first_vec_x0)
> +    jle  L(null)
> +       addq    %rdi, %rax
> +    ret
>  # endif
> -       testl   %eax, %eax
> -       jz      L(cross_page_continue)
> -       tzcntl  %eax, %eax
> -       addq    %RRAW_PTR_REG, %rax
> -L(return_vzeroupper):
> -       ZERO_UPPER_VEC_REGISTERS_RETURN
>
> -       .p2align 4
> +       .p2align 4,, 10
>  L(first_vec_x1):
> -       tzcntl  %eax, %eax
> +       bsfl    %eax, %eax
>         incq    %rdi
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
> -
> +# ifndef USE_AS_RAWMEMCHR
> +       /* First in aligning bytes here.  */
> +L(null):
> +       xorl    %eax, %eax
> +       ret
> +# endif
>         .p2align 4
>  L(first_vec_x2):
>         tzcntl  %eax, %eax
> @@ -340,7 +315,7 @@ L(first_vec_x1_check):
>         incq    %rdi
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
> -       .p2align 4
> +       .p2align 4,, 6
>  L(set_zero_end):
>         xorl    %eax, %eax
>         VZEROUPPER_RETURN
> @@ -428,5 +403,39 @@ L(last_vec_x3):
>         VZEROUPPER_RETURN
>  # endif
>
> +       .p2align 4
> +L(cross_page_boundary):
> +       /* Save pointer before aligning as its original value is necessary for
> +          computer return address if byte is found or adjusting length if it
> +          is not and this is memchr.  */
> +       movq    %rdi, %rcx
> +       /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
> +          rawmemchr.  */
> +       andq    $-VEC_SIZE, %ALGN_PTR_REG
> +       VPCMPEQ (%ALGN_PTR_REG), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +       /* Calculate length until end of page (length checked for a match).  */
> +       leal    VEC_SIZE(%ALGN_PTR_REG), %esi
> +       subl    %ERAW_PTR_REG, %esi
> +# ifdef USE_AS_WMEMCHR
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> +       shrl    $2, %esi
> +# endif
> +# endif
> +       /* Remove the leading bytes.  */
> +       sarxl   %ERAW_PTR_REG, %eax, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +       /* Check the end of data.  */
> +       cmpq    %rsi, %rdx
> +       jbe     L(first_vec_x0)
> +# endif
> +       testl   %eax, %eax
> +       jz      L(cross_page_continue)
> +       bsfl    %eax, %eax
> +       addq    %RRAW_PTR_REG, %rax
> +       VZEROUPPER_RETURN
> +
> +
>  END (MEMCHR)
>  #endif
> --
> 2.34.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v6 7/8] x86: Shrink code size of memchr-avx2.S
  2022-06-07 18:18       ` H.J. Lu
@ 2022-07-14  2:31         ` Sunil Pandey
  2022-07-14  2:41           ` Noah Goldstein
  0 siblings, 1 reply; 82+ messages in thread
From: Sunil Pandey @ 2022-07-14  2:31 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Noah Goldstein, GNU C Library

On Tue, Jun 7, 2022 at 11:19 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > This is not meant as a performance optimization. The previous code was
> > far to liberal in aligning targets and wasted code size unnecissarily.
> >
> > The total code size saving is: 59 bytes
> >
> > There are no major changes in the benchmarks.
> > Geometric Mean of all benchmarks New / Old: 0.967
> >
> > Full xcheck passes on x86_64.
> > ---
> >  sysdeps/x86_64/multiarch/memchr-avx2-rtm.S |   1 +
> >  sysdeps/x86_64/multiarch/memchr-avx2.S     | 109 +++++++++++----------
> >  2 files changed, 60 insertions(+), 50 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> > index 87b076c7c4..c4d71938c5 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> > @@ -2,6 +2,7 @@
> >  # define MEMCHR __memchr_avx2_rtm
> >  #endif
> >
> > +#define COND_VZEROUPPER        COND_VZEROUPPER_XTEST
> >  #define ZERO_UPPER_VEC_REGISTERS_RETURN \
> >    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > index 75bd7262e0..28a01280ec 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > @@ -57,7 +57,7 @@
> >  # define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> >
> >         .section SECTION(.text),"ax",@progbits
> > -ENTRY (MEMCHR)
> > +ENTRY_P2ALIGN (MEMCHR, 5)
> >  # ifndef USE_AS_RAWMEMCHR
> >         /* Check for zero length.  */
> >  #  ifdef __ILP32__
> > @@ -87,12 +87,14 @@ ENTRY (MEMCHR)
> >  # endif
> >         testl   %eax, %eax
> >         jz      L(aligned_more)
> > -       tzcntl  %eax, %eax
> > +       bsfl    %eax, %eax
> >         addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> > +L(return_vzeroupper):
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN
> > +
> >
> >  # ifndef USE_AS_RAWMEMCHR
> > -       .p2align 5
> > +       .p2align 4
> >  L(first_vec_x0):
> >         /* Check if first match was before length.  */
> >         tzcntl  %eax, %eax
> > @@ -100,58 +102,31 @@ L(first_vec_x0):
> >         /* NB: Multiply length by 4 to get byte count.  */
> >         sall    $2, %edx
> >  #  endif
> > -       xorl    %ecx, %ecx
> > +    COND_VZEROUPPER
> > +       /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
> > +          block. branch here as opposed to cmovcc is not that costly. Common
> > +          usage of memchr is to check if the return was NULL (if string was
> > +          known to contain CHAR user would use rawmemchr). This branch will be
> > +          highly correlated with the user branch and can be used by most
> > +          modern branch predictors to predict the user branch.  */
> >         cmpl    %eax, %edx
> > -       leaq    (%rdi, %rax), %rax
> > -       cmovle  %rcx, %rax
> > -       VZEROUPPER_RETURN
> > -
> > -L(null):
> > -       xorl    %eax, %eax
> > -       ret
> > -# endif
> > -       .p2align 4
> > -L(cross_page_boundary):
> > -       /* Save pointer before aligning as its original value is
> > -          necessary for computer return address if byte is found or
> > -          adjusting length if it is not and this is memchr.  */
> > -       movq    %rdi, %rcx
> > -       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
> > -          and rdi for rawmemchr.  */
> > -       orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
> > -       VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > -# ifndef USE_AS_RAWMEMCHR
> > -       /* Calculate length until end of page (length checked for a
> > -          match).  */
> > -       leaq    1(%ALGN_PTR_REG), %rsi
> > -       subq    %RRAW_PTR_REG, %rsi
> > -#  ifdef USE_AS_WMEMCHR
> > -       /* NB: Divide bytes by 4 to get wchar_t count.  */
> > -       shrl    $2, %esi
> > -#  endif
> > -# endif
> > -       /* Remove the leading bytes.  */
> > -       sarxl   %ERAW_PTR_REG, %eax, %eax
> > -# ifndef USE_AS_RAWMEMCHR
> > -       /* Check the end of data.  */
> > -       cmpq    %rsi, %rdx
> > -       jbe     L(first_vec_x0)
> > +    jle  L(null)
> > +       addq    %rdi, %rax
> > +    ret
> >  # endif
> > -       testl   %eax, %eax
> > -       jz      L(cross_page_continue)
> > -       tzcntl  %eax, %eax
> > -       addq    %RRAW_PTR_REG, %rax
> > -L(return_vzeroupper):
> > -       ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> > -       .p2align 4
> > +       .p2align 4,, 10
> >  L(first_vec_x1):
> > -       tzcntl  %eax, %eax
> > +       bsfl    %eax, %eax
> >         incq    %rdi
> >         addq    %rdi, %rax
> >         VZEROUPPER_RETURN
> > -
> > +# ifndef USE_AS_RAWMEMCHR
> > +       /* First in aligning bytes here.  */
> > +L(null):
> > +       xorl    %eax, %eax
> > +       ret
> > +# endif
> >         .p2align 4
> >  L(first_vec_x2):
> >         tzcntl  %eax, %eax
> > @@ -340,7 +315,7 @@ L(first_vec_x1_check):
> >         incq    %rdi
> >         addq    %rdi, %rax
> >         VZEROUPPER_RETURN
> > -       .p2align 4
> > +       .p2align 4,, 6
> >  L(set_zero_end):
> >         xorl    %eax, %eax
> >         VZEROUPPER_RETURN
> > @@ -428,5 +403,39 @@ L(last_vec_x3):
> >         VZEROUPPER_RETURN
> >  # endif
> >
> > +       .p2align 4
> > +L(cross_page_boundary):
> > +       /* Save pointer before aligning as its original value is necessary for
> > +          computer return address if byte is found or adjusting length if it
> > +          is not and this is memchr.  */
> > +       movq    %rdi, %rcx
> > +       /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
> > +          rawmemchr.  */
> > +       andq    $-VEC_SIZE, %ALGN_PTR_REG
> > +       VPCMPEQ (%ALGN_PTR_REG), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > +       /* Calculate length until end of page (length checked for a match).  */
> > +       leal    VEC_SIZE(%ALGN_PTR_REG), %esi
> > +       subl    %ERAW_PTR_REG, %esi
> > +# ifdef USE_AS_WMEMCHR
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> > +       shrl    $2, %esi
> > +# endif
> > +# endif
> > +       /* Remove the leading bytes.  */
> > +       sarxl   %ERAW_PTR_REG, %eax, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > +       /* Check the end of data.  */
> > +       cmpq    %rsi, %rdx
> > +       jbe     L(first_vec_x0)
> > +# endif
> > +       testl   %eax, %eax
> > +       jz      L(cross_page_continue)
> > +       bsfl    %eax, %eax
> > +       addq    %RRAW_PTR_REG, %rax
> > +       VZEROUPPER_RETURN
> > +
> > +
> >  END (MEMCHR)
> >  #endif
> > --
> > 2.34.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v6 7/8] x86: Shrink code size of memchr-avx2.S
  2022-07-14  2:31         ` Sunil Pandey
@ 2022-07-14  2:41           ` Noah Goldstein
  0 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-07-14  2:41 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: H.J. Lu, GNU C Library

On Wed, Jul 13, 2022 at 7:32 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Tue, Jun 7, 2022 at 11:19 AM H.J. Lu via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > This is not meant as a performance optimization. The previous code was
> > > far to liberal in aligning targets and wasted code size unnecissarily.
> > >
> > > The total code size saving is: 59 bytes
> > >
> > > There are no major changes in the benchmarks.
> > > Geometric Mean of all benchmarks New / Old: 0.967
> > >
> > > Full xcheck passes on x86_64.
> > > ---
> > >  sysdeps/x86_64/multiarch/memchr-avx2-rtm.S |   1 +
> > >  sysdeps/x86_64/multiarch/memchr-avx2.S     | 109 +++++++++++----------
> > >  2 files changed, 60 insertions(+), 50 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> > > index 87b076c7c4..c4d71938c5 100644
> > > --- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> > > +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> > > @@ -2,6 +2,7 @@
> > >  # define MEMCHR __memchr_avx2_rtm
> > >  #endif
> > >
> > > +#define COND_VZEROUPPER        COND_VZEROUPPER_XTEST
> > >  #define ZERO_UPPER_VEC_REGISTERS_RETURN \
> > >    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > > index 75bd7262e0..28a01280ec 100644
> > > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> > > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > > @@ -57,7 +57,7 @@
> > >  # define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > >
> > >         .section SECTION(.text),"ax",@progbits
> > > -ENTRY (MEMCHR)
> > > +ENTRY_P2ALIGN (MEMCHR, 5)
> > >  # ifndef USE_AS_RAWMEMCHR
> > >         /* Check for zero length.  */
> > >  #  ifdef __ILP32__
> > > @@ -87,12 +87,14 @@ ENTRY (MEMCHR)
> > >  # endif
> > >         testl   %eax, %eax
> > >         jz      L(aligned_more)
> > > -       tzcntl  %eax, %eax
> > > +       bsfl    %eax, %eax
> > >         addq    %rdi, %rax
> > > -       VZEROUPPER_RETURN
> > > +L(return_vzeroupper):
> > > +       ZERO_UPPER_VEC_REGISTERS_RETURN
> > > +
> > >
> > >  # ifndef USE_AS_RAWMEMCHR
> > > -       .p2align 5
> > > +       .p2align 4
> > >  L(first_vec_x0):
> > >         /* Check if first match was before length.  */
> > >         tzcntl  %eax, %eax
> > > @@ -100,58 +102,31 @@ L(first_vec_x0):
> > >         /* NB: Multiply length by 4 to get byte count.  */
> > >         sall    $2, %edx
> > >  #  endif
> > > -       xorl    %ecx, %ecx
> > > +    COND_VZEROUPPER
> > > +       /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
> > > +          block. branch here as opposed to cmovcc is not that costly. Common
> > > +          usage of memchr is to check if the return was NULL (if string was
> > > +          known to contain CHAR user would use rawmemchr). This branch will be
> > > +          highly correlated with the user branch and can be used by most
> > > +          modern branch predictors to predict the user branch.  */
> > >         cmpl    %eax, %edx
> > > -       leaq    (%rdi, %rax), %rax
> > > -       cmovle  %rcx, %rax
> > > -       VZEROUPPER_RETURN
> > > -
> > > -L(null):
> > > -       xorl    %eax, %eax
> > > -       ret
> > > -# endif
> > > -       .p2align 4
> > > -L(cross_page_boundary):
> > > -       /* Save pointer before aligning as its original value is
> > > -          necessary for computer return address if byte is found or
> > > -          adjusting length if it is not and this is memchr.  */
> > > -       movq    %rdi, %rcx
> > > -       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
> > > -          and rdi for rawmemchr.  */
> > > -       orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
> > > -       VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> > > -       vpmovmskb %ymm1, %eax
> > > -# ifndef USE_AS_RAWMEMCHR
> > > -       /* Calculate length until end of page (length checked for a
> > > -          match).  */
> > > -       leaq    1(%ALGN_PTR_REG), %rsi
> > > -       subq    %RRAW_PTR_REG, %rsi
> > > -#  ifdef USE_AS_WMEMCHR
> > > -       /* NB: Divide bytes by 4 to get wchar_t count.  */
> > > -       shrl    $2, %esi
> > > -#  endif
> > > -# endif
> > > -       /* Remove the leading bytes.  */
> > > -       sarxl   %ERAW_PTR_REG, %eax, %eax
> > > -# ifndef USE_AS_RAWMEMCHR
> > > -       /* Check the end of data.  */
> > > -       cmpq    %rsi, %rdx
> > > -       jbe     L(first_vec_x0)
> > > +    jle  L(null)
> > > +       addq    %rdi, %rax
> > > +    ret
> > >  # endif
> > > -       testl   %eax, %eax
> > > -       jz      L(cross_page_continue)
> > > -       tzcntl  %eax, %eax
> > > -       addq    %RRAW_PTR_REG, %rax
> > > -L(return_vzeroupper):
> > > -       ZERO_UPPER_VEC_REGISTERS_RETURN
> > >
> > > -       .p2align 4
> > > +       .p2align 4,, 10
> > >  L(first_vec_x1):
> > > -       tzcntl  %eax, %eax
> > > +       bsfl    %eax, %eax
> > >         incq    %rdi
> > >         addq    %rdi, %rax
> > >         VZEROUPPER_RETURN
> > > -
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       /* First in aligning bytes here.  */
> > > +L(null):
> > > +       xorl    %eax, %eax
> > > +       ret
> > > +# endif
> > >         .p2align 4
> > >  L(first_vec_x2):
> > >         tzcntl  %eax, %eax
> > > @@ -340,7 +315,7 @@ L(first_vec_x1_check):
> > >         incq    %rdi
> > >         addq    %rdi, %rax
> > >         VZEROUPPER_RETURN
> > > -       .p2align 4
> > > +       .p2align 4,, 6
> > >  L(set_zero_end):
> > >         xorl    %eax, %eax
> > >         VZEROUPPER_RETURN
> > > @@ -428,5 +403,39 @@ L(last_vec_x3):
> > >         VZEROUPPER_RETURN
> > >  # endif
> > >
> > > +       .p2align 4
> > > +L(cross_page_boundary):
> > > +       /* Save pointer before aligning as its original value is necessary for
> > > +          computer return address if byte is found or adjusting length if it
> > > +          is not and this is memchr.  */
> > > +       movq    %rdi, %rcx
> > > +       /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
> > > +          rawmemchr.  */
> > > +       andq    $-VEC_SIZE, %ALGN_PTR_REG
> > > +       VPCMPEQ (%ALGN_PTR_REG), %ymm0, %ymm1
> > > +       vpmovmskb %ymm1, %eax
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       /* Calculate length until end of page (length checked for a match).  */
> > > +       leal    VEC_SIZE(%ALGN_PTR_REG), %esi
> > > +       subl    %ERAW_PTR_REG, %esi
> > > +# ifdef USE_AS_WMEMCHR
> > > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> > > +       shrl    $2, %esi
> > > +# endif
> > > +# endif
> > > +       /* Remove the leading bytes.  */
> > > +       sarxl   %ERAW_PTR_REG, %eax, %eax
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       /* Check the end of data.  */
> > > +       cmpq    %rsi, %rdx
> > > +       jbe     L(first_vec_x0)
> > > +# endif
> > > +       testl   %eax, %eax
> > > +       jz      L(cross_page_continue)
> > > +       bsfl    %eax, %eax
> > > +       addq    %RRAW_PTR_REG, %rax
> > > +       VZEROUPPER_RETURN
> > > +
> > > +
> > >  END (MEMCHR)
> > >  #endif
> > > --
> > > 2.34.1
> > >
> >
> > LGTM.
> >
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> >
> > Thanks.
> >
> > --
> > H.J.
>
> I would like to backport this patch to release branches.
> Any comments or objections?

Probably best to squash with:
https://sourceware.org/git/?p=glibc.git;a=commit;h=2c9af8421d2b4a7fcce163e7bc81a118d22fd346
>
> --Sunil

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v6 8/8] x86: Shrink code size of memchr-evex.S
  2022-06-07  4:11   ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (5 preceding siblings ...)
  2022-06-07  4:11     ` [PATCH v6 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
@ 2022-06-07  4:11     ` Noah Goldstein
  2022-06-07 18:19       ` H.J. Lu
  2022-06-07 18:04     ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library H.J. Lu
  7 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-07  4:11 UTC (permalink / raw)
  To: libc-alpha

This is not meant as a performance optimization. The previous code was
far to liberal in aligning targets and wasted code size unnecissarily.

The total code size saving is: 64 bytes

There are no non-negligible changes in the benchmarks.
Geometric Mean of all benchmarks New / Old: 1.000

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memchr-evex.S | 46 ++++++++++++++------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index cfaf02907d..0fd11b7632 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -88,7 +88,7 @@
 # define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 6)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
@@ -131,22 +131,24 @@ L(zero):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
-	/* Check if first match was before length.  */
-	tzcntl	%eax, %eax
-	xorl	%ecx, %ecx
-	cmpl	%eax, %edx
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-	cmovle	%rcx, %rax
+	/* Check if first match was before length. NB: tzcnt has false data-
+	   dependency on destination. eax already had a data-dependency on esi
+	   so this should have no affect here.  */
+	tzcntl	%eax, %esi
+#  ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
+#  else
+	addq	%rsi, %rdi
+#  endif
+	xorl	%eax, %eax
+	cmpl	%esi, %edx
+	cmovg	%rdi, %rax
 	ret
-# else
-	/* NB: first_vec_x0 is 17 bytes which will leave
-	   cross_page_boundary (which is relatively cold) close enough
-	   to ideal alignment. So only realign L(cross_page_boundary) if
-	   rawmemchr.  */
-	.p2align 4
 # endif
+
+	.p2align 4
 L(cross_page_boundary):
 	/* Save pointer before aligning as its original value is
 	   necessary for computer return address if byte is found or
@@ -400,10 +402,14 @@ L(last_2x_vec):
 L(zero_end):
 	ret
 
+L(set_zero_end):
+	xorl	%eax, %eax
+	ret
 
 	.p2align 4
 L(first_vec_x1_check):
-	tzcntl	%eax, %eax
+	/* eax must be non-zero. Use bsfl to save code size.  */
+	bsfl	%eax, %eax
 	/* Adjust length.  */
 	subl	$-(CHAR_PER_VEC * 4), %edx
 	/* Check if match within remaining length.  */
@@ -412,9 +418,6 @@ L(first_vec_x1_check):
 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 	ret
-L(set_zero_end):
-	xorl	%eax, %eax
-	ret
 
 	.p2align 4
 L(loop_4x_vec_end):
@@ -464,7 +467,7 @@ L(loop_4x_vec_end):
 # endif
 	ret
 
-	.p2align 4
+	.p2align 4,, 10
 L(last_vec_x1_return):
 	tzcntl	%eax, %eax
 # if defined USE_AS_WMEMCHR || RET_OFFSET != 0
@@ -496,6 +499,7 @@ L(last_vec_x3_return):
 # endif
 
 # ifndef USE_AS_RAWMEMCHR
+	.p2align 4,, 5
 L(last_4x_vec_or_less_cmpeq):
 	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
 	kmovd	%k0, %eax
@@ -546,7 +550,7 @@ L(last_4x_vec):
 #  endif
 	andl	%ecx, %eax
 	jz	L(zero_end2)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 L(zero_end2):
 	ret
@@ -562,6 +566,6 @@ L(last_vec_x3):
 	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 # endif
-
+	/* 7 bytes from next cache line.  */
 END (MEMCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v6 8/8] x86: Shrink code size of memchr-evex.S
  2022-06-07  4:11     ` [PATCH v6 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
@ 2022-06-07 18:19       ` H.J. Lu
  2022-07-14  2:32         ` Sunil Pandey
  0 siblings, 1 reply; 82+ messages in thread
From: H.J. Lu @ 2022-06-07 18:19 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This is not meant as a performance optimization. The previous code was
> far to liberal in aligning targets and wasted code size unnecissarily.
>
> The total code size saving is: 64 bytes
>
> There are no non-negligible changes in the benchmarks.
> Geometric Mean of all benchmarks New / Old: 1.000
>
> Full xcheck passes on x86_64.
> ---
>  sysdeps/x86_64/multiarch/memchr-evex.S | 46 ++++++++++++++------------
>  1 file changed, 25 insertions(+), 21 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> index cfaf02907d..0fd11b7632 100644
> --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> @@ -88,7 +88,7 @@
>  # define PAGE_SIZE 4096
>
>         .section SECTION(.text),"ax",@progbits
> -ENTRY (MEMCHR)
> +ENTRY_P2ALIGN (MEMCHR, 6)
>  # ifndef USE_AS_RAWMEMCHR
>         /* Check for zero length.  */
>         test    %RDX_LP, %RDX_LP
> @@ -131,22 +131,24 @@ L(zero):
>         xorl    %eax, %eax
>         ret
>
> -       .p2align 5
> +       .p2align 4
>  L(first_vec_x0):
> -       /* Check if first match was before length.  */
> -       tzcntl  %eax, %eax
> -       xorl    %ecx, %ecx
> -       cmpl    %eax, %edx
> -       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> -       cmovle  %rcx, %rax
> +       /* Check if first match was before length. NB: tzcnt has false data-
> +          dependency on destination. eax already had a data-dependency on esi
> +          so this should have no affect here.  */
> +       tzcntl  %eax, %esi
> +#  ifdef USE_AS_WMEMCHR
> +       leaq    (%rdi, %rsi, CHAR_SIZE), %rdi
> +#  else
> +       addq    %rsi, %rdi
> +#  endif
> +       xorl    %eax, %eax
> +       cmpl    %esi, %edx
> +       cmovg   %rdi, %rax
>         ret
> -# else
> -       /* NB: first_vec_x0 is 17 bytes which will leave
> -          cross_page_boundary (which is relatively cold) close enough
> -          to ideal alignment. So only realign L(cross_page_boundary) if
> -          rawmemchr.  */
> -       .p2align 4
>  # endif
> +
> +       .p2align 4
>  L(cross_page_boundary):
>         /* Save pointer before aligning as its original value is
>            necessary for computer return address if byte is found or
> @@ -400,10 +402,14 @@ L(last_2x_vec):
>  L(zero_end):
>         ret
>
> +L(set_zero_end):
> +       xorl    %eax, %eax
> +       ret
>
>         .p2align 4
>  L(first_vec_x1_check):
> -       tzcntl  %eax, %eax
> +       /* eax must be non-zero. Use bsfl to save code size.  */
> +       bsfl    %eax, %eax
>         /* Adjust length.  */
>         subl    $-(CHAR_PER_VEC * 4), %edx
>         /* Check if match within remaining length.  */
> @@ -412,9 +418,6 @@ L(first_vec_x1_check):
>         /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
>         leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
>         ret
> -L(set_zero_end):
> -       xorl    %eax, %eax
> -       ret
>
>         .p2align 4
>  L(loop_4x_vec_end):
> @@ -464,7 +467,7 @@ L(loop_4x_vec_end):
>  # endif
>         ret
>
> -       .p2align 4
> +       .p2align 4,, 10
>  L(last_vec_x1_return):
>         tzcntl  %eax, %eax
>  # if defined USE_AS_WMEMCHR || RET_OFFSET != 0
> @@ -496,6 +499,7 @@ L(last_vec_x3_return):
>  # endif
>
>  # ifndef USE_AS_RAWMEMCHR
> +       .p2align 4,, 5
>  L(last_4x_vec_or_less_cmpeq):
>         VPCMP   $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
>         kmovd   %k0, %eax
> @@ -546,7 +550,7 @@ L(last_4x_vec):
>  #  endif
>         andl    %ecx, %eax
>         jz      L(zero_end2)
> -       tzcntl  %eax, %eax
> +       bsfl    %eax, %eax
>         leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
>  L(zero_end2):
>         ret
> @@ -562,6 +566,6 @@ L(last_vec_x3):
>         leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>  # endif
> -
> +       /* 7 bytes from next cache line.  */
>  END (MEMCHR)
>  #endif
> --
> 2.34.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v6 8/8] x86: Shrink code size of memchr-evex.S
  2022-06-07 18:19       ` H.J. Lu
@ 2022-07-14  2:32         ` Sunil Pandey
  0 siblings, 0 replies; 82+ messages in thread
From: Sunil Pandey @ 2022-07-14  2:32 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Noah Goldstein, GNU C Library

On Tue, Jun 7, 2022 at 11:20 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > This is not meant as a performance optimization. The previous code was
> > far to liberal in aligning targets and wasted code size unnecissarily.
> >
> > The total code size saving is: 64 bytes
> >
> > There are no non-negligible changes in the benchmarks.
> > Geometric Mean of all benchmarks New / Old: 1.000
> >
> > Full xcheck passes on x86_64.
> > ---
> >  sysdeps/x86_64/multiarch/memchr-evex.S | 46 ++++++++++++++------------
> >  1 file changed, 25 insertions(+), 21 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> > index cfaf02907d..0fd11b7632 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> > @@ -88,7 +88,7 @@
> >  # define PAGE_SIZE 4096
> >
> >         .section SECTION(.text),"ax",@progbits
> > -ENTRY (MEMCHR)
> > +ENTRY_P2ALIGN (MEMCHR, 6)
> >  # ifndef USE_AS_RAWMEMCHR
> >         /* Check for zero length.  */
> >         test    %RDX_LP, %RDX_LP
> > @@ -131,22 +131,24 @@ L(zero):
> >         xorl    %eax, %eax
> >         ret
> >
> > -       .p2align 5
> > +       .p2align 4
> >  L(first_vec_x0):
> > -       /* Check if first match was before length.  */
> > -       tzcntl  %eax, %eax
> > -       xorl    %ecx, %ecx
> > -       cmpl    %eax, %edx
> > -       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > -       cmovle  %rcx, %rax
> > +       /* Check if first match was before length. NB: tzcnt has false data-
> > +          dependency on destination. eax already had a data-dependency on esi
> > +          so this should have no affect here.  */
> > +       tzcntl  %eax, %esi
> > +#  ifdef USE_AS_WMEMCHR
> > +       leaq    (%rdi, %rsi, CHAR_SIZE), %rdi
> > +#  else
> > +       addq    %rsi, %rdi
> > +#  endif
> > +       xorl    %eax, %eax
> > +       cmpl    %esi, %edx
> > +       cmovg   %rdi, %rax
> >         ret
> > -# else
> > -       /* NB: first_vec_x0 is 17 bytes which will leave
> > -          cross_page_boundary (which is relatively cold) close enough
> > -          to ideal alignment. So only realign L(cross_page_boundary) if
> > -          rawmemchr.  */
> > -       .p2align 4
> >  # endif
> > +
> > +       .p2align 4
> >  L(cross_page_boundary):
> >         /* Save pointer before aligning as its original value is
> >            necessary for computer return address if byte is found or
> > @@ -400,10 +402,14 @@ L(last_2x_vec):
> >  L(zero_end):
> >         ret
> >
> > +L(set_zero_end):
> > +       xorl    %eax, %eax
> > +       ret
> >
> >         .p2align 4
> >  L(first_vec_x1_check):
> > -       tzcntl  %eax, %eax
> > +       /* eax must be non-zero. Use bsfl to save code size.  */
> > +       bsfl    %eax, %eax
> >         /* Adjust length.  */
> >         subl    $-(CHAR_PER_VEC * 4), %edx
> >         /* Check if match within remaining length.  */
> > @@ -412,9 +418,6 @@ L(first_vec_x1_check):
> >         /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> >         leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> >         ret
> > -L(set_zero_end):
> > -       xorl    %eax, %eax
> > -       ret
> >
> >         .p2align 4
> >  L(loop_4x_vec_end):
> > @@ -464,7 +467,7 @@ L(loop_4x_vec_end):
> >  # endif
> >         ret
> >
> > -       .p2align 4
> > +       .p2align 4,, 10
> >  L(last_vec_x1_return):
> >         tzcntl  %eax, %eax
> >  # if defined USE_AS_WMEMCHR || RET_OFFSET != 0
> > @@ -496,6 +499,7 @@ L(last_vec_x3_return):
> >  # endif
> >
> >  # ifndef USE_AS_RAWMEMCHR
> > +       .p2align 4,, 5
> >  L(last_4x_vec_or_less_cmpeq):
> >         VPCMP   $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> >         kmovd   %k0, %eax
> > @@ -546,7 +550,7 @@ L(last_4x_vec):
> >  #  endif
> >         andl    %ecx, %eax
> >         jz      L(zero_end2)
> > -       tzcntl  %eax, %eax
> > +       bsfl    %eax, %eax
> >         leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> >  L(zero_end2):
> >         ret
> > @@ -562,6 +566,6 @@ L(last_vec_x3):
> >         leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> >         ret
> >  # endif
> > -
> > +       /* 7 bytes from next cache line.  */
> >  END (MEMCHR)
> >  #endif
> > --
> > 2.34.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library
  2022-06-07  4:11   ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                       ` (6 preceding siblings ...)
  2022-06-07  4:11     ` [PATCH v6 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
@ 2022-06-07 18:04     ` H.J. Lu
  2022-07-14  2:07       ` Sunil Pandey
  7 siblings, 1 reply; 82+ messages in thread
From: H.J. Lu @ 2022-06-07 18:04 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This patch does not touch any existing code and is only meant to be a
> tool for future patches so that simple source files can more easily be
> maintained to target multiple VEC classes.
>
> There is no difference in the objdump of libc.so before and after this
> patch.
> ---
>  sysdeps/x86_64/multiarch/avx-rtm-vecs.h     | 34 ++++++++
>  sysdeps/x86_64/multiarch/avx-vecs.h         | 47 +++++++++++
>  sysdeps/x86_64/multiarch/evex-vecs-common.h | 39 +++++++++
>  sysdeps/x86_64/multiarch/evex256-vecs.h     | 35 ++++++++
>  sysdeps/x86_64/multiarch/evex512-vecs.h     | 35 ++++++++
>  sysdeps/x86_64/multiarch/sse2-vecs.h        | 47 +++++++++++
>  sysdeps/x86_64/multiarch/vec-macros.h       | 90 +++++++++++++++++++++
>  7 files changed, 327 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/evex-vecs-common.h
>  create mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/vec-macros.h
>
> diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> new file mode 100644
> index 0000000000..3f531dd47f
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> @@ -0,0 +1,34 @@
> +/* Common config for AVX-RTM VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _AVX_RTM_VECS_H
> +#define _AVX_RTM_VECS_H                        1
> +
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN              jmp L(return_vzeroupper)
> +
> +#define USE_WITH_RTM                   1
> +#include "avx-vecs.h"
> +
> +#undef SECTION
> +#define SECTION(p)                             p##.avx.rtm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
> new file mode 100644
> index 0000000000..89680f5db8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/avx-vecs.h
> @@ -0,0 +1,47 @@
> +/* Common config for AVX VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _AVX_VECS_H
> +#define _AVX_VECS_H                    1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       32
> +#include "vec-macros.h"
> +
> +#define USE_WITH_AVX           1
> +#define SECTION(p)                     p##.avx
> +
> +/* 4-byte mov instructions with AVX2.  */
> +#define MOV_SIZE                       4
> +/* 1 (ret) + 3 (vzeroupper).  */
> +#define RET_SIZE                       4
> +#define VZEROUPPER                     vzeroupper
> +
> +#define VMOVU                          vmovdqu
> +#define VMOVA                          vmovdqa
> +#define VMOVNT                         vmovntdq
> +
> +/* Often need to access xmm portion.  */
> +#define VEC_xmm                                VEC_any_xmm
> +#define VEC                                    VEC_any_ymm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
> new file mode 100644
> index 0000000000..99806ebcd7
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/evex-vecs-common.h
> @@ -0,0 +1,39 @@
> +/* Common config for EVEX256 and EVEX512 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _EVEX_VECS_COMMON_H
> +#define _EVEX_VECS_COMMON_H                    1
> +
> +#include "vec-macros.h"
> +
> +/* 6-byte mov instructions with EVEX.  */
> +#define MOV_SIZE                       6
> +/* No vzeroupper needed.  */
> +#define RET_SIZE                       1
> +#define VZEROUPPER
> +
> +#define VMOVU                          vmovdqu64
> +#define VMOVA                          vmovdqa64
> +#define VMOVNT                         vmovntdq
> +
> +#define VEC_xmm                                VEC_hi_xmm
> +#define VEC_ymm                                VEC_hi_ymm
> +#define VEC_zmm                                VEC_hi_zmm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
> new file mode 100644
> index 0000000000..222ba46dc7
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
> @@ -0,0 +1,35 @@
> +/* Common config for EVEX256 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _EVEX256_VECS_H
> +#define _EVEX256_VECS_H                        1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       32
> +#include "evex-vecs-common.h"
> +
> +#define USE_WITH_EVEX256       1
> +#define SECTION(p)                     p##.evex
> +
> +#define VEC                                    VEC_ymm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
> new file mode 100644
> index 0000000000..d1784d5368
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
> @@ -0,0 +1,35 @@
> +/* Common config for EVEX512 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _EVEX512_VECS_H
> +#define _EVEX512_VECS_H                        1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       64
> +#include "evex-vecs-common.h"
> +
> +#define USE_WITH_EVEX512       1
> +#define SECTION(p)                     p##.evex512
> +
> +#define VEC                                    VEC_zmm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
> new file mode 100644
> index 0000000000..2b77a59d56
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
> @@ -0,0 +1,47 @@
> +/* Common config for SSE2 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _SSE2_VECS_H
> +#define _SSE2_VECS_H                   1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       16
> +#include "vec-macros.h"
> +
> +#define USE_WITH_SSE2          1
> +#define SECTION(p)                     p
> +
> +/* 3-byte mov instructions with SSE2.  */
> +#define MOV_SIZE                       3
> +/* No vzeroupper needed.  */
> +#define RET_SIZE                       1
> +#define VZEROUPPER
> +
> +#define VMOVU                          movups
> +#define VMOVA                          movaps
> +#define VMOVNT                         movntdq
> +
> +#define VEC_xmm                                VEC_any_xmm
> +#define VEC                                    VEC_any_xmm
> +
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
> new file mode 100644
> index 0000000000..9f3ffecede
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/vec-macros.h
> @@ -0,0 +1,90 @@
> +/* Macro helpers for VEC_{type}({vec_num})
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _VEC_MACROS_H
> +#define _VEC_MACROS_H                  1
> +
> +#ifndef VEC_SIZE
> +# error "Never include this file directly. Always include a vector config."
> +#endif
> +
> +/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
> +   VEC(N) values.  */
> +#define VEC_hi_xmm0                            xmm16
> +#define VEC_hi_xmm1                            xmm17
> +#define VEC_hi_xmm2                            xmm18
> +#define VEC_hi_xmm3                            xmm19
> +#define VEC_hi_xmm4                            xmm20
> +#define VEC_hi_xmm5                            xmm21
> +#define VEC_hi_xmm6                            xmm22
> +#define VEC_hi_xmm7                            xmm23
> +#define VEC_hi_xmm8                            xmm24
> +#define VEC_hi_xmm9                            xmm25
> +#define VEC_hi_xmm10                   xmm26
> +#define VEC_hi_xmm11                   xmm27
> +#define VEC_hi_xmm12                   xmm28
> +#define VEC_hi_xmm13                   xmm29
> +#define VEC_hi_xmm14                   xmm30
> +#define VEC_hi_xmm15                   xmm31
> +
> +#define VEC_hi_ymm0                            ymm16
> +#define VEC_hi_ymm1                            ymm17
> +#define VEC_hi_ymm2                            ymm18
> +#define VEC_hi_ymm3                            ymm19
> +#define VEC_hi_ymm4                            ymm20
> +#define VEC_hi_ymm5                            ymm21
> +#define VEC_hi_ymm6                            ymm22
> +#define VEC_hi_ymm7                            ymm23
> +#define VEC_hi_ymm8                            ymm24
> +#define VEC_hi_ymm9                            ymm25
> +#define VEC_hi_ymm10                   ymm26
> +#define VEC_hi_ymm11                   ymm27
> +#define VEC_hi_ymm12                   ymm28
> +#define VEC_hi_ymm13                   ymm29
> +#define VEC_hi_ymm14                   ymm30
> +#define VEC_hi_ymm15                   ymm31
> +
> +#define VEC_hi_zmm0                            zmm16
> +#define VEC_hi_zmm1                            zmm17
> +#define VEC_hi_zmm2                            zmm18
> +#define VEC_hi_zmm3                            zmm19
> +#define VEC_hi_zmm4                            zmm20
> +#define VEC_hi_zmm5                            zmm21
> +#define VEC_hi_zmm6                            zmm22
> +#define VEC_hi_zmm7                            zmm23
> +#define VEC_hi_zmm8                            zmm24
> +#define VEC_hi_zmm9                            zmm25
> +#define VEC_hi_zmm10                   zmm26
> +#define VEC_hi_zmm11                   zmm27
> +#define VEC_hi_zmm12                   zmm28
> +#define VEC_hi_zmm13                   zmm29
> +#define VEC_hi_zmm14                   zmm30
> +#define VEC_hi_zmm15                   zmm31
> +
> +#define PRIMITIVE_VEC(vec, num)                vec##num
> +
> +#define VEC_any_xmm(i)                 PRIMITIVE_VEC(xmm, i)
> +#define VEC_any_ymm(i)                 PRIMITIVE_VEC(ymm, i)
> +#define VEC_any_zmm(i)                 PRIMITIVE_VEC(zmm, i)
> +
> +#define VEC_hi_xmm(i)                  PRIMITIVE_VEC(VEC_hi_xmm, i)
> +#define VEC_hi_ymm(i)                  PRIMITIVE_VEC(VEC_hi_ymm, i)
> +#define VEC_hi_zmm(i)                  PRIMITIVE_VEC(VEC_hi_zmm, i)
> +
> +#endif
> --
> 2.34.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library
  2022-06-07 18:04     ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library H.J. Lu
@ 2022-07-14  2:07       ` Sunil Pandey
  0 siblings, 0 replies; 82+ messages in thread
From: Sunil Pandey @ 2022-07-14  2:07 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Noah Goldstein, GNU C Library

On Tue, Jun 7, 2022 at 11:05 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > This patch does not touch any existing code and is only meant to be a
> > tool for future patches so that simple source files can more easily be
> > maintained to target multiple VEC classes.
> >
> > There is no difference in the objdump of libc.so before and after this
> > patch.
> > ---
> >  sysdeps/x86_64/multiarch/avx-rtm-vecs.h     | 34 ++++++++
> >  sysdeps/x86_64/multiarch/avx-vecs.h         | 47 +++++++++++
> >  sysdeps/x86_64/multiarch/evex-vecs-common.h | 39 +++++++++
> >  sysdeps/x86_64/multiarch/evex256-vecs.h     | 35 ++++++++
> >  sysdeps/x86_64/multiarch/evex512-vecs.h     | 35 ++++++++
> >  sysdeps/x86_64/multiarch/sse2-vecs.h        | 47 +++++++++++
> >  sysdeps/x86_64/multiarch/vec-macros.h       | 90 +++++++++++++++++++++
> >  7 files changed, 327 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/evex-vecs-common.h
> >  create mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/vec-macros.h
> >
> > diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> > new file mode 100644
> > index 0000000000..3f531dd47f
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> > @@ -0,0 +1,34 @@
> > +/* Common config for AVX-RTM VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _AVX_RTM_VECS_H
> > +#define _AVX_RTM_VECS_H                        1
> > +
> > +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > +
> > +#define VZEROUPPER_RETURN              jmp L(return_vzeroupper)
> > +
> > +#define USE_WITH_RTM                   1
> > +#include "avx-vecs.h"
> > +
> > +#undef SECTION
> > +#define SECTION(p)                             p##.avx.rtm
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
> > new file mode 100644
> > index 0000000000..89680f5db8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/avx-vecs.h
> > @@ -0,0 +1,47 @@
> > +/* Common config for AVX VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _AVX_VECS_H
> > +#define _AVX_VECS_H                    1
> > +
> > +#ifdef VEC_SIZE
> > +# error "Multiple VEC configs included!"
> > +#endif
> > +
> > +#define VEC_SIZE                       32
> > +#include "vec-macros.h"
> > +
> > +#define USE_WITH_AVX           1
> > +#define SECTION(p)                     p##.avx
> > +
> > +/* 4-byte mov instructions with AVX2.  */
> > +#define MOV_SIZE                       4
> > +/* 1 (ret) + 3 (vzeroupper).  */
> > +#define RET_SIZE                       4
> > +#define VZEROUPPER                     vzeroupper
> > +
> > +#define VMOVU                          vmovdqu
> > +#define VMOVA                          vmovdqa
> > +#define VMOVNT                         vmovntdq
> > +
> > +/* Often need to access xmm portion.  */
> > +#define VEC_xmm                                VEC_any_xmm
> > +#define VEC                                    VEC_any_ymm
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
> > new file mode 100644
> > index 0000000000..99806ebcd7
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/evex-vecs-common.h
> > @@ -0,0 +1,39 @@
> > +/* Common config for EVEX256 and EVEX512 VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _EVEX_VECS_COMMON_H
> > +#define _EVEX_VECS_COMMON_H                    1
> > +
> > +#include "vec-macros.h"
> > +
> > +/* 6-byte mov instructions with EVEX.  */
> > +#define MOV_SIZE                       6
> > +/* No vzeroupper needed.  */
> > +#define RET_SIZE                       1
> > +#define VZEROUPPER
> > +
> > +#define VMOVU                          vmovdqu64
> > +#define VMOVA                          vmovdqa64
> > +#define VMOVNT                         vmovntdq
> > +
> > +#define VEC_xmm                                VEC_hi_xmm
> > +#define VEC_ymm                                VEC_hi_ymm
> > +#define VEC_zmm                                VEC_hi_zmm
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
> > new file mode 100644
> > index 0000000000..222ba46dc7
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
> > @@ -0,0 +1,35 @@
> > +/* Common config for EVEX256 VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _EVEX256_VECS_H
> > +#define _EVEX256_VECS_H                        1
> > +
> > +#ifdef VEC_SIZE
> > +# error "Multiple VEC configs included!"
> > +#endif
> > +
> > +#define VEC_SIZE                       32
> > +#include "evex-vecs-common.h"
> > +
> > +#define USE_WITH_EVEX256       1
> > +#define SECTION(p)                     p##.evex
> > +
> > +#define VEC                                    VEC_ymm
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
> > new file mode 100644
> > index 0000000000..d1784d5368
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
> > @@ -0,0 +1,35 @@
> > +/* Common config for EVEX512 VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _EVEX512_VECS_H
> > +#define _EVEX512_VECS_H                        1
> > +
> > +#ifdef VEC_SIZE
> > +# error "Multiple VEC configs included!"
> > +#endif
> > +
> > +#define VEC_SIZE                       64
> > +#include "evex-vecs-common.h"
> > +
> > +#define USE_WITH_EVEX512       1
> > +#define SECTION(p)                     p##.evex512
> > +
> > +#define VEC                                    VEC_zmm
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
> > new file mode 100644
> > index 0000000000..2b77a59d56
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
> > @@ -0,0 +1,47 @@
> > +/* Common config for SSE2 VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _SSE2_VECS_H
> > +#define _SSE2_VECS_H                   1
> > +
> > +#ifdef VEC_SIZE
> > +# error "Multiple VEC configs included!"
> > +#endif
> > +
> > +#define VEC_SIZE                       16
> > +#include "vec-macros.h"
> > +
> > +#define USE_WITH_SSE2          1
> > +#define SECTION(p)                     p
> > +
> > +/* 3-byte mov instructions with SSE2.  */
> > +#define MOV_SIZE                       3
> > +/* No vzeroupper needed.  */
> > +#define RET_SIZE                       1
> > +#define VZEROUPPER
> > +
> > +#define VMOVU                          movups
> > +#define VMOVA                          movaps
> > +#define VMOVNT                         movntdq
> > +
> > +#define VEC_xmm                                VEC_any_xmm
> > +#define VEC                                    VEC_any_xmm
> > +
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
> > new file mode 100644
> > index 0000000000..9f3ffecede
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/vec-macros.h
> > @@ -0,0 +1,90 @@
> > +/* Macro helpers for VEC_{type}({vec_num})
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _VEC_MACROS_H
> > +#define _VEC_MACROS_H                  1
> > +
> > +#ifndef VEC_SIZE
> > +# error "Never include this file directly. Always include a vector config."
> > +#endif
> > +
> > +/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
> > +   VEC(N) values.  */
> > +#define VEC_hi_xmm0                            xmm16
> > +#define VEC_hi_xmm1                            xmm17
> > +#define VEC_hi_xmm2                            xmm18
> > +#define VEC_hi_xmm3                            xmm19
> > +#define VEC_hi_xmm4                            xmm20
> > +#define VEC_hi_xmm5                            xmm21
> > +#define VEC_hi_xmm6                            xmm22
> > +#define VEC_hi_xmm7                            xmm23
> > +#define VEC_hi_xmm8                            xmm24
> > +#define VEC_hi_xmm9                            xmm25
> > +#define VEC_hi_xmm10                   xmm26
> > +#define VEC_hi_xmm11                   xmm27
> > +#define VEC_hi_xmm12                   xmm28
> > +#define VEC_hi_xmm13                   xmm29
> > +#define VEC_hi_xmm14                   xmm30
> > +#define VEC_hi_xmm15                   xmm31
> > +
> > +#define VEC_hi_ymm0                            ymm16
> > +#define VEC_hi_ymm1                            ymm17
> > +#define VEC_hi_ymm2                            ymm18
> > +#define VEC_hi_ymm3                            ymm19
> > +#define VEC_hi_ymm4                            ymm20
> > +#define VEC_hi_ymm5                            ymm21
> > +#define VEC_hi_ymm6                            ymm22
> > +#define VEC_hi_ymm7                            ymm23
> > +#define VEC_hi_ymm8                            ymm24
> > +#define VEC_hi_ymm9                            ymm25
> > +#define VEC_hi_ymm10                   ymm26
> > +#define VEC_hi_ymm11                   ymm27
> > +#define VEC_hi_ymm12                   ymm28
> > +#define VEC_hi_ymm13                   ymm29
> > +#define VEC_hi_ymm14                   ymm30
> > +#define VEC_hi_ymm15                   ymm31
> > +
> > +#define VEC_hi_zmm0                            zmm16
> > +#define VEC_hi_zmm1                            zmm17
> > +#define VEC_hi_zmm2                            zmm18
> > +#define VEC_hi_zmm3                            zmm19
> > +#define VEC_hi_zmm4                            zmm20
> > +#define VEC_hi_zmm5                            zmm21
> > +#define VEC_hi_zmm6                            zmm22
> > +#define VEC_hi_zmm7                            zmm23
> > +#define VEC_hi_zmm8                            zmm24
> > +#define VEC_hi_zmm9                            zmm25
> > +#define VEC_hi_zmm10                   zmm26
> > +#define VEC_hi_zmm11                   zmm27
> > +#define VEC_hi_zmm12                   zmm28
> > +#define VEC_hi_zmm13                   zmm29
> > +#define VEC_hi_zmm14                   zmm30
> > +#define VEC_hi_zmm15                   zmm31
> > +
> > +#define PRIMITIVE_VEC(vec, num)                vec##num
> > +
> > +#define VEC_any_xmm(i)                 PRIMITIVE_VEC(xmm, i)
> > +#define VEC_any_ymm(i)                 PRIMITIVE_VEC(ymm, i)
> > +#define VEC_any_zmm(i)                 PRIMITIVE_VEC(zmm, i)
> > +
> > +#define VEC_hi_xmm(i)                  PRIMITIVE_VEC(VEC_hi_xmm, i)
> > +#define VEC_hi_ymm(i)                  PRIMITIVE_VEC(VEC_hi_ymm, i)
> > +#define VEC_hi_zmm(i)                  PRIMITIVE_VEC(VEC_hi_zmm, i)
> > +
> > +#endif
> > --
> > 2.34.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v1 3/8] Benchtests: Improve memrchr benchmarks
  2022-06-03  4:42 [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  2022-06-03  4:42 ` [PATCH v1 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
@ 2022-06-03  4:42 ` Noah Goldstein
  2022-06-03  4:42 ` [PATCH v1 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03  4:42 UTC (permalink / raw)
  To: libc-alpha

Add a second iteration for memrchr to set `pos` starting from the end
of the buffer.

Previously `pos` was only set relative to the begining of the
buffer. This isn't really useful for memchr because the begining
of the search space is (buf + len).
---
 benchtests/bench-memchr.c | 110 ++++++++++++++++++++++----------------
 1 file changed, 65 insertions(+), 45 deletions(-)

diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
index 4d7212332f..0facda2fa0 100644
--- a/benchtests/bench-memchr.c
+++ b/benchtests/bench-memchr.c
@@ -76,7 +76,7 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c,
 
 static void
 do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
-	 int seek_char)
+	 int seek_char, int invert_pos)
 {
   size_t i;
 
@@ -96,7 +96,10 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
 
   if (pos < len)
     {
-      buf[align + pos] = seek_char;
+      if (invert_pos)
+	buf[align + len - pos] = seek_char;
+      else
+	buf[align + pos] = seek_char;
       buf[align + len] = -seek_char;
     }
   else
@@ -109,6 +112,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
   json_attr_uint (json_ctx, "pos", pos);
   json_attr_uint (json_ctx, "len", len);
   json_attr_uint (json_ctx, "seek_char", seek_char);
+  json_attr_uint (json_ctx, "invert_pos", invert_pos);
 
   json_array_begin (json_ctx, "timings");
 
@@ -123,6 +127,7 @@ int
 test_main (void)
 {
   size_t i;
+  int repeats;
   json_ctx_t json_ctx;
   test_init ();
 
@@ -142,53 +147,68 @@ test_main (void)
 
   json_array_begin (&json_ctx, "results");
 
-  for (i = 1; i < 8; ++i)
+  for (repeats = 0; repeats < 2; ++repeats)
     {
-      do_test (&json_ctx, 0, 16 << i, 2048, 23);
-      do_test (&json_ctx, i, 64, 256, 23);
-      do_test (&json_ctx, 0, 16 << i, 2048, 0);
-      do_test (&json_ctx, i, 64, 256, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, 64, 256, 0);
+      for (i = 1; i < 8; ++i)
+	{
+	  do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
+	  do_test (&json_ctx, i, 64, 256, 23, repeats);
+	  do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
+	  do_test (&json_ctx, i, 64, 256, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
 #ifdef USE_AS_MEMRCHR
-      /* Also test the position close to the beginning for memrchr.  */
-      do_test (&json_ctx, 0, i, 256, 23);
-      do_test (&json_ctx, 0, i, 256, 0);
-      do_test (&json_ctx, i, i, 256, 23);
-      do_test (&json_ctx, i, i, 256, 0);
+	  /* Also test the position close to the beginning for memrchr.  */
+	  do_test (&json_ctx, 0, i, 256, 23, repeats);
+	  do_test (&json_ctx, 0, i, 256, 0, repeats);
+	  do_test (&json_ctx, i, i, 256, 23, repeats);
+	  do_test (&json_ctx, i, i, 256, 0, repeats);
 #endif
-    }
-  for (i = 1; i < 8; ++i)
-    {
-      do_test (&json_ctx, i, i << 5, 192, 23);
-      do_test (&json_ctx, i, i << 5, 192, 0);
-      do_test (&json_ctx, i, i << 5, 256, 23);
-      do_test (&json_ctx, i, i << 5, 256, 0);
-      do_test (&json_ctx, i, i << 5, 512, 23);
-      do_test (&json_ctx, i, i << 5, 512, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23);
-    }
-  for (i = 1; i < 32; ++i)
-    {
-      do_test (&json_ctx, 0, i, i + 1, 23);
-      do_test (&json_ctx, 0, i, i + 1, 0);
-      do_test (&json_ctx, i, i, i + 1, 23);
-      do_test (&json_ctx, i, i, i + 1, 0);
-      do_test (&json_ctx, 0, i, i - 1, 23);
-      do_test (&json_ctx, 0, i, i - 1, 0);
-      do_test (&json_ctx, i, i, i - 1, 23);
-      do_test (&json_ctx, i, i, i - 1, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23);
-      do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0);
-
-      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23);
-      do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0);
+	}
+      for (i = 1; i < 8; ++i)
+	{
+	  do_test (&json_ctx, i, i << 5, 192, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 192, 0, repeats);
+	  do_test (&json_ctx, i, i << 5, 256, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 256, 0, repeats);
+	  do_test (&json_ctx, i, i << 5, 512, 23, repeats);
+	  do_test (&json_ctx, i, i << 5, 512, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
+	}
+      for (i = 1; i < 32; ++i)
+	{
+	  do_test (&json_ctx, 0, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, 0, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, i, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, i, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, 0, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, 0, i, i - 1, 0, repeats);
+	  do_test (&json_ctx, i, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, i, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () / 2, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2, i, i - 1, 0, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0, repeats);
+
+	  do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23, repeats);
+	  do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0, repeats);
+
 #ifdef USE_AS_MEMRCHR
-      /* Also test the position close to the beginning for memrchr.  */
-      do_test (&json_ctx, 0, 1, i + 1, 23);
-      do_test (&json_ctx, 0, 2, i + 1, 0);
+	  do_test (&json_ctx, 0, 1, i + 1, 23, repeats);
+	  do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
+#endif
+	}
+#ifndef USE_AS_MEMRCHR
+      break;
 #endif
     }
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v1 4/8] x86: Optimize memrchr-sse2.S
  2022-06-03  4:42 [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  2022-06-03  4:42 ` [PATCH v1 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
  2022-06-03  4:42 ` [PATCH v1 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
@ 2022-06-03  4:42 ` Noah Goldstein
  2022-06-03  4:47   ` Noah Goldstein
  2022-06-03  4:42 ` [PATCH v1 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03  4:42 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller lengths more.
    2. optimizes target placement more carefully.
    3. reuses logic more.
    4. fixes up various inefficiencies in the logic.

The total code size saving is: 394 bytes
Geometric Mean of all benchmarks New / Old: 0.874

Regressions:
    1. The page cross case is now colder, especially re-entry from the
       page cross case if a match is not found in the first VEC
       (roughly 50%). My general opinion with this patch is this is
       acceptable given the "coldness" of this case (less than 4%) and
       generally performance improvement in the other far more common
       cases.

    2. There are some regressions 5-15% for medium/large user-arg
       lengths that have a match in the first VEC. This is because the
       logic was rewritten to optimize finds in the first VEC if the
       user-arg length is shorter (where we see roughly 20-50%
       performance improvements). It is not always the case this is a
       regression. My intuition is some frontend quirk is partially
       explaining the data although I haven't been able to find the
       root cause.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++--------------------
 1 file changed, 292 insertions(+), 321 deletions(-)

diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index d1a9f47911..b0dffd2ae2 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -18,362 +18,333 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#define VEC_SIZE			16
+#define PAGE_SIZE			4096
 
 	.text
-ENTRY (__memrchr)
-	movd	%esi, %xmm1
-
-	sub	$16, %RDX_LP
-	jbe	L(length_less16)
-
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	%RDX_LP, %RDI_LP
-	pshufd	$0, %xmm1, %xmm1
-
-	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-
-/* Check if there is a match.  */
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(loop_prolog)
-
-	add	$16, %rdi
-	add	$16, %rdx
-	and	$-16, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(loop_prolog):
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqb	%xmm1, %xmm4
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	mov	%edi, %ecx
-	and	$63, %ecx
-	jz	L(align64_loop)
-
-	add	$64, %rdi
-	add	$64, %rdx
-	and	$-64, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(align64_loop):
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	movdqa	48(%rdi), %xmm4
-
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm1, %xmm4
-
-	pmaxub	%xmm3, %xmm0
-	pmaxub	%xmm4, %xmm2
-	pmaxub	%xmm0, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	test	%eax, %eax
-	jz	L(align64_loop)
-
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm2
-
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	(%rdi), %xmm1
-
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	pmovmskb	%xmm1, %eax
-	bsr	%eax, %eax
-
-	add	%rdi, %rax
+ENTRY_P2ALIGN(__memrchr, 6)
+#ifdef __ILP32__
+	/* Clear upper bits.  */
+	mov	%RDX_LP, %RDX_LP
+#endif
+	movd	%esi, %xmm0
+
+	/* Get end pointer.  */
+	leaq	(%rdx, %rdi), %rcx
+
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	jz	L(page_cross)
+
+	/* NB: This load happens regardless of whether rdx (len) is zero. Since
+	   it doesn't cross a page and the standard gurantees any pointer have
+	   at least one-valid byte this load must be safe. For the entire
+	   history of the x86 memrchr implementation this has been possible so
+	   no code "should" be relying on a zero-length check before this load.
+	   The zero-length check is moved to the page cross case because it is
+	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
+	   into 2-cache lines.  */
+	movups	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
+	   zero.  */
+	bsrl	%eax, %eax
+	jz	L(ret_0)
+	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
+	   if out of bounds.  */
+	addl	%edx, %eax
+	jl	L(zero_0)
+	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
+	   ptr.  */
+	addq	%rdi, %rax
+L(ret_0):
 	ret
 
-	.p2align 4
-L(exit_loop):
-	add	$64, %edx
-	cmp	$32, %edx
-	jbe	L(exit_loop_32)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16_1)
-	cmp	$48, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches0_1)
-	xor	%eax, %eax
+	.p2align 4,, 5
+L(ret_vec_x0):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(exit_loop_32):
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48_1)
-	cmp	$16, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	32(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches32_1)
-	xor	%eax, %eax
+	.p2align 4,, 2
+L(zero_0):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches0):
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
-	ret
 
-	.p2align 4
-L(matches32):
-	bsr	%eax, %eax
-	lea	32(%rax, %rdi), %rax
+	.p2align 4,, 8
+L(more_1x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	/* Align rcx (pointer to string).  */
+	decq	%rcx
+	andq	$-VEC_SIZE, %rcx
+
+	movq	%rcx, %rdx
+	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
+	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
+	   it adds more frontend uops (even if the moves can be eliminated) and
+	   some percentage of the time actual backend uops.  */
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	subq	%rdi, %rdx
+	pmovmskb %xmm1, %eax
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	subl	$VEC_SIZE, %edx
+	jbe	L(ret_vec_x0_test)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_1)
+	addl	%edx, %eax
+	jl	L(zero_0)
+	addq	%rdi, %rax
+L(ret_1):
 	ret
 
-	.p2align 4
-L(matches48):
-	bsr	%eax, %eax
-	lea	48(%rax, %rdi), %rax
+	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
+	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
+	   lines.  Naturally aligned % 16 to 8-bytes.  */
+L(page_cross):
+	/* Zero length check.  */
+	testq	%rdx, %rdx
+	jz	L(zero_0)
+
+	leaq	-1(%rcx), %r8
+	andq	$-(VEC_SIZE), %r8
+
+	movaps	(%r8), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	negl	%ecx
+	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
+	   explicitly.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	shl	%cl, %esi
+	movzwl	%si, %eax
+	leaq	(%rdi, %rdx), %rcx
+	cmpq	%rdi, %r8
+	ja	L(more_1x_vec)
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_2)
+	addl	%edx, %eax
+	jl	L(zero_1)
+	addq	%rdi, %rax
+L(ret_2):
 	ret
 
-	.p2align 4
-L(matches0_1):
-	bsr	%eax, %eax
-	sub	$64, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	add	%rdi, %rax
+	/* Fits in aliging bytes.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches16_1):
-	bsr	%eax, %eax
-	sub	$48, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	16(%rdi, %rax), %rax
+	.p2align 4,, 5
+L(ret_vec_x1):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(matches32_1):
-	bsr	%eax, %eax
-	sub	$32, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	32(%rdi, %rax), %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(matches48_1):
-	bsr	%eax, %eax
-	sub	$16, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	48(%rdi, %rax), %rax
-	ret
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(return_null):
-	xor	%eax, %eax
-	ret
 
-	.p2align 4
-L(length_less16_offset0):
-	test	%edx, %edx
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	mov	%dl, %cl
-	pcmpeqb	(%rdi), %xmm1
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
 
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
+	addl	$(VEC_SIZE), %edx
+	jle	L(ret_vec_x2_test)
 
-	pmovmskb	%xmm1, %eax
+L(last_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
 
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
+	subl	$(VEC_SIZE), %edx
+	bsrl	%eax, %eax
+	jz	L(ret_3)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
+L(ret_3):
 	ret
 
-	.p2align 4
-L(length_less16):
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	$16, %edx
-
-	pshufd	$0, %xmm1, %xmm1
-
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(length_less16_offset0)
-
-	mov	%cl, %dh
-	mov	%ecx, %esi
-	add	%dl, %dh
-	and	$-16, %rdi
-
-	sub	$16, %dh
-	ja	L(length_less16_part2)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-
-	sar	%cl, %eax
-	mov	%dl, %cl
-
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
-
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 6
+L(ret_vec_x2_test):
+	bsrl	%eax, %eax
+	jz	L(zero_2)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4
-L(length_less16_part2):
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	mov	%dh, %cl
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
+L(zero_2):
+	xorl	%eax, %eax
+	ret
 
-	test	%eax, %eax
-	jnz	L(length_less16_part2_return)
 
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
+	.p2align 4,, 5
+L(ret_vec_x2):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	mov	%esi, %ecx
-	sar	%cl, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	.p2align 4,, 5
+L(ret_vec_x3):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
+
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x3)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
+	   keeping the code from spilling to the next cache line.  */
+	addq	$(VEC_SIZE * 4 - 1), %rcx
+	andq	$-(VEC_SIZE * 4), %rcx
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
+	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
+	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
+	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
+	pcmpeqb	%xmm0, %xmm1
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm0, %xmm4
+
+	por	%xmm1, %xmm2
+	por	%xmm3, %xmm4
+	por	%xmm2, %xmm4
+
+	pmovmskb %xmm4, %esi
+	testl	%esi, %esi
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	%rdx, %rcx
+	jne	L(loop_4x_vec)
+
+	subl	%edi, %edx
+
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 2
+L(last_4x_vec):
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+	bsrl	%eax, %eax
+	jz	L(ret_4)
+	addl	%edx, %eax
+	jl	L(zero_3)
+	addq	%rdi, %rax
+L(ret_4):
 	ret
 
-	.p2align 4
-L(length_less16_part2_return):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 3
+L(loop_end):
+	pmovmskb %xmm1, %eax
+	sall	$16, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm2, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm3, %eax
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	sall	$16, %eax
+	orl	%esi, %eax
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
 	ret
 
-END (__memrchr)
+L(ret_vec_end):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
+	ret
+	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
+	   aligning bytes.  */
+L(zero_3):
+	xorl	%eax, %eax
+	ret
+	/* 2-bytes from next cache line.  */
+END(__memrchr)
 weak_alias (__memrchr, memrchr)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v1 4/8] x86: Optimize memrchr-sse2.S
  2022-06-03  4:42 ` [PATCH v1 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
@ 2022-06-03  4:47   ` Noah Goldstein
  0 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03  4:47 UTC (permalink / raw)
  To: GNU C Library

On Thu, Jun 2, 2022 at 11:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code:
>     1. prioritizes smaller lengths more.
>     2. optimizes target placement more carefully.
>     3. reuses logic more.
>     4. fixes up various inefficiencies in the logic.
>
> The total code size saving is: 394 bytes
> Geometric Mean of all benchmarks New / Old: 0.874
>
> Regressions:
>     1. The page cross case is now colder, especially re-entry from the
>        page cross case if a match is not found in the first VEC
>        (roughly 50%). My general opinion with this patch is this is
>        acceptable given the "coldness" of this case (less than 4%) and
>        generally performance improvement in the other far more common
>        cases.
>
>     2. There are some regressions 5-15% for medium/large user-arg
>        lengths that have a match in the first VEC. This is because the
>        logic was rewritten to optimize finds in the first VEC if the
>        user-arg length is shorter (where we see roughly 20-50%
>        performance improvements). It is not always the case this is a
>        regression. My intuition is some frontend quirk is partially
>        explaining the data although I haven't been able to find the
>        root cause.
>
> Full xcheck passes on x86_64.
> ---

Least confident with numbers in this patch.

Geometric mean of N = 30 runs.
Benchmarked on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

Aggregate Geometric Mean of New / Old: 0.8743388468654057

Results For: memrchr
 len, align,  pos, seek_char, invert_pos,  New / Old
2048,     0,   32,        23,          0,      0.993
 256,     1,   64,        23,          0,      0.903
2048,     0,   32,         0,          0,       0.89
 256,     1,   64,         0,          0,      0.904
 256,  4081,   64,         0,          0,      0.907
 256,     0,    1,        23,          0,       0.95
 256,     0,    1,         0,          0,       0.95
 256,     1,    1,        23,          0,      0.885
 256,     1,    1,         0,          0,      0.883
2048,     0,   64,        23,          0,        0.8
 256,     2,   64,        23,          0,      0.905
2048,     0,   64,         0,          0,      0.795
 256,     2,   64,         0,          0,      0.905
 256,     0,    2,        23,          0,      0.949
 256,     0,    2,         0,          0,      0.949
 256,     2,    2,        23,          0,      0.885
 256,     2,    2,         0,          0,      0.886
2048,     0,  128,        23,          0,      0.781
 256,     3,   64,        23,          0,      0.904
2048,     0,  128,         0,          0,      0.804
 256,     3,   64,         0,          0,      0.904
 256,     0,    3,        23,          0,      0.948
 256,     0,    3,         0,          0,      0.948
 256,     3,    3,        23,          0,      0.886
 256,     3,    3,         0,          0,      0.881
2048,     0,  256,        23,          0,      0.715
 256,     4,   64,        23,          0,      0.896
2048,     0,  256,         0,          0,      0.747
 256,     4,   64,         0,          0,      0.897
 256,     0,    4,        23,          0,      0.948
 256,     0,    4,         0,          0,       0.95
 256,     4,    4,        23,          0,      0.884
 256,     4,    4,         0,          0,      0.885
2048,     0,  512,        23,          0,       0.66
 256,     5,   64,        23,          0,      0.905
2048,     0,  512,         0,          0,      0.674
 256,     5,   64,         0,          0,      0.905
 256,     0,    5,        23,          0,      0.951
 256,     0,    5,         0,          0,       0.95
 256,     5,    5,        23,          0,      0.885
 256,     5,    5,         0,          0,      0.883
2048,     0, 1024,        23,          0,      0.952
 256,     6,   64,        23,          0,      0.905
2048,     0, 1024,         0,          0,      0.952
 256,     6,   64,         0,          0,      0.904
 256,     0,    6,        23,          0,       0.95
 256,     0,    6,         0,          0,       0.95
 256,     6,    6,        23,          0,      0.884
 256,     6,    6,         0,          0,      0.884
2048,     0, 2048,        23,          0,      0.843
 256,     7,   64,        23,          0,      0.904
2048,     0, 2048,         0,          0,      0.839
 256,     7,   64,         0,          0,      0.906
 256,     0,    7,        23,          0,      0.951
 256,     0,    7,         0,          0,      0.951
 256,     7,    7,        23,          0,      0.887
 256,     7,    7,         0,          0,      0.885
 192,     1,   32,        23,          0,      0.867
 192,     1,   32,         0,          0,      0.866
 256,     1,   32,        23,          0,      0.888
 256,     1,   32,         0,          0,      0.888
 512,     1,   32,        23,          0,      1.103
 512,     1,   32,         0,          0,      1.102
 256,  4081,   32,        23,          0,      0.924
 192,     2,   64,        23,          0,      1.081
 192,     2,   64,         0,          0,      1.081
 512,     2,   64,        23,          0,      1.131
 512,     2,   64,         0,          0,      1.129
 256,  4081,   64,        23,          0,      0.905
 192,     3,   96,        23,          0,      1.174
 192,     3,   96,         0,          0,      1.174
 256,     3,   96,        23,          0,       0.73
 256,     3,   96,         0,          0,       0.73
 512,     3,   96,        23,          0,      0.755
 512,     3,   96,         0,          0,      0.757
 256,  4081,   96,        23,          0,      0.835
 192,     4,  128,        23,          0,      0.898
 192,     4,  128,         0,          0,      0.895
 256,     4,  128,        23,          0,      1.081
 256,     4,  128,         0,          0,      1.082
 512,     4,  128,        23,          0,      1.088
 512,     4,  128,         0,          0,      1.087
 256,  4081,  128,        23,          0,      1.252
 192,     5,  160,        23,          0,      0.894
 192,     5,  160,         0,          0,      0.894
 256,     5,  160,        23,          0,      1.174
 256,     5,  160,         0,          0,      1.174
 512,     5,  160,        23,          0,      1.093
 512,     5,  160,         0,          0,      1.097
 256,  4081,  160,        23,          0,      1.255
 192,     6,  192,        23,          0,      0.869
 192,     6,  192,         0,          0,      0.869
 256,     6,  192,        23,          0,      0.903
 256,     6,  192,         0,          0,      0.899
 512,     6,  192,        23,          0,      0.999
 512,     6,  192,         0,          0,        1.0
 256,  4081,  192,        23,          0,       0.91
 192,     7,  224,        23,          0,      0.869
 192,     7,  224,         0,          0,      0.868
 256,     7,  224,        23,          0,      0.893
 256,     7,  224,         0,          0,      0.893
 512,     7,  224,        23,          0,      0.718
 512,     7,  224,         0,          0,      0.718
 256,  4081,  224,        23,          0,      0.903
   2,     0,    1,        23,          0,      1.026
   2,     0,    1,         0,          0,      1.029
   2,     1,    1,        23,          0,      0.874
   2,     1,    1,         0,          0,      0.875
   0,     0,    1,        23,          0,      0.583
   0,     0,    1,         0,          0,      0.583
   0,     1,    1,        23,          0,      0.539
   0,     1,    1,         0,          0,      0.538
   2,  2048,    1,        23,          0,      0.751
   2,  2048,    1,         0,          0,      0.749
   2,  2049,    1,        23,          0,      0.638
   2,  2049,    1,         0,          0,      0.638
   0,  2048,    1,        23,          0,        0.5
   0,  2048,    1,         0,          0,        0.5
   0,  2049,    1,        23,          0,      0.462
   0,  2049,    1,         0,          0,      0.462
   0,  4081,    1,        23,          0,      0.462
   0,  4081,    1,         0,          0,      0.462
   2,  4081,    1,        23,          0,       0.61
   2,  4081,    1,         0,          0,      0.609
   2,     0,    2,         0,          0,      0.889
   3,     0,    2,        23,          0,       1.05
   3,     0,    2,         0,          0,      1.034
   3,     2,    2,        23,          0,        0.9
   3,     2,    2,         0,          0,      0.887
   1,     0,    2,        23,          0,      0.942
   1,     0,    2,         0,          0,      0.941
   1,     2,    2,        23,          0,      1.043
   1,     2,    2,         0,          0,       1.11
   3,  2048,    2,        23,          0,       0.75
   3,  2048,    2,         0,          0,       0.75
   3,  2050,    2,        23,          0,      0.638
   3,  2050,    2,         0,          0,      0.639
   1,  2048,    2,        23,          0,      0.666
   1,  2048,    2,         0,          0,      0.668
   1,  2050,    2,        23,          0,      0.734
   1,  2050,    2,         0,          0,      0.727
   1,  4081,    2,        23,          0,      0.725
   1,  4081,    2,         0,          0,      0.726
   3,  4081,    2,        23,          0,      0.614
   3,  4081,    2,         0,          0,      0.619
   3,     0,    1,        23,          0,      1.043
   4,     0,    3,        23,          0,       1.04
   4,     0,    3,         0,          0,      1.043
   4,     3,    3,        23,          0,      0.886
   4,     3,    3,         0,          0,      0.901
   2,     0,    3,        23,          0,      0.923
   2,     0,    3,         0,          0,      0.933
   2,     3,    3,        23,          0,       1.01
   2,     3,    3,         0,          0,      1.083
   4,  2048,    3,        23,          0,      0.751
   4,  2048,    3,         0,          0,       0.75
   4,  2051,    3,        23,          0,      0.638
   4,  2051,    3,         0,          0,      0.641
   2,  2048,    3,        23,          0,       0.67
   2,  2048,    3,         0,          0,       0.67
   2,  2051,    3,        23,          0,      0.728
   2,  2051,    3,         0,          0,       0.73
   2,  4081,    3,        23,          0,      0.727
   2,  4081,    3,         0,          0,      0.726
   4,  4081,    3,        23,          0,      0.613
   4,  4081,    3,         0,          0,       0.63
   4,     0,    1,        23,          0,      1.073
   4,     0,    2,         0,          0,      1.055
   5,     0,    4,        23,          0,      1.055
   5,     0,    4,         0,          0,      1.066
   5,     4,    4,        23,          0,      0.893
   5,     4,    4,         0,          0,      0.892
   3,     0,    4,        23,          0,      0.911
   3,     0,    4,         0,          0,      0.913
   3,     4,    4,        23,          0,      0.988
   3,     4,    4,         0,          0,      1.055
   5,  2048,    4,        23,          0,      0.751
   5,  2048,    4,         0,          0,      0.752
   5,  2052,    4,        23,          0,       0.64
   5,  2052,    4,         0,          0,      0.639
   3,  2048,    4,        23,          0,      0.668
   3,  2048,    4,         0,          0,      0.669
   3,  2052,    4,        23,          0,       0.73
   3,  2052,    4,         0,          0,      0.731
   3,  4081,    4,        23,          0,      0.726
   3,  4081,    4,         0,          0,       0.73
   5,  4081,    4,        23,          0,       0.62
   5,  4081,    4,         0,          0,      0.611
   5,     0,    1,        23,          0,      1.044
   5,     0,    2,         0,          0,      1.048
   6,     0,    5,        23,          0,      1.062
   6,     0,    5,         0,          0,      1.064
   6,     5,    5,        23,          0,      0.898
   6,     5,    5,         0,          0,      0.896
   4,     0,    5,        23,          0,      0.894
   4,     0,    5,         0,          0,      0.894
   4,     5,    5,        23,          0,      0.974
   4,     5,    5,         0,          0,      1.042
   6,  2048,    5,        23,          0,      0.752
   6,  2048,    5,         0,          0,      0.751
   6,  2053,    5,        23,          0,      0.639
   6,  2053,    5,         0,          0,      0.638
   4,  2048,    5,        23,          0,      0.667
   4,  2048,    5,         0,          0,      0.668
   4,  2053,    5,        23,          0,       0.73
   4,  2053,    5,         0,          0,      0.729
   4,  4081,    5,        23,          0,      0.726
   4,  4081,    5,         0,          0,      0.727
   6,  4081,    5,        23,          0,      0.626
   6,  4081,    5,         0,          0,      0.619
   6,     0,    1,        23,          0,      1.045
   6,     0,    2,         0,          0,      1.049
   7,     0,    6,        23,          0,      1.032
   7,     0,    6,         0,          0,      1.038
   7,     6,    6,        23,          0,      0.889
   7,     6,    6,         0,          0,      0.894
   5,     0,    6,        23,          0,       0.89
   5,     0,    6,         0,          0,      0.891
   5,     6,    6,        23,          0,      0.971
   5,     6,    6,         0,          0,      0.997
   7,  2048,    6,        23,          0,      0.751
   7,  2048,    6,         0,          0,      0.747
   7,  2054,    6,        23,          0,      0.639
   7,  2054,    6,         0,          0,       0.64
   5,  2048,    6,        23,          0,      0.667
   5,  2048,    6,         0,          0,      0.669
   5,  2054,    6,        23,          0,      0.732
   5,  2054,    6,         0,          0,      0.728
   5,  4081,    6,        23,          0,      0.729
   5,  4081,    6,         0,          0,      0.727
   7,  4081,    6,        23,          0,      0.631
   7,  4081,    6,         0,          0,      0.619
   7,     0,    1,        23,          0,      1.042
   7,     0,    2,         0,          0,      1.039
   8,     0,    7,        23,          0,      1.034
   8,     0,    7,         0,          0,       1.04
   8,     7,    7,        23,          0,      0.876
   8,     7,    7,         0,          0,      0.883
   6,     0,    7,        23,          0,      0.891
   6,     0,    7,         0,          0,      0.895
   6,     7,    7,        23,          0,      0.986
   6,     7,    7,         0,          0,      0.996
   8,  2048,    7,        23,          0,      0.754
   8,  2048,    7,         0,          0,      0.754
   8,  2055,    7,        23,          0,      0.638
   8,  2055,    7,         0,          0,      0.638
   6,  2048,    7,        23,          0,      0.667
   6,  2048,    7,         0,          0,       0.67
   6,  2055,    7,        23,          0,       0.73
   6,  2055,    7,         0,          0,      0.729
   6,  4081,    7,        23,          0,      0.726
   6,  4081,    7,         0,          0,      0.727
   8,  4081,    7,        23,          0,       0.61
   8,  4081,    7,         0,          0,      0.616
   8,     0,    1,        23,          0,      1.031
   8,     0,    2,         0,          0,      1.032
   9,     0,    8,        23,          0,      1.044
   9,     0,    8,         0,          0,      1.037
   9,     8,    8,        23,          0,      0.652
   9,     8,    8,         0,          0,      0.643
   7,     0,    8,        23,          0,      0.897
   7,     0,    8,         0,          0,      0.889
   7,     8,    8,        23,          0,      0.969
   7,     8,    8,         0,          0,      1.015
   9,  2048,    8,        23,          0,      0.753
   9,  2048,    8,         0,          0,       0.75
   9,  2056,    8,        23,          0,      0.645
   9,  2056,    8,         0,          0,      0.655
   7,  2048,    8,        23,          0,      0.667
   7,  2048,    8,         0,          0,      0.671
   7,  2056,    8,        23,          0,      0.731
   7,  2056,    8,         0,          0,      0.731
   7,  4081,    8,        23,          0,      0.723
   7,  4081,    8,         0,          0,      0.724
   9,  4081,    8,        23,          0,      0.653
   9,  4081,    8,         0,          0,      0.638
   9,     0,    1,        23,          0,      1.037
   9,     0,    2,         0,          0,      1.032
  10,     0,    9,        23,          0,      1.033
  10,     0,    9,         0,          0,       1.03
  10,     9,    9,        23,          0,       0.66
  10,     9,    9,         0,          0,      0.657
   8,     0,    9,        23,          0,      0.888
   8,     0,    9,         0,          0,      0.891
   8,     9,    9,        23,          0,      0.631
   8,     9,    9,         0,          0,      0.632
  10,  2048,    9,        23,          0,      0.767
  10,  2048,    9,         0,          0,      0.759
  10,  2057,    9,        23,          0,      0.666
  10,  2057,    9,         0,          0,      0.647
   8,  2048,    9,        23,          0,      0.669
   8,  2048,    9,         0,          0,      0.668
   8,  2057,    9,        23,          0,      0.629
   8,  2057,    9,         0,          0,      0.641
   8,  4081,    9,        23,          0,      0.727
   8,  4081,    9,         0,          0,      0.764
  10,  4081,    9,        23,          0,      0.642
  10,  4081,    9,         0,          0,      0.653
  10,     0,    1,        23,          0,      1.031
  10,     0,    2,         0,          0,      1.038
  11,     0,   10,        23,          0,      1.032
  11,     0,   10,         0,          0,      1.029
  11,    10,   10,        23,          0,      0.652
  11,    10,   10,         0,          0,      0.656
   9,     0,   10,        23,          0,      0.893
   9,     0,   10,         0,          0,      0.894
   9,    10,   10,        23,          0,      0.629
   9,    10,   10,         0,          0,      0.707
  11,  2048,   10,        23,          0,      0.753
  11,  2048,   10,         0,          0,      0.749
  11,  2058,   10,        23,          0,      0.662
  11,  2058,   10,         0,          0,      0.661
   9,  2048,   10,        23,          0,      0.673
   9,  2048,   10,         0,          0,      0.666
   9,  2058,   10,        23,          0,      0.629
   9,  2058,   10,         0,          0,      0.663
   9,  4081,   10,        23,          0,      0.727
   9,  4081,   10,         0,          0,      0.779
  11,  4081,   10,        23,          0,      0.624
  11,  4081,   10,         0,          0,      0.619
  11,     0,    1,        23,          0,       1.03
  11,     0,    2,         0,          0,       1.03
  12,     0,   11,        23,          0,      1.039
  12,     0,   11,         0,          0,       1.03
  12,    11,   11,        23,          0,      0.653
  12,    11,   11,         0,          0,      0.652
  10,     0,   11,        23,          0,      0.896
  10,     0,   11,         0,          0,      0.889
  10,    11,   11,        23,          0,      0.628
  10,    11,   11,         0,          0,      0.696
  12,  2048,   11,        23,          0,      0.752
  12,  2048,   11,         0,          0,      0.754
  12,  2059,   11,        23,          0,      0.657
  12,  2059,   11,         0,          0,      0.652
  10,  2048,   11,        23,          0,       0.67
  10,  2048,   11,         0,          0,      0.668
  10,  2059,   11,        23,          0,      0.627
  10,  2059,   11,         0,          0,      0.677
  10,  4081,   11,        23,          0,      0.726
  10,  4081,   11,         0,          0,      0.771
  12,  4081,   11,        23,          0,      0.648
  12,  4081,   11,         0,          0,      0.624
  12,     0,    1,        23,          0,      1.047
  12,     0,    2,         0,          0,      1.042
  13,     0,   12,        23,          0,      1.043
  13,     0,   12,         0,          0,       1.04
  13,    12,   12,        23,          0,       0.66
  13,    12,   12,         0,          0,      0.647
  11,     0,   12,        23,          0,      0.891
  11,     0,   12,         0,          0,      0.895
  11,    12,   12,        23,          0,      0.629
  11,    12,   12,         0,          0,      0.655
  13,  2048,   12,        23,          0,      0.749
  13,  2048,   12,         0,          0,      0.748
  13,  2060,   12,        23,          0,      0.647
  13,  2060,   12,         0,          0,      0.636
  11,  2048,   12,        23,          0,      0.669
  11,  2048,   12,         0,          0,      0.668
  11,  2060,   12,        23,          0,      0.627
  11,  2060,   12,         0,          0,      0.664
  11,  4081,   12,        23,          0,      0.725
  11,  4081,   12,         0,          0,      0.766
  13,  4081,   12,        23,          0,      0.674
  13,  4081,   12,         0,          0,      0.633
  13,     0,    1,        23,          0,      1.036
  13,     0,    2,         0,          0,      1.029
  14,     0,   13,        23,          0,      1.029
  14,     0,   13,         0,          0,      1.032
  14,    13,   13,        23,          0,      0.646
  14,    13,   13,         0,          0,      0.655
  12,     0,   13,        23,          0,      0.889
  12,     0,   13,         0,          0,       0.89
  12,    13,   13,        23,          0,      0.628
  12,    13,   13,         0,          0,      0.684
  14,  2048,   13,        23,          0,      0.748
  14,  2048,   13,         0,          0,      0.749
  14,  2061,   13,        23,          0,      0.644
  14,  2061,   13,         0,          0,      0.651
  12,  2048,   13,        23,          0,       0.67
  12,  2048,   13,         0,          0,      0.667
  12,  2061,   13,        23,          0,      0.627
  12,  2061,   13,         0,          0,      0.655
  12,  4081,   13,        23,          0,      0.725
  12,  4081,   13,         0,          0,      0.758
  14,  4081,   13,        23,          0,      0.645
  14,  4081,   13,         0,          0,      0.638
  14,     0,    1,        23,          0,      1.046
  14,     0,    2,         0,          0,      1.029
  15,     0,   14,        23,          0,      1.028
  15,     0,   14,         0,          0,      1.029
  15,    14,   14,        23,          0,       0.65
  15,    14,   14,         0,          0,      0.671
  13,     0,   14,        23,          0,      0.891
  13,     0,   14,         0,          0,       0.89
  13,    14,   14,        23,          0,      0.637
  13,    14,   14,         0,          0,      0.628
  15,  2048,   14,        23,          0,       0.75
  15,  2048,   14,         0,          0,      0.751
  15,  2062,   14,        23,          0,      0.647
  15,  2062,   14,         0,          0,      0.655
  13,  2048,   14,        23,          0,      0.667
  13,  2048,   14,         0,          0,      0.667
  13,  2062,   14,        23,          0,      0.658
  13,  2062,   14,         0,          0,      0.655
  13,  4081,   14,        23,          0,      0.726
  13,  4081,   14,         0,          0,      0.778
  15,  4081,   14,        23,          0,      0.872
  15,  4081,   14,         0,          0,      0.872
  15,     0,    1,        23,          0,      1.052
  15,     0,    2,         0,          0,      1.028
  16,     0,   15,        23,          0,      0.724
  16,     0,   15,         0,          0,      0.724
  16,    15,   15,        23,          0,       0.65
  16,    15,   15,         0,          0,       0.65
  14,     0,   15,        23,          0,      0.889
  14,     0,   15,         0,          0,      0.889
  14,    15,   15,        23,          0,      0.626
  14,    15,   15,         0,          0,      0.665
  16,  2048,   15,        23,          0,      0.735
  16,  2048,   15,         0,          0,      0.717
  16,  2063,   15,        23,          0,      0.648
  16,  2063,   15,         0,          0,      0.651
  14,  2048,   15,        23,          0,      0.667
  14,  2048,   15,         0,          0,      0.667
  14,  2063,   15,        23,          0,      0.627
  14,  2063,   15,         0,          0,      0.694
  14,  4081,   15,        23,          0,      0.725
  14,  4081,   15,         0,          0,      0.801
  16,  4081,   15,        23,          0,      0.999
  16,  4081,   15,         0,          0,      0.999
  16,     0,    1,        23,          0,      0.751
  16,     0,    2,         0,          0,      0.731
  17,     0,   16,        23,          0,      1.167
  17,     0,   16,         0,          0,      1.165
  17,    16,   16,        23,          0,      1.167
  17,    16,   16,         0,          0,      1.167
  15,     0,   16,        23,          0,      0.889
  15,     0,   16,         0,          0,      0.889
  15,    16,   16,        23,          0,      0.666
  15,    16,   16,         0,          0,      0.712
  17,  2048,   16,        23,          0,      1.167
  17,  2048,   16,         0,          0,      1.167
  17,  2064,   16,        23,          0,      1.167
  17,  2064,   16,         0,          0,      1.167
  15,  2048,   16,        23,          0,      0.667
  15,  2048,   16,         0,          0,      0.667
  15,  2064,   16,        23,          0,      0.667
  15,  2064,   16,         0,          0,      0.696
  15,  4081,   16,        23,          0,      0.956
  15,  4081,   16,         0,          0,      1.098
  17,  4081,   16,        23,          0,        1.5
  17,  4081,   16,         0,          0,        1.5
  17,     0,    1,        23,          0,      1.167
  17,     0,    2,         0,          0,      1.167
  18,     0,   17,        23,          0,      1.167
  18,     0,   17,         0,          0,      1.167
  18,    17,   17,        23,          0,      1.167
  18,    17,   17,         0,          0,      1.167
  16,     0,   17,        23,          0,      0.667
  16,     0,   17,         0,          0,      0.667
  16,    17,   17,        23,          0,      0.627
  16,    17,   17,         0,          0,      0.627
  18,  2048,   17,        23,          0,      1.167
  18,  2048,   17,         0,          0,      1.167
  18,  2065,   17,        23,          0,      1.167
  18,  2065,   17,         0,          0,      1.167
  16,  2048,   17,        23,          0,      0.667
  16,  2048,   17,         0,          0,      0.667
  16,  2065,   17,        23,          0,      0.627
  16,  2065,   17,         0,          0,      0.627
  16,  4081,   17,        23,          0,      1.046
  16,  4081,   17,         0,          0,      1.095
  18,  4081,   17,        23,          0,        1.5
  18,  4081,   17,         0,          0,        1.5
  18,     0,    1,        23,          0,      0.852
  18,     0,    2,         0,          0,      1.167
  19,     0,   18,        23,          0,      1.167
  19,     0,   18,         0,          0,      1.167
  19,    18,   18,        23,          0,      1.167
  19,    18,   18,         0,          0,      1.167
  17,     0,   18,        23,          0,      0.889
  17,     0,   18,         0,          0,      0.889
  17,    18,   18,        23,          0,      0.889
  17,    18,   18,         0,          0,        0.8
  19,  2048,   18,        23,          0,      1.167
  19,  2048,   18,         0,          0,      1.167
  19,  2066,   18,        23,          0,      1.167
  19,  2066,   18,         0,          0,      1.167
  17,  2048,   18,        23,          0,      0.889
  17,  2048,   18,         0,          0,      0.889
  17,  2066,   18,        23,          0,      0.889
  17,  2066,   18,         0,          0,        0.8
  17,  4081,   18,        23,          0,       1.11
  17,  4081,   18,         0,          0,      1.047
  19,  4081,   18,        23,          0,        1.5
  19,  4081,   18,         0,          0,        1.5
  19,     0,    1,        23,          0,      0.897
  19,     0,    2,         0,          0,      0.878
  20,     0,   19,        23,          0,      1.167
  20,     0,   19,         0,          0,      1.167
  20,    19,   19,        23,          0,      1.167
  20,    19,   19,         0,          0,      1.167
  18,     0,   19,        23,          0,      0.889
  18,     0,   19,         0,          0,      0.889
  18,    19,   19,        23,          0,      0.889
  18,    19,   19,         0,          0,        0.8
  20,  2048,   19,        23,          0,      1.167
  20,  2048,   19,         0,          0,      1.167
  20,  2067,   19,        23,          0,      1.167
  20,  2067,   19,         0,          0,      1.167
  18,  2048,   19,        23,          0,      0.889
  18,  2048,   19,         0,          0,      0.889
  18,  2067,   19,        23,          0,      0.889
  18,  2067,   19,         0,          0,        0.8
  18,  4081,   19,        23,          0,       1.11
  18,  4081,   19,         0,          0,      1.047
  20,  4081,   19,        23,          0,        1.5
  20,  4081,   19,         0,          0,        1.5
  20,     0,    1,        23,          0,      0.906
  20,     0,    2,         0,          0,      0.899
  21,     0,   20,        23,          0,      1.167
  21,     0,   20,         0,          0,      1.167
  21,    20,   20,        23,          0,      1.167
  21,    20,   20,         0,          0,      1.167
  19,     0,   20,        23,          0,      0.889
  19,     0,   20,         0,          0,      0.889
  19,    20,   20,        23,          0,      0.889
  19,    20,   20,         0,          0,        0.8
  21,  2048,   20,        23,          0,      1.167
  21,  2048,   20,         0,          0,      1.167
  21,  2068,   20,        23,          0,      1.167
  21,  2068,   20,         0,          0,      1.167
  19,  2048,   20,        23,          0,      0.889
  19,  2048,   20,         0,          0,      0.889
  19,  2068,   20,        23,          0,      0.889
  19,  2068,   20,         0,          0,        0.8
  19,  4081,   20,        23,          0,       1.11
  19,  4081,   20,         0,          0,      1.047
  21,  4081,   20,        23,          0,        1.5
  21,  4081,   20,         0,          0,        1.5
  21,     0,    1,        23,          0,      0.902
  21,     0,    2,         0,          0,      0.891
  22,     0,   21,        23,          0,      1.167
  22,     0,   21,         0,          0,      1.167
  22,    21,   21,        23,          0,      1.167
  22,    21,   21,         0,          0,      1.167
  20,     0,   21,        23,          0,      0.889
  20,     0,   21,         0,          0,      0.889
  20,    21,   21,        23,          0,      0.889
  20,    21,   21,         0,          0,        0.8
  22,  2048,   21,        23,          0,      1.167
  22,  2048,   21,         0,          0,      1.167
  22,  2069,   21,        23,          0,      1.167
  22,  2069,   21,         0,          0,      1.167
  20,  2048,   21,        23,          0,      0.889
  20,  2048,   21,         0,          0,      0.889
  20,  2069,   21,        23,          0,      0.889
  20,  2069,   21,         0,          0,        0.8
  20,  4081,   21,        23,          0,       1.11
  20,  4081,   21,         0,          0,       1.06
  22,  4081,   21,        23,          0,        1.5
  22,  4081,   21,         0,          0,        1.5
  22,     0,    1,        23,          0,      0.915
  22,     0,    2,         0,          0,      0.906
  23,     0,   22,        23,          0,      1.167
  23,     0,   22,         0,          0,      1.167
  23,    22,   22,        23,          0,      1.167
  23,    22,   22,         0,          0,      1.167
  21,     0,   22,        23,          0,      0.889
  21,     0,   22,         0,          0,      0.889
  21,    22,   22,        23,          0,      0.889
  21,    22,   22,         0,          0,        0.8
  23,  2048,   22,        23,          0,      1.167
  23,  2048,   22,         0,          0,      1.167
  23,  2070,   22,        23,          0,      1.167
  23,  2070,   22,         0,          0,      1.167
  21,  2048,   22,        23,          0,      0.889
  21,  2048,   22,         0,          0,      0.889
  21,  2070,   22,        23,          0,      0.889
  21,  2070,   22,         0,          0,        0.8
  21,  4081,   22,        23,          0,       1.11
  21,  4081,   22,         0,          0,      1.059
  23,  4081,   22,        23,          0,        1.5
  23,  4081,   22,         0,          0,        1.5
  23,     0,    1,        23,          0,      0.914
  23,     0,    2,         0,          0,      0.907
  24,     0,   23,        23,          0,      1.167
  24,     0,   23,         0,          0,      1.167
  24,    23,   23,        23,          0,      1.167
  24,    23,   23,         0,          0,      1.167
  22,     0,   23,        23,          0,      0.889
  22,     0,   23,         0,          0,      0.889
  22,    23,   23,        23,          0,      0.889
  22,    23,   23,         0,          0,        0.8
  24,  2048,   23,        23,          0,      1.167
  24,  2048,   23,         0,          0,      1.167
  24,  2071,   23,        23,          0,      1.167
  24,  2071,   23,         0,          0,      1.167
  22,  2048,   23,        23,          0,      0.889
  22,  2048,   23,         0,          0,      0.889
  22,  2071,   23,        23,          0,      0.889
  22,  2071,   23,         0,          0,        0.8
  22,  4081,   23,        23,          0,       1.11
  22,  4081,   23,         0,          0,      1.049
  24,  4081,   23,        23,          0,        1.5
  24,  4081,   23,         0,          0,        1.5
  24,     0,    1,        23,          0,      0.915
  24,     0,    2,         0,          0,      0.915
  25,     0,   24,        23,          0,      1.167
  25,     0,   24,         0,          0,      1.167
  25,    24,   24,        23,          0,      1.167
  25,    24,   24,         0,          0,      1.167
  23,     0,   24,        23,          0,      0.889
  23,     0,   24,         0,          0,      0.889
  23,    24,   24,        23,          0,      0.889
  23,    24,   24,         0,          0,        0.8
  25,  2048,   24,        23,          0,      1.167
  25,  2048,   24,         0,          0,      1.167
  25,  2072,   24,        23,          0,      1.167
  25,  2072,   24,         0,          0,      1.167
  23,  2048,   24,        23,          0,      0.889
  23,  2048,   24,         0,          0,      0.889
  23,  2072,   24,        23,          0,      0.889
  23,  2072,   24,         0,          0,        0.8
  23,  4081,   24,        23,          0,       1.11
  23,  4081,   24,         0,          0,       1.05
  25,  4081,   24,        23,          0,        1.5
  25,  4081,   24,         0,          0,        1.5
  25,     0,    1,        23,          0,      0.917
  25,     0,    2,         0,          0,      0.918
  26,     0,   25,        23,          0,      1.167
  26,     0,   25,         0,          0,      1.167
  26,    25,   25,        23,          0,      1.167
  26,    25,   25,         0,          0,      1.167
  24,     0,   25,        23,          0,      0.889
  24,     0,   25,         0,          0,      0.889
  24,    25,   25,        23,          0,      0.898
  24,    25,   25,         0,          0,      0.832
  26,  2048,   25,        23,          0,      1.167
  26,  2048,   25,         0,          0,      1.167
  26,  2073,   25,        23,          0,      1.167
  26,  2073,   25,         0,          0,      1.167
  24,  2048,   25,        23,          0,      0.889
  24,  2048,   25,         0,          0,      0.889
  24,  2073,   25,        23,          0,      0.879
  24,  2073,   25,         0,          0,      0.814
  24,  4081,   25,        23,          0,       1.11
  24,  4081,   25,         0,          0,      1.049
  26,  4081,   25,        23,          0,        1.5
  26,  4081,   25,         0,          0,        1.5
  26,     0,    1,        23,          0,      0.869
  26,     0,    2,         0,          0,      0.869
  27,     0,   26,        23,          0,      1.167
  27,     0,   26,         0,          0,      1.167
  27,    26,   26,        23,          0,      1.167
  27,    26,   26,         0,          0,      1.167
  25,     0,   26,        23,          0,      0.889
  25,     0,   26,         0,          0,      0.889
  25,    26,   26,        23,          0,      0.871
  25,    26,   26,         0,          0,      0.827
  27,  2048,   26,        23,          0,      1.167
  27,  2048,   26,         0,          0,      1.167
  27,  2074,   26,        23,          0,      1.167
  27,  2074,   26,         0,          0,      1.167
  25,  2048,   26,        23,          0,      0.889
  25,  2048,   26,         0,          0,      0.889
  25,  2074,   26,        23,          0,       0.88
  25,  2074,   26,         0,          0,      0.823
  25,  4081,   26,        23,          0,       1.11
  25,  4081,   26,         0,          0,      1.047
  27,  4081,   26,        23,          0,        1.5
  27,  4081,   26,         0,          0,        1.5
  27,     0,    1,        23,          0,      0.865
  27,     0,    2,         0,          0,      0.857
  28,     0,   27,        23,          0,      1.167
  28,     0,   27,         0,          0,      1.167
  28,    27,   27,        23,          0,      1.167
  28,    27,   27,         0,          0,      1.167
  26,     0,   27,        23,          0,      0.889
  26,     0,   27,         0,          0,      0.889
  26,    27,   27,        23,          0,      0.884
  26,    27,   27,         0,          0,       0.82
  28,  2048,   27,        23,          0,      1.167
  28,  2048,   27,         0,          0,      1.167
  28,  2075,   27,        23,          0,      1.167
  28,  2075,   27,         0,          0,      1.167
  26,  2048,   27,        23,          0,      0.889
  26,  2048,   27,         0,          0,      0.889
  26,  2075,   27,        23,          0,      0.892
  26,  2075,   27,         0,          0,       0.83
  26,  4081,   27,        23,          0,       1.11
  26,  4081,   27,         0,          0,      1.054
  28,  4081,   27,        23,          0,        1.5
  28,  4081,   27,         0,          0,        1.5
  28,     0,    1,        23,          0,      0.866
  28,     0,    2,         0,          0,      0.867
  29,     0,   28,        23,          0,      1.167
  29,     0,   28,         0,          0,      1.167
  29,    28,   28,        23,          0,      1.167
  29,    28,   28,         0,          0,      1.167
  27,     0,   28,        23,          0,      0.889
  27,     0,   28,         0,          0,      0.889
  27,    28,   28,        23,          0,      0.892
  27,    28,   28,         0,          0,      0.825
  29,  2048,   28,        23,          0,      1.167
  29,  2048,   28,         0,          0,      1.167
  29,  2076,   28,        23,          0,      1.167
  29,  2076,   28,         0,          0,      1.167
  27,  2048,   28,        23,          0,      0.889
  27,  2048,   28,         0,          0,      0.888
  27,  2076,   28,        23,          0,      0.898
  27,  2076,   28,         0,          0,      0.821
  27,  4081,   28,        23,          0,       1.11
  27,  4081,   28,         0,          0,      1.052
  29,  4081,   28,        23,          0,        1.5
  29,  4081,   28,         0,          0,        1.5
  29,     0,    1,        23,          0,      0.854
  29,     0,    2,         0,          0,       0.86
  30,     0,   29,        23,          0,      1.166
  30,     0,   29,         0,          0,      1.167
  30,    29,   29,        23,          0,      1.167
  30,    29,   29,         0,          0,      1.167
  28,     0,   29,        23,          0,      0.887
  28,     0,   29,         0,          0,      0.888
  28,    29,   29,        23,          0,      0.891
  28,    29,   29,         0,          0,      0.843
  30,  2048,   29,        23,          0,      1.166
  30,  2048,   29,         0,          0,      1.167
  30,  2077,   29,        23,          0,      1.167
  30,  2077,   29,         0,          0,      1.165
  28,  2048,   29,        23,          0,      0.886
  28,  2048,   29,         0,          0,      0.887
  28,  2077,   29,        23,          0,      0.891
  28,  2077,   29,         0,          0,      0.836
  28,  4081,   29,        23,          0,      1.106
  28,  4081,   29,         0,          0,      1.063
  30,  4081,   29,        23,          0,      1.496
  30,  4081,   29,         0,          0,      1.496
  30,     0,    1,        23,          0,      0.874
  30,     0,    2,         0,          0,      0.873
  31,     0,   30,        23,          0,      1.164
  31,     0,   30,         0,          0,      1.161
  31,    30,   30,        23,          0,      1.162
  31,    30,   30,         0,          0,      1.163
  29,     0,   30,        23,          0,      0.884
  29,     0,   30,         0,          0,      0.884
  29,    30,   30,        23,          0,      0.893
  29,    30,   30,         0,          0,      0.847
  31,  2048,   30,        23,          0,      1.163
  31,  2048,   30,         0,          0,      1.162
  31,  2078,   30,        23,          0,      1.161
  31,  2078,   30,         0,          0,      1.161
  29,  2048,   30,        23,          0,      0.884
  29,  2048,   30,         0,          0,      0.884
  29,  2078,   30,        23,          0,      0.894
  29,  2078,   30,         0,          0,      0.848
  29,  4081,   30,        23,          0,      1.102
  29,  4081,   30,         0,          0,      1.074
  31,  4081,   30,        23,          0,      1.159
  31,  4081,   30,         0,          0,       1.16
  31,     0,    1,        23,          0,      0.859
  31,     0,    2,         0,          0,      0.858
  32,     0,   31,        23,          0,      1.161
  32,     0,   31,         0,          0,      1.161
  32,    31,   31,        23,          0,      1.161
  32,    31,   31,         0,          0,      1.161
  30,     0,   31,        23,          0,      0.882
  30,     0,   31,         0,          0,      0.883
  30,    31,   31,        23,          0,      0.897
  30,    31,   31,         0,          0,      0.854
  32,  2048,   31,        23,          0,      1.161
  32,  2048,   31,         0,          0,      1.161
  32,  2079,   31,        23,          0,      1.159
  32,  2079,   31,         0,          0,      1.158
  30,  2048,   31,        23,          0,      0.881
  30,  2048,   31,         0,          0,      0.882
  30,  2079,   31,        23,          0,      0.891
  30,  2079,   31,         0,          0,      0.851
  30,  4081,   31,        23,          0,        1.1
  30,  4081,   31,         0,          0,      1.066
  32,  4081,   31,        23,          0,      1.157
  32,  4081,   31,         0,          0,      1.157
  32,     0,    1,        23,          0,      0.798
  32,     0,    2,         0,          0,      0.798
2048,     0,   32,        23,          1,      0.993
 256,     1,   64,        23,          1,       0.89
2048,     0,   32,         0,          1,      0.992
 256,     1,   64,         0,          1,      0.894
 256,  4081,   64,         0,          1,      0.903
 256,     0,    1,        23,          1,      1.158
 256,     0,    1,         0,          1,      1.157
 256,     1,    1,        23,          1,      1.158
 256,     1,    1,         0,          1,      1.158
2048,     0,   64,        23,          1,       0.79
 256,     2,   64,        23,          1,      0.894
2048,     0,   64,         0,          1,       0.79
 256,     2,   64,         0,          1,      0.894
 256,     0,    2,        23,          1,      1.161
 256,     0,    2,         0,          1,      1.161
 256,     2,    2,        23,          1,      1.161
 256,     2,    2,         0,          1,      1.161
2048,     0,  128,        23,          1,      1.319
 256,     3,   64,        23,          1,      0.897
2048,     0,  128,         0,          1,      1.323
 256,     3,   64,         0,          1,        0.9
 256,     0,    3,        23,          1,      1.166
 256,     0,    3,         0,          1,      1.167
 256,     3,    3,        23,          1,      1.167
 256,     3,    3,         0,          1,      1.167
2048,     0,  256,        23,          1,      0.995
 256,     4,   64,        23,          1,      0.902
2048,     0,  256,         0,          1,      0.993
 256,     4,   64,         0,          1,      0.901
 256,     0,    4,        23,          1,      1.167
 256,     0,    4,         0,          1,      1.167
 256,     4,    4,        23,          1,      1.167
 256,     4,    4,         0,          1,      1.167
2048,     0,  512,        23,          1,      1.109
 256,     5,   64,        23,          1,      0.903
2048,     0,  512,         0,          1,      1.109
 256,     5,   64,         0,          1,      0.897
 256,     0,    5,        23,          1,      1.167
 256,     0,    5,         0,          1,      1.167
 256,     5,    5,        23,          1,      1.167
 256,     5,    5,         0,          1,      1.167
2048,     0, 1024,        23,          1,      0.951
 256,     6,   64,        23,          1,      0.902
2048,     0, 1024,         0,          1,      0.953
 256,     6,   64,         0,          1,        0.9
 256,     0,    6,        23,          1,      1.167
 256,     0,    6,         0,          1,      1.167
 256,     6,    6,        23,          1,      1.167
 256,     6,    6,         0,          1,      1.167
2048,     0, 2048,        23,          1,      0.896
 256,     7,   64,        23,          1,      0.901
2048,     0, 2048,         0,          1,      0.845
 256,     7,   64,         0,          1,        0.9
 256,     0,    7,        23,          1,      1.165
 256,     0,    7,         0,          1,      1.165
 256,     7,    7,        23,          1,      1.165
 256,     7,    7,         0,          1,      1.165
 192,     1,   32,        23,          1,      0.892
 192,     1,   32,         0,          1,      0.892
 256,     1,   32,        23,          1,      0.892
 256,     1,   32,         0,          1,      0.892
 512,     1,   32,        23,          1,      0.892
 512,     1,   32,         0,          1,      0.892
 256,  4081,   32,        23,          1,      0.902
 192,     2,   64,        23,          1,      0.902
 192,     2,   64,         0,          1,      0.898
 512,     2,   64,        23,          1,        0.9
 512,     2,   64,         0,          1,      0.899
 256,  4081,   64,        23,          1,      0.908
 192,     3,   96,        23,          1,      1.174
 192,     3,   96,         0,          1,      1.174
 256,     3,   96,        23,          1,      1.174
 256,     3,   96,         0,          1,      1.174
 512,     3,   96,        23,          1,      1.174
 512,     3,   96,         0,          1,      1.174
 256,  4081,   96,        23,          1,      1.255
 192,     4,  128,        23,          1,       1.08
 192,     4,  128,         0,          1,      1.081
 256,     4,  128,        23,          1,      1.081
 256,     4,  128,         0,          1,      1.081
 512,     4,  128,        23,          1,       1.08
 512,     4,  128,         0,          1,      1.081
 256,  4081,  128,        23,          1,       1.25
 192,     5,  160,        23,          1,      0.862
 192,     5,  160,         0,          1,      0.864
 256,     5,  160,        23,          1,      0.729
 256,     5,  160,         0,          1,       0.73
 512,     5,  160,        23,          1,       0.73
 512,     5,  160,         0,          1,      0.729
 256,  4081,  160,        23,          1,      0.834
 192,     6,  192,        23,          1,      0.868
 192,     6,  192,         0,          1,      0.868
 256,     6,  192,        23,          1,      0.903
 256,     6,  192,         0,          1,      0.903
 512,     6,  192,        23,          1,      0.902
 512,     6,  192,         0,          1,      0.902
 256,  4081,  192,        23,          1,      0.902
 192,     7,  224,        23,          1,      0.866
 192,     7,  224,         0,          1,      0.865
 256,     7,  224,        23,          1,      0.885
 256,     7,  224,         0,          1,      0.885
 512,     7,  224,        23,          1,      0.948
 512,     7,  224,         0,          1,       0.95
 256,  4081,  224,        23,          1,      0.921
   2,     0,    1,        23,          1,       1.02
   2,     0,    1,         0,          1,      1.026
   2,     1,    1,        23,          1,      0.873
   2,     1,    1,         0,          1,      0.873
   0,     0,    1,        23,          1,      0.581
   0,     0,    1,         0,          1,      0.581
   0,     1,    1,        23,          1,      0.537
   0,     1,    1,         0,          1,      0.537
   2,  2048,    1,        23,          1,      0.749
   2,  2048,    1,         0,          1,      0.747
   2,  2049,    1,        23,          1,      0.636
   2,  2049,    1,         0,          1,      0.636
   0,  2048,    1,        23,          1,      0.498
   0,  2048,    1,         0,          1,      0.498
   0,  2049,    1,        23,          1,       0.46
   0,  2049,    1,         0,          1,       0.46
   0,  4081,    1,        23,          1,       0.46
   0,  4081,    1,         0,          1,       0.46
   2,  4081,    1,        23,          1,      0.611
   2,  4081,    1,         0,          1,      0.608
   2,     0,    2,         0,          1,      0.894
   3,     0,    2,        23,          1,       1.06
   3,     0,    2,         0,          1,      1.027
   3,     2,    2,        23,          1,      0.893
   3,     2,    2,         0,          1,      0.892
   1,     0,    2,        23,          1,      0.916
   1,     0,    2,         0,          1,      0.916
   1,     2,    2,        23,          1,      1.004
   1,     2,    2,         0,          1,      1.094
   3,  2048,    2,        23,          1,      0.749
   3,  2048,    2,         0,          1,      0.749
   3,  2050,    2,        23,          1,      0.636
   3,  2050,    2,         0,          1,      0.636
   1,  2048,    2,        23,          1,      0.665
   1,  2048,    2,         0,          1,      0.664
   1,  2050,    2,        23,          1,      0.723
   1,  2050,    2,         0,          1,      0.724
   1,  4081,    2,        23,          1,       0.72
   1,  4081,    2,         0,          1,       0.74
   3,  4081,    2,        23,          1,      0.618
   3,  4081,    2,         0,          1,      0.597
   3,     0,    1,        23,          1,      1.032
   4,     0,    3,        23,          1,      1.036
   4,     0,    3,         0,          1,       1.05
   4,     3,    3,        23,          1,      0.889
   4,     3,    3,         0,          1,      0.881
   2,     0,    3,        23,          1,      0.907
   2,     0,    3,         0,          1,      0.902
   2,     3,    3,        23,          1,      0.982
   2,     3,    3,         0,          1,      1.059
   4,  2048,    3,        23,          1,      0.748
   4,  2048,    3,         0,          1,      0.748
   4,  2051,    3,        23,          1,      0.637
   4,  2051,    3,         0,          1,      0.637
   2,  2048,    3,        23,          1,      0.664
   2,  2048,    3,         0,          1,      0.665
   2,  2051,    3,        23,          1,      0.726
   2,  2051,    3,         0,          1,      0.726
   2,  4081,    3,        23,          1,      0.727
   2,  4081,    3,         0,          1,      0.727
   4,  4081,    3,        23,          1,      0.647
   4,  4081,    3,         0,          1,      0.622
   4,     0,    1,        23,          1,      1.043
   4,     0,    2,         0,          1,      1.039
   5,     0,    4,        23,          1,      1.043
   5,     0,    4,         0,          1,      1.055
   5,     4,    4,        23,          1,      0.889
   5,     4,    4,         0,          1,      0.878
   3,     0,    4,        23,          1,      0.889
   3,     0,    4,         0,          1,      0.894
   3,     4,    4,        23,          1,      0.958
   3,     4,    4,         0,          1,      1.033
   5,  2048,    4,        23,          1,       0.75
   5,  2048,    4,         0,          1,       0.75
   5,  2052,    4,        23,          1,      0.638
   5,  2052,    4,         0,          1,      0.637
   3,  2048,    4,        23,          1,      0.666
   3,  2048,    4,         0,          1,      0.666
   3,  2052,    4,        23,          1,      0.706
   3,  2052,    4,         0,          1,      0.713
   3,  4081,    4,        23,          1,      0.712
   3,  4081,    4,         0,          1,      0.711
   5,  4081,    4,        23,          1,       0.66
   5,  4081,    4,         0,          1,      0.629
   5,     0,    1,        23,          1,      1.026
   5,     0,    2,         0,          1,      1.057
   6,     0,    5,        23,          1,      1.031
   6,     0,    5,         0,          1,      1.029
   6,     5,    5,        23,          1,      0.894
   6,     5,    5,         0,          1,      0.887
   4,     0,    5,        23,          1,      0.889
   4,     0,    5,         0,          1,      0.889
   4,     5,    5,        23,          1,      0.932
   4,     5,    5,         0,          1,      1.026
   6,  2048,    5,        23,          1,      0.749
   6,  2048,    5,         0,          1,       0.75
   6,  2053,    5,        23,          1,      0.638
   6,  2053,    5,         0,          1,      0.638
   4,  2048,    5,        23,          1,      0.667
   4,  2048,    5,         0,          1,      0.667
   4,  2053,    5,        23,          1,      0.692
   4,  2053,    5,         0,          1,      0.699
   4,  4081,    5,        23,          1,      0.698
   4,  4081,    5,         0,          1,      0.698
   6,  4081,    5,        23,          1,      0.639
   6,  4081,    5,         0,          1,      0.619
   6,     0,    1,        23,          1,      1.027
   6,     0,    2,         0,          1,      1.026
   7,     0,    6,        23,          1,      1.028
   7,     0,    6,         0,          1,      1.028
   7,     6,    6,        23,          1,      0.874
   7,     6,    6,         0,          1,      0.882
   5,     0,    6,        23,          1,      0.888
   5,     0,    6,         0,          1,      0.888
   5,     6,    6,        23,          1,      0.942
   5,     6,    6,         0,          1,      1.014
   7,  2048,    6,        23,          1,       0.75
   7,  2048,    6,         0,          1,      0.749
   7,  2054,    6,        23,          1,      0.637
   7,  2054,    6,         0,          1,      0.638
   5,  2048,    6,        23,          1,      0.667
   5,  2048,    6,         0,          1,      0.666
   5,  2054,    6,        23,          1,      0.706
   5,  2054,    6,         0,          1,      0.702
   5,  4081,    6,        23,          1,      0.705
   5,  4081,    6,         0,          1,      0.705
   7,  4081,    6,        23,          1,      0.659
   7,  4081,    6,         0,          1,      0.638
   7,     0,    1,        23,          1,      1.042
   7,     0,    2,         0,          1,      1.035
   8,     0,    7,        23,          1,      1.033
   8,     0,    7,         0,          1,      1.027
   8,     7,    7,        23,          1,      0.886
   8,     7,    7,         0,          1,      0.875
   6,     0,    7,        23,          1,      0.889
   6,     0,    7,         0,          1,      0.889
   6,     7,    7,        23,          1,      0.912
   6,     7,    7,         0,          1,      0.982
   8,  2048,    7,        23,          1,      0.755
   8,  2048,    7,         0,          1,      0.749
   8,  2055,    7,        23,          1,      0.638
   8,  2055,    7,         0,          1,      0.638
   6,  2048,    7,        23,          1,      0.667
   6,  2048,    7,         0,          1,      0.667
   6,  2055,    7,        23,          1,      0.692
   6,  2055,    7,         0,          1,      0.693
   6,  4081,    7,        23,          1,      0.689
   6,  4081,    7,         0,          1,      0.723
   8,  4081,    7,        23,          1,       0.64
   8,  4081,    7,         0,          1,      0.631
   8,     0,    1,        23,          1,      1.028
   8,     0,    2,         0,          1,      1.039
   9,     0,    8,        23,          1,      1.029
   9,     0,    8,         0,          1,      1.028
   9,     8,    8,        23,          1,       0.55
   9,     8,    8,         0,          1,      0.542
   7,     0,    8,        23,          1,      0.889
   7,     0,    8,         0,          1,      0.889
   7,     8,    8,        23,          1,      0.934
   7,     8,    8,         0,          1,      1.011
   9,  2048,    8,        23,          1,      0.751
   9,  2048,    8,         0,          1,       0.75
   9,  2056,    8,        23,          1,      0.553
   9,  2056,    8,         0,          1,      0.542
   7,  2048,    8,        23,          1,      0.667
   7,  2048,    8,         0,          1,      0.667
   7,  2056,    8,        23,          1,      0.712
   7,  2056,    8,         0,          1,       0.73
   7,  4081,    8,        23,          1,      0.716
   7,  4081,    8,         0,          1,       0.76
   9,  4081,    8,        23,          1,      0.632
   9,  4081,    8,         0,          1,      0.624
   9,     0,    1,        23,          1,      1.028
   9,     0,    2,         0,          1,      1.028
  10,     0,    9,        23,          1,      1.027
  10,     0,    9,         0,          1,      1.028
  10,     9,    9,        23,          1,      0.545
  10,     9,    9,         0,          1,      0.536
   8,     0,    9,        23,          1,      0.889
   8,     0,    9,         0,          1,      0.889
   8,     9,    9,        23,          1,      0.627
   8,     9,    9,         0,          1,      0.637
  10,  2048,    9,        23,          1,      0.751
  10,  2048,    9,         0,          1,       0.75
  10,  2057,    9,        23,          1,      0.545
  10,  2057,    9,         0,          1,      0.547
   8,  2048,    9,        23,          1,      0.667
   8,  2048,    9,         0,          1,      0.667
   8,  2057,    9,        23,          1,      0.627
   8,  2057,    9,         0,          1,      0.633
   8,  4081,    9,        23,          1,      0.726
   8,  4081,    9,         0,          1,      0.775
  10,  4081,    9,        23,          1,      0.657
  10,  4081,    9,         0,          1,      0.642
  10,     0,    1,        23,          1,       1.03
  10,     0,    2,         0,          1,      1.033
  11,     0,   10,        23,          1,      1.029
  11,     0,   10,         0,          1,       1.03
  11,    10,   10,        23,          1,      0.542
  11,    10,   10,         0,          1,      0.549
   9,     0,   10,        23,          1,      0.889
   9,     0,   10,         0,          1,      0.889
   9,    10,   10,        23,          1,      0.627
   9,    10,   10,         0,          1,      0.646
  11,  2048,   10,        23,          1,      0.751
  11,  2048,   10,         0,          1,       0.75
  11,  2058,   10,        23,          1,      0.553
  11,  2058,   10,         0,          1,      0.538
   9,  2048,   10,        23,          1,      0.667
   9,  2048,   10,         0,          1,      0.667
   9,  2058,   10,        23,          1,      0.627
   9,  2058,   10,         0,          1,      0.656
   9,  4081,   10,        23,          1,      0.726
   9,  4081,   10,         0,          1,      0.773
  11,  4081,   10,        23,          1,      0.625
  11,  4081,   10,         0,          1,      0.613
  11,     0,    1,        23,          1,      1.029
  11,     0,    2,         0,          1,      1.029
  12,     0,   11,        23,          1,      1.028
  12,     0,   11,         0,          1,      1.028
  12,    11,   11,        23,          1,      0.545
  12,    11,   11,         0,          1,      0.537
  10,     0,   11,        23,          1,      0.889
  10,     0,   11,         0,          1,      0.889
  10,    11,   11,        23,          1,      0.627
  10,    11,   11,         0,          1,      0.655
  12,  2048,   11,        23,          1,      0.757
  12,  2048,   11,         0,          1,       0.75
  12,  2059,   11,        23,          1,      0.536
  12,  2059,   11,         0,          1,      0.545
  10,  2048,   11,        23,          1,      0.672
  10,  2048,   11,         0,          1,      0.667
  10,  2059,   11,        23,          1,      0.627
  10,  2059,   11,         0,          1,       0.66
  10,  4081,   11,        23,          1,      0.726
  10,  4081,   11,         0,          1,      0.793
  12,  4081,   11,        23,          1,      0.627
  12,  4081,   11,         0,          1,      0.633
  12,     0,    1,        23,          1,      1.028
  12,     0,    2,         0,          1,      1.029
  13,     0,   12,        23,          1,      1.028
  13,     0,   12,         0,          1,      1.028
  13,    12,   12,        23,          1,      0.547
  13,    12,   12,         0,          1,      0.542
  11,     0,   12,        23,          1,      0.889
  11,     0,   12,         0,          1,      0.889
  11,    12,   12,        23,          1,      0.627
  11,    12,   12,         0,          1,       0.69
  13,  2048,   12,        23,          1,       0.75
  13,  2048,   12,         0,          1,       0.75
  13,  2060,   12,        23,          1,       0.55
  13,  2060,   12,         0,          1,      0.542
  11,  2048,   12,        23,          1,      0.667
  11,  2048,   12,         0,          1,      0.667
  11,  2060,   12,        23,          1,      0.627
  11,  2060,   12,         0,          1,      0.646
  11,  4081,   12,        23,          1,      0.726
  11,  4081,   12,         0,          1,       0.78
  13,  4081,   12,        23,          1,      0.632
  13,  4081,   12,         0,          1,      0.619
  13,     0,    1,        23,          1,      1.028
  13,     0,    2,         0,          1,      1.028
  14,     0,   13,        23,          1,      1.032
  14,     0,   13,         0,          1,      1.038
  14,    13,   13,        23,          1,       0.55
  14,    13,   13,         0,          1,      0.539
  12,     0,   13,        23,          1,      0.889
  12,     0,   13,         0,          1,      0.889
  12,    13,   13,        23,          1,      0.627
  12,    13,   13,         0,          1,      0.655
  14,  2048,   13,        23,          1,      0.751
  14,  2048,   13,         0,          1,      0.751
  14,  2061,   13,        23,          1,      0.542
  14,  2061,   13,         0,          1,      0.547
  12,  2048,   13,        23,          1,      0.667
  12,  2048,   13,         0,          1,      0.667
  12,  2061,   13,        23,          1,      0.627
  12,  2061,   13,         0,          1,      0.646
  12,  4081,   13,        23,          1,      0.726
  12,  4081,   13,         0,          1,      0.769
  14,  4081,   13,        23,          1,      0.627
  14,  4081,   13,         0,          1,       0.62
  14,     0,    1,        23,          1,      1.035
  14,     0,    2,         0,          1,      1.033
  15,     0,   14,        23,          1,      1.028
  15,     0,   14,         0,          1,      1.028
  15,    14,   14,        23,          1,      0.545
  15,    14,   14,         0,          1,      0.531
  13,     0,   14,        23,          1,      0.889
  13,     0,   14,         0,          1,      0.889
  13,    14,   14,        23,          1,      0.628
  13,    14,   14,         0,          1,      0.628
  15,  2048,   14,        23,          1,      0.751
  15,  2048,   14,         0,          1,       0.75
  15,  2062,   14,        23,          1,      0.542
  15,  2062,   14,         0,          1,      0.536
  13,  2048,   14,        23,          1,      0.667
  13,  2048,   14,         0,          1,      0.667
  13,  2062,   14,        23,          1,      0.627
  13,  2062,   14,         0,          1,      0.628
  13,  4081,   14,        23,          1,      0.726
  13,  4081,   14,         0,          1,      0.747
  15,  4081,   14,        23,          1,      0.874
  15,  4081,   14,         0,          1,      0.879
  15,     0,    1,        23,          1,      1.028
  15,     0,    2,         0,          1,      1.028
  16,     0,   15,        23,          1,      0.728
  16,     0,   15,         0,          1,      0.735
  16,    15,   15,        23,          1,      0.647
  16,    15,   15,         0,          1,      0.647
  14,     0,   15,        23,          1,      0.889
  14,     0,   15,         0,          1,      0.889
  14,    15,   15,        23,          1,      0.627
  14,    15,   15,         0,          1,      0.647
  16,  2048,   15,        23,          1,      0.732
  16,  2048,   15,         0,          1,      0.714
  16,  2063,   15,        23,          1,       0.65
  16,  2063,   15,         0,          1,       0.65
  14,  2048,   15,        23,          1,      0.667
  14,  2048,   15,         0,          1,      0.667
  14,  2063,   15,        23,          1,      0.627
  14,  2063,   15,         0,          1,      0.674
  14,  4081,   15,        23,          1,      0.724
  14,  4081,   15,         0,          1,      0.777
  16,  4081,   15,        23,          1,       1.01
  16,  4081,   15,         0,          1,      0.997
  16,     0,    1,        23,          1,      0.722
  16,     0,    2,         0,          1,      0.725
  17,     0,   16,        23,          1,      1.167
  17,     0,   16,         0,          1,      1.167
  17,    16,   16,        23,          1,      1.167
  17,    16,   16,         0,          1,      1.167
  15,     0,   16,        23,          1,      0.891
  15,     0,   16,         0,          1,      0.892
  15,    16,   16,        23,          1,      0.668
  15,    16,   16,         0,          1,      0.699
  17,  2048,   16,        23,          1,      1.167
  17,  2048,   16,         0,          1,      1.167
  17,  2064,   16,        23,          1,      1.167
  17,  2064,   16,         0,          1,      1.167
  15,  2048,   16,        23,          1,      0.668
  15,  2048,   16,         0,          1,      0.667
  15,  2064,   16,        23,          1,      0.667
  15,  2064,   16,         0,          1,      0.771
  15,  4081,   16,        23,          1,      0.933
  15,  4081,   16,         0,          1,      1.056
  17,  4081,   16,        23,          1,       1.78
  17,  4081,   16,         0,          1,      1.789
  17,     0,    1,        23,          1,       1.17
  17,     0,    2,         0,          1,      1.169
  18,     0,   17,        23,          1,      0.859
  18,     0,   17,         0,          1,      0.857
  18,    17,   17,        23,          1,      0.857
  18,    17,   17,         0,          1,      0.857
  16,     0,   17,        23,          1,      0.673
  16,     0,   17,         0,          1,      0.672
  16,    17,   17,        23,          1,      0.628
  16,    17,   17,         0,          1,      0.628
  18,  2048,   17,        23,          1,      0.861
  18,  2048,   17,         0,          1,      0.859
  18,  2065,   17,        23,          1,       0.86
  18,  2065,   17,         0,          1,      0.857
  16,  2048,   17,        23,          1,      0.668
  16,  2048,   17,         0,          1,      0.668
  16,  2065,   17,        23,          1,      0.627
  16,  2065,   17,         0,          1,      0.627
  16,  4081,   17,        23,          1,      1.049
  16,  4081,   17,         0,          1,      1.174
  18,  4081,   17,        23,          1,      1.068
  18,  4081,   17,         0,          1,      1.064
  18,     0,    1,        23,          1,      1.172
  18,     0,    2,         0,          1,      1.172
  19,     0,   18,        23,          1,      0.865
  19,     0,   18,         0,          1,      0.864
  19,    18,   18,        23,          1,       0.86
  19,    18,   18,         0,          1,      0.861
  17,     0,   18,        23,          1,      0.895
  17,     0,   18,         0,          1,      0.895
  17,    18,   18,        23,          1,      0.896
  17,    18,   18,         0,          1,      0.836
  19,  2048,   18,        23,          1,      0.866
  19,  2048,   18,         0,          1,      0.866
  19,  2066,   18,        23,          1,      0.866
  19,  2066,   18,         0,          1,      0.863
  17,  2048,   18,        23,          1,      0.896
  17,  2048,   18,         0,          1,      0.895
  17,  2066,   18,        23,          1,      0.895
  17,  2066,   18,         0,          1,      0.877
  17,  4081,   18,        23,          1,      1.115
  17,  4081,   18,         0,          1,       1.07
  19,  4081,   18,        23,          1,      1.061
  19,  4081,   18,         0,          1,       1.06
  19,     0,    1,        23,          1,      1.168
  19,     0,    2,         0,          1,      1.168
  20,     0,   19,        23,          1,      0.855
  20,     0,   19,         0,          1,      0.858
  20,    19,   19,        23,          1,      0.856
  20,    19,   19,         0,          1,      0.855
  18,     0,   19,        23,          1,       0.89
  18,     0,   19,         0,          1,       0.89
  18,    19,   19,        23,          1,       0.89
  18,    19,   19,         0,          1,      0.875
  20,  2048,   19,        23,          1,      0.859
  20,  2048,   19,         0,          1,      0.855
  20,  2067,   19,        23,          1,      0.854
  20,  2067,   19,         0,          1,      0.856
  18,  2048,   19,        23,          1,      0.889
  18,  2048,   19,         0,          1,      0.889
  18,  2067,   19,        23,          1,      0.889
  18,  2067,   19,         0,          1,      0.893
  18,  4081,   19,        23,          1,      1.109
  18,  4081,   19,         0,          1,      1.067
  20,  4081,   19,        23,          1,      1.053
  20,  4081,   19,         0,          1,      1.052
  20,     0,    1,        23,          1,      1.165
  20,     0,    2,         0,          1,      1.166
  21,     0,   20,        23,          1,      0.855
  21,     0,   20,         0,          1,      0.856
  21,    20,   20,        23,          1,      0.854
  21,    20,   20,         0,          1,      0.854
  19,     0,   20,        23,          1,      0.888
  19,     0,   20,         0,          1,      0.888
  19,    20,   20,        23,          1,      0.888
  19,    20,   20,         0,          1,      0.868
  21,  2048,   20,        23,          1,      0.853
  21,  2048,   20,         0,          1,      0.857
  21,  2068,   20,        23,          1,      0.855
  21,  2068,   20,         0,          1,      0.854
  19,  2048,   20,        23,          1,      0.889
  19,  2048,   20,         0,          1,      0.889
  19,  2068,   20,        23,          1,      0.889
  19,  2068,   20,         0,          1,      0.894
  19,  4081,   20,        23,          1,      1.112
  19,  4081,   20,         0,          1,      1.103
  21,  4081,   20,        23,          1,      1.054
  21,  4081,   20,         0,          1,      1.051
  21,     0,    1,        23,          1,      1.169
  21,     0,    2,         0,          1,      1.168
  22,     0,   21,        23,          1,      0.853
  22,     0,   21,         0,          1,      0.855
  22,    21,   21,        23,          1,      0.856
  22,    21,   21,         0,          1,      0.853
  20,     0,   21,        23,          1,      0.889
  20,     0,   21,         0,          1,      0.889
  20,    21,   21,        23,          1,      0.889
  20,    21,   21,         0,          1,       0.91
  22,  2048,   21,        23,          1,      0.852
  22,  2048,   21,         0,          1,      0.855
  22,  2069,   21,        23,          1,      0.853
  22,  2069,   21,         0,          1,      0.854
  20,  2048,   21,        23,          1,      0.889
  20,  2048,   21,         0,          1,      0.889
  20,  2069,   21,        23,          1,      0.889
  20,  2069,   21,         0,          1,      0.925
  20,  4081,   21,        23,          1,      1.111
  20,  4081,   21,         0,          1,      1.111
  22,  4081,   21,        23,          1,      1.053
  22,  4081,   21,         0,          1,      1.051
  22,     0,    1,        23,          1,      1.167
  22,     0,    2,         0,          1,      1.165
  23,     0,   22,        23,          1,      0.853
  23,     0,   22,         0,          1,      0.853
  23,    22,   22,        23,          1,      0.853
  23,    22,   22,         0,          1,      0.853
  21,     0,   22,        23,          1,      0.888
  21,     0,   22,         0,          1,      0.888
  21,    22,   22,        23,          1,      0.889
  21,    22,   22,         0,          1,      0.931
  23,  2048,   22,        23,          1,      0.854
  23,  2048,   22,         0,          1,      0.854
  23,  2070,   22,        23,          1,      0.853
  23,  2070,   22,         0,          1,      0.852
  21,  2048,   22,        23,          1,      0.887
  21,  2048,   22,         0,          1,      0.887
  21,  2070,   22,        23,          1,      0.887
  21,  2070,   22,         0,          1,      0.901
  21,  4081,   22,        23,          1,      1.107
  21,  4081,   22,         0,          1,       1.11
  23,  4081,   22,        23,          1,      1.047
  23,  4081,   22,         0,          1,      1.049
  23,     0,    1,        23,          1,      1.163
  23,     0,    2,         0,          1,      1.163
  24,     0,   23,        23,          1,      0.851
  24,     0,   23,         0,          1,      0.852
  24,    23,   23,        23,          1,      0.852
  24,    23,   23,         0,          1,      0.854
  22,     0,   23,        23,          1,      0.888
  22,     0,   23,         0,          1,      0.888
  22,    23,   23,        23,          1,      0.888
  22,    23,   23,         0,          1,      0.908
  24,  2048,   23,        23,          1,      0.853
  24,  2048,   23,         0,          1,      0.851
  24,  2071,   23,        23,          1,      0.851
  24,  2071,   23,         0,          1,      0.851
  22,  2048,   23,        23,          1,      0.888
  22,  2048,   23,         0,          1,      0.888
  22,  2071,   23,        23,          1,      0.888
  22,  2071,   23,         0,          1,      0.882
  22,  4081,   23,        23,          1,      1.109
  22,  4081,   23,         0,          1,      1.084
  24,  4081,   23,        23,          1,      1.049
  24,  4081,   23,         0,          1,      1.049
  24,     0,    1,        23,          1,      1.164
  24,     0,    2,         0,          1,      1.164
  25,     0,   24,        23,          1,      0.855
  25,     0,   24,         0,          1,      0.849
  25,    24,   24,        23,          1,      0.859
  25,    24,   24,         0,          1,      0.861
  23,     0,   24,        23,          1,      0.885
  23,     0,   24,         0,          1,      0.885
  23,    24,   24,        23,          1,      0.887
  23,    24,   24,         0,          1,      0.898
  25,  2048,   24,        23,          1,      0.851
  25,  2048,   24,         0,          1,      0.852
  25,  2072,   24,        23,          1,      0.852
  25,  2072,   24,         0,          1,      0.852
  23,  2048,   24,        23,          1,      0.886
  23,  2048,   24,         0,          1,      0.886
  23,  2072,   24,        23,          1,      0.886
  23,  2072,   24,         0,          1,      0.916
  23,  4081,   24,        23,          1,      1.106
  23,  4081,   24,         0,          1,      1.078
  25,  4081,   24,        23,          1,      1.044
  25,  4081,   24,         0,          1,      1.045
  25,     0,    1,        23,          1,      1.163
  25,     0,    2,         0,          1,      1.163
  26,     0,   25,        23,          1,      0.849
  26,     0,   25,         0,          1,      0.851
  26,    25,   25,        23,          1,      0.844
  26,    25,   25,         0,          1,      0.849
  24,     0,   25,        23,          1,      0.885
  24,     0,   25,         0,          1,      0.886
  24,    25,   25,        23,          1,      0.875
  24,    25,   25,         0,          1,      0.845
  26,  2048,   25,        23,          1,       0.85
  26,  2048,   25,         0,          1,      0.849
  26,  2073,   25,        23,          1,      0.862
  26,  2073,   25,         0,          1,      0.861
  24,  2048,   25,        23,          1,      0.886
  24,  2048,   25,         0,          1,      0.885
  24,  2073,   25,        23,          1,      0.862
  24,  2073,   25,         0,          1,      0.836
  24,  4081,   25,        23,          1,      1.105
  24,  4081,   25,         0,          1,      1.088
  26,  4081,   25,        23,          1,      1.047
  26,  4081,   25,         0,          1,      1.045
  26,     0,    1,        23,          1,      1.163
  26,     0,    2,         0,          1,      1.163
  27,     0,   26,        23,          1,      0.853
  27,     0,   26,         0,          1,      0.853
  27,    26,   26,        23,          1,       0.85
  27,    26,   26,         0,          1,       0.86
  25,     0,   26,        23,          1,      0.888
  25,     0,   26,         0,          1,      0.887
  25,    26,   26,        23,          1,      0.867
  25,    26,   26,         0,          1,      0.844
  27,  2048,   26,        23,          1,      0.852
  27,  2048,   26,         0,          1,      0.851
  27,  2074,   26,        23,          1,      0.872
  27,  2074,   26,         0,          1,      0.878
  25,  2048,   26,        23,          1,      0.889
  25,  2048,   26,         0,          1,      0.888
  25,  2074,   26,        23,          1,      0.868
  25,  2074,   26,         0,          1,      0.854
  25,  4081,   26,        23,          1,      1.109
  25,  4081,   26,         0,          1,      1.102
  27,  4081,   26,        23,          1,      1.046
  27,  4081,   26,         0,          1,      1.049
  27,     0,    1,        23,          1,      1.165
  27,     0,    2,         0,          1,      1.165
  28,     0,   27,        23,          1,      0.853
  28,     0,   27,         0,          1,      0.854
  28,    27,   27,        23,          1,      0.873
  28,    27,   27,         0,          1,      0.878
  26,     0,   27,        23,          1,      0.887
  26,     0,   27,         0,          1,      0.888
  26,    27,   27,        23,          1,      0.875
  26,    27,   27,         0,          1,      0.851
  28,  2048,   27,        23,          1,      0.851
  28,  2048,   27,         0,          1,      0.851
  28,  2075,   27,        23,          1,      0.879
  28,  2075,   27,         0,          1,      0.883
  26,  2048,   27,        23,          1,      0.888
  26,  2048,   27,         0,          1,      0.888
  26,  2075,   27,        23,          1,      0.876
  26,  2075,   27,         0,          1,       0.86
  26,  4081,   27,        23,          1,      1.109
  26,  4081,   27,         0,          1,      1.105
  28,  4081,   27,        23,          1,      1.048
  28,  4081,   27,         0,          1,      1.048
  28,     0,    1,        23,          1,      1.164
  28,     0,    2,         0,          1,      1.165
  29,     0,   28,        23,          1,      0.854
  29,     0,   28,         0,          1,      0.852
  29,    28,   28,        23,          1,      0.887
  29,    28,   28,         0,          1,      0.884
  27,     0,   28,        23,          1,      0.887
  27,     0,   28,         0,          1,      0.889
  27,    28,   28,        23,          1,      0.885
  27,    28,   28,         0,          1,      0.866
  29,  2048,   28,        23,          1,      0.853
  29,  2048,   28,         0,          1,      0.852
  29,  2076,   28,        23,          1,      0.879
  29,  2076,   28,         0,          1,      0.876
  27,  2048,   28,        23,          1,      0.889
  27,  2048,   28,         0,          1,      0.891
  27,  2076,   28,        23,          1,      0.883
  27,  2076,   28,         0,          1,       0.86
  27,  4081,   28,        23,          1,       1.11
  27,  4081,   28,         0,          1,      1.106
  29,  4081,   28,        23,          1,      1.051
  29,  4081,   28,         0,          1,      1.052
  29,     0,    1,        23,          1,      1.168
  29,     0,    2,         0,          1,      1.168
  30,     0,   29,        23,          1,      0.856
  30,     0,   29,         0,          1,      0.854
  30,    29,   29,        23,          1,      0.873
  30,    29,   29,         0,          1,      0.874
  28,     0,   29,        23,          1,      0.891
  28,     0,   29,         0,          1,      0.891
  28,    29,   29,        23,          1,      0.884
  28,    29,   29,         0,          1,      0.872
  30,  2048,   29,        23,          1,      0.859
  30,  2048,   29,         0,          1,      0.856
  30,  2077,   29,        23,          1,      0.879
  30,  2077,   29,         0,          1,      0.878
  28,  2048,   29,        23,          1,      0.891
  28,  2048,   29,         0,          1,      0.891
  28,  2077,   29,        23,          1,      0.889
  28,  2077,   29,         0,          1,      0.863
  28,  4081,   29,        23,          1,      1.109
  28,  4081,   29,         0,          1,      1.122
  30,  4081,   29,        23,          1,      1.054
  30,  4081,   29,         0,          1,      1.052
  30,     0,    1,        23,          1,      1.163
  30,     0,    2,         0,          1,      1.161
  31,     0,   30,        23,          1,      0.851
  31,     0,   30,         0,          1,      0.849
  31,    30,   30,        23,          1,      0.871
  31,    30,   30,         0,          1,      0.874
  29,     0,   30,        23,          1,      0.884
  29,     0,   30,         0,          1,      0.885
  29,    30,   30,        23,          1,      0.888
  29,    30,   30,         0,          1,      0.864
  31,  2048,   30,        23,          1,      0.854
  31,  2048,   30,         0,          1,      0.852
  31,  2078,   30,        23,          1,      0.874
  31,  2078,   30,         0,          1,      0.882
  29,  2048,   30,        23,          1,      0.888
  29,  2048,   30,         0,          1,      0.889
  29,  2078,   30,        23,          1,      0.895
  29,  2078,   30,         0,          1,      0.878
  29,  4081,   30,        23,          1,      1.109
  29,  4081,   30,         0,          1,      1.128
  31,  4081,   30,        23,          1,      0.804
  31,  4081,   30,         0,          1,      0.803
  31,     0,    1,        23,          1,      1.167
  31,     0,    2,         0,          1,      1.167
  32,     0,   31,        23,          1,      0.802
  32,     0,   31,         0,          1,      0.802
  32,    31,   31,        23,          1,      0.798
  32,    31,   31,         0,          1,      0.797
  30,     0,   31,        23,          1,       0.88
  30,     0,   31,         0,          1,      0.888
  30,    31,   31,        23,          1,       0.96
  30,    31,   31,         0,          1,      0.869
  32,  2048,   31,        23,          1,      0.802
  32,  2048,   31,         0,          1,      0.802
  32,  2079,   31,        23,          1,      0.843
  32,  2079,   31,         0,          1,      0.835
  30,  2048,   31,        23,          1,      0.889
  30,  2048,   31,         0,          1,      0.889
  30,  2079,   31,        23,          1,      0.937
  30,  2079,   31,         0,          1,      0.872
  30,  4081,   31,        23,          1,       1.11
  30,  4081,   31,         0,          1,      1.142
  32,  4081,   31,        23,          1,      0.864
  32,  4081,   31,         0,          1,      0.872
  32,     0,    1,        23,          1,      1.167
  32,     0,    2,         0,          1,      1.167

>  sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++--------------------
>  1 file changed, 292 insertions(+), 321 deletions(-)
>
> diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
> index d1a9f47911..b0dffd2ae2 100644
> --- a/sysdeps/x86_64/memrchr.S
> +++ b/sysdeps/x86_64/memrchr.S
> @@ -18,362 +18,333 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #include <sysdep.h>
> +#define VEC_SIZE                       16
> +#define PAGE_SIZE                      4096
>
>         .text
> -ENTRY (__memrchr)
> -       movd    %esi, %xmm1
> -
> -       sub     $16, %RDX_LP
> -       jbe     L(length_less16)
> -
> -       punpcklbw       %xmm1, %xmm1
> -       punpcklbw       %xmm1, %xmm1
> -
> -       add     %RDX_LP, %RDI_LP
> -       pshufd  $0, %xmm1, %xmm1
> -
> -       movdqu  (%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -
> -/* Check if there is a match.  */
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0)
> -
> -       sub     $64, %rdi
> -       mov     %edi, %ecx
> -       and     $15, %ecx
> -       jz      L(loop_prolog)
> -
> -       add     $16, %rdi
> -       add     $16, %rdx
> -       and     $-16, %rdi
> -       sub     %rcx, %rdx
> -
> -       .p2align 4
> -L(loop_prolog):
> -       sub     $64, %rdx
> -       jbe     L(exit_loop)
> -
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       movdqa  32(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16)
> -
> -       movdqa  (%rdi), %xmm4
> -       pcmpeqb %xmm1, %xmm4
> -       pmovmskb        %xmm4, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0)
> -
> -       sub     $64, %rdi
> -       sub     $64, %rdx
> -       jbe     L(exit_loop)
> -
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       movdqa  32(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16)
> -
> -       movdqa  (%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0)
> -
> -       mov     %edi, %ecx
> -       and     $63, %ecx
> -       jz      L(align64_loop)
> -
> -       add     $64, %rdi
> -       add     $64, %rdx
> -       and     $-64, %rdi
> -       sub     %rcx, %rdx
> -
> -       .p2align 4
> -L(align64_loop):
> -       sub     $64, %rdi
> -       sub     $64, %rdx
> -       jbe     L(exit_loop)
> -
> -       movdqa  (%rdi), %xmm0
> -       movdqa  16(%rdi), %xmm2
> -       movdqa  32(%rdi), %xmm3
> -       movdqa  48(%rdi), %xmm4
> -
> -       pcmpeqb %xmm1, %xmm0
> -       pcmpeqb %xmm1, %xmm2
> -       pcmpeqb %xmm1, %xmm3
> -       pcmpeqb %xmm1, %xmm4
> -
> -       pmaxub  %xmm3, %xmm0
> -       pmaxub  %xmm4, %xmm2
> -       pmaxub  %xmm0, %xmm2
> -       pmovmskb        %xmm2, %eax
> -
> -       test    %eax, %eax
> -       jz      L(align64_loop)
> -
> -       pmovmskb        %xmm4, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm2
> -
> -       pcmpeqb %xmm1, %xmm2
> -       pcmpeqb (%rdi), %xmm1
> -
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16)
> -
> -       pmovmskb        %xmm1, %eax
> -       bsr     %eax, %eax
> -
> -       add     %rdi, %rax
> +ENTRY_P2ALIGN(__memrchr, 6)
> +#ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       mov     %RDX_LP, %RDX_LP
> +#endif
> +       movd    %esi, %xmm0
> +
> +       /* Get end pointer.  */
> +       leaq    (%rdx, %rdi), %rcx
> +
> +       punpcklbw %xmm0, %xmm0
> +       punpcklwd %xmm0, %xmm0
> +       pshufd  $0, %xmm0, %xmm0
> +
> +       /* Check if we can load 1x VEC without cross a page.  */
> +       testl   $(PAGE_SIZE - VEC_SIZE), %ecx
> +       jz      L(page_cross)
> +
> +       /* NB: This load happens regardless of whether rdx (len) is zero. Since
> +          it doesn't cross a page and the standard gurantees any pointer have
> +          at least one-valid byte this load must be safe. For the entire
> +          history of the x86 memrchr implementation this has been possible so
> +          no code "should" be relying on a zero-length check before this load.
> +          The zero-length check is moved to the page cross case because it is
> +          1) pretty cold and including it pushes the hot case len <= VEC_SIZE
> +          into 2-cache lines.  */
> +       movups  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subq    $VEC_SIZE, %rdx
> +       ja      L(more_1x_vec)
> +L(ret_vec_x0_test):
> +       /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
> +          zero.  */
> +       bsrl    %eax, %eax
> +       jz      L(ret_0)
> +       /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
> +          if out of bounds.  */
> +       addl    %edx, %eax
> +       jl      L(zero_0)
> +       /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
> +          ptr.  */
> +       addq    %rdi, %rax
> +L(ret_0):
>         ret
>
> -       .p2align 4
> -L(exit_loop):
> -       add     $64, %edx
> -       cmp     $32, %edx
> -       jbe     L(exit_loop_32)
> -
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       movdqa  32(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16_1)
> -       cmp     $48, %edx
> -       jbe     L(return_null)
> -
> -       pcmpeqb (%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0_1)
> -       xor     %eax, %eax
> +       .p2align 4,, 5
> +L(ret_vec_x0):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE)(%rcx, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(exit_loop_32):
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48_1)
> -       cmp     $16, %edx
> -       jbe     L(return_null)
> -
> -       pcmpeqb 32(%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32_1)
> -       xor     %eax, %eax
> +       .p2align 4,, 2
> +L(zero_0):
> +       xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(matches0):
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(matches16):
> -       bsr     %eax, %eax
> -       lea     16(%rax, %rdi), %rax
> -       ret
>
> -       .p2align 4
> -L(matches32):
> -       bsr     %eax, %eax
> -       lea     32(%rax, %rdi), %rax
> +       .p2align 4,, 8
> +L(more_1x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +       /* Align rcx (pointer to string).  */
> +       decq    %rcx
> +       andq    $-VEC_SIZE, %rcx
> +
> +       movq    %rcx, %rdx
> +       /* NB: We could consistenyl save 1-byte in this pattern with `movaps
> +          %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
> +          it adds more frontend uops (even if the moves can be eliminated) and
> +          some percentage of the time actual backend uops.  */
> +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       subq    %rdi, %rdx
> +       pmovmskb %xmm1, %eax
> +
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       ja      L(more_2x_vec)
> +L(last_2x_vec):
> +       subl    $VEC_SIZE, %edx
> +       jbe     L(ret_vec_x0_test)
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subl    $VEC_SIZE, %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_1)
> +       addl    %edx, %eax
> +       jl      L(zero_0)
> +       addq    %rdi, %rax
> +L(ret_1):
>         ret
>
> -       .p2align 4
> -L(matches48):
> -       bsr     %eax, %eax
> -       lea     48(%rax, %rdi), %rax
> +       /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
> +          causes the hot pause (length <= VEC_SIZE) to span multiple cache
> +          lines.  Naturally aligned % 16 to 8-bytes.  */
> +L(page_cross):
> +       /* Zero length check.  */
> +       testq   %rdx, %rdx
> +       jz      L(zero_0)
> +
> +       leaq    -1(%rcx), %r8
> +       andq    $-(VEC_SIZE), %r8
> +
> +       movaps  (%r8), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %esi
> +       /* Shift out negative alignment (because we are starting from endptr and
> +          working backwards).  */
> +       negl    %ecx
> +       /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
> +          explicitly.  */
> +       andl    $(VEC_SIZE - 1), %ecx
> +       shl     %cl, %esi
> +       movzwl  %si, %eax
> +       leaq    (%rdi, %rdx), %rcx
> +       cmpq    %rdi, %r8
> +       ja      L(more_1x_vec)
> +       subl    $VEC_SIZE, %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_2)
> +       addl    %edx, %eax
> +       jl      L(zero_1)
> +       addq    %rdi, %rax
> +L(ret_2):
>         ret
>
> -       .p2align 4
> -L(matches0_1):
> -       bsr     %eax, %eax
> -       sub     $64, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       add     %rdi, %rax
> +       /* Fits in aliging bytes.  */
> +L(zero_1):
> +       xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(matches16_1):
> -       bsr     %eax, %eax
> -       sub     $48, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       lea     16(%rdi, %rax), %rax
> +       .p2align 4,, 5
> +L(ret_vec_x1):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(matches32_1):
> -       bsr     %eax, %eax
> -       sub     $32, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       lea     32(%rdi, %rax), %rax
> -       ret
> +       .p2align 4,, 8
> +L(more_2x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
>
> -       .p2align 4
> -L(matches48_1):
> -       bsr     %eax, %eax
> -       sub     $16, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       lea     48(%rdi, %rax), %rax
> -       ret
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x1)
>
> -       .p2align 4
> -L(return_null):
> -       xor     %eax, %eax
> -       ret
>
> -       .p2align 4
> -L(length_less16_offset0):
> -       test    %edx, %edx
> -       jz      L(return_null)
> +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
>
> -       mov     %dl, %cl
> -       pcmpeqb (%rdi), %xmm1
> +       subq    $(VEC_SIZE * 4), %rdx
> +       ja      L(more_4x_vec)
>
> -       mov     $1, %edx
> -       sal     %cl, %edx
> -       sub     $1, %edx
> +       addl    $(VEC_SIZE), %edx
> +       jle     L(ret_vec_x2_test)
>
> -       pmovmskb        %xmm1, %eax
> +L(last_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x2)
>
> -       and     %edx, %eax
> -       test    %eax, %eax
> -       jz      L(return_null)
> +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
>
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> +       subl    $(VEC_SIZE), %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_3)
> +       addl    %edx, %eax
> +       jl      L(zero_2)
> +       addq    %rdi, %rax
> +L(ret_3):
>         ret
>
> -       .p2align 4
> -L(length_less16):
> -       punpcklbw       %xmm1, %xmm1
> -       punpcklbw       %xmm1, %xmm1
> -
> -       add     $16, %edx
> -
> -       pshufd  $0, %xmm1, %xmm1
> -
> -       mov     %edi, %ecx
> -       and     $15, %ecx
> -       jz      L(length_less16_offset0)
> -
> -       mov     %cl, %dh
> -       mov     %ecx, %esi
> -       add     %dl, %dh
> -       and     $-16, %rdi
> -
> -       sub     $16, %dh
> -       ja      L(length_less16_part2)
> -
> -       pcmpeqb (%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> -
> -       sar     %cl, %eax
> -       mov     %dl, %cl
> -
> -       mov     $1, %edx
> -       sal     %cl, %edx
> -       sub     $1, %edx
> -
> -       and     %edx, %eax
> -       test    %eax, %eax
> -       jz      L(return_null)
> -
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> -       add     %rsi, %rax
> +       .p2align 4,, 6
> +L(ret_vec_x2_test):
> +       bsrl    %eax, %eax
> +       jz      L(zero_2)
> +       addl    %edx, %eax
> +       jl      L(zero_2)
> +       addq    %rdi, %rax
>         ret
>
> -       .p2align 4
> -L(length_less16_part2):
> -       movdqa  16(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -
> -       mov     %dh, %cl
> -       mov     $1, %edx
> -       sal     %cl, %edx
> -       sub     $1, %edx
> -
> -       and     %edx, %eax
> +L(zero_2):
> +       xorl    %eax, %eax
> +       ret
>
> -       test    %eax, %eax
> -       jnz     L(length_less16_part2_return)
>
> -       pcmpeqb (%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> +       .p2align 4,, 5
> +L(ret_vec_x2):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> +       ret
>
> -       mov     %esi, %ecx
> -       sar     %cl, %eax
> -       test    %eax, %eax
> -       jz      L(return_null)
> +       .p2align 4,, 5
> +L(ret_vec_x3):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> +       ret
>
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> -       add     %rsi, %rax
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x2)
> +
> +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x3)
> +
> +       addq    $-(VEC_SIZE * 4), %rcx
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       jbe     L(last_4x_vec)
> +
> +       /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
> +          keeping the code from spilling to the next cache line.  */
> +       addq    $(VEC_SIZE * 4 - 1), %rcx
> +       andq    $-(VEC_SIZE * 4), %rcx
> +       leaq    (VEC_SIZE * 4)(%rdi), %rdx
> +       andq    $-(VEC_SIZE * 4), %rdx
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +       movaps  (VEC_SIZE * -1)(%rcx), %xmm1
> +       movaps  (VEC_SIZE * -2)(%rcx), %xmm2
> +       movaps  (VEC_SIZE * -3)(%rcx), %xmm3
> +       movaps  (VEC_SIZE * -4)(%rcx), %xmm4
> +       pcmpeqb %xmm0, %xmm1
> +       pcmpeqb %xmm0, %xmm2
> +       pcmpeqb %xmm0, %xmm3
> +       pcmpeqb %xmm0, %xmm4
> +
> +       por     %xmm1, %xmm2
> +       por     %xmm3, %xmm4
> +       por     %xmm2, %xmm4
> +
> +       pmovmskb %xmm4, %esi
> +       testl   %esi, %esi
> +       jnz     L(loop_end)
> +
> +       addq    $-(VEC_SIZE * 4), %rcx
> +       cmpq    %rdx, %rcx
> +       jne     L(loop_4x_vec)
> +
> +       subl    %edi, %edx
> +
> +       /* Ends up being 1-byte nop.  */
> +       .p2align 4,, 2
> +L(last_4x_vec):
> +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       cmpl    $(VEC_SIZE * 2), %edx
> +       jbe     L(last_2x_vec)
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_end)
> +
> +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subl    $(VEC_SIZE * 3), %edx
> +       ja      L(last_vec)
> +       bsrl    %eax, %eax
> +       jz      L(ret_4)
> +       addl    %edx, %eax
> +       jl      L(zero_3)
> +       addq    %rdi, %rax
> +L(ret_4):
>         ret
>
> -       .p2align 4
> -L(length_less16_part2_return):
> -       bsr     %eax, %eax
> -       lea     16(%rax, %rdi), %rax
> +       /* Ends up being 1-byte nop.  */
> +       .p2align 4,, 3
> +L(loop_end):
> +       pmovmskb %xmm1, %eax
> +       sall    $16, %eax
> +       jnz     L(ret_vec_end)
> +
> +       pmovmskb %xmm2, %eax
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_end)
> +
> +       pmovmskb %xmm3, %eax
> +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> +          then CHAR in VEC3 and bsrq will use that position.  */
> +       sall    $16, %eax
> +       orl     %esi, %eax
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
>         ret
>
> -END (__memrchr)
> +L(ret_vec_end):
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE * -2)(%rax, %rcx), %rax
> +       ret
> +       /* Use in L(last_4x_vec). In the same cache line. This is just a spare
> +          aligning bytes.  */
> +L(zero_3):
> +       xorl    %eax, %eax
> +       ret
> +       /* 2-bytes from next cache line.  */
> +END(__memrchr)
>  weak_alias (__memrchr, memrchr)
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v1 5/8] x86: Optimize memrchr-evex.S
  2022-06-03  4:42 [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                   ` (2 preceding siblings ...)
  2022-06-03  4:42 ` [PATCH v1 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
@ 2022-06-03  4:42 ` Noah Goldstein
  2022-06-03  4:49   ` Noah Goldstein
  2022-06-03  4:42 ` [PATCH v1 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03  4:42 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller user-arg lengths more.
    2. optimizes target placement more carefully
    3. reuses logic more
    4. fixes up various inefficiencies in the logic. The biggest
       case here is the `lzcnt` logic for checking returns which
       saves either a branch or multiple instructions.

The total code size saving is: 263 bytes
Geometric Mean of all benchmarks New / Old: 0.755

Regressions:
There are some regressions. Particularly where the length (user arg
length) is large but the position of the match char is near the
begining of the string (in first VEC). This case has roughly a
20% regression.

This is because the new logic gives the hot path for immediate matches
to shorter lengths (the more common input). This case has roughly
a 35% speedup.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memrchr-evex.S | 539 ++++++++++++------------
 1 file changed, 268 insertions(+), 271 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index 0b99709c6b..ad541c0e50 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -19,319 +19,316 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
+# ifndef MEMRCHR
+#  define MEMRCHR				__memrchr_evex
+# endif
+
+# define PAGE_SIZE			4096
+# define VECMATCH			VEC(0)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(MEMRCHR, 6)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
+
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdi, %rdx), %rax
+	vpbroadcastb %esi, %VECMATCH
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+
+	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will gurantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-# define VMOVA		vmovdqa64
-
-# define YMMMATCH	ymm16
-
-# define VEC_SIZE 32
-
-	.section .text.evex,"ax",@progbits
-ENTRY (__memrchr_evex)
-	/* Broadcast CHAR to YMMMATCH.  */
-	vpbroadcastb %esi, %YMMMATCH
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
-
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
-
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
-	kord	%k1, %k2, %k5
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
-	kord	%k3, %k4, %k6
-	kortestd %k5, %k6
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	kmovd	%k1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0_dec):
+	decq	%rax
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
+	/* Align rax (pointer to string).  */
+	andq	$-VEC_SIZE, %rax
 
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
+	/* Recompute length after aligning.  */
+	movq	%rax, %rdx
 
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	ret
+	subq	%rdi, %rdx
 
-	.p2align 4
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
+
+	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
+	decq	%rax
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpb	$0, (%rsi), %VECMATCH, %k0
+	kmovd	%k0, %r8d
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %ecx
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%ecx
+	shlxl	%ecx, %r8d, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_1)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
+	/* Continue creating zero labels that fit in aligning bytes and get
+	   2-byte encoding / are in the same cache line as condition.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
-
-	kmovd	%k1, %eax
-
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
-
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(zero)
-
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
-
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	.p2align 4,, 8
+L(ret_vec_x2):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	/* Check the last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecissary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	ret
+	decq	%rax
+	andq	$-(VEC_SIZE * 4), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	andq	$-(VEC_SIZE * 4), %rdx
 
 	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
-
-	/* Check the last VEC.  */
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+L(loop_4x_vec):
+	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
+	   on).  */
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
+
+	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
+	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
+	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
+
+	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
+	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
+	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
+	vptestnmb %VEC(3), %VEC(3), %k2
+
+	/* Any 1s and we found CHAR.  */
+	kortestd %k2, %k4
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
+
+	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
+	subq	$-(VEC_SIZE * 4), %rdx
+	movq	%rdx, %rax
+	subl	%edi, %edx
+L(last_4x_vec):
+
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	kmovd	%k1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	movl	%r8d, %ecx
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	kmovd	%k1, %eax
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret_1)
+	xorl	%eax, %eax
+L(ret_1):
+	ret
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 6
+L(loop_end):
+	kmovd	%k1, %ecx
+	notl	%ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vptestnmb %VEC(2), %VEC(2), %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	kmovd	%k2, %ecx
+	kmovd	%k4, %esi
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	addq	%rcx, %rax
+	ret
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	addq	$(VEC_SIZE), %rax
+L(ret_vec_x1_end):
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
 	ret
-END (__memrchr_evex)
+
+END(MEMRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v1 5/8] x86: Optimize memrchr-evex.S
  2022-06-03  4:42 ` [PATCH v1 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
@ 2022-06-03  4:49   ` Noah Goldstein
  0 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03  4:49 UTC (permalink / raw)
  To: GNU C Library

On Thu, Jun 2, 2022 at 11:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code:
>     1. prioritizes smaller user-arg lengths more.
>     2. optimizes target placement more carefully
>     3. reuses logic more
>     4. fixes up various inefficiencies in the logic. The biggest
>        case here is the `lzcnt` logic for checking returns which
>        saves either a branch or multiple instructions.
>
> The total code size saving is: 263 bytes
> Geometric Mean of all benchmarks New / Old: 0.755
>
> Regressions:
> There are some regressions. Particularly where the length (user arg
> length) is large but the position of the match char is near the
> begining of the string (in first VEC). This case has roughly a
> 20% regression.
>
> This is because the new logic gives the hot path for immediate matches
> to shorter lengths (the more common input). This case has roughly
> a 35% speedup.
>
> Full xcheck passes on x86_64.

Geometric mean of N = 30 runs.
Benchmarked on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

Aggregate Geometric Mean of New / Old: 0.7552309594785345

Results For: memrchr
 len, align,  pos, seek_char, invert_pos,  New / Old
2048,     0,   32,        23,          0,      0.897
 256,     1,   64,        23,          0,      0.773
2048,     0,   32,         0,          0,      0.897
 256,     1,   64,         0,          0,      0.772
 256,  4081,   64,         0,          0,      0.647
 256,     0,    1,        23,          0,      0.773
 256,     0,    1,         0,          0,      0.773
 256,     1,    1,        23,          0,      0.799
 256,     1,    1,         0,          0,        0.8
2048,     0,   64,        23,          0,      0.905
 256,     2,   64,        23,          0,      0.773
2048,     0,   64,         0,          0,      0.904
 256,     2,   64,         0,          0,      0.772
 256,     0,    2,        23,          0,      0.772
 256,     0,    2,         0,          0,      0.773
 256,     2,    2,        23,          0,      0.799
 256,     2,    2,         0,          0,      0.796
2048,     0,  128,        23,          0,      0.926
 256,     3,   64,        23,          0,      0.772
2048,     0,  128,         0,          0,      0.925
 256,     3,   64,         0,          0,      0.772
 256,     0,    3,        23,          0,      0.772
 256,     0,    3,         0,          0,      0.772
 256,     3,    3,        23,          0,      0.796
 256,     3,    3,         0,          0,      0.797
2048,     0,  256,        23,          0,      0.927
 256,     4,   64,        23,          0,      0.764
2048,     0,  256,         0,          0,      0.929
 256,     4,   64,         0,          0,      0.767
 256,     0,    4,        23,          0,      0.772
 256,     0,    4,         0,          0,      0.773
 256,     4,    4,        23,          0,        0.8
 256,     4,    4,         0,          0,      0.798
2048,     0,  512,        23,          0,      0.957
 256,     5,   64,        23,          0,      0.773
2048,     0,  512,         0,          0,      0.956
 256,     5,   64,         0,          0,      0.773
 256,     0,    5,        23,          0,      0.774
 256,     0,    5,         0,          0,      0.773
 256,     5,    5,        23,          0,      0.798
 256,     5,    5,         0,          0,      0.797
2048,     0, 1024,        23,          0,      1.034
 256,     6,   64,        23,          0,      0.773
2048,     0, 1024,         0,          0,      1.034
 256,     6,   64,         0,          0,      0.773
 256,     0,    6,        23,          0,      0.774
 256,     0,    6,         0,          0,      0.773
 256,     6,    6,        23,          0,      0.799
 256,     6,    6,         0,          0,      0.798
2048,     0, 2048,        23,          0,      0.902
 256,     7,   64,        23,          0,      0.773
2048,     0, 2048,         0,          0,      0.901
 256,     7,   64,         0,          0,      0.773
 256,     0,    7,        23,          0,      0.774
 256,     0,    7,         0,          0,      0.774
 256,     7,    7,        23,          0,      0.802
 256,     7,    7,         0,          0,      0.798
 192,     1,   32,        23,          0,       0.62
 192,     1,   32,         0,          0,       0.62
 256,     1,   32,        23,          0,      0.787
 256,     1,   32,         0,          0,      0.786
 512,     1,   32,        23,          0,      0.819
 512,     1,   32,         0,          0,      0.822
 256,  4081,   32,        23,          0,      0.731
 192,     2,   64,        23,          0,      0.852
 192,     2,   64,         0,          0,      0.852
 512,     2,   64,        23,          0,      0.883
 512,     2,   64,         0,          0,      0.883
 256,  4081,   64,        23,          0,      0.646
 192,     3,   96,        23,          0,      0.847
 192,     3,   96,         0,          0,      0.847
 256,     3,   96,        23,          0,      0.782
 256,     3,   96,         0,          0,      0.782
 512,     3,   96,        23,          0,      0.933
 512,     3,   96,         0,          0,      0.932
 256,  4081,   96,        23,          0,      0.615
 192,     4,  128,        23,          0,      0.836
 192,     4,  128,         0,          0,      0.836
 256,     4,  128,        23,          0,      0.852
 256,     4,  128,         0,          0,      0.853
 512,     4,  128,        23,          0,       0.96
 512,     4,  128,         0,          0,      0.961
 256,  4081,  128,        23,          0,      0.863
 192,     5,  160,        23,          0,      1.166
 192,     5,  160,         0,          0,      1.167
 256,     5,  160,        23,          0,      0.847
 256,     5,  160,         0,          0,      0.847
 512,     5,  160,        23,          0,      0.949
 512,     5,  160,         0,          0,       0.95
 256,  4081,  160,        23,          0,      0.879
 192,     6,  192,        23,          0,      0.696
 192,     6,  192,         0,          0,      0.695
 256,     6,  192,        23,          0,      0.836
 256,     6,  192,         0,          0,      0.836
 512,     6,  192,        23,          0,      0.936
 512,     6,  192,         0,          0,      0.935
 256,  4081,  192,        23,          0,      0.874
 192,     7,  224,        23,          0,      0.697
 192,     7,  224,         0,          0,      0.696
 256,     7,  224,        23,          0,      1.167
 256,     7,  224,         0,          0,      1.167
 512,     7,  224,        23,          0,       0.95
 512,     7,  224,         0,          0,      0.952
 256,  4081,  224,        23,          0,      1.167
   2,     0,    1,        23,          0,      0.874
   2,     0,    1,         0,          0,      0.875
   2,     1,    1,        23,          0,      0.796
   2,     1,    1,         0,          0,      0.796
   0,     0,    1,        23,          0,      0.857
   0,     0,    1,         0,          0,      0.857
   0,     1,    1,        23,          0,      0.857
   0,     1,    1,         0,          0,      0.857
   2,  2048,    1,        23,          0,       0.64
   2,  2048,    1,         0,          0,       0.64
   2,  2049,    1,        23,          0,      0.582
   2,  2049,    1,         0,          0,      0.582
   0,  2048,    1,        23,          0,      0.856
   0,  2048,    1,         0,          0,      0.856
   0,  2049,    1,        23,          0,      0.857
   0,  2049,    1,         0,          0,      0.857
   0,  4081,    1,        23,          0,      0.857
   0,  4081,    1,         0,          0,      0.857
   2,  4081,    1,        23,          0,      0.568
   2,  4081,    1,         0,          0,      0.568
   2,     0,    2,         0,          0,      0.874
   3,     0,    2,        23,          0,      0.875
   3,     0,    2,         0,          0,      0.875
   3,     2,    2,        23,          0,      0.796
   3,     2,    2,         0,          0,      0.796
   1,     0,    2,        23,          0,      0.875
   1,     0,    2,         0,          0,      0.875
   1,     2,    2,        23,          0,      0.875
   1,     2,    2,         0,          0,      0.875
   3,  2048,    2,        23,          0,       0.64
   3,  2048,    2,         0,          0,       0.64
   3,  2050,    2,        23,          0,      0.582
   3,  2050,    2,         0,          0,      0.582
   1,  2048,    2,        23,          0,       0.75
   1,  2048,    2,         0,          0,      0.751
   1,  2050,    2,        23,          0,      0.688
   1,  2050,    2,         0,          0,       0.71
   1,  4081,    2,        23,          0,      0.714
   1,  4081,    2,         0,          0,      0.726
   3,  4081,    2,        23,          0,      0.567
   3,  4081,    2,         0,          0,      0.567
   3,     0,    1,        23,          0,      0.874
   4,     0,    3,        23,          0,      0.875
   4,     0,    3,         0,          0,      0.875
   4,     3,    3,        23,          0,      0.793
   4,     3,    3,         0,          0,      0.795
   2,     0,    3,        23,          0,      0.875
   2,     0,    3,         0,          0,      0.875
   2,     3,    3,        23,          0,      0.779
   2,     3,    3,         0,          0,      0.822
   4,  2048,    3,        23,          0,      0.641
   4,  2048,    3,         0,          0,      0.639
   4,  2051,    3,        23,          0,       0.58
   4,  2051,    3,         0,          0,      0.581
   2,  2048,    3,        23,          0,      0.753
   2,  2048,    3,         0,          0,      0.752
   2,  2051,    3,        23,          0,      0.693
   2,  2051,    3,         0,          0,       0.72
   2,  4081,    3,        23,          0,      0.715
   2,  4081,    3,         0,          0,      0.731
   4,  4081,    3,        23,          0,      0.565
   4,  4081,    3,         0,          0,      0.565
   4,     0,    1,        23,          0,      0.878
   4,     0,    2,         0,          0,      0.876
   5,     0,    4,        23,          0,      0.877
   5,     0,    4,         0,          0,       0.88
   5,     4,    4,        23,          0,      0.794
   5,     4,    4,         0,          0,      0.796
   3,     0,    4,        23,          0,      0.877
   3,     0,    4,         0,          0,      0.876
   3,     4,    4,        23,          0,      0.785
   3,     4,    4,         0,          0,      0.796
   5,  2048,    4,        23,          0,      0.639
   5,  2048,    4,         0,          0,      0.641
   5,  2052,    4,        23,          0,      0.582
   5,  2052,    4,         0,          0,      0.579
   3,  2048,    4,        23,          0,      0.749
   3,  2048,    4,         0,          0,      0.751
   3,  2052,    4,        23,          0,      0.689
   3,  2052,    4,         0,          0,      0.706
   3,  4081,    4,        23,          0,      0.732
   3,  4081,    4,         0,          0,      0.724
   5,  4081,    4,        23,          0,      0.566
   5,  4081,    4,         0,          0,      0.566
   5,     0,    1,        23,          0,      0.876
   5,     0,    2,         0,          0,      0.876
   6,     0,    5,        23,          0,      0.877
   6,     0,    5,         0,          0,      0.881
   6,     5,    5,        23,          0,      0.797
   6,     5,    5,         0,          0,      0.795
   4,     0,    5,        23,          0,      0.876
   4,     0,    5,         0,          0,      0.877
   4,     5,    5,        23,          0,      0.769
   4,     5,    5,         0,          0,      0.787
   6,  2048,    5,        23,          0,      0.642
   6,  2048,    5,         0,          0,      0.641
   6,  2053,    5,        23,          0,      0.579
   6,  2053,    5,         0,          0,      0.578
   4,  2048,    5,        23,          0,       0.75
   4,  2048,    5,         0,          0,       0.75
   4,  2053,    5,        23,          0,      0.684
   4,  2053,    5,         0,          0,      0.703
   4,  4081,    5,        23,          0,      0.725
   4,  4081,    5,         0,          0,      0.733
   6,  4081,    5,        23,          0,      0.565
   6,  4081,    5,         0,          0,      0.566
   6,     0,    1,        23,          0,      0.876
   6,     0,    2,         0,          0,      0.877
   7,     0,    6,        23,          0,      0.876
   7,     0,    6,         0,          0,       0.88
   7,     6,    6,        23,          0,      0.792
   7,     6,    6,         0,          0,       0.79
   5,     0,    6,        23,          0,      0.875
   5,     0,    6,         0,          0,      0.875
   5,     6,    6,        23,          0,      0.806
   5,     6,    6,         0,          0,      0.833
   7,  2048,    6,        23,          0,       0.64
   7,  2048,    6,         0,          0,      0.638
   7,  2054,    6,        23,          0,      0.578
   7,  2054,    6,         0,          0,      0.579
   5,  2048,    6,        23,          0,       0.75
   5,  2048,    6,         0,          0,       0.75
   5,  2054,    6,        23,          0,       0.68
   5,  2054,    6,         0,          0,      0.706
   5,  4081,    6,        23,          0,       0.71
   5,  4081,    6,         0,          0,      0.708
   7,  4081,    6,        23,          0,      0.565
   7,  4081,    6,         0,          0,      0.562
   7,     0,    1,        23,          0,      0.872
   7,     0,    2,         0,          0,      0.875
   8,     0,    7,        23,          0,      0.875
   8,     0,    7,         0,          0,      0.877
   8,     7,    7,        23,          0,       0.79
   8,     7,    7,         0,          0,      0.791
   6,     0,    7,        23,          0,      0.875
   6,     0,    7,         0,          0,       0.88
   6,     7,    7,        23,          0,       0.77
   6,     7,    7,         0,          0,       0.79
   8,  2048,    7,        23,          0,      0.642
   8,  2048,    7,         0,          0,      0.643
   8,  2055,    7,        23,          0,      0.577
   8,  2055,    7,         0,          0,      0.578
   6,  2048,    7,        23,          0,       0.75
   6,  2048,    7,         0,          0,      0.753
   6,  2055,    7,        23,          0,      0.668
   6,  2055,    7,         0,          0,      0.674
   6,  4081,    7,        23,          0,      0.724
   6,  4081,    7,         0,          0,      0.714
   8,  4081,    7,        23,          0,      0.565
   8,  4081,    7,         0,          0,      0.567
   8,     0,    1,        23,          0,      0.876
   8,     0,    2,         0,          0,      0.877
   9,     0,    8,        23,          0,      0.875
   9,     0,    8,         0,          0,      0.877
   9,     8,    8,        23,          0,      0.792
   9,     8,    8,         0,          0,       0.79
   7,     0,    8,        23,          0,      0.875
   7,     0,    8,         0,          0,      0.875
   7,     8,    8,        23,          0,      0.788
   7,     8,    8,         0,          0,      0.795
   9,  2048,    8,        23,          0,      0.639
   9,  2048,    8,         0,          0,      0.641
   9,  2056,    8,        23,          0,       0.58
   9,  2056,    8,         0,          0,      0.581
   7,  2048,    8,        23,          0,      0.751
   7,  2048,    8,         0,          0,      0.754
   7,  2056,    8,        23,          0,      0.668
   7,  2056,    8,         0,          0,      0.682
   7,  4081,    8,        23,          0,      0.691
   7,  4081,    8,         0,          0,      0.684
   9,  4081,    8,        23,          0,      0.562
   9,  4081,    8,         0,          0,      0.564
   9,     0,    1,        23,          0,      0.874
   9,     0,    2,         0,          0,      0.875
  10,     0,    9,        23,          0,      0.878
  10,     0,    9,         0,          0,      0.878
  10,     9,    9,        23,          0,      0.793
  10,     9,    9,         0,          0,      0.795
   8,     0,    9,        23,          0,      0.875
   8,     0,    9,         0,          0,      0.876
   8,     9,    9,        23,          0,      0.788
   8,     9,    9,         0,          0,      0.792
  10,  2048,    9,        23,          0,      0.641
  10,  2048,    9,         0,          0,      0.639
  10,  2057,    9,        23,          0,      0.579
  10,  2057,    9,         0,          0,      0.582
   8,  2048,    9,        23,          0,       0.75
   8,  2048,    9,         0,          0,      0.751
   8,  2057,    9,        23,          0,      0.693
   8,  2057,    9,         0,          0,      0.701
   8,  4081,    9,        23,          0,      0.726
   8,  4081,    9,         0,          0,      0.725
  10,  4081,    9,        23,          0,      0.564
  10,  4081,    9,         0,          0,      0.567
  10,     0,    1,        23,          0,      0.874
  10,     0,    2,         0,          0,      0.877
  11,     0,   10,        23,          0,      0.877
  11,     0,   10,         0,          0,      0.876
  11,    10,   10,        23,          0,      0.795
  11,    10,   10,         0,          0,      0.796
   9,     0,   10,        23,          0,      0.876
   9,     0,   10,         0,          0,      0.875
   9,    10,   10,        23,          0,      0.792
   9,    10,   10,         0,          0,      0.827
  11,  2048,   10,        23,          0,       0.64
  11,  2048,   10,         0,          0,      0.639
  11,  2058,   10,        23,          0,      0.581
  11,  2058,   10,         0,          0,      0.577
   9,  2048,   10,        23,          0,      0.751
   9,  2048,   10,         0,          0,      0.749
   9,  2058,   10,        23,          0,      0.703
   9,  2058,   10,         0,          0,       0.71
   9,  4081,   10,        23,          0,      0.709
   9,  4081,   10,         0,          0,      0.725
  11,  4081,   10,        23,          0,      0.565
  11,  4081,   10,         0,          0,      0.566
  11,     0,    1,        23,          0,      0.875
  11,     0,    2,         0,          0,      0.876
  12,     0,   11,        23,          0,      0.874
  12,     0,   11,         0,          0,      0.875
  12,    11,   11,        23,          0,      0.793
  12,    11,   11,         0,          0,      0.791
  10,     0,   11,        23,          0,      0.876
  10,     0,   11,         0,          0,      0.875
  10,    11,   11,        23,          0,      0.789
  10,    11,   11,         0,          0,      0.792
  12,  2048,   11,        23,          0,      0.639
  12,  2048,   11,         0,          0,      0.639
  12,  2059,   11,        23,          0,      0.578
  12,  2059,   11,         0,          0,       0.58
  10,  2048,   11,        23,          0,       0.75
  10,  2048,   11,         0,          0,       0.75
  10,  2059,   11,        23,          0,      0.685
  10,  2059,   11,         0,          0,      0.707
  10,  4081,   11,        23,          0,        0.7
  10,  4081,   11,         0,          0,      0.716
  12,  4081,   11,        23,          0,      0.565
  12,  4081,   11,         0,          0,      0.562
  12,     0,    1,        23,          0,      0.875
  12,     0,    2,         0,          0,      0.876
  13,     0,   12,        23,          0,      0.875
  13,     0,   12,         0,          0,      0.875
  13,    12,   12,        23,          0,      0.794
  13,    12,   12,         0,          0,      0.794
  11,     0,   12,        23,          0,      0.875
  11,     0,   12,         0,          0,      0.874
  11,    12,   12,        23,          0,       0.81
  11,    12,   12,         0,          0,      0.801
  13,  2048,   12,        23,          0,       0.64
  13,  2048,   12,         0,          0,      0.639
  13,  2060,   12,        23,          0,      0.578
  13,  2060,   12,         0,          0,      0.578
  11,  2048,   12,        23,          0,       0.75
  11,  2048,   12,         0,          0,       0.75
  11,  2060,   12,        23,          0,      0.694
  11,  2060,   12,         0,          0,      0.701
  11,  4081,   12,        23,          0,      0.702
  11,  4081,   12,         0,          0,      0.714
  13,  4081,   12,        23,          0,      0.563
  13,  4081,   12,         0,          0,      0.566
  13,     0,    1,        23,          0,      0.875
  13,     0,    2,         0,          0,      0.874
  14,     0,   13,        23,          0,      0.874
  14,     0,   13,         0,          0,      0.876
  14,    13,   13,        23,          0,      0.794
  14,    13,   13,         0,          0,      0.792
  12,     0,   13,        23,          0,      0.875
  12,     0,   13,         0,          0,      0.875
  12,    13,   13,        23,          0,      0.801
  12,    13,   13,         0,          0,      0.817
  14,  2048,   13,        23,          0,      0.639
  14,  2048,   13,         0,          0,      0.639
  14,  2061,   13,        23,          0,      0.579
  14,  2061,   13,         0,          0,      0.577
  12,  2048,   13,        23,          0,       0.75
  12,  2048,   13,         0,          0,      0.751
  12,  2061,   13,        23,          0,      0.663
  12,  2061,   13,         0,          0,      0.677
  12,  4081,   13,        23,          0,      0.703
  12,  4081,   13,         0,          0,      0.724
  14,  4081,   13,        23,          0,      0.565
  14,  4081,   13,         0,          0,      0.564
  14,     0,    1,        23,          0,      0.876
  14,     0,    2,         0,          0,      0.876
  15,     0,   14,        23,          0,      0.875
  15,     0,   14,         0,          0,      0.875
  15,    14,   14,        23,          0,      0.789
  15,    14,   14,         0,          0,      0.792
  13,     0,   14,        23,          0,      0.876
  13,     0,   14,         0,          0,      0.876
  13,    14,   14,        23,          0,      0.768
  13,    14,   14,         0,          0,      0.789
  15,  2048,   14,        23,          0,       0.64
  15,  2048,   14,         0,          0,       0.64
  15,  2062,   14,        23,          0,      0.579
  15,  2062,   14,         0,          0,      0.582
  13,  2048,   14,        23,          0,       0.75
  13,  2048,   14,         0,          0,       0.75
  13,  2062,   14,        23,          0,      0.668
  13,  2062,   14,         0,          0,      0.681
  13,  4081,   14,        23,          0,      0.694
  13,  4081,   14,         0,          0,      0.699
  15,  4081,   14,        23,          0,      0.565
  15,  4081,   14,         0,          0,      0.565
  15,     0,    1,        23,          0,      0.875
  15,     0,    2,         0,          0,      0.875
  16,     0,   15,        23,          0,      0.875
  16,     0,   15,         0,          0,      0.875
  16,    15,   15,        23,          0,      0.793
  16,    15,   15,         0,          0,       0.79
  14,     0,   15,        23,          0,      0.875
  14,     0,   15,         0,          0,      0.875
  14,    15,   15,        23,          0,      0.758
  14,    15,   15,         0,          0,      0.776
  16,  2048,   15,        23,          0,       0.64
  16,  2048,   15,         0,          0,       0.64
  16,  2063,   15,        23,          0,      0.578
  16,  2063,   15,         0,          0,      0.579
  14,  2048,   15,        23,          0,       0.75
  14,  2048,   15,         0,          0,       0.75
  14,  2063,   15,        23,          0,      0.661
  14,  2063,   15,         0,          0,      0.678
  14,  4081,   15,        23,          0,      0.688
  14,  4081,   15,         0,          0,      0.706
  16,  4081,   15,        23,          0,      0.887
  16,  4081,   15,         0,          0,      0.888
  16,     0,    1,        23,          0,      0.874
  16,     0,    2,         0,          0,      0.874
  17,     0,   16,        23,          0,      0.875
  17,     0,   16,         0,          0,      0.874
  17,    16,   16,        23,          0,      0.556
  17,    16,   16,         0,          0,      0.556
  15,     0,   16,        23,          0,      0.875
  15,     0,   16,         0,          0,      0.875
  15,    16,   16,        23,          0,      0.811
  15,    16,   16,         0,          0,      0.813
  17,  2048,   16,        23,          0,       0.64
  17,  2048,   16,         0,          0,       0.64
  17,  2064,   16,        23,          0,      0.556
  17,  2064,   16,         0,          0,      0.556
  15,  2048,   16,        23,          0,       0.75
  15,  2048,   16,         0,          0,       0.75
  15,  2064,   16,        23,          0,      0.693
  15,  2064,   16,         0,          0,      0.694
  15,  4081,   16,        23,          0,      0.709
  15,  4081,   16,         0,          0,      0.709
  17,  4081,   16,        23,          0,      0.889
  17,  4081,   16,         0,          0,      0.889
  17,     0,    1,        23,          0,      0.875
  17,     0,    2,         0,          0,      0.875
  18,     0,   17,        23,          0,      0.875
  18,     0,   17,         0,          0,      0.875
  18,    17,   17,        23,          0,      0.556
  18,    17,   17,         0,          0,      0.556
  16,     0,   17,        23,          0,      0.875
  16,     0,   17,         0,          0,      0.875
  16,    17,   17,        23,          0,      0.666
  16,    17,   17,         0,          0,      0.666
  18,  2048,   17,        23,          0,       0.64
  18,  2048,   17,         0,          0,       0.64
  18,  2065,   17,        23,          0,      0.556
  18,  2065,   17,         0,          0,      0.556
  16,  2048,   17,        23,          0,       0.75
  16,  2048,   17,         0,          0,       0.75
  16,  2065,   17,        23,          0,      0.666
  16,  2065,   17,         0,          0,      0.659
  16,  4081,   17,        23,          0,      0.999
  16,  4081,   17,         0,          0,      0.999
  18,  4081,   17,        23,          0,      0.889
  18,  4081,   17,         0,          0,      0.889
  18,     0,    1,        23,          0,      0.875
  18,     0,    2,         0,          0,      0.875
  19,     0,   18,        23,          0,      0.875
  19,     0,   18,         0,          0,      0.875
  19,    18,   18,        23,          0,      0.556
  19,    18,   18,         0,          0,      0.556
  17,     0,   18,        23,          0,      0.875
  17,     0,   18,         0,          0,      0.875
  17,    18,   18,        23,          0,      0.662
  17,    18,   18,         0,          0,      0.666
  19,  2048,   18,        23,          0,       0.64
  19,  2048,   18,         0,          0,       0.64
  19,  2066,   18,        23,          0,      0.556
  19,  2066,   18,         0,          0,      0.556
  17,  2048,   18,        23,          0,       0.75
  17,  2048,   18,         0,          0,       0.75
  17,  2066,   18,        23,          0,      0.656
  17,  2066,   18,         0,          0,      0.666
  17,  4081,   18,        23,          0,      0.973
  17,  4081,   18,         0,          0,      0.999
  19,  4081,   18,        23,          0,      0.889
  19,  4081,   18,         0,          0,      0.889
  19,     0,    1,        23,          0,      0.875
  19,     0,    2,         0,          0,      0.875
  20,     0,   19,        23,          0,      0.875
  20,     0,   19,         0,          0,      0.875
  20,    19,   19,        23,          0,      0.556
  20,    19,   19,         0,          0,      0.556
  18,     0,   19,        23,          0,      0.875
  18,     0,   19,         0,          0,      0.875
  18,    19,   19,        23,          0,      0.666
  18,    19,   19,         0,          0,      0.666
  20,  2048,   19,        23,          0,       0.64
  20,  2048,   19,         0,          0,       0.64
  20,  2067,   19,        23,          0,      0.556
  20,  2067,   19,         0,          0,      0.556
  18,  2048,   19,        23,          0,       0.75
  18,  2048,   19,         0,          0,       0.75
  18,  2067,   19,        23,          0,      0.666
  18,  2067,   19,         0,          0,      0.666
  18,  4081,   19,        23,          0,      0.999
  18,  4081,   19,         0,          0,      0.999
  20,  4081,   19,        23,          0,      0.889
  20,  4081,   19,         0,          0,      0.889
  20,     0,    1,        23,          0,      0.875
  20,     0,    2,         0,          0,      0.875
  21,     0,   20,        23,          0,      0.875
  21,     0,   20,         0,          0,      0.875
  21,    20,   20,        23,          0,      0.556
  21,    20,   20,         0,          0,      0.556
  19,     0,   20,        23,          0,      0.875
  19,     0,   20,         0,          0,      0.875
  19,    20,   20,        23,          0,      0.657
  19,    20,   20,         0,          0,      0.666
  21,  2048,   20,        23,          0,       0.64
  21,  2048,   20,         0,          0,       0.64
  21,  2068,   20,        23,          0,      0.556
  21,  2068,   20,         0,          0,      0.556
  19,  2048,   20,        23,          0,       0.75
  19,  2048,   20,         0,          0,       0.75
  19,  2068,   20,        23,          0,      0.666
  19,  2068,   20,         0,          0,      0.659
  19,  4081,   20,        23,          0,      0.999
  19,  4081,   20,         0,          0,      0.989
  21,  4081,   20,        23,          0,      0.889
  21,  4081,   20,         0,          0,      0.889
  21,     0,    1,        23,          0,      0.875
  21,     0,    2,         0,          0,      0.875
  22,     0,   21,        23,          0,      0.875
  22,     0,   21,         0,          0,      0.875
  22,    21,   21,        23,          0,      0.556
  22,    21,   21,         0,          0,      0.556
  20,     0,   21,        23,          0,      0.914
  20,     0,   21,         0,          0,      0.903
  20,    21,   21,        23,          0,      0.666
  20,    21,   21,         0,          0,      0.666
  22,  2048,   21,        23,          0,       0.64
  22,  2048,   21,         0,          0,       0.64
  22,  2069,   21,        23,          0,      0.556
  22,  2069,   21,         0,          0,      0.556
  20,  2048,   21,        23,          0,       0.75
  20,  2048,   21,         0,          0,       0.75
  20,  2069,   21,        23,          0,      0.666
  20,  2069,   21,         0,          0,      0.666
  20,  4081,   21,        23,          0,      0.974
  20,  4081,   21,         0,          0,      0.983
  22,  4081,   21,        23,          0,      0.889
  22,  4081,   21,         0,          0,      0.889
  22,     0,    1,        23,          0,      0.875
  22,     0,    2,         0,          0,      0.875
  23,     0,   22,        23,          0,      0.875
  23,     0,   22,         0,          0,      0.875
  23,    22,   22,        23,          0,      0.556
  23,    22,   22,         0,          0,      0.556
  21,     0,   22,        23,          0,      0.932
  21,     0,   22,         0,          0,       0.93
  21,    22,   22,        23,          0,      0.666
  21,    22,   22,         0,          0,      0.666
  23,  2048,   22,        23,          0,       0.64
  23,  2048,   22,         0,          0,       0.64
  23,  2070,   22,        23,          0,      0.556
  23,  2070,   22,         0,          0,      0.556
  21,  2048,   22,        23,          0,       0.75
  21,  2048,   22,         0,          0,       0.75
  21,  2070,   22,        23,          0,      0.666
  21,  2070,   22,         0,          0,      0.652
  21,  4081,   22,        23,          0,      0.999
  21,  4081,   22,         0,          0,      0.999
  23,  4081,   22,        23,          0,      0.889
  23,  4081,   22,         0,          0,      0.889
  23,     0,    1,        23,          0,      0.875
  23,     0,    2,         0,          0,      0.875
  24,     0,   23,        23,          0,      0.875
  24,     0,   23,         0,          0,      0.875
  24,    23,   23,        23,          0,      0.556
  24,    23,   23,         0,          0,      0.556
  22,     0,   23,        23,          0,       0.92
  22,     0,   23,         0,          0,       0.92
  22,    23,   23,        23,          0,       0.66
  22,    23,   23,         0,          0,      0.662
  24,  2048,   23,        23,          0,       0.64
  24,  2048,   23,         0,          0,       0.64
  24,  2071,   23,        23,          0,      0.556
  24,  2071,   23,         0,          0,      0.556
  22,  2048,   23,        23,          0,       0.75
  22,  2048,   23,         0,          0,       0.75
  22,  2071,   23,        23,          0,      0.654
  22,  2071,   23,         0,          0,      0.666
  22,  4081,   23,        23,          0,      0.979
  22,  4081,   23,         0,          0,      0.994
  24,  4081,   23,        23,          0,      0.889
  24,  4081,   23,         0,          0,      0.889
  24,     0,    1,        23,          0,      0.875
  24,     0,    2,         0,          0,      0.875
  25,     0,   24,        23,          0,      0.875
  25,     0,   24,         0,          0,      0.875
  25,    24,   24,        23,          0,      0.556
  25,    24,   24,         0,          0,      0.556
  23,     0,   24,        23,          0,      0.921
  23,     0,   24,         0,          0,      0.909
  23,    24,   24,        23,          0,      0.663
  23,    24,   24,         0,          0,      0.666
  25,  2048,   24,        23,          0,       0.64
  25,  2048,   24,         0,          0,       0.64
  25,  2072,   24,        23,          0,      0.556
  25,  2072,   24,         0,          0,      0.556
  23,  2048,   24,        23,          0,       0.75
  23,  2048,   24,         0,          0,       0.75
  23,  2072,   24,        23,          0,      0.658
  23,  2072,   24,         0,          0,      0.666
  23,  4081,   24,        23,          0,      0.999
  23,  4081,   24,         0,          0,      0.999
  25,  4081,   24,        23,          0,      0.889
  25,  4081,   24,         0,          0,      0.889
  25,     0,    1,        23,          0,      0.875
  25,     0,    2,         0,          0,      0.875
  26,     0,   25,        23,          0,      0.875
  26,     0,   25,         0,          0,      0.875
  26,    25,   25,        23,          0,      0.556
  26,    25,   25,         0,          0,      0.556
  24,     0,   25,        23,          0,       0.92
  24,     0,   25,         0,          0,       0.92
  24,    25,   25,        23,          0,      0.666
  24,    25,   25,         0,          0,      0.666
  26,  2048,   25,        23,          0,       0.64
  26,  2048,   25,         0,          0,       0.64
  26,  2073,   25,        23,          0,      0.556
  26,  2073,   25,         0,          0,      0.556
  24,  2048,   25,        23,          0,       0.75
  24,  2048,   25,         0,          0,       0.75
  24,  2073,   25,        23,          0,      0.666
  24,  2073,   25,         0,          0,      0.666
  24,  4081,   25,        23,          0,      0.999
  24,  4081,   25,         0,          0,      0.999
  26,  4081,   25,        23,          0,      0.889
  26,  4081,   25,         0,          0,      0.889
  26,     0,    1,        23,          0,      0.875
  26,     0,    2,         0,          0,      0.875
  27,     0,   26,        23,          0,      0.875
  27,     0,   26,         0,          0,      0.875
  27,    26,   26,        23,          0,      0.556
  27,    26,   26,         0,          0,      0.556
  25,     0,   26,        23,          0,      0.992
  25,     0,   26,         0,          0,      0.992
  25,    26,   26,        23,          0,      0.664
  25,    26,   26,         0,          0,      0.663
  27,  2048,   26,        23,          0,       0.64
  27,  2048,   26,         0,          0,       0.64
  27,  2074,   26,        23,          0,      0.556
  27,  2074,   26,         0,          0,      0.556
  25,  2048,   26,        23,          0,       0.75
  25,  2048,   26,         0,          0,       0.75
  25,  2074,   26,        23,          0,      0.651
  25,  2074,   26,         0,          0,      0.666
  25,  4081,   26,        23,          0,      0.994
  25,  4081,   26,         0,          0,      0.999
  27,  4081,   26,        23,          0,      0.889
  27,  4081,   26,         0,          0,      0.889
  27,     0,    1,        23,          0,      0.875
  27,     0,    2,         0,          0,      0.875
  28,     0,   27,        23,          0,      0.875
  28,     0,   27,         0,          0,      0.875
  28,    27,   27,        23,          0,      0.556
  28,    27,   27,         0,          0,      0.556
  26,     0,   27,        23,          0,       0.98
  26,     0,   27,         0,          0,       0.98
  26,    27,   27,        23,          0,      0.645
  26,    27,   27,         0,          0,      0.656
  28,  2048,   27,        23,          0,       0.64
  28,  2048,   27,         0,          0,       0.64
  28,  2075,   27,        23,          0,      0.556
  28,  2075,   27,         0,          0,      0.556
  26,  2048,   27,        23,          0,       0.75
  26,  2048,   27,         0,          0,       0.75
  26,  2075,   27,        23,          0,      0.665
  26,  2075,   27,         0,          0,      0.666
  26,  4081,   27,        23,          0,      0.996
  26,  4081,   27,         0,          0,      0.977
  28,  4081,   27,        23,          0,      0.889
  28,  4081,   27,         0,          0,      0.889
  28,     0,    1,        23,          0,      0.875
  28,     0,    2,         0,          0,      0.875
  29,     0,   28,        23,          0,      0.875
  29,     0,   28,         0,          0,      0.875
  29,    28,   28,        23,          0,      0.556
  29,    28,   28,         0,          0,      0.556
  27,     0,   28,        23,          0,       0.99
  27,     0,   28,         0,          0,      0.975
  27,    28,   28,        23,          0,      0.657
  27,    28,   28,         0,          0,      0.663
  29,  2048,   28,        23,          0,       0.64
  29,  2048,   28,         0,          0,       0.64
  29,  2076,   28,        23,          0,      0.556
  29,  2076,   28,         0,          0,      0.556
  27,  2048,   28,        23,          0,       0.75
  27,  2048,   28,         0,          0,       0.75
  27,  2076,   28,        23,          0,      0.653
  27,  2076,   28,         0,          0,      0.656
  27,  4081,   28,        23,          0,      0.992
  27,  4081,   28,         0,          0,      0.984
  29,  4081,   28,        23,          0,      0.889
  29,  4081,   28,         0,          0,      0.889
  29,     0,    1,        23,          0,      0.875
  29,     0,    2,         0,          0,      0.875
  30,     0,   29,        23,          0,      0.875
  30,     0,   29,         0,          0,      0.875
  30,    29,   29,        23,          0,      0.556
  30,    29,   29,         0,          0,      0.556
  28,     0,   29,        23,          0,       0.98
  28,     0,   29,         0,          0,      0.951
  28,    29,   29,        23,          0,      0.656
  28,    29,   29,         0,          0,       0.65
  30,  2048,   29,        23,          0,       0.64
  30,  2048,   29,         0,          0,       0.64
  30,  2077,   29,        23,          0,      0.556
  30,  2077,   29,         0,          0,      0.555
  28,  2048,   29,        23,          0,      0.749
  28,  2048,   29,         0,          0,      0.749
  28,  2077,   29,        23,          0,      0.656
  28,  2077,   29,         0,          0,      0.657
  28,  4081,   29,        23,          0,      0.986
  28,  4081,   29,         0,          0,      0.978
  30,  4081,   29,        23,          0,      0.886
  30,  4081,   29,         0,          0,      0.887
  30,     0,    1,        23,          0,      0.873
  30,     0,    2,         0,          0,      0.873
  31,     0,   30,        23,          0,      0.873
  31,     0,   30,         0,          0,      0.871
  31,    30,   30,        23,          0,      0.554
  31,    30,   30,         0,          0,      0.554
  29,     0,   30,        23,          0,      0.932
  29,     0,   30,         0,          0,      0.927
  29,    30,   30,        23,          0,      0.655
  29,    30,   30,         0,          0,      0.659
  31,  2048,   30,        23,          0,      0.637
  31,  2048,   30,         0,          0,      0.638
  31,  2078,   30,        23,          0,      0.554
  31,  2078,   30,         0,          0,      0.553
  29,  2048,   30,        23,          0,      0.746
  29,  2048,   30,         0,          0,      0.746
  29,  2078,   30,        23,          0,      0.649
  29,  2078,   30,         0,          0,      0.658
  29,  4081,   30,        23,          0,       0.98
  29,  4081,   30,         0,          0,      0.984
  31,  4081,   30,        23,          0,      0.883
  31,  4081,   30,         0,          0,      0.884
  31,     0,    1,        23,          0,       0.87
  31,     0,    2,         0,          0,       0.87
  32,     0,   31,        23,          0,       0.87
  32,     0,   31,         0,          0,      0.869
  32,    31,   31,        23,          0,      0.553
  32,    31,   31,         0,          0,      0.553
  30,     0,   31,        23,          0,      0.977
  30,     0,   31,         0,          0,      0.975
  30,    31,   31,        23,          0,       0.66
  30,    31,   31,         0,          0,      0.658
  32,  2048,   31,        23,          0,      0.622
  32,  2048,   31,         0,          0,      0.622
  32,  2079,   31,        23,          0,      0.553
  32,  2079,   31,         0,          0,      0.552
  30,  2048,   31,        23,          0,      0.745
  30,  2048,   31,         0,          0,      0.744
  30,  2079,   31,        23,          0,      0.659
  30,  2079,   31,         0,          0,       0.66
  30,  4081,   31,        23,          0,      0.972
  30,  4081,   31,         0,          0,      0.972
  32,  4081,   31,        23,          0,      0.881
  32,  4081,   31,         0,          0,      0.881
  32,     0,    1,        23,          0,      0.868
  32,     0,    2,         0,          0,      0.868
2048,     0,   32,        23,          1,      1.158
 256,     1,   64,        23,          1,       0.83
2048,     0,   32,         0,          1,      1.158
 256,     1,   64,         0,          1,       0.83
 256,  4081,   64,         0,          1,      0.873
 256,     0,    1,        23,          1,      1.158
 256,     0,    1,         0,          1,      1.157
 256,     1,    1,        23,          1,      1.158
 256,     1,    1,         0,          1,      1.158
2048,     0,   64,        23,          1,      1.133
 256,     2,   64,        23,          1,      0.835
2048,     0,   64,         0,          1,      1.132
 256,     2,   64,         0,          1,      0.835
 256,     0,    2,        23,          1,       1.16
 256,     0,    2,         0,          1,      1.161
 256,     2,    2,        23,          1,      1.161
 256,     2,    2,         0,          1,      1.161
2048,     0,  128,        23,          1,      1.023
 256,     3,   64,        23,          1,      0.833
2048,     0,  128,         0,          1,      1.025
 256,     3,   64,         0,          1,      0.835
 256,     0,    3,        23,          1,      1.167
 256,     0,    3,         0,          1,      1.167
 256,     3,    3,        23,          1,      1.167
 256,     3,    3,         0,          1,      1.167
2048,     0,  256,        23,          1,      0.993
 256,     4,   64,        23,          1,      0.836
2048,     0,  256,         0,          1,      0.994
 256,     4,   64,         0,          1,      0.836
 256,     0,    4,        23,          1,      1.167
 256,     0,    4,         0,          1,      1.167
 256,     4,    4,        23,          1,      1.167
 256,     4,    4,         0,          1,      1.167
2048,     0,  512,        23,          1,      1.065
 256,     5,   64,        23,          1,      0.836
2048,     0,  512,         0,          1,      1.057
 256,     5,   64,         0,          1,      0.836
 256,     0,    5,        23,          1,      1.167
 256,     0,    5,         0,          1,      1.167
 256,     5,    5,        23,          1,      1.167
 256,     5,    5,         0,          1,      1.167
2048,     0, 1024,        23,          1,      1.034
 256,     6,   64,        23,          1,      0.836
2048,     0, 1024,         0,          1,      1.032
 256,     6,   64,         0,          1,      0.836
 256,     0,    6,        23,          1,      1.167
 256,     0,    6,         0,          1,      1.167
 256,     6,    6,        23,          1,      1.167
 256,     6,    6,         0,          1,      1.167
2048,     0, 2048,        23,          1,      0.901
 256,     7,   64,        23,          1,      0.836
2048,     0, 2048,         0,          1,      0.901
 256,     7,   64,         0,          1,      0.835
 256,     0,    7,        23,          1,      1.165
 256,     0,    7,         0,          1,      1.165
 256,     7,    7,        23,          1,      1.165
 256,     7,    7,         0,          1,      1.165
 192,     1,   32,        23,          1,      1.165
 192,     1,   32,         0,          1,      1.165
 256,     1,   32,        23,          1,      1.165
 256,     1,   32,         0,          1,      1.165
 512,     1,   32,        23,          1,      1.165
 512,     1,   32,         0,          1,      1.165
 256,  4081,   32,        23,          1,      1.165
 192,     2,   64,        23,          1,      0.835
 192,     2,   64,         0,          1,      0.835
 512,     2,   64,        23,          1,      0.836
 512,     2,   64,         0,          1,      0.836
 256,  4081,   64,        23,          1,      0.874
 192,     3,   96,        23,          1,      0.847
 192,     3,   96,         0,          1,      0.847
 256,     3,   96,        23,          1,      0.847
 256,     3,   96,         0,          1,      0.847
 512,     3,   96,        23,          1,      0.847
 512,     3,   96,         0,          1,      0.847
 256,  4081,   96,        23,          1,      0.879
 192,     4,  128,        23,          1,      0.851
 192,     4,  128,         0,          1,      0.851
 256,     4,  128,        23,          1,      0.852
 256,     4,  128,         0,          1,      0.852
 512,     4,  128,        23,          1,      0.851
 512,     4,  128,         0,          1,      0.851
 256,  4081,  128,        23,          1,      0.862
 192,     5,  160,        23,          1,      0.619
 192,     5,  160,         0,          1,      0.618
 256,     5,  160,        23,          1,      0.781
 256,     5,  160,         0,          1,      0.779
 512,     5,  160,        23,          1,      0.936
 512,     5,  160,         0,          1,      0.937
 256,  4081,  160,        23,          1,      0.616
 192,     6,  192,        23,          1,      0.695
 192,     6,  192,         0,          1,      0.695
 256,     6,  192,        23,          1,       0.77
 256,     6,  192,         0,          1,      0.771
 512,     6,  192,        23,          1,       0.94
 512,     6,  192,         0,          1,      0.942
 256,  4081,  192,        23,          1,      0.643
 192,     7,  224,        23,          1,      0.693
 192,     7,  224,         0,          1,      0.694
 256,     7,  224,        23,          1,      0.783
 256,     7,  224,         0,          1,      0.782
 512,     7,  224,        23,          1,      0.945
 512,     7,  224,         0,          1,      0.946
 256,  4081,  224,        23,          1,      0.728
   2,     0,    1,        23,          1,       0.87
   2,     0,    1,         0,          1,      0.872
   2,     1,    1,        23,          1,      0.793
   2,     1,    1,         0,          1,      0.792
   0,     0,    1,        23,          1,      0.854
   0,     0,    1,         0,          1,      0.854
   0,     1,    1,        23,          1,      0.855
   0,     1,    1,         0,          1,      0.854
   2,  2048,    1,        23,          1,      0.639
   2,  2048,    1,         0,          1,      0.638
   2,  2049,    1,        23,          1,      0.581
   2,  2049,    1,         0,          1,       0.58
   0,  2048,    1,        23,          1,      0.854
   0,  2048,    1,         0,          1,      0.854
   0,  2049,    1,        23,          1,      0.854
   0,  2049,    1,         0,          1,      0.854
   0,  4081,    1,        23,          1,      0.854
   0,  4081,    1,         0,          1,      0.854
   2,  4081,    1,        23,          1,      0.567
   2,  4081,    1,         0,          1,      0.567
   2,     0,    2,         0,          1,      0.922
   3,     0,    2,        23,          1,      0.872
   3,     0,    2,         0,          1,       0.87
   3,     2,    2,        23,          1,      0.793
   3,     2,    2,         0,          1,      0.794
   1,     0,    2,        23,          1,      0.874
   1,     0,    2,         0,          1,      0.873
   1,     2,    2,        23,          1,      0.829
   1,     2,    2,         0,          1,      0.848
   3,  2048,    2,        23,          1,      0.638
   3,  2048,    2,         0,          1,      0.638
   3,  2050,    2,        23,          1,       0.58
   3,  2050,    2,         0,          1,       0.58
   1,  2048,    2,        23,          1,      0.747
   1,  2048,    2,         0,          1,      0.747
   1,  2050,    2,        23,          1,      0.687
   1,  2050,    2,         0,          1,      0.691
   1,  4081,    2,        23,          1,      0.707
   1,  4081,    2,         0,          1,      0.723
   3,  4081,    2,        23,          1,      0.565
   3,  4081,    2,         0,          1,      0.566
   3,     0,    1,        23,          1,      0.873
   4,     0,    3,        23,          1,      0.873
   4,     0,    3,         0,          1,      0.873
   4,     3,    3,        23,          1,      0.794
   4,     3,    3,         0,          1,      0.793
   2,     0,    3,        23,          1,      0.874
   2,     0,    3,         0,          1,      0.874
   2,     3,    3,        23,          1,      0.821
   2,     3,    3,         0,          1,      0.828
   4,  2048,    3,        23,          1,      0.638
   4,  2048,    3,         0,          1,      0.638
   4,  2051,    3,        23,          1,      0.581
   4,  2051,    3,         0,          1,      0.581
   2,  2048,    3,        23,          1,      0.747
   2,  2048,    3,         0,          1,      0.747
   2,  2051,    3,        23,          1,      0.686
   2,  2051,    3,         0,          1,      0.689
   2,  4081,    3,        23,          1,      0.702
   2,  4081,    3,         0,          1,      0.702
   4,  4081,    3,        23,          1,      0.567
   4,  4081,    3,         0,          1,      0.568
   4,     0,    1,        23,          1,      0.874
   4,     0,    2,         0,          1,      0.875
   5,     0,    4,        23,          1,      0.875
   5,     0,    4,         0,          1,      0.875
   5,     4,    4,        23,          1,      0.796
   5,     4,    4,         0,          1,      0.794
   3,     0,    4,        23,          1,      0.874
   3,     0,    4,         0,          1,      0.875
   3,     4,    4,        23,          1,      0.783
   3,     4,    4,         0,          1,      0.795
   5,  2048,    4,        23,          1,      0.639
   5,  2048,    4,         0,          1,       0.64
   5,  2052,    4,        23,          1,      0.581
   5,  2052,    4,         0,          1,      0.581
   3,  2048,    4,        23,          1,      0.749
   3,  2048,    4,         0,          1,      0.748
   3,  2052,    4,        23,          1,      0.694
   3,  2052,    4,         0,          1,      0.702
   3,  4081,    4,        23,          1,      0.701
   3,  4081,    4,         0,          1,      0.701
   5,  4081,    4,        23,          1,      0.566
   5,  4081,    4,         0,          1,      0.567
   5,     0,    1,        23,          1,      0.873
   5,     0,    2,         0,          1,      0.874
   6,     0,    5,        23,          1,      0.874
   6,     0,    5,         0,          1,      0.875
   6,     5,    5,        23,          1,      0.792
   6,     5,    5,         0,          1,      0.795
   4,     0,    5,        23,          1,      0.875
   4,     0,    5,         0,          1,      0.875
   4,     5,    5,        23,          1,      0.804
   4,     5,    5,         0,          1,      0.804
   6,  2048,    5,        23,          1,       0.64
   6,  2048,    5,         0,          1,       0.64
   6,  2053,    5,        23,          1,      0.581
   6,  2053,    5,         0,          1,      0.581
   4,  2048,    5,        23,          1,       0.75
   4,  2048,    5,         0,          1,       0.75
   4,  2053,    5,        23,          1,      0.709
   4,  2053,    5,         0,          1,      0.701
   4,  4081,    5,        23,          1,      0.693
   4,  4081,    5,         0,          1,        0.7
   6,  4081,    5,        23,          1,      0.566
   6,  4081,    5,         0,          1,      0.566
   6,     0,    1,        23,          1,      0.874
   6,     0,    2,         0,          1,      0.874
   7,     0,    6,        23,          1,      0.874
   7,     0,    6,         0,          1,      0.874
   7,     6,    6,        23,          1,      0.793
   7,     6,    6,         0,          1,      0.793
   5,     0,    6,        23,          1,      0.874
   5,     0,    6,         0,          1,      0.874
   5,     6,    6,        23,          1,      0.803
   5,     6,    6,         0,          1,      0.821
   7,  2048,    6,        23,          1,      0.639
   7,  2048,    6,         0,          1,      0.639
   7,  2054,    6,        23,          1,      0.579
   7,  2054,    6,         0,          1,      0.581
   5,  2048,    6,        23,          1,      0.749
   5,  2048,    6,         0,          1,      0.749
   5,  2054,    6,        23,          1,      0.685
   5,  2054,    6,         0,          1,      0.693
   5,  4081,    6,        23,          1,      0.701
   5,  4081,    6,         0,          1,      0.708
   7,  4081,    6,        23,          1,      0.566
   7,  4081,    6,         0,          1,      0.565
   7,     0,    1,        23,          1,      0.874
   7,     0,    2,         0,          1,      0.874
   8,     0,    7,        23,          1,      0.874
   8,     0,    7,         0,          1,      0.874
   8,     7,    7,        23,          1,      0.796
   8,     7,    7,         0,          1,      0.793
   6,     0,    7,        23,          1,      0.875
   6,     0,    7,         0,          1,      0.875
   6,     7,    7,        23,          1,      0.795
   6,     7,    7,         0,          1,      0.786
   8,  2048,    7,        23,          1,       0.64
   8,  2048,    7,         0,          1,       0.64
   8,  2055,    7,        23,          1,      0.579
   8,  2055,    7,         0,          1,      0.581
   6,  2048,    7,        23,          1,       0.75
   6,  2048,    7,         0,          1,       0.75
   6,  2055,    7,        23,          1,      0.701
   6,  2055,    7,         0,          1,      0.699
   6,  4081,    7,        23,          1,      0.701
   6,  4081,    7,         0,          1,      0.695
   8,  4081,    7,        23,          1,      0.566
   8,  4081,    7,         0,          1,      0.567
   8,     0,    1,        23,          1,      0.875
   8,     0,    2,         0,          1,      0.875
   9,     0,    8,        23,          1,      0.875
   9,     0,    8,         0,          1,      0.875
   9,     8,    8,        23,          1,      0.795
   9,     8,    8,         0,          1,      0.794
   7,     0,    8,        23,          1,      0.875
   7,     0,    8,         0,          1,      0.875
   7,     8,    8,        23,          1,      0.773
   7,     8,    8,         0,          1,       0.79
   9,  2048,    8,        23,          1,       0.64
   9,  2048,    8,         0,          1,       0.64
   9,  2056,    8,        23,          1,      0.581
   9,  2056,    8,         0,          1,      0.581
   7,  2048,    8,        23,          1,       0.75
   7,  2048,    8,         0,          1,       0.75
   7,  2056,    8,        23,          1,      0.701
   7,  2056,    8,         0,          1,      0.701
   7,  4081,    8,        23,          1,      0.691
   7,  4081,    8,         0,          1,      0.701
   9,  4081,    8,        23,          1,      0.567
   9,  4081,    8,         0,          1,      0.567
   9,     0,    1,        23,          1,      0.875
   9,     0,    2,         0,          1,      0.875
  10,     0,    9,        23,          1,      0.875
  10,     0,    9,         0,          1,      0.875
  10,     9,    9,        23,          1,      0.794
  10,     9,    9,         0,          1,      0.796
   8,     0,    9,        23,          1,      0.875
   8,     0,    9,         0,          1,      0.875
   8,     9,    9,        23,          1,      0.804
   8,     9,    9,         0,          1,      0.804
  10,  2048,    9,        23,          1,       0.64
  10,  2048,    9,         0,          1,       0.64
  10,  2057,    9,        23,          1,       0.58
  10,  2057,    9,         0,          1,      0.581
   8,  2048,    9,        23,          1,       0.75
   8,  2048,    9,         0,          1,       0.75
   8,  2057,    9,        23,          1,      0.694
   8,  2057,    9,         0,          1,      0.709
   8,  4081,    9,        23,          1,      0.693
   8,  4081,    9,         0,          1,      0.712
  10,  4081,    9,        23,          1,      0.565
  10,  4081,    9,         0,          1,      0.567
  10,     0,    1,        23,          1,      0.875
  10,     0,    2,         0,          1,      0.875
  11,     0,   10,        23,          1,      0.875
  11,     0,   10,         0,          1,      0.875
  11,    10,   10,        23,          1,      0.795
  11,    10,   10,         0,          1,      0.794
   9,     0,   10,        23,          1,      0.875
   9,     0,   10,         0,          1,      0.875
   9,    10,   10,        23,          1,      0.804
   9,    10,   10,         0,          1,      0.804
  11,  2048,   10,        23,          1,       0.64
  11,  2048,   10,         0,          1,       0.64
  11,  2058,   10,        23,          1,       0.58
  11,  2058,   10,         0,          1,      0.581
   9,  2048,   10,        23,          1,       0.75
   9,  2048,   10,         0,          1,       0.75
   9,  2058,   10,        23,          1,      0.671
   9,  2058,   10,         0,          1,      0.671
   9,  4081,   10,        23,          1,      0.678
   9,  4081,   10,         0,          1,      0.693
  11,  4081,   10,        23,          1,      0.567
  11,  4081,   10,         0,          1,      0.565
  11,     0,    1,        23,          1,      0.875
  11,     0,    2,         0,          1,      0.875
  12,     0,   11,        23,          1,      0.875
  12,     0,   11,         0,          1,      0.875
  12,    11,   11,        23,          1,      0.795
  12,    11,   11,         0,          1,      0.794
  10,     0,   11,        23,          1,      0.875
  10,     0,   11,         0,          1,      0.875
  10,    11,   11,        23,          1,      0.791
  10,    11,   11,         0,          1,      0.791
  12,  2048,   11,        23,          1,       0.64
  12,  2048,   11,         0,          1,       0.64
  12,  2059,   11,        23,          1,       0.58
  12,  2059,   11,         0,          1,      0.581
  10,  2048,   11,        23,          1,       0.75
  10,  2048,   11,         0,          1,       0.75
  10,  2059,   11,        23,          1,      0.689
  10,  2059,   11,         0,          1,      0.686
  10,  4081,   11,        23,          1,      0.686
  10,  4081,   11,         0,          1,      0.709
  12,  4081,   11,        23,          1,      0.566
  12,  4081,   11,         0,          1,      0.567
  12,     0,    1,        23,          1,      0.875
  12,     0,    2,         0,          1,      0.875
  13,     0,   12,        23,          1,      0.875
  13,     0,   12,         0,          1,      0.875
  13,    12,   12,        23,          1,      0.794
  13,    12,   12,         0,          1,      0.794
  11,     0,   12,        23,          1,      0.875
  11,     0,   12,         0,          1,      0.875
  11,    12,   12,        23,          1,      0.804
  11,    12,   12,         0,          1,      0.795
  13,  2048,   12,        23,          1,       0.64
  13,  2048,   12,         0,          1,       0.64
  13,  2060,   12,        23,          1,       0.58
  13,  2060,   12,         0,          1,       0.58
  11,  2048,   12,        23,          1,       0.75
  11,  2048,   12,         0,          1,       0.75
  11,  2060,   12,        23,          1,      0.693
  11,  2060,   12,         0,          1,      0.701
  11,  4081,   12,        23,          1,      0.717
  11,  4081,   12,         0,          1,      0.725
  13,  4081,   12,        23,          1,      0.566
  13,  4081,   12,         0,          1,      0.567
  13,     0,    1,        23,          1,      0.875
  13,     0,    2,         0,          1,      0.875
  14,     0,   13,        23,          1,      0.875
  14,     0,   13,         0,          1,      0.875
  14,    13,   13,        23,          1,      0.792
  14,    13,   13,         0,          1,      0.792
  12,     0,   13,        23,          1,      0.875
  12,     0,   13,         0,          1,      0.875
  12,    13,   13,        23,          1,      0.782
  12,    13,   13,         0,          1,      0.804
  14,  2048,   13,        23,          1,       0.64
  14,  2048,   13,         0,          1,       0.64
  14,  2061,   13,        23,          1,      0.579
  14,  2061,   13,         0,          1,      0.579
  12,  2048,   13,        23,          1,       0.75
  12,  2048,   13,         0,          1,       0.75
  12,  2061,   13,        23,          1,      0.701
  12,  2061,   13,         0,          1,      0.701
  12,  4081,   13,        23,          1,      0.705
  12,  4081,   13,         0,          1,      0.733
  14,  4081,   13,        23,          1,      0.565
  14,  4081,   13,         0,          1,      0.565
  14,     0,    1,        23,          1,      0.875
  14,     0,    2,         0,          1,      0.875
  15,     0,   14,        23,          1,      0.875
  15,     0,   14,         0,          1,      0.875
  15,    14,   14,        23,          1,      0.795
  15,    14,   14,         0,          1,      0.794
  13,     0,   14,        23,          1,      0.875
  13,     0,   14,         0,          1,      0.875
  13,    14,   14,        23,          1,      0.804
  13,    14,   14,         0,          1,      0.813
  15,  2048,   14,        23,          1,       0.64
  15,  2048,   14,         0,          1,       0.64
  15,  2062,   14,        23,          1,      0.579
  15,  2062,   14,         0,          1,       0.58
  13,  2048,   14,        23,          1,       0.75
  13,  2048,   14,         0,          1,       0.75
  13,  2062,   14,        23,          1,      0.705
  13,  2062,   14,         0,          1,      0.701
  13,  4081,   14,        23,          1,      0.705
  13,  4081,   14,         0,          1,      0.733
  15,  4081,   14,        23,          1,      0.565
  15,  4081,   14,         0,          1,      0.568
  15,     0,    1,        23,          1,      0.875
  15,     0,    2,         0,          1,      0.875
  16,     0,   15,        23,          1,      0.875
  16,     0,   15,         0,          1,      0.875
  16,    15,   15,        23,          1,      0.795
  16,    15,   15,         0,          1,      0.796
  14,     0,   15,        23,          1,      0.875
  14,     0,   15,         0,          1,      0.875
  14,    15,   15,        23,          1,      0.807
  14,    15,   15,         0,          1,      0.821
  16,  2048,   15,        23,          1,       0.64
  16,  2048,   15,         0,          1,       0.64
  16,  2063,   15,        23,          1,      0.581
  16,  2063,   15,         0,          1,      0.581
  14,  2048,   15,        23,          1,       0.75
  14,  2048,   15,         0,          1,       0.75
  14,  2063,   15,        23,          1,      0.693
  14,  2063,   15,         0,          1,      0.685
  14,  4081,   15,        23,          1,      0.693
  14,  4081,   15,         0,          1,      0.716
  16,  4081,   15,        23,          1,      0.862
  16,  4081,   15,         0,          1,      0.855
  16,     0,    1,        23,          1,      0.875
  16,     0,    2,         0,          1,      0.875
  17,     0,   16,        23,          1,      0.875
  17,     0,   16,         0,          1,      0.875
  17,    16,   16,        23,          1,      0.492
  17,    16,   16,         0,          1,      0.492
  15,     0,   16,        23,          1,      0.875
  15,     0,   16,         0,          1,      0.876
  15,    16,   16,        23,          1,       0.83
  15,    16,   16,         0,          1,      0.841
  17,  2048,   16,        23,          1,       0.64
  17,  2048,   16,         0,          1,       0.64
  17,  2064,   16,        23,          1,      0.492
  17,  2064,   16,         0,          1,      0.492
  15,  2048,   16,        23,          1,       0.75
  15,  2048,   16,         0,          1,       0.75
  15,  2064,   16,        23,          1,      0.716
  15,  2064,   16,         0,          1,      0.715
  15,  4081,   16,        23,          1,      0.716
  15,  4081,   16,         0,          1,      0.723
  17,  4081,   16,        23,          1,      0.857
  17,  4081,   16,         0,          1,      0.856
  17,     0,    1,        23,          1,      0.875
  17,     0,    2,         0,          1,      0.875
  18,     0,   17,        23,          1,      0.875
  18,     0,   17,         0,          1,      0.875
  18,    17,   17,        23,          1,      0.492
  18,    17,   17,         0,          1,      0.492
  16,     0,   17,        23,          1,      0.881
  16,     0,   17,         0,          1,       0.88
  16,    17,   17,        23,          1,      0.661
  16,    17,   17,         0,          1,      0.666
  18,  2048,   17,        23,          1,       0.64
  18,  2048,   17,         0,          1,       0.64
  18,  2065,   17,        23,          1,      0.492
  18,  2065,   17,         0,          1,      0.492
  16,  2048,   17,        23,          1,       0.75
  16,  2048,   17,         0,          1,       0.75
  16,  2065,   17,        23,          1,      0.655
  16,  2065,   17,         0,          1,      0.664
  16,  4081,   17,        23,          1,       0.99
  16,  4081,   17,         0,          1,        1.0
  18,  4081,   17,        23,          1,      0.863
  18,  4081,   17,         0,          1,      0.857
  18,     0,    1,        23,          1,      0.879
  18,     0,    2,         0,          1,      0.879
  19,     0,   18,        23,          1,      0.882
  19,     0,   18,         0,          1,      0.881
  19,    18,   18,        23,          1,      0.495
  19,    18,   18,         0,          1,      0.495
  17,     0,   18,        23,          1,      0.885
  17,     0,   18,         0,          1,      0.886
  17,    18,   18,        23,          1,      0.668
  17,    18,   18,         0,          1,       0.66
  19,  2048,   18,        23,          1,      0.645
  19,  2048,   18,         0,          1,      0.644
  19,  2066,   18,        23,          1,      0.496
  19,  2066,   18,         0,          1,      0.496
  17,  2048,   18,        23,          1,      0.755
  17,  2048,   18,         0,          1,      0.756
  17,  2066,   18,        23,          1,      0.663
  17,  2066,   18,         0,          1,      0.667
  17,  4081,   18,        23,          1,      0.997
  17,  4081,   18,         0,          1,      0.998
  19,  4081,   18,        23,          1,      0.858
  19,  4081,   18,         0,          1,      0.858
  19,     0,    1,        23,          1,      0.877
  19,     0,    2,         0,          1,      0.877
  20,     0,   19,        23,          1,      0.876
  20,     0,   19,         0,          1,      0.876
  20,    19,   19,        23,          1,      0.493
  20,    19,   19,         0,          1,      0.493
  18,     0,   19,        23,          1,      0.882
  18,     0,   19,         0,          1,      0.881
  18,    19,   19,        23,          1,      0.659
  18,    19,   19,         0,          1,      0.665
  20,  2048,   19,        23,          1,       0.64
  20,  2048,   19,         0,          1,      0.641
  20,  2067,   19,        23,          1,      0.492
  20,  2067,   19,         0,          1,      0.492
  18,  2048,   19,        23,          1,       0.75
  18,  2048,   19,         0,          1,      0.751
  18,  2067,   19,        23,          1,      0.648
  18,  2067,   19,         0,          1,      0.651
  18,  4081,   19,        23,          1,      0.979
  18,  4081,   19,         0,          1,      0.993
  20,  4081,   19,        23,          1,      0.857
  20,  4081,   19,         0,          1,      0.857
  20,     0,    1,        23,          1,      0.876
  20,     0,    2,         0,          1,      0.876
  21,     0,   20,        23,          1,      0.876
  21,     0,   20,         0,          1,      0.875
  21,    20,   20,        23,          1,      0.492
  21,    20,   20,         0,          1,      0.492
  19,     0,   20,        23,          1,      0.889
  19,     0,   20,         0,          1,       0.89
  19,    20,   20,        23,          1,       0.66
  19,    20,   20,         0,          1,      0.665
  21,  2048,   20,        23,          1,       0.64
  21,  2048,   20,         0,          1,       0.64
  21,  2068,   20,        23,          1,      0.492
  21,  2068,   20,         0,          1,      0.492
  19,  2048,   20,        23,          1,       0.75
  19,  2048,   20,         0,          1,       0.75
  19,  2068,   20,        23,          1,      0.655
  19,  2068,   20,         0,          1,      0.649
  19,  4081,   20,        23,          1,      0.981
  19,  4081,   20,         0,          1,        1.0
  21,  4081,   20,        23,          1,      0.858
  21,  4081,   20,         0,          1,      0.856
  21,     0,    1,        23,          1,      0.877
  21,     0,    2,         0,          1,      0.877
  22,     0,   21,        23,          1,      0.877
  22,     0,   21,         0,          1,      0.876
  22,    21,   21,        23,          1,      0.493
  22,    21,   21,         0,          1,      0.492
  20,     0,   21,        23,          1,      0.878
  20,     0,   21,         0,          1,      0.879
  20,    21,   21,        23,          1,       0.66
  20,    21,   21,         0,          1,       0.66
  22,  2048,   21,        23,          1,       0.64
  22,  2048,   21,         0,          1,       0.64
  22,  2069,   21,        23,          1,      0.493
  22,  2069,   21,         0,          1,      0.492
  20,  2048,   21,        23,          1,       0.75
  20,  2048,   21,         0,          1,       0.75
  20,  2069,   21,        23,          1,      0.665
  20,  2069,   21,         0,          1,      0.666
  20,  4081,   21,        23,          1,      0.985
  20,  4081,   21,         0,          1,      0.985
  22,  4081,   21,        23,          1,      0.858
  22,  4081,   21,         0,          1,      0.856
  22,     0,    1,        23,          1,      0.876
  22,     0,    2,         0,          1,      0.875
  23,     0,   22,        23,          1,      0.875
  23,     0,   22,         0,          1,      0.876
  23,    22,   22,        23,          1,      0.492
  23,    22,   22,         0,          1,      0.492
  21,     0,   22,        23,          1,      0.897
  21,     0,   22,         0,          1,      0.892
  21,    22,   22,        23,          1,      0.659
  21,    22,   22,         0,          1,       0.66
  23,  2048,   22,        23,          1,      0.639
  23,  2048,   22,         0,          1,      0.639
  23,  2070,   22,        23,          1,      0.492
  23,  2070,   22,         0,          1,      0.492
  21,  2048,   22,        23,          1,      0.748
  21,  2048,   22,         0,          1,      0.748
  21,  2070,   22,        23,          1,       0.65
  21,  2070,   22,         0,          1,      0.664
  21,  4081,   22,        23,          1,      0.996
  21,  4081,   22,         0,          1,      0.995
  23,  4081,   22,        23,          1,      0.854
  23,  4081,   22,         0,          1,      0.855
  23,     0,    1,        23,          1,      0.873
  23,     0,    2,         0,          1,      0.873
  24,     0,   23,        23,          1,      0.873
  24,     0,   23,         0,          1,      0.873
  24,    23,   23,        23,          1,      0.491
  24,    23,   23,         0,          1,      0.491
  22,     0,   23,        23,          1,      0.884
  22,     0,   23,         0,          1,      0.884
  22,    23,   23,        23,          1,      0.664
  22,    23,   23,         0,          1,      0.665
  24,  2048,   23,        23,          1,      0.638
  24,  2048,   23,         0,          1,      0.639
  24,  2071,   23,        23,          1,      0.491
  24,  2071,   23,         0,          1,      0.491
  22,  2048,   23,        23,          1,      0.748
  22,  2048,   23,         0,          1,      0.748
  22,  2071,   23,        23,          1,       0.66
  22,  2071,   23,         0,          1,       0.66
  22,  4081,   23,        23,          1,      0.991
  22,  4081,   23,         0,          1,       0.99
  24,  4081,   23,        23,          1,      0.855
  24,  4081,   23,         0,          1,      0.853
  24,     0,    1,        23,          1,      0.873
  24,     0,    2,         0,          1,      0.873
  25,     0,   24,        23,          1,      0.873
  25,     0,   24,         0,          1,      0.872
  25,    24,   24,        23,          1,      0.491
  25,    24,   24,         0,          1,      0.491
  23,     0,   24,        23,          1,      0.917
  23,     0,   24,         0,          1,      0.917
  23,    24,   24,        23,          1,       0.66
  23,    24,   24,         0,          1,      0.659
  25,  2048,   24,        23,          1,      0.638
  25,  2048,   24,         0,          1,      0.638
  25,  2072,   24,        23,          1,      0.491
  25,  2072,   24,         0,          1,      0.491
  23,  2048,   24,        23,          1,      0.747
  23,  2048,   24,         0,          1,      0.747
  23,  2072,   24,        23,          1,      0.648
  23,  2072,   24,         0,          1,      0.663
  23,  4081,   24,        23,          1,       0.99
  23,  4081,   24,         0,          1,      0.996
  25,  4081,   24,        23,          1,      0.858
  25,  4081,   24,         0,          1,      0.852
  25,     0,    1,        23,          1,      0.872
  25,     0,    2,         0,          1,      0.872
  26,     0,   25,        23,          1,      0.872
  26,     0,   25,         0,          1,      0.872
  26,    25,   25,        23,          1,      0.491
  26,    25,   25,         0,          1,      0.491
  24,     0,   25,        23,          1,      0.906
  24,     0,   25,         0,          1,      0.897
  24,    25,   25,        23,          1,      0.653
  24,    25,   25,         0,          1,      0.664
  26,  2048,   25,        23,          1,      0.638
  26,  2048,   25,         0,          1,      0.638
  26,  2073,   25,        23,          1,      0.491
  26,  2073,   25,         0,          1,      0.491
  24,  2048,   25,        23,          1,      0.747
  24,  2048,   25,         0,          1,      0.748
  24,  2073,   25,        23,          1,      0.663
  24,  2073,   25,         0,          1,      0.657
  24,  4081,   25,        23,          1,      0.991
  24,  4081,   25,         0,          1,      0.995
  26,  4081,   25,        23,          1,      0.853
  26,  4081,   25,         0,          1,      0.852
  26,     0,    1,        23,          1,      0.872
  26,     0,    2,         0,          1,      0.872
  27,     0,   26,        23,          1,      0.873
  27,     0,   26,         0,          1,      0.873
  27,    26,   26,        23,          1,      0.492
  27,    26,   26,         0,          1,      0.492
  25,     0,   26,        23,          1,      0.919
  25,     0,   26,         0,          1,       0.92
  25,    26,   26,        23,          1,      0.661
  25,    26,   26,         0,          1,      0.656
  27,  2048,   26,        23,          1,      0.639
  27,  2048,   26,         0,          1,      0.639
  27,  2074,   26,        23,          1,      0.492
  27,  2074,   26,         0,          1,      0.492
  25,  2048,   26,        23,          1,      0.749
  25,  2048,   26,         0,          1,      0.749
  25,  2074,   26,        23,          1,      0.665
  25,  2074,   26,         0,          1,      0.662
  25,  4081,   26,        23,          1,      0.988
  25,  4081,   26,         0,          1,      0.998
  27,  4081,   26,        23,          1,      0.855
  27,  4081,   26,         0,          1,      0.854
  27,     0,    1,        23,          1,      0.874
  27,     0,    2,         0,          1,      0.874
  28,     0,   27,        23,          1,      0.874
  28,     0,   27,         0,          1,      0.874
  28,    27,   27,        23,          1,      0.492
  28,    27,   27,         0,          1,      0.492
  26,     0,   27,        23,          1,      0.908
  26,     0,   27,         0,          1,      0.908
  26,    27,   27,        23,          1,      0.658
  26,    27,   27,         0,          1,      0.665
  28,  2048,   27,        23,          1,      0.639
  28,  2048,   27,         0,          1,      0.639
  28,  2075,   27,        23,          1,      0.492
  28,  2075,   27,         0,          1,      0.492
  26,  2048,   27,        23,          1,      0.749
  26,  2048,   27,         0,          1,      0.749
  26,  2075,   27,        23,          1,      0.664
  26,  2075,   27,         0,          1,      0.665
  26,  4081,   27,        23,          1,      0.999
  26,  4081,   27,         0,          1,      0.998
  28,  4081,   27,        23,          1,      0.855
  28,  4081,   27,         0,          1,      0.855
  28,     0,    1,        23,          1,      0.874
  28,     0,    2,         0,          1,      0.874
  29,     0,   28,        23,          1,      0.874
  29,     0,   28,         0,          1,      0.874
  29,    28,   28,        23,          1,      0.492
  29,    28,   28,         0,          1,      0.492
  27,     0,   28,        23,          1,      0.919
  27,     0,   28,         0,          1,      0.919
  27,    28,   28,        23,          1,      0.665
  27,    28,   28,         0,          1,      0.655
  29,  2048,   28,        23,          1,       0.64
  29,  2048,   28,         0,          1,       0.64
  29,  2076,   28,        23,          1,      0.492
  29,  2076,   28,         0,          1,      0.492
  27,  2048,   28,        23,          1,       0.75
  27,  2048,   28,         0,          1,      0.752
  27,  2076,   28,        23,          1,      0.657
  27,  2076,   28,         0,          1,      0.667
  27,  4081,   28,        23,          1,      0.981
  27,  4081,   28,         0,          1,      0.998
  29,  4081,   28,        23,          1,      0.859
  29,  4081,   28,         0,          1,      0.858
  29,     0,    1,        23,          1,      0.876
  29,     0,    2,         0,          1,      0.876
  30,     0,   29,        23,          1,      0.876
  30,     0,   29,         0,          1,      0.875
  30,    29,   29,        23,          1,      0.493
  30,    29,   29,         0,          1,      0.494
  28,     0,   29,        23,          1,      0.919
  28,     0,   29,         0,          1,      0.913
  28,    29,   29,        23,          1,      0.668
  28,    29,   29,         0,          1,      0.669
  30,  2048,   29,        23,          1,      0.642
  30,  2048,   29,         0,          1,      0.643
  30,  2077,   29,        23,          1,      0.495
  30,  2077,   29,         0,          1,      0.495
  28,  2048,   29,        23,          1,      0.754
  28,  2048,   29,         0,          1,      0.753
  28,  2077,   29,        23,          1,      0.663
  28,  2077,   29,         0,          1,      0.664
  28,  4081,   29,        23,          1,      0.998
  28,  4081,   29,         0,          1,       0.98
  30,  4081,   29,        23,          1,      0.854
  30,  4081,   29,         0,          1,      0.852
  30,     0,    1,        23,          1,      0.872
  30,     0,    2,         0,          1,      0.871
  31,     0,   30,        23,          1,      0.872
  31,     0,   30,         0,          1,       0.87
  31,    30,   30,        23,          1,       0.49
  31,    30,   30,         0,          1,       0.49
  29,     0,   30,        23,          1,      0.921
  29,     0,   30,         0,          1,      0.924
  29,    30,   30,        23,          1,      0.659
  29,    30,   30,         0,          1,      0.664
  31,  2048,   30,        23,          1,      0.639
  31,  2048,   30,         0,          1,       0.64
  31,  2078,   30,        23,          1,      0.492
  31,  2078,   30,         0,          1,      0.492
  29,  2048,   30,        23,          1,       0.75
  29,  2048,   30,         0,          1,       0.75
  29,  2078,   30,        23,          1,      0.661
  29,  2078,   30,         0,          1,      0.665
  29,  4081,   30,        23,          1,       0.98
  29,  4081,   30,         0,          1,      0.987
  31,  4081,   30,        23,          1,      0.861
  31,  4081,   30,         0,          1,      0.855
  31,     0,    1,        23,          1,      0.875
  31,     0,    2,         0,          1,      0.875
  32,     0,   31,        23,          1,      0.875
  32,     0,   31,         0,          1,      0.875
  32,    31,   31,        23,          1,      0.556
  32,    31,   31,         0,          1,      0.556
  30,     0,   31,        23,          1,       0.93
  30,     0,   31,         0,          1,       0.92
  30,    31,   31,        23,          1,      0.666
  30,    31,   31,         0,          1,      0.666
  32,  2048,   31,        23,          1,      0.625
  32,  2048,   31,         0,          1,      0.625
  32,  2079,   31,        23,          1,      0.556
  32,  2079,   31,         0,          1,      0.556
  30,  2048,   31,        23,          1,       0.75
  30,  2048,   31,         0,          1,       0.75
  30,  2079,   31,        23,          1,      0.666
  30,  2079,   31,         0,          1,      0.655
  30,  4081,   31,        23,          1,      0.993
  30,  4081,   31,         0,          1,      0.999
  32,  4081,   31,        23,          1,      0.857
  32,  4081,   31,         0,          1,      0.855
  32,     0,    1,        23,          1,      0.875
  32,     0,    2,         0,          1,      0.875

> ---
>  sysdeps/x86_64/multiarch/memrchr-evex.S | 539 ++++++++++++------------
>  1 file changed, 268 insertions(+), 271 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
> index 0b99709c6b..ad541c0e50 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
> @@ -19,319 +19,316 @@
>  #if IS_IN (libc)
>
>  # include <sysdep.h>
> +# include "evex256-vecs.h"
> +# if VEC_SIZE != 32
> +#  error "VEC_SIZE != 32 unimplemented"
> +# endif
> +
> +# ifndef MEMRCHR
> +#  define MEMRCHR                              __memrchr_evex
> +# endif
> +
> +# define PAGE_SIZE                     4096
> +# define VECMATCH                      VEC(0)
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN(MEMRCHR, 6)
> +# ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       and     %RDX_LP, %RDX_LP
> +# else
> +       test    %RDX_LP, %RDX_LP
> +# endif
> +       jz      L(zero_0)
> +
> +       /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
> +          correct page cross check and 2) it correctly sets up end ptr to be
> +          subtract by lzcnt aligned.  */
> +       leaq    -1(%rdi, %rdx), %rax
> +       vpbroadcastb %esi, %VECMATCH
> +
> +       /* Check if we can load 1x VEC without cross a page.  */
> +       testl   $(PAGE_SIZE - VEC_SIZE), %eax
> +       jz      L(page_cross)
> +
> +       /* Don't use rax for pointer here because EVEX has better encoding with
> +          offset % VEC_SIZE == 0.  */
> +       vpcmpb  $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
> +
> +       /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
> +       cmpq    $VEC_SIZE, %rdx
> +       ja      L(more_1x_vec)
> +L(ret_vec_x0_test):
> +
> +       /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
> +          will gurantee edx (len) is less than it.  */
> +       lzcntl  %ecx, %ecx
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
> +       ret
>
> -# define VMOVA         vmovdqa64
> -
> -# define YMMMATCH      ymm16
> -
> -# define VEC_SIZE 32
> -
> -       .section .text.evex,"ax",@progbits
> -ENTRY (__memrchr_evex)
> -       /* Broadcast CHAR to YMMMATCH.  */
> -       vpbroadcastb %esi, %YMMMATCH
> -
> -       sub     $VEC_SIZE, %RDX_LP
> -       jbe     L(last_vec_or_less)
> -
> -       add     %RDX_LP, %RDI_LP
> -
> -       /* Check the last VEC_SIZE bytes.  */
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x0)
> -
> -       subq    $(VEC_SIZE * 4), %rdi
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -       jz      L(aligned_more)
> -
> -       /* Align data for aligned loads in the loop.  */
> -       addq    $VEC_SIZE, %rdi
> -       addq    $VEC_SIZE, %rdx
> -       andq    $-VEC_SIZE, %rdi
> -       subq    %rcx, %rdx
> -
> -       .p2align 4
> -L(aligned_more):
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -
> -       /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -          since data is only aligned to VEC_SIZE.  */
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
> -       kmovd   %k2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
> -       kmovd   %k3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k4
> -       kmovd   %k4, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x0)
> -
> -       /* Align data to 4 * VEC_SIZE for loop with fewer branches.
> -          There are some overlaps with above if data isn't aligned
> -          to 4 * VEC_SIZE.  */
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE * 4 - 1), %ecx
> -       jz      L(loop_4x_vec)
> -
> -       addq    $(VEC_SIZE * 4), %rdi
> -       addq    $(VEC_SIZE * 4), %rdx
> -       andq    $-(VEC_SIZE * 4), %rdi
> -       subq    %rcx, %rdx
> +       /* Fits in aligning bytes of first cache line.  */
> +L(zero_0):
> +       xorl    %eax, %eax
> +       ret
>
> -       .p2align 4
> -L(loop_4x_vec):
> -       /* Compare 4 * VEC at a time forward.  */
> -       subq    $(VEC_SIZE * 4), %rdi
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> -       kord    %k1, %k2, %k5
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> -
> -       kord    %k3, %k4, %k6
> -       kortestd %k5, %k6
> -       jz      L(loop_4x_vec)
> -
> -       /* There is a match.  */
> -       kmovd   %k4, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       kmovd   %k3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       kmovd   %k2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       kmovd   %k1, %eax
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> +       .p2align 4,, 9
> +L(ret_vec_x0_dec):
> +       decq    %rax
> +L(ret_vec_x0):
> +       lzcntl  %ecx, %ecx
> +       subq    %rcx, %rax
>         ret
>
> -       .p2align 4
> -L(last_4x_vec_or_less):
> -       addl    $(VEC_SIZE * 4), %edx
> -       cmpl    $(VEC_SIZE * 2), %edx
> -       jbe     L(last_2x_vec)
> +       .p2align 4,, 10
> +L(more_1x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
>
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> +       /* Align rax (pointer to string).  */
> +       andq    $-VEC_SIZE, %rax
>
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
> -       kmovd   %k2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> +       /* Recompute length after aligning.  */
> +       movq    %rax, %rdx
>
> -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
> -       kmovd   %k3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1_check)
> -       cmpl    $(VEC_SIZE * 3), %edx
> -       jbe     L(zero)
> +       /* Need no matter what.  */
> +       vpcmpb  $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k4
> -       kmovd   %k4, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 4), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addq    %rdi, %rax
> -       ret
> +       subq    %rdi, %rdx
>
> -       .p2align 4
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       ja      L(more_2x_vec)
>  L(last_2x_vec):
> -       vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3_check)
> +
> +       /* Must dec rax because L(ret_vec_x0_test) expects it.  */
> +       decq    %rax
>         cmpl    $VEC_SIZE, %edx
> -       jbe     L(zero)
> -
> -       vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 2), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $(VEC_SIZE * 2), %eax
> -       addq    %rdi, %rax
> +       jbe     L(ret_vec_x0_test)
> +
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
> +
> +       /* Don't use rax for pointer here because EVEX has better encoding with
> +          offset % VEC_SIZE == 0.  */
> +       vpcmpb  $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
> +       /* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
> +       lzcntq  %rcx, %rcx
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x0):
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> +       /* Inexpensive place to put this regarding code size / target alignments
> +          / ICache NLP. Necessary for 2-byte encoding of jump to page cross
> +          case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
> +          in first cache line.  */
> +L(page_cross):
> +       movq    %rax, %rsi
> +       andq    $-VEC_SIZE, %rsi
> +       vpcmpb  $0, (%rsi), %VECMATCH, %k0
> +       kmovd   %k0, %r8d
> +       /* Shift out negative alignment (because we are starting from endptr and
> +          working backwards).  */
> +       movl    %eax, %ecx
> +       /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
> +       notl    %ecx
> +       shlxl   %ecx, %r8d, %ecx
> +       cmpq    %rdi, %rsi
> +       ja      L(more_1x_vec)
> +       lzcntl  %ecx, %ecx
> +       cmpl    %ecx, %edx
> +       jle     L(zero_1)
> +       subq    %rcx, %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x1):
> -       bsrl    %eax, %eax
> -       addl    $VEC_SIZE, %eax
> -       addq    %rdi, %rax
> +       /* Continue creating zero labels that fit in aligning bytes and get
> +          2-byte encoding / are in the same cache line as condition.  */
> +L(zero_1):
> +       xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(last_vec_x2):
> -       bsrl    %eax, %eax
> -       addl    $(VEC_SIZE * 2), %eax
> -       addq    %rdi, %rax
> +       .p2align 4,, 8
> +L(ret_vec_x1):
> +       /* This will naturally add 32 to position.  */
> +       bsrl    %ecx, %ecx
> +       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x3):
> -       bsrl    %eax, %eax
> -       addl    $(VEC_SIZE * 3), %eax
> -       addq    %rdi, %rax
> -       ret
> +       .p2align 4,, 8
> +L(more_2x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_dec)
>
> -       .p2align 4
> -L(last_vec_x1_check):
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 3), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $VEC_SIZE, %eax
> -       addq    %rdi, %rax
> -       ret
> +       vpcmpb  $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
>
> -       .p2align 4
> -L(last_vec_x3_check):
> -       bsrl    %eax, %eax
> -       subq    $VEC_SIZE, %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $(VEC_SIZE * 3), %eax
> -       addq    %rdi, %rax
> -       ret
> +       /* Need no matter what.  */
> +       vpcmpb  $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       .p2align 4
> -L(zero):
> -       xorl    %eax, %eax
> +       subq    $(VEC_SIZE * 4), %rdx
> +       ja      L(more_4x_vec)
> +
> +       cmpl    $(VEC_SIZE * -1), %edx
> +       jle     L(ret_vec_x2_test)
> +L(last_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
> +
> +
> +       /* Need no matter what.  */
> +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 3 + 1), %rax
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       ja      L(zero_1)
>         ret
>
> -       .p2align 4
> -L(last_vec_or_less_aligned):
> -       movl    %edx, %ecx
> -
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> -
> -       movl    $1, %edx
> -       /* Support rdx << 32.  */
> -       salq    %cl, %rdx
> -       subq    $1, %rdx
> -
> -       kmovd   %k1, %eax
> -
> -       /* Remove the trailing bytes.  */
> -       andl    %edx, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> +       .p2align 4,, 8
> +L(ret_vec_x2_test):
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 2 + 1), %rax
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       ja      L(zero_1)
>         ret
>
> -       .p2align 4
> -L(last_vec_or_less):
> -       addl    $VEC_SIZE, %edx
> -
> -       /* Check for zero length.  */
> -       testl   %edx, %edx
> -       jz      L(zero)
> -
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -       jz      L(last_vec_or_less_aligned)
> -
> -       movl    %ecx, %esi
> -       movl    %ecx, %r8d
> -       addl    %edx, %esi
> -       andq    $-VEC_SIZE, %rdi
> +       .p2align 4,, 8
> +L(ret_vec_x2):
> +       bsrl    %ecx, %ecx
> +       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> +       ret
>
> -       subl    $VEC_SIZE, %esi
> -       ja      L(last_vec_2x_aligned)
> +       .p2align 4,, 8
> +L(ret_vec_x3):
> +       bsrl    %ecx, %ecx
> +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> +       ret
>
> -       /* Check the last VEC.  */
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
>
> -       /* Remove the leading and trailing bytes.  */
> -       sarl    %cl, %eax
> -       movl    %edx, %ecx
> +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       movl    $1, %edx
> -       sall    %cl, %edx
> -       subl    $1, %edx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
>
> -       andl    %edx, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> +       /* Check if near end before re-aligning (otherwise might do an
> +          unnecissary loop iteration).  */
> +       addq    $-(VEC_SIZE * 4), %rax
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       jbe     L(last_4x_vec)
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       addq    %r8, %rax
> -       ret
> +       decq    %rax
> +       andq    $-(VEC_SIZE * 4), %rax
> +       movq    %rdi, %rdx
> +       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
> +          lengths that overflow can be valid and break the comparison.  */
> +       andq    $-(VEC_SIZE * 4), %rdx
>
>         .p2align 4
> -L(last_vec_2x_aligned):
> -       movl    %esi, %ecx
> -
> -       /* Check the last VEC.  */
> -       vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> +L(loop_4x_vec):
> +       /* Store 1 were not-equals and 0 where equals in k1 (used to mask later
> +          on).  */
> +       vpcmpb  $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
> +
> +       /* VEC(2/3) will have zero-byte where we found a CHAR.  */
> +       vpxorq  (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
> +       vpxorq  (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
> +       vpcmpb  $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
> +
> +       /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
> +          CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
> +       vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z}
> +       vptestnmb %VEC(3), %VEC(3), %k2
> +
> +       /* Any 1s and we found CHAR.  */
> +       kortestd %k2, %k4
> +       jnz     L(loop_end)
> +
> +       addq    $-(VEC_SIZE * 4), %rax
> +       cmpq    %rdx, %rax
> +       jne     L(loop_4x_vec)
> +
> +       /* Need to re-adjust rdx / rax for L(last_4x_vec).  */
> +       subq    $-(VEC_SIZE * 4), %rdx
> +       movq    %rdx, %rax
> +       subl    %edi, %edx
> +L(last_4x_vec):
> +
> +       /* Used no matter what.  */
> +       vpcmpb  $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       movl    $1, %edx
> -       sall    %cl, %edx
> -       subl    $1, %edx
> +       cmpl    $(VEC_SIZE * 2), %edx
> +       jbe     L(last_2x_vec)
>
> -       kmovd   %k1, %eax
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_dec)
>
> -       /* Remove the trailing bytes.  */
> -       andl    %edx, %eax
>
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> +       vpcmpb  $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       /* Check the second last VEC.  */
> -       vpcmpb  $0, (%rdi), %YMMMATCH, %k1
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
>
> -       movl    %r8d, %ecx
> +       /* Used no matter what.  */
> +       vpcmpb  $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
> +       kmovd   %k0, %ecx
>
> -       kmovd   %k1, %eax
> +       cmpl    $(VEC_SIZE * 3), %edx
> +       ja      L(last_vec)
>
> -       /* Remove the leading bytes.  Must use unsigned right shift for
> -          bsrl below.  */
> -       shrl    %cl, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 2 + 1), %rax
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       jbe     L(ret_1)
> +       xorl    %eax, %eax
> +L(ret_1):
> +       ret
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       addq    %r8, %rax
> +       .p2align 4,, 6
> +L(loop_end):
> +       kmovd   %k1, %ecx
> +       notl    %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_end)
> +
> +       vptestnmb %VEC(2), %VEC(2), %k0
> +       kmovd   %k0, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1_end)
> +
> +       kmovd   %k2, %ecx
> +       kmovd   %k4, %esi
> +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> +          then CHAR in VEC3 and bsrq will use that position.  */
> +       salq    $32, %rcx
> +       orq     %rsi, %rcx
> +       bsrq    %rcx, %rcx
> +       addq    %rcx, %rax
> +       ret
> +       .p2align 4,, 4
> +L(ret_vec_x0_end):
> +       addq    $(VEC_SIZE), %rax
> +L(ret_vec_x1_end):
> +       bsrl    %ecx, %ecx
> +       leaq    (VEC_SIZE * 2)(%rax, %rcx), %rax
>         ret
> -END (__memrchr_evex)
> +
> +END(MEMRCHR)
>  #endif
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v1 6/8] x86: Optimize memrchr-avx2.S
  2022-06-03  4:42 [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                   ` (3 preceding siblings ...)
  2022-06-03  4:42 ` [PATCH v1 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
@ 2022-06-03  4:42 ` Noah Goldstein
  2022-06-03  4:50   ` Noah Goldstein
  2022-06-03  4:42 ` [PATCH v1 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03  4:42 UTC (permalink / raw)
  To: libc-alpha

The new code:
    1. prioritizes smaller user-arg lengths more.
    2. optimizes target placement more carefully
    3. reuses logic more
    4. fixes up various inefficiencies in the logic. The biggest
       case here is the `lzcnt` logic for checking returns which
       saves either a branch or multiple instructions.

The total code size saving is: 306 bytes
Geometric Mean of all benchmarks New / Old: 0.760

Regressions:
There are some regressions. Particularly where the length (user arg
length) is large but the position of the match char is near the
begining of the string (in first VEC). This case has roughly a
10-20% regression.

This is because the new logic gives the hot path for immediate matches
to shorter lengths (the more common input). This case has roughly
a 15-45% speedup.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S |   1 +
 sysdeps/x86_64/multiarch/memrchr-avx2.S     | 538 ++++++++++----------
 2 files changed, 260 insertions(+), 279 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
index cea2d2a72d..5e9beeeef2 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
@@ -2,6 +2,7 @@
 # define MEMRCHR __memrchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index ba2ce7cb03..6915e1c373 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -21,340 +21,320 @@
 # include <sysdep.h>
 
 # ifndef MEMRCHR
-#  define MEMRCHR	__memrchr_avx2
+#  define MEMRCHR				__memrchr_avx2
 # endif
 
 # ifndef VZEROUPPER
-#  define VZEROUPPER	vzeroupper
+#  define VZEROUPPER			vzeroupper
 # endif
 
+// abf-off
 # ifndef SECTION
 #  define SECTION(p)	p##.avx
 # endif
+// abf-on
+
+# define VEC_SIZE			32
+# define PAGE_SIZE			4096
+	.section SECTION(.text), "ax", @progbits
+ENTRY(MEMRCHR)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
 
-# define VEC_SIZE 32
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMRCHR)
-	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
-	vpbroadcastb %xmm0, %ymm0
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdx, %rdi), %rax
 
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
+	vpbroadcastb %xmm0, %ymm0
 
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+
+L(ret_vec_x0_test):
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will gurantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+
+	/* Hoist vzeroupper (not great for RTM) to save code size. This allows
+	   all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vmovdqa	(%rdi), %ymm1
-	vmovdqa	VEC_SIZE(%rdi), %ymm2
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
-
-	vpcmpeqb %ymm1, %ymm0, %ymm1
-	vpcmpeqb %ymm2, %ymm0, %ymm2
-	vpcmpeqb %ymm3, %ymm0, %ymm3
-	vpcmpeqb %ymm4, %ymm0, %ymm4
-
-	vpor	%ymm1, %ymm2, %ymm5
-	vpor	%ymm3, %ymm4, %ymm6
-	vpor	%ymm5, %ymm6, %ymm5
-
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpmovmskb %ymm1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Align rax (string pointer).  */
+	andq	$-VEC_SIZE, %rax
+
+	/* Recompute remaining length after aligning.  */
+	movq	%rax, %rdx
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
+	subq	%rdi, %rdx
+	decq	%rax
+	vpmovmskb %ymm1, %ecx
+	/* Fall through for short (hotter than length).  */
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpeqb (%rsi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %r8d
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%r8d
+	shlxl	%r8d, %ecx, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
+	.p2align 4,, 11
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
+	.p2align 4,, 10
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	VZEROUPPER_RETURN
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	.p2align 4
-L(null):
+	/* First in aligning bytes.  */
+L(zero_2):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
+	.p2align 4,, 4
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	vpcmpeqb (%rdi), %ymm0, %ymm1
 
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
+	.p2align 4,, 11
+L(ret_vec_x2):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4,, 14
+L(ret_vec_x3):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
 	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(null)
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecissary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	/* Align rax to (VEC_SIZE - 1).  */
+	orq	$(VEC_SIZE * 4 - 1), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	orq	$(VEC_SIZE * 4 - 1), %rdx
 
-	/* Check the last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	.p2align 4
+L(loop_4x_vec):
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	vpor	%ymm1, %ymm2, %ymm2
+	vpor	%ymm3, %ymm4, %ymm4
+	vpor	%ymm2, %ymm4, %ymm4
+	vpmovmskb %ymm4, %esi
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	testl	%esi, %esi
+	jnz	L(loop_end)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	VZEROUPPER_RETURN
+	addq	$(VEC_SIZE * -4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
 
-	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
+	subl	%edi, %edx
+	incl	%edx
 
-	/* Check the last VEC.  */
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
+L(last_4x_vec):
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	vpmovmskb %ymm1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret0)
+	xorl	%eax, %eax
+L(ret0):
+	ret
 
-	movl	%r8d, %ecx
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(loop_end):
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vpmovmskb %ymm2, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	vpmovmskb %ymm3, %ecx
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	.p2align 4,, 4
+L(ret_vec_x1_end):
+	/* 64-bit version will automatically add 32 (VEC_SIZE).  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
+	VZEROUPPER_RETURN
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
-END (MEMRCHR)
+
+	/* 2 bytes until next cache line.  */
+END(MEMRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v1 6/8] x86: Optimize memrchr-avx2.S
  2022-06-03  4:42 ` [PATCH v1 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
@ 2022-06-03  4:50   ` Noah Goldstein
  0 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03  4:50 UTC (permalink / raw)
  To: GNU C Library

[-- Attachment #1: Type: text/plain, Size: 19793 bytes --]

On Thu, Jun 2, 2022 at 11:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code:
>     1. prioritizes smaller user-arg lengths more.
>     2. optimizes target placement more carefully
>     3. reuses logic more
>     4. fixes up various inefficiencies in the logic. The biggest
>        case here is the `lzcnt` logic for checking returns which
>        saves either a branch or multiple instructions.
>
> The total code size saving is: 306 bytes
> Geometric Mean of all benchmarks New / Old: 0.760
>
> Regressions:
> There are some regressions. Particularly where the length (user arg
> length) is large but the position of the match char is near the
> begining of the string (in first VEC). This case has roughly a
> 10-20% regression.
>
> This is because the new logic gives the hot path for immediate matches
> to shorter lengths (the more common input). This case has roughly
> a 15-45% speedup.
>
> Full xcheck passes on x86_64.
> ---
>  sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S |   1 +
>  sysdeps/x86_64/multiarch/memrchr-avx2.S     | 538 ++++++++++----------
>  2 files changed, 260 insertions(+), 279 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> index cea2d2a72d..5e9beeeef2 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> @@ -2,6 +2,7 @@
>  # define MEMRCHR __memrchr_avx2_rtm
>  #endif
>
> +#define COND_VZEROUPPER        COND_VZEROUPPER_XTEST
>  #define ZERO_UPPER_VEC_REGISTERS_RETURN \
>    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>
> diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
> index ba2ce7cb03..6915e1c373 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
> @@ -21,340 +21,320 @@
>  # include <sysdep.h>
>
>  # ifndef MEMRCHR
> -#  define MEMRCHR      __memrchr_avx2
> +#  define MEMRCHR                              __memrchr_avx2
>  # endif
>
>  # ifndef VZEROUPPER
> -#  define VZEROUPPER   vzeroupper
> +#  define VZEROUPPER                   vzeroupper
>  # endif
>
> +// abf-off
>  # ifndef SECTION
>  #  define SECTION(p)   p##.avx
>  # endif
> +// abf-on
> +
> +# define VEC_SIZE                      32
> +# define PAGE_SIZE                     4096
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(MEMRCHR)
> +# ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       and     %RDX_LP, %RDX_LP
> +# else
> +       test    %RDX_LP, %RDX_LP
> +# endif
> +       jz      L(zero_0)
>
> -# define VEC_SIZE 32
> -
> -       .section SECTION(.text),"ax",@progbits
> -ENTRY (MEMRCHR)
> -       /* Broadcast CHAR to YMM0.  */
>         vmovd   %esi, %xmm0
> -       vpbroadcastb %xmm0, %ymm0
> -
> -       sub     $VEC_SIZE, %RDX_LP
> -       jbe     L(last_vec_or_less)
> -
> -       add     %RDX_LP, %RDI_LP
> +       /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
> +          correct page cross check and 2) it correctly sets up end ptr to be
> +          subtract by lzcnt aligned.  */
> +       leaq    -1(%rdx, %rdi), %rax
>
> -       /* Check the last VEC_SIZE bytes.  */
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x0)
> -
> -       subq    $(VEC_SIZE * 4), %rdi
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -       jz      L(aligned_more)
> +       vpbroadcastb %xmm0, %ymm0
>
> -       /* Align data for aligned loads in the loop.  */
> -       addq    $VEC_SIZE, %rdi
> -       addq    $VEC_SIZE, %rdx
> -       andq    $-VEC_SIZE, %rdi
> -       subq    %rcx, %rdx
> +       /* Check if we can load 1x VEC without cross a page.  */
> +       testl   $(PAGE_SIZE - VEC_SIZE), %eax
> +       jz      L(page_cross)
> +
> +       vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       cmpq    $VEC_SIZE, %rdx
> +       ja      L(more_1x_vec)
> +
> +L(ret_vec_x0_test):
> +       /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
> +          will gurantee edx (len) is less than it.  */
> +       lzcntl  %ecx, %ecx
> +
> +       /* Hoist vzeroupper (not great for RTM) to save code size. This allows
> +          all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
> +       COND_VZEROUPPER
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
> +       ret
>
> -       .p2align 4
> -L(aligned_more):
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -
> -       /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -          since data is only aligned to VEC_SIZE.  */
> -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
> -       vpmovmskb %ymm2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
> -       vpmovmskb %ymm3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       vpcmpeqb (%rdi), %ymm0, %ymm4
> -       vpmovmskb %ymm4, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x0)
> -
> -       /* Align data to 4 * VEC_SIZE for loop with fewer branches.
> -          There are some overlaps with above if data isn't aligned
> -          to 4 * VEC_SIZE.  */
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE * 4 - 1), %ecx
> -       jz      L(loop_4x_vec)
> -
> -       addq    $(VEC_SIZE * 4), %rdi
> -       addq    $(VEC_SIZE * 4), %rdx
> -       andq    $-(VEC_SIZE * 4), %rdi
> -       subq    %rcx, %rdx
> +       /* Fits in aligning bytes of first cache line.  */
> +L(zero_0):
> +       xorl    %eax, %eax
> +       ret
>
> -       .p2align 4
> -L(loop_4x_vec):
> -       /* Compare 4 * VEC at a time forward.  */
> -       subq    $(VEC_SIZE * 4), %rdi
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -
> -       vmovdqa (%rdi), %ymm1
> -       vmovdqa VEC_SIZE(%rdi), %ymm2
> -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> -
> -       vpcmpeqb %ymm1, %ymm0, %ymm1
> -       vpcmpeqb %ymm2, %ymm0, %ymm2
> -       vpcmpeqb %ymm3, %ymm0, %ymm3
> -       vpcmpeqb %ymm4, %ymm0, %ymm4
> -
> -       vpor    %ymm1, %ymm2, %ymm5
> -       vpor    %ymm3, %ymm4, %ymm6
> -       vpor    %ymm5, %ymm6, %ymm5
> -
> -       vpmovmskb %ymm5, %eax
> -       testl   %eax, %eax
> -       jz      L(loop_4x_vec)
> -
> -       /* There is a match.  */
> -       vpmovmskb %ymm4, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       vpmovmskb %ymm3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       vpmovmskb %ymm2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       vpmovmskb %ymm1, %eax
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> +       .p2align 4,, 9
> +L(ret_vec_x0):
> +       lzcntl  %ecx, %ecx
> +       subq    %rcx, %rax
>  L(return_vzeroupper):
>         ZERO_UPPER_VEC_REGISTERS_RETURN
>
> -       .p2align 4
> -L(last_4x_vec_or_less):
> -       addl    $(VEC_SIZE * 4), %edx
> -       cmpl    $(VEC_SIZE * 2), %edx
> -       jbe     L(last_2x_vec)
> -
> -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
> -       vpmovmskb %ymm2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
> -       vpmovmskb %ymm3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1_check)
> -       cmpl    $(VEC_SIZE * 3), %edx
> -       jbe     L(zero)
> -
> -       vpcmpeqb (%rdi), %ymm0, %ymm4
> -       vpmovmskb %ymm4, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 4), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> +       .p2align 4,, 10
> +L(more_1x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
> +
> +       /* Align rax (string pointer).  */
> +       andq    $-VEC_SIZE, %rax
> +
> +       /* Recompute remaining length after aligning.  */
> +       movq    %rax, %rdx
> +       /* Need this comparison next no matter what.  */
> +       vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
> +       subq    %rdi, %rdx
> +       decq    %rax
> +       vpmovmskb %ymm1, %ecx
> +       /* Fall through for short (hotter than length).  */
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       ja      L(more_2x_vec)
>  L(last_2x_vec):
> -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3_check)
>         cmpl    $VEC_SIZE, %edx
> -       jbe     L(zero)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 2), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $(VEC_SIZE * 2), %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(last_vec_x0):
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> +       jbe     L(ret_vec_x0_test)
> +
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
> +
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       /* 64-bit lzcnt. This will naturally add 32 to position.  */
> +       lzcntq  %rcx, %rcx
> +       COND_VZEROUPPER
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
> +       ret
>
> -       .p2align 4
> -L(last_vec_x1):
> -       bsrl    %eax, %eax
> -       addl    $VEC_SIZE, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(last_vec_x2):
> -       bsrl    %eax, %eax
> -       addl    $(VEC_SIZE * 2), %eax
> -       addq    %rdi, %rax
> +       /* Inexpensive place to put this regarding code size / target alignments
> +          / ICache NLP. Necessary for 2-byte encoding of jump to page cross
> +          case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
> +          in first cache line.  */
> +L(page_cross):
> +       movq    %rax, %rsi
> +       andq    $-VEC_SIZE, %rsi
> +       vpcmpeqb (%rsi), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       /* Shift out negative alignment (because we are starting from endptr and
> +          working backwards).  */
> +       movl    %eax, %r8d
> +       /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
> +       notl    %r8d
> +       shlxl   %r8d, %ecx, %ecx
> +       cmpq    %rdi, %rsi
> +       ja      L(more_1x_vec)
> +       lzcntl  %ecx, %ecx
> +       COND_VZEROUPPER
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
> +       ret
> +       .p2align 4,, 11
> +L(ret_vec_x1):
> +       /* This will naturally add 32 to position.  */
> +       lzcntq  %rcx, %rcx
> +       subq    %rcx, %rax
>         VZEROUPPER_RETURN
> +       .p2align 4,, 10
> +L(more_2x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
>
> -       .p2align 4
> -L(last_vec_x3):
> -       bsrl    %eax, %eax
> -       addl    $(VEC_SIZE * 3), %eax
> -       addq    %rdi, %rax
> -       ret
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
>
> -       .p2align 4
> -L(last_vec_x1_check):
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 3), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $VEC_SIZE, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(last_vec_x3_check):
> -       bsrl    %eax, %eax
> -       subq    $VEC_SIZE, %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $(VEC_SIZE * 3), %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> +       /* Needed no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       .p2align 4
> -L(zero):
> -       xorl    %eax, %eax
> -       VZEROUPPER_RETURN
> +       subq    $(VEC_SIZE * 4), %rdx
> +       ja      L(more_4x_vec)
> +
> +       cmpl    $(VEC_SIZE * -1), %edx
> +       jle     L(ret_vec_x2_test)
> +
> +L(last_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
> +
> +       /* Needed no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 3), %rax
> +       COND_VZEROUPPER
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       ja      L(zero_2)
> +       ret
>
> -       .p2align 4
> -L(null):
> +       /* First in aligning bytes.  */
> +L(zero_2):
>         xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(last_vec_or_less_aligned):
> -       movl    %edx, %ecx
> +       .p2align 4,, 4
> +L(ret_vec_x2_test):
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 2), %rax
> +       COND_VZEROUPPER
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       ja      L(zero_2)
> +       ret
>
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
>
> -       movl    $1, %edx
> -       /* Support rdx << 32.  */
> -       salq    %cl, %rdx
> -       subq    $1, %rdx
> +       .p2align 4,, 11
> +L(ret_vec_x2):
> +       /* ecx must be non-zero.  */
> +       bsrl    %ecx, %ecx
> +       leaq    (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
> +       VZEROUPPER_RETURN
>
> -       vpmovmskb %ymm1, %eax
> +       .p2align 4,, 14
> +L(ret_vec_x3):
> +       /* ecx must be non-zero.  */
> +       bsrl    %ecx, %ecx
> +       leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
> +       VZEROUPPER_RETURN
>
> -       /* Remove the trailing bytes.  */
> -       andl    %edx, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
>
>         .p2align 4
> -L(last_vec_or_less):
> -       addl    $VEC_SIZE, %edx
> +L(more_4x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
>
> -       /* Check for zero length.  */
> -       testl   %edx, %edx
> -       jz      L(null)
> +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -       jz      L(last_vec_or_less_aligned)
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
>
> -       movl    %ecx, %esi
> -       movl    %ecx, %r8d
> -       addl    %edx, %esi
> -       andq    $-VEC_SIZE, %rdi
> +       /* Check if near end before re-aligning (otherwise might do an
> +          unnecissary loop iteration).  */
> +       addq    $-(VEC_SIZE * 4), %rax
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       jbe     L(last_4x_vec)
>
> -       subl    $VEC_SIZE, %esi
> -       ja      L(last_vec_2x_aligned)
> +       /* Align rax to (VEC_SIZE - 1).  */
> +       orq     $(VEC_SIZE * 4 - 1), %rax
> +       movq    %rdi, %rdx
> +       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
> +          lengths that overflow can be valid and break the comparison.  */
> +       orq     $(VEC_SIZE * 4 - 1), %rdx
>
> -       /* Check the last VEC.  */
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -
> -       /* Remove the leading and trailing bytes.  */
> -       sarl    %cl, %eax
> -       movl    %edx, %ecx
> +       .p2align 4
> +L(loop_4x_vec):
> +       /* Need this comparison next no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
> +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
> +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
>
> -       movl    $1, %edx
> -       sall    %cl, %edx
> -       subl    $1, %edx
> +       vpor    %ymm1, %ymm2, %ymm2
> +       vpor    %ymm3, %ymm4, %ymm4
> +       vpor    %ymm2, %ymm4, %ymm4
> +       vpmovmskb %ymm4, %esi
>
> -       andl    %edx, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> +       testl   %esi, %esi
> +       jnz     L(loop_end)
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       addq    %r8, %rax
> -       VZEROUPPER_RETURN
> +       addq    $(VEC_SIZE * -4), %rax
> +       cmpq    %rdx, %rax
> +       jne     L(loop_4x_vec)
>
> -       .p2align 4
> -L(last_vec_2x_aligned):
> -       movl    %esi, %ecx
> +       subl    %edi, %edx
> +       incl    %edx
>
> -       /* Check the last VEC.  */
> -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
> +L(last_4x_vec):
> +       /* Used no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       movl    $1, %edx
> -       sall    %cl, %edx
> -       subl    $1, %edx
> +       cmpl    $(VEC_SIZE * 2), %edx
> +       jbe     L(last_2x_vec)
>
> -       vpmovmskb %ymm1, %eax
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_end)
>
> -       /* Remove the trailing bytes.  */
> -       andl    %edx, %eax
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1_end)
>
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> +       /* Used no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       /* Check the second last VEC.  */
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
> +       cmpl    $(VEC_SIZE * 3), %edx
> +       ja      L(last_vec)
> +
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 2), %rax
> +       COND_VZEROUPPER
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       jbe     L(ret0)
> +       xorl    %eax, %eax
> +L(ret0):
> +       ret
>
> -       movl    %r8d, %ecx
>
> -       vpmovmskb %ymm1, %eax
> +       .p2align 4
> +L(loop_end):
> +       vpmovmskb %ymm1, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_end)
> +
> +       vpmovmskb %ymm2, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1_end)
> +
> +       vpmovmskb %ymm3, %ecx
> +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> +          then CHAR in VEC3 and bsrq will use that position.  */
> +       salq    $32, %rcx
> +       orq     %rsi, %rcx
> +       bsrq    %rcx, %rcx
> +       leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
> +       VZEROUPPER_RETURN
>
> -       /* Remove the leading bytes.  Must use unsigned right shift for
> -          bsrl below.  */
> -       shrl    %cl, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> +       .p2align 4,, 4
> +L(ret_vec_x1_end):
> +       /* 64-bit version will automatically add 32 (VEC_SIZE).  */
> +       lzcntq  %rcx, %rcx
> +       subq    %rcx, %rax
> +       VZEROUPPER_RETURN
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       addq    %r8, %rax
> +       .p2align 4,, 4
> +L(ret_vec_x0_end):
> +       lzcntl  %ecx, %ecx
> +       subq    %rcx, %rax
>         VZEROUPPER_RETURN
> -END (MEMRCHR)
> +
> +       /* 2 bytes until next cache line.  */
> +END(MEMRCHR)
>  #endif
> --
> 2.34.1
>

[-- Attachment #2: tgl-memrchr-avx2.txt --]
[-- Type: text/plain, Size: 83424 bytes --]

Geometric mean of N = 30 runs.
Benchmarked on Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

Aggregate Geometric Mean of New / Old: 0.7606704539854201

Results For: memrchr
 len, align,  pos, seek_char, invert_pos,  New / Old
2048,     0,   32,        23,          0,      0.938
 256,     1,   64,        23,          0,      0.813
2048,     0,   32,         0,          0,      0.948
 256,     1,   64,         0,          0,      0.811
 256,  4081,   64,         0,          0,      0.642
 256,     0,    1,        23,          0,       0.77
 256,     0,    1,         0,          0,      0.771
 256,     1,    1,        23,          0,      0.788
 256,     1,    1,         0,          0,      0.788
2048,     0,   64,        23,          0,       0.95
 256,     2,   64,        23,          0,      0.817
2048,     0,   64,         0,          0,       0.96
 256,     2,   64,         0,          0,      0.811
 256,     0,    2,        23,          0,      0.772
 256,     0,    2,         0,          0,      0.772
 256,     2,    2,        23,          0,      0.789
 256,     2,    2,         0,          0,      0.789
2048,     0,  128,        23,          0,      0.979
 256,     3,   64,        23,          0,      0.811
2048,     0,  128,         0,          0,      0.971
 256,     3,   64,         0,          0,      0.816
 256,     0,    3,        23,          0,      0.772
 256,     0,    3,         0,          0,      0.772
 256,     3,    3,        23,          0,      0.788
 256,     3,    3,         0,          0,      0.787
2048,     0,  256,        23,          0,      0.949
 256,     4,   64,        23,          0,      0.808
2048,     0,  256,         0,          0,      0.952
 256,     4,   64,         0,          0,      0.811
 256,     0,    4,        23,          0,      0.778
 256,     0,    4,         0,          0,       0.78
 256,     4,    4,        23,          0,       0.79
 256,     4,    4,         0,          0,       0.79
2048,     0,  512,        23,          0,      0.971
 256,     5,   64,        23,          0,      0.817
2048,     0,  512,         0,          0,      0.969
 256,     5,   64,         0,          0,      0.822
 256,     0,    5,        23,          0,       0.78
 256,     0,    5,         0,          0,       0.77
 256,     5,    5,        23,          0,      0.791
 256,     5,    5,         0,          0,      0.791
2048,     0, 1024,        23,          0,      1.007
 256,     6,   64,        23,          0,      0.812
2048,     0, 1024,         0,          0,      1.002
 256,     6,   64,         0,          0,      0.811
 256,     0,    6,        23,          0,      0.773
 256,     0,    6,         0,          0,      0.774
 256,     6,    6,        23,          0,      0.791
 256,     6,    6,         0,          0,      0.791
2048,     0, 2048,        23,          0,      0.969
 256,     7,   64,        23,          0,      0.818
2048,     0, 2048,         0,          0,      0.959
 256,     7,   64,         0,          0,      0.816
 256,     0,    7,        23,          0,      0.773
 256,     0,    7,         0,          0,      0.776
 256,     7,    7,        23,          0,      0.791
 256,     7,    7,         0,          0,      0.798
 192,     1,   32,        23,          0,      0.647
 192,     1,   32,         0,          0,      0.643
 256,     1,   32,        23,          0,      0.803
 256,     1,   32,         0,          0,      0.808
 512,     1,   32,        23,          0,      0.838
 512,     1,   32,         0,          0,      0.834
 256,  4081,   32,        23,          0,      0.725
 192,     2,   64,        23,          0,      0.919
 192,     2,   64,         0,          0,      0.919
 512,     2,   64,        23,          0,      0.878
 512,     2,   64,         0,          0,      0.875
 256,  4081,   64,        23,          0,      0.645
 192,     3,   96,        23,          0,      0.919
 192,     3,   96,         0,          0,      0.934
 256,     3,   96,        23,          0,      0.829
 256,     3,   96,         0,          0,       0.83
 512,     3,   96,        23,          0,      0.883
 512,     3,   96,         0,          0,      0.886
 256,  4081,   96,        23,          0,      0.649
 192,     4,  128,        23,          0,      0.877
 192,     4,  128,         0,          0,      0.862
 256,     4,  128,        23,          0,      0.915
 256,     4,  128,         0,          0,      0.918
 512,     4,  128,        23,          0,      0.892
 512,     4,  128,         0,          0,       0.89
 256,  4081,  128,        23,          0,      0.927
 192,     5,  160,        23,          0,      1.164
 192,     5,  160,         0,          0,      1.157
 256,     5,  160,        23,          0,      0.928
 256,     5,  160,         0,          0,       0.93
 512,     5,  160,        23,          0,      0.857
 512,     5,  160,         0,          0,      0.864
 256,  4081,  160,        23,          0,       0.94
 192,     6,  192,        23,          0,      0.699
 192,     6,  192,         0,          0,      0.701
 256,     6,  192,        23,          0,      0.865
 256,     6,  192,         0,          0,      0.873
 512,     6,  192,        23,          0,      0.836
 512,     6,  192,         0,          0,      0.835
 256,  4081,  192,        23,          0,      0.896
 192,     7,  224,        23,          0,      0.702
 192,     7,  224,         0,          0,      0.701
 256,     7,  224,        23,          0,       1.16
 256,     7,  224,         0,          0,      1.161
 512,     7,  224,        23,          0,      0.854
 512,     7,  224,         0,          0,      0.862
 256,  4081,  224,        23,          0,      1.155
   2,     0,    1,        23,          0,      0.812
   2,     0,    1,         0,          0,      0.836
   2,     1,    1,        23,          0,       0.72
   2,     1,    1,         0,          0,      0.726
   0,     0,    1,        23,          0,      0.857
   0,     0,    1,         0,          0,      0.857
   0,     1,    1,        23,          0,      0.857
   0,     1,    1,         0,          0,      0.857
   2,  2048,    1,        23,          0,      0.694
   2,  2048,    1,         0,          0,       0.71
   2,  2049,    1,        23,          0,      0.621
   2,  2049,    1,         0,          0,      0.629
   0,  2048,    1,        23,          0,      0.857
   0,  2048,    1,         0,          0,      0.856
   0,  2049,    1,        23,          0,      0.857
   0,  2049,    1,         0,          0,      0.857
   0,  4081,    1,        23,          0,      0.857
   0,  4081,    1,         0,          0,      0.857
   2,  4081,    1,        23,          0,      0.602
   2,  4081,    1,         0,          0,      0.621
   2,     0,    2,         0,          0,      0.777
   3,     0,    2,        23,          0,      0.827
   3,     0,    2,         0,          0,      0.819
   3,     2,    2,        23,          0,      0.737
   3,     2,    2,         0,          0,      0.731
   1,     0,    2,        23,          0,      0.778
   1,     0,    2,         0,          0,      0.778
   1,     2,    2,        23,          0,      0.868
   1,     2,    2,         0,          0,      0.868
   3,  2048,    2,        23,          0,      0.716
   3,  2048,    2,         0,          0,      0.719
   3,  2050,    2,        23,          0,      0.625
   3,  2050,    2,         0,          0,      0.632
   1,  2048,    2,        23,          0,      0.667
   1,  2048,    2,         0,          0,      0.667
   1,  2050,    2,        23,          0,      0.749
   1,  2050,    2,         0,          0,      0.752
   1,  4081,    2,        23,          0,      0.743
   1,  4081,    2,         0,          0,      0.745
   3,  4081,    2,        23,          0,      0.601
   3,  4081,    2,         0,          0,      0.613
   3,     0,    1,        23,          0,      0.832
   4,     0,    3,        23,          0,      0.834
   4,     0,    3,         0,          0,      0.838
   4,     3,    3,        23,          0,      0.728
   4,     3,    3,         0,          0,      0.718
   2,     0,    3,        23,          0,      0.778
   2,     0,    3,         0,          0,      0.777
   2,     3,    3,        23,          0,      0.868
   2,     3,    3,         0,          0,       0.87
   4,  2048,    3,        23,          0,      0.711
   4,  2048,    3,         0,          0,      0.711
   4,  2051,    3,        23,          0,      0.621
   4,  2051,    3,         0,          0,      0.622
   2,  2048,    3,        23,          0,      0.669
   2,  2048,    3,         0,          0,      0.669
   2,  2051,    3,        23,          0,      0.746
   2,  2051,    3,         0,          0,      0.746
   2,  4081,    3,        23,          0,      0.745
   2,  4081,    3,         0,          0,      0.744
   4,  4081,    3,        23,          0,      0.587
   4,  4081,    3,         0,          0,      0.585
   4,     0,    1,        23,          0,      0.846
   4,     0,    2,         0,          0,      0.834
   5,     0,    4,        23,          0,       0.84
   5,     0,    4,         0,          0,      0.838
   5,     4,    4,        23,          0,      0.736
   5,     4,    4,         0,          0,      0.733
   3,     0,    4,        23,          0,      0.779
   3,     0,    4,         0,          0,      0.779
   3,     4,    4,        23,          0,      0.869
   3,     4,    4,         0,          0,      0.869
   5,  2048,    4,        23,          0,      0.693
   5,  2048,    4,         0,          0,      0.713
   5,  2052,    4,        23,          0,      0.615
   5,  2052,    4,         0,          0,      0.609
   3,  2048,    4,        23,          0,      0.666
   3,  2048,    4,         0,          0,      0.668
   3,  2052,    4,        23,          0,      0.748
   3,  2052,    4,         0,          0,      0.745
   3,  4081,    4,        23,          0,      0.745
   3,  4081,    4,         0,          0,      0.745
   5,  4081,    4,        23,          0,      0.566
   5,  4081,    4,         0,          0,      0.589
   5,     0,    1,        23,          0,      0.841
   5,     0,    2,         0,          0,      0.832
   6,     0,    5,        23,          0,      0.832
   6,     0,    5,         0,          0,      0.836
   6,     5,    5,        23,          0,      0.736
   6,     5,    5,         0,          0,      0.732
   4,     0,    5,        23,          0,      0.778
   4,     0,    5,         0,          0,      0.779
   4,     5,    5,        23,          0,      0.871
   4,     5,    5,         0,          0,       0.87
   6,  2048,    5,        23,          0,      0.713
   6,  2048,    5,         0,          0,      0.702
   6,  2053,    5,        23,          0,      0.611
   6,  2053,    5,         0,          0,      0.632
   4,  2048,    5,        23,          0,      0.667
   4,  2048,    5,         0,          0,      0.667
   4,  2053,    5,        23,          0,      0.746
   4,  2053,    5,         0,          0,      0.745
   4,  4081,    5,        23,          0,      0.746
   4,  4081,    5,         0,          0,      0.744
   6,  4081,    5,        23,          0,      0.584
   6,  4081,    5,         0,          0,      0.602
   6,     0,    1,        23,          0,      0.842
   6,     0,    2,         0,          0,      0.848
   7,     0,    6,        23,          0,      0.837
   7,     0,    6,         0,          0,      0.838
   7,     6,    6,        23,          0,      0.739
   7,     6,    6,         0,          0,      0.721
   5,     0,    6,        23,          0,      0.778
   5,     0,    6,         0,          0,      0.777
   5,     6,    6,        23,          0,      0.869
   5,     6,    6,         0,          0,      0.869
   7,  2048,    6,        23,          0,      0.719
   7,  2048,    6,         0,          0,      0.699
   7,  2054,    6,        23,          0,      0.618
   7,  2054,    6,         0,          0,      0.621
   5,  2048,    6,        23,          0,      0.667
   5,  2048,    6,         0,          0,      0.667
   5,  2054,    6,        23,          0,      0.747
   5,  2054,    6,         0,          0,       0.75
   5,  4081,    6,        23,          0,      0.745
   5,  4081,    6,         0,          0,      0.743
   7,  4081,    6,        23,          0,      0.586
   7,  4081,    6,         0,          0,      0.572
   7,     0,    1,        23,          0,      0.836
   7,     0,    2,         0,          0,      0.848
   8,     0,    7,        23,          0,      0.843
   8,     0,    7,         0,          0,      0.827
   8,     7,    7,        23,          0,      0.733
   8,     7,    7,         0,          0,      0.744
   6,     0,    7,        23,          0,      0.778
   6,     0,    7,         0,          0,      0.781
   6,     7,    7,        23,          0,      0.869
   6,     7,    7,         0,          0,      0.872
   8,  2048,    7,        23,          0,      0.717
   8,  2048,    7,         0,          0,      0.703
   8,  2055,    7,        23,          0,      0.608
   8,  2055,    7,         0,          0,      0.608
   6,  2048,    7,        23,          0,      0.667
   6,  2048,    7,         0,          0,      0.669
   6,  2055,    7,        23,          0,      0.749
   6,  2055,    7,         0,          0,      0.748
   6,  4081,    7,        23,          0,      0.742
   6,  4081,    7,         0,          0,      0.745
   8,  4081,    7,        23,          0,      0.602
   8,  4081,    7,         0,          0,       0.59
   8,     0,    1,        23,          0,      0.833
   8,     0,    2,         0,          0,      0.837
   9,     0,    8,        23,          0,      0.822
   9,     0,    8,         0,          0,      0.824
   9,     8,    8,        23,          0,      0.725
   9,     8,    8,         0,          0,      0.734
   7,     0,    8,        23,          0,      0.778
   7,     0,    8,         0,          0,      0.778
   7,     8,    8,        23,          0,      0.871
   7,     8,    8,         0,          0,      0.873
   9,  2048,    8,        23,          0,      0.709
   9,  2048,    8,         0,          0,      0.713
   9,  2056,    8,        23,          0,      0.616
   9,  2056,    8,         0,          0,      0.616
   7,  2048,    8,        23,          0,      0.667
   7,  2048,    8,         0,          0,      0.668
   7,  2056,    8,        23,          0,      0.749
   7,  2056,    8,         0,          0,      0.747
   7,  4081,    8,        23,          0,      0.744
   7,  4081,    8,         0,          0,      0.742
   9,  4081,    8,        23,          0,      0.581
   9,  4081,    8,         0,          0,      0.591
   9,     0,    1,        23,          0,      0.842
   9,     0,    2,         0,          0,       0.83
  10,     0,    9,        23,          0,      0.834
  10,     0,    9,         0,          0,      0.851
  10,     9,    9,        23,          0,      0.728
  10,     9,    9,         0,          0,      0.726
   8,     0,    9,        23,          0,      0.778
   8,     0,    9,         0,          0,      0.776
   8,     9,    9,        23,          0,      0.872
   8,     9,    9,         0,          0,      0.868
  10,  2048,    9,        23,          0,      0.697
  10,  2048,    9,         0,          0,      0.712
  10,  2057,    9,        23,          0,      0.605
  10,  2057,    9,         0,          0,       0.61
   8,  2048,    9,        23,          0,      0.667
   8,  2048,    9,         0,          0,      0.667
   8,  2057,    9,        23,          0,      0.744
   8,  2057,    9,         0,          0,      0.745
   8,  4081,    9,        23,          0,      0.745
   8,  4081,    9,         0,          0,      0.743
  10,  4081,    9,        23,          0,       0.57
  10,  4081,    9,         0,          0,      0.593
  10,     0,    1,        23,          0,      0.841
  10,     0,    2,         0,          0,      0.841
  11,     0,   10,        23,          0,      0.815
  11,     0,   10,         0,          0,      0.834
  11,    10,   10,        23,          0,      0.735
  11,    10,   10,         0,          0,       0.73
   9,     0,   10,        23,          0,      0.778
   9,     0,   10,         0,          0,      0.779
   9,    10,   10,        23,          0,      0.867
   9,    10,   10,         0,          0,       0.87
  11,  2048,   10,        23,          0,      0.707
  11,  2048,   10,         0,          0,      0.697
  11,  2058,   10,        23,          0,      0.614
  11,  2058,   10,         0,          0,      0.616
   9,  2048,   10,        23,          0,      0.667
   9,  2048,   10,         0,          0,      0.666
   9,  2058,   10,        23,          0,      0.744
   9,  2058,   10,         0,          0,      0.744
   9,  4081,   10,        23,          0,      0.745
   9,  4081,   10,         0,          0,      0.744
  11,  4081,   10,        23,          0,       0.59
  11,  4081,   10,         0,          0,      0.588
  11,     0,    1,        23,          0,       0.84
  11,     0,    2,         0,          0,      0.837
  12,     0,   11,        23,          0,      0.831
  12,     0,   11,         0,          0,      0.831
  12,    11,   11,        23,          0,      0.725
  12,    11,   11,         0,          0,      0.737
  10,     0,   11,        23,          0,       0.78
  10,     0,   11,         0,          0,      0.778
  10,    11,   11,        23,          0,      0.867
  10,    11,   11,         0,          0,       0.87
  12,  2048,   11,        23,          0,      0.715
  12,  2048,   11,         0,          0,      0.723
  12,  2059,   11,        23,          0,      0.623
  12,  2059,   11,         0,          0,      0.613
  10,  2048,   11,        23,          0,      0.668
  10,  2048,   11,         0,          0,      0.667
  10,  2059,   11,        23,          0,      0.743
  10,  2059,   11,         0,          0,      0.743
  10,  4081,   11,        23,          0,      0.744
  10,  4081,   11,         0,          0,      0.743
  12,  4081,   11,        23,          0,      0.605
  12,  4081,   11,         0,          0,      0.586
  12,     0,    1,        23,          0,      0.843
  12,     0,    2,         0,          0,      0.846
  13,     0,   12,        23,          0,      0.829
  13,     0,   12,         0,          0,      0.832
  13,    12,   12,        23,          0,      0.731
  13,    12,   12,         0,          0,      0.727
  11,     0,   12,        23,          0,      0.778
  11,     0,   12,         0,          0,      0.777
  11,    12,   12,        23,          0,       0.87
  11,    12,   12,         0,          0,       0.87
  13,  2048,   12,        23,          0,      0.714
  13,  2048,   12,         0,          0,      0.713
  13,  2060,   12,        23,          0,      0.618
  13,  2060,   12,         0,          0,      0.614
  11,  2048,   12,        23,          0,      0.667
  11,  2048,   12,         0,          0,      0.667
  11,  2060,   12,        23,          0,      0.744
  11,  2060,   12,         0,          0,      0.744
  11,  4081,   12,        23,          0,      0.744
  11,  4081,   12,         0,          0,      0.743
  13,  4081,   12,        23,          0,      0.586
  13,  4081,   12,         0,          0,      0.589
  13,     0,    1,        23,          0,      0.838
  13,     0,    2,         0,          0,       0.83
  14,     0,   13,        23,          0,      0.838
  14,     0,   13,         0,          0,      0.843
  14,    13,   13,        23,          0,      0.739
  14,    13,   13,         0,          0,      0.728
  12,     0,   13,        23,          0,      0.778
  12,     0,   13,         0,          0,      0.778
  12,    13,   13,        23,          0,      0.868
  12,    13,   13,         0,          0,      0.866
  14,  2048,   13,        23,          0,      0.706
  14,  2048,   13,         0,          0,      0.719
  14,  2061,   13,        23,          0,      0.626
  14,  2061,   13,         0,          0,      0.626
  12,  2048,   13,        23,          0,      0.667
  12,  2048,   13,         0,          0,      0.667
  12,  2061,   13,        23,          0,      0.744
  12,  2061,   13,         0,          0,      0.742
  12,  4081,   13,        23,          0,      0.745
  12,  4081,   13,         0,          0,      0.743
  14,  4081,   13,        23,          0,      0.601
  14,  4081,   13,         0,          0,      0.582
  14,     0,    1,        23,          0,      0.851
  14,     0,    2,         0,          0,      0.839
  15,     0,   14,        23,          0,      0.833
  15,     0,   14,         0,          0,      0.815
  15,    14,   14,        23,          0,      0.723
  15,    14,   14,         0,          0,      0.719
  13,     0,   14,        23,          0,      0.777
  13,     0,   14,         0,          0,      0.779
  13,    14,   14,        23,          0,      0.867
  13,    14,   14,         0,          0,      0.867
  15,  2048,   14,        23,          0,      0.701
  15,  2048,   14,         0,          0,      0.718
  15,  2062,   14,        23,          0,      0.628
  15,  2062,   14,         0,          0,      0.622
  13,  2048,   14,        23,          0,      0.667
  13,  2048,   14,         0,          0,      0.667
  13,  2062,   14,        23,          0,      0.743
  13,  2062,   14,         0,          0,      0.743
  13,  4081,   14,        23,          0,      0.744
  13,  4081,   14,         0,          0,      0.741
  15,  4081,   14,        23,          0,      0.568
  15,  4081,   14,         0,          0,      0.562
  15,     0,    1,        23,          0,      0.842
  15,     0,    2,         0,          0,      0.841
  16,     0,   15,        23,          0,      0.834
  16,     0,   15,         0,          0,      0.831
  16,    15,   15,        23,          0,      0.737
  16,    15,   15,         0,          0,      0.715
  14,     0,   15,        23,          0,      0.793
  14,     0,   15,         0,          0,      0.792
  14,    15,   15,        23,          0,      0.878
  14,    15,   15,         0,          0,      0.876
  16,  2048,   15,        23,          0,      0.702
  16,  2048,   15,         0,          0,      0.697
  16,  2063,   15,        23,          0,      0.615
  16,  2063,   15,         0,          0,      0.622
  14,  2048,   15,        23,          0,      0.689
  14,  2048,   15,         0,          0,      0.688
  14,  2063,   15,        23,          0,       0.76
  14,  2063,   15,         0,          0,      0.759
  14,  4081,   15,        23,          0,      0.756
  14,  4081,   15,         0,          0,      0.763
  16,  4081,   15,        23,          0,      0.887
  16,  4081,   15,         0,          0,      0.888
  16,     0,    1,        23,          0,       0.84
  16,     0,    2,         0,          0,      0.848
  17,     0,   16,        23,          0,      0.833
  17,     0,   16,         0,          0,      0.845
  17,    16,   16,        23,          0,      0.616
  17,    16,   16,         0,          0,      0.603
  15,     0,   16,        23,          0,      0.829
  15,     0,   16,         0,          0,      0.829
  15,    16,   16,        23,          0,      0.907
  15,    16,   16,         0,          0,      0.909
  17,  2048,   16,        23,          0,       0.71
  17,  2048,   16,         0,          0,       0.69
  17,  2064,   16,        23,          0,      0.615
  17,  2064,   16,         0,          0,      0.588
  15,  2048,   16,        23,          0,      0.686
  15,  2048,   16,         0,          0,      0.687
  15,  2064,   16,        23,          0,      0.755
  15,  2064,   16,         0,          0,      0.756
  15,  4081,   16,        23,          0,       0.76
  15,  4081,   16,         0,          0,      0.755
  17,  4081,   16,        23,          0,      0.889
  17,  4081,   16,         0,          0,      0.889
  17,     0,    1,        23,          0,      0.849
  17,     0,    2,         0,          0,      0.855
  18,     0,   17,        23,          0,       0.83
  18,     0,   17,         0,          0,      0.826
  18,    17,   17,        23,          0,      0.612
  18,    17,   17,         0,          0,      0.597
  16,     0,   17,        23,          0,        0.8
  16,     0,   17,         0,          0,      0.805
  16,    17,   17,        23,          0,      0.669
  16,    17,   17,         0,          0,      0.669
  18,  2048,   17,        23,          0,      0.707
  18,  2048,   17,         0,          0,       0.71
  18,  2065,   17,        23,          0,      0.607
  18,  2065,   17,         0,          0,      0.588
  16,  2048,   17,        23,          0,      0.687
  16,  2048,   17,         0,          0,      0.686
  16,  2065,   17,        23,          0,      0.669
  16,  2065,   17,         0,          0,       0.67
  16,  4081,   17,        23,          0,      0.986
  16,  4081,   17,         0,          0,      0.982
  18,  4081,   17,        23,          0,      0.889
  18,  4081,   17,         0,          0,      0.889
  18,     0,    1,        23,          0,      0.857
  18,     0,    2,         0,          0,      0.853
  19,     0,   18,        23,          0,      0.842
  19,     0,   18,         0,          0,      0.817
  19,    18,   18,        23,          0,      0.599
  19,    18,   18,         0,          0,      0.593
  17,     0,   18,        23,          0,      0.795
  17,     0,   18,         0,          0,        0.8
  17,    18,   18,        23,          0,       0.67
  17,    18,   18,         0,          0,      0.669
  19,  2048,   18,        23,          0,      0.707
  19,  2048,   18,         0,          0,      0.704
  19,  2066,   18,        23,          0,      0.588
  19,  2066,   18,         0,          0,      0.611
  17,  2048,   18,        23,          0,      0.687
  17,  2048,   18,         0,          0,      0.686
  17,  2066,   18,        23,          0,       0.67
  17,  2066,   18,         0,          0,      0.671
  17,  4081,   18,        23,          0,      0.982
  17,  4081,   18,         0,          0,       0.98
  19,  4081,   18,        23,          0,      0.889
  19,  4081,   18,         0,          0,      0.889
  19,     0,    1,        23,          0,      0.844
  19,     0,    2,         0,          0,      0.847
  20,     0,   19,        23,          0,       0.83
  20,     0,   19,         0,          0,      0.836
  20,    19,   19,        23,          0,      0.588
  20,    19,   19,         0,          0,       0.61
  18,     0,   19,        23,          0,      0.829
  18,     0,   19,         0,          0,      0.835
  18,    19,   19,        23,          0,      0.669
  18,    19,   19,         0,          0,       0.67
  20,  2048,   19,        23,          0,      0.691
  20,  2048,   19,         0,          0,      0.707
  20,  2067,   19,        23,          0,      0.626
  20,  2067,   19,         0,          0,      0.611
  18,  2048,   19,        23,          0,      0.686
  18,  2048,   19,         0,          0,      0.687
  18,  2067,   19,        23,          0,      0.669
  18,  2067,   19,         0,          0,      0.669
  18,  4081,   19,        23,          0,      0.982
  18,  4081,   19,         0,          0,       0.98
  20,  4081,   19,        23,          0,      0.889
  20,  4081,   19,         0,          0,      0.889
  20,     0,    1,        23,          0,       0.85
  20,     0,    2,         0,          0,      0.838
  21,     0,   20,        23,          0,      0.839
  21,     0,   20,         0,          0,      0.824
  21,    20,   20,        23,          0,      0.593
  21,    20,   20,         0,          0,      0.612
  19,     0,   20,        23,          0,      0.833
  19,     0,   20,         0,          0,       0.83
  19,    20,   20,        23,          0,      0.669
  19,    20,   20,         0,          0,      0.669
  21,  2048,   20,        23,          0,        0.7
  21,  2048,   20,         0,          0,       0.72
  21,  2068,   20,        23,          0,      0.611
  21,  2068,   20,         0,          0,      0.597
  19,  2048,   20,        23,          0,      0.687
  19,  2048,   20,         0,          0,      0.687
  19,  2068,   20,        23,          0,      0.669
  19,  2068,   20,         0,          0,      0.668
  19,  4081,   20,        23,          0,       0.98
  19,  4081,   20,         0,          0,       0.98
  21,  4081,   20,        23,          0,      0.889
  21,  4081,   20,         0,          0,      0.889
  21,     0,    1,        23,          0,      0.856
  21,     0,    2,         0,          0,      0.845
  22,     0,   21,        23,          0,      0.833
  22,     0,   21,         0,          0,       0.83
  22,    21,   21,        23,          0,      0.607
  22,    21,   21,         0,          0,      0.602
  20,     0,   21,        23,          0,      0.807
  20,     0,   21,         0,          0,      0.807
  20,    21,   21,        23,          0,      0.666
  20,    21,   21,         0,          0,      0.669
  22,  2048,   21,        23,          0,       0.71
  22,  2048,   21,         0,          0,      0.723
  22,  2069,   21,        23,          0,      0.602
  22,  2069,   21,         0,          0,      0.597
  20,  2048,   21,        23,          0,      0.688
  20,  2048,   21,         0,          0,      0.689
  20,  2069,   21,        23,          0,       0.67
  20,  2069,   21,         0,          0,      0.668
  20,  4081,   21,        23,          0,      0.982
  20,  4081,   21,         0,          0,      0.983
  22,  4081,   21,        23,          0,      0.889
  22,  4081,   21,         0,          0,      0.889
  22,     0,    1,        23,          0,      0.851
  22,     0,    2,         0,          0,      0.837
  23,     0,   22,        23,          0,      0.833
  23,     0,   22,         0,          0,      0.834
  23,    22,   22,        23,          0,      0.626
  23,    22,   22,         0,          0,      0.603
  21,     0,   22,        23,          0,      0.828
  21,     0,   22,         0,          0,      0.823
  21,    22,   22,        23,          0,       0.67
  21,    22,   22,         0,          0,      0.669
  23,  2048,   22,        23,          0,       0.71
  23,  2048,   22,         0,          0,      0.713
  23,  2070,   22,        23,          0,      0.611
  23,  2070,   22,         0,          0,      0.607
  21,  2048,   22,        23,          0,      0.687
  21,  2048,   22,         0,          0,      0.687
  21,  2070,   22,        23,          0,       0.67
  21,  2070,   22,         0,          0,       0.67
  21,  4081,   22,        23,          0,      0.981
  21,  4081,   22,         0,          0,      0.981
  23,  4081,   22,        23,          0,      0.889
  23,  4081,   22,         0,          0,      0.889
  23,     0,    1,        23,          0,      0.852
  23,     0,    2,         0,          0,      0.856
  24,     0,   23,        23,          0,       0.83
  24,     0,   23,         0,          0,      0.852
  24,    23,   23,        23,          0,      0.595
  24,    23,   23,         0,          0,      0.597
  22,     0,   23,        23,          0,      0.846
  22,     0,   23,         0,          0,      0.847
  22,    23,   23,        23,          0,      0.673
  22,    23,   23,         0,          0,      0.673
  24,  2048,   23,        23,          0,      0.691
  24,  2048,   23,         0,          0,      0.694
  24,  2071,   23,        23,          0,      0.611
  24,  2071,   23,         0,          0,      0.593
  22,  2048,   23,        23,          0,      0.688
  22,  2048,   23,         0,          0,      0.692
  22,  2071,   23,        23,          0,      0.675
  22,  2071,   23,         0,          0,      0.673
  22,  4081,   23,        23,          0,      0.982
  22,  4081,   23,         0,          0,      0.981
  24,  4081,   23,        23,          0,      0.889
  24,  4081,   23,         0,          0,      0.889
  24,     0,    1,        23,          0,       0.84
  24,     0,    2,         0,          0,      0.853
  25,     0,   24,        23,          0,      0.823
  25,     0,   24,         0,          0,       0.83
  25,    24,   24,        23,          0,      0.593
  25,    24,   24,         0,          0,      0.597
  23,     0,   24,        23,          0,      0.815
  23,     0,   24,         0,          0,      0.815
  23,    24,   24,        23,          0,      0.669
  23,    24,   24,         0,          0,      0.672
  25,  2048,   24,        23,          0,      0.694
  25,  2048,   24,         0,          0,      0.716
  25,  2072,   24,        23,          0,      0.621
  25,  2072,   24,         0,          0,      0.597
  23,  2048,   24,        23,          0,      0.689
  23,  2048,   24,         0,          0,      0.689
  23,  2072,   24,        23,          0,       0.67
  23,  2072,   24,         0,          0,      0.675
  23,  4081,   24,        23,          0,       0.98
  23,  4081,   24,         0,          0,      0.983
  25,  4081,   24,        23,          0,      0.889
  25,  4081,   24,         0,          0,      0.889
  25,     0,    1,        23,          0,      0.847
  25,     0,    2,         0,          0,      0.851
  26,     0,   25,        23,          0,      0.825
  26,     0,   25,         0,          0,      0.842
  26,    25,   25,        23,          0,      0.616
  26,    25,   25,         0,          0,      0.626
  24,     0,   25,        23,          0,      0.817
  24,     0,   25,         0,          0,      0.814
  24,    25,   25,        23,          0,      0.676
  24,    25,   25,         0,          0,      0.673
  26,  2048,   25,        23,          0,      0.707
  26,  2048,   25,         0,          0,      0.707
  26,  2073,   25,        23,          0,      0.607
  26,  2073,   25,         0,          0,      0.593
  24,  2048,   25,        23,          0,      0.686
  24,  2048,   25,         0,          0,      0.691
  24,  2073,   25,        23,          0,      0.672
  24,  2073,   25,         0,          0,      0.673
  24,  4081,   25,        23,          0,      0.981
  24,  4081,   25,         0,          0,      0.977
  26,  4081,   25,        23,          0,      0.889
  26,  4081,   25,         0,          0,      0.889
  26,     0,    1,        23,          0,      0.842
  26,     0,    2,         0,          0,       0.85
  27,     0,   26,        23,          0,       0.83
  27,     0,   26,         0,          0,      0.848
  27,    26,   26,        23,          0,      0.607
  27,    26,   26,         0,          0,      0.612
  25,     0,   26,        23,          0,      0.828
  25,     0,   26,         0,          0,      0.826
  25,    26,   26,        23,          0,      0.675
  25,    26,   26,         0,          0,      0.672
  27,  2048,   26,        23,          0,        0.7
  27,  2048,   26,         0,          0,        0.7
  27,  2074,   26,        23,          0,      0.616
  27,  2074,   26,         0,          0,      0.599
  25,  2048,   26,        23,          0,      0.691
  25,  2048,   26,         0,          0,      0.694
  25,  2074,   26,        23,          0,       0.67
  25,  2074,   26,         0,          0,      0.672
  25,  4081,   26,        23,          0,      0.979
  25,  4081,   26,         0,          0,      0.985
  27,  4081,   26,        23,          0,      0.889
  27,  4081,   26,         0,          0,      0.889
  27,     0,    1,        23,          0,      0.854
  27,     0,    2,         0,          0,      0.853
  28,     0,   27,        23,          0,      0.827
  28,     0,   27,         0,          0,      0.845
  28,    27,   27,        23,          0,      0.583
  28,    27,   27,         0,          0,      0.585
  26,     0,   27,        23,          0,      0.844
  26,     0,   27,         0,          0,      0.829
  26,    27,   27,        23,          0,      0.673
  26,    27,   27,         0,          0,      0.671
  28,  2048,   27,        23,          0,      0.697
  28,  2048,   27,         0,          0,      0.713
  28,  2075,   27,        23,          0,      0.602
  28,  2075,   27,         0,          0,      0.602
  26,  2048,   27,        23,          0,      0.688
  26,  2048,   27,         0,          0,      0.692
  26,  2075,   27,        23,          0,      0.673
  26,  2075,   27,         0,          0,       0.67
  26,  4081,   27,        23,          0,       0.98
  26,  4081,   27,         0,          0,      0.977
  28,  4081,   27,        23,          0,      0.889
  28,  4081,   27,         0,          0,      0.889
  28,     0,    1,        23,          0,      0.837
  28,     0,    2,         0,          0,      0.839
  29,     0,   28,        23,          0,      0.811
  29,     0,   28,         0,          0,      0.843
  29,    28,   28,        23,          0,      0.618
  29,    28,   28,         0,          0,      0.626
  27,     0,   28,        23,          0,      0.839
  27,     0,   28,         0,          0,      0.832
  27,    28,   28,        23,          0,      0.674
  27,    28,   28,         0,          0,      0.671
  29,  2048,   28,        23,          0,      0.694
  29,  2048,   28,         0,          0,        0.7
  29,  2076,   28,        23,          0,      0.583
  29,  2076,   28,         0,          0,      0.618
  27,  2048,   28,        23,          0,      0.689
  27,  2048,   28,         0,          0,      0.692
  27,  2076,   28,        23,          0,       0.67
  27,  2076,   28,         0,          0,      0.678
  27,  4081,   28,        23,          0,      0.982
  27,  4081,   28,         0,          0,      0.978
  29,  4081,   28,        23,          0,      0.889
  29,  4081,   28,         0,          0,      0.889
  29,     0,    1,        23,          0,      0.849
  29,     0,    2,         0,          0,      0.843
  30,     0,   29,        23,          0,      0.827
  30,     0,   29,         0,          0,      0.829
  30,    29,   29,        23,          0,      0.616
  30,    29,   29,         0,          0,      0.616
  28,     0,   29,        23,          0,      0.847
  28,     0,   29,         0,          0,       0.85
  28,    29,   29,        23,          0,      0.676
  28,    29,   29,         0,          0,      0.672
  30,  2048,   29,        23,          0,       0.71
  30,  2048,   29,         0,          0,      0.707
  30,  2077,   29,        23,          0,      0.621
  30,  2077,   29,         0,          0,      0.583
  28,  2048,   29,        23,          0,       0.69
  28,  2048,   29,         0,          0,      0.687
  28,  2077,   29,        23,          0,      0.668
  28,  2077,   29,         0,          0,      0.666
  28,  4081,   29,        23,          0,      0.977
  28,  4081,   29,         0,          0,      0.974
  30,  4081,   29,        23,          0,      0.887
  30,  4081,   29,         0,          0,      0.887
  30,     0,    1,        23,          0,      0.854
  30,     0,    2,         0,          0,      0.852
  31,     0,   30,        23,          0,      0.828
  31,     0,   30,         0,          0,      0.837
  31,    30,   30,        23,          0,      0.586
  31,    30,   30,         0,          0,        0.6
  29,     0,   30,        23,          0,      0.852
  29,     0,   30,         0,          0,       0.85
  29,    30,   30,        23,          0,      0.667
  29,    30,   30,         0,          0,      0.673
  31,  2048,   30,        23,          0,      0.727
  31,  2048,   30,         0,          0,      0.695
  31,  2078,   30,        23,          0,      0.614
  31,  2078,   30,         0,          0,      0.599
  29,  2048,   30,        23,          0,      0.687
  29,  2048,   30,         0,          0,      0.684
  29,  2078,   30,        23,          0,      0.668
  29,  2078,   30,         0,          0,      0.664
  29,  4081,   30,        23,          0,      0.967
  29,  4081,   30,         0,          0,       0.97
  31,  4081,   30,        23,          0,      0.884
  31,  4081,   30,         0,          0,      0.884
  31,     0,    1,        23,          0,      0.837
  31,     0,    2,         0,          0,      0.847
  32,     0,   31,        23,          0,      0.841
  32,     0,   31,         0,          0,      0.827
  32,    31,   31,        23,          0,      0.585
  32,    31,   31,         0,          0,      0.604
  30,     0,   31,        23,          0,      0.848
  30,     0,   31,         0,          0,      0.838
  30,    31,   31,        23,          0,      0.663
  30,    31,   31,         0,          0,      0.671
  32,  2048,   31,        23,          0,      0.658
  32,  2048,   31,         0,          0,      0.689
  32,  2079,   31,        23,          0,      0.585
  32,  2079,   31,         0,          0,      0.584
  30,  2048,   31,        23,          0,      0.682
  30,  2048,   31,         0,          0,      0.691
  30,  2079,   31,        23,          0,      0.673
  30,  2079,   31,         0,          0,       0.67
  30,  4081,   31,        23,          0,       0.97
  30,  4081,   31,         0,          0,      0.976
  32,  4081,   31,        23,          0,      0.882
  32,  4081,   31,         0,          0,      0.881
  32,     0,    1,        23,          0,      0.855
  32,     0,    2,         0,          0,      0.856
2048,     0,   32,        23,          1,      1.142
 256,     1,   64,        23,          1,      0.857
2048,     0,   32,         0,          1,      1.134
 256,     1,   64,         0,          1,      0.854
 256,  4081,   64,         0,          1,      0.868
 256,     0,    1,        23,          1,      1.155
 256,     0,    1,         0,          1,      1.157
 256,     1,    1,        23,          1,      1.157
 256,     1,    1,         0,          1,      1.158
2048,     0,   64,        23,          1,      1.122
 256,     2,   64,        23,          1,      0.863
2048,     0,   64,         0,          1,      1.118
 256,     2,   64,         0,          1,      0.857
 256,     0,    2,        23,          1,      1.161
 256,     0,    2,         0,          1,      1.161
 256,     2,    2,        23,          1,       1.16
 256,     2,    2,         0,          1,      1.161
2048,     0,  128,        23,          1,      1.091
 256,     3,   64,        23,          1,      0.867
2048,     0,  128,         0,          1,      1.081
 256,     3,   64,         0,          1,      0.867
 256,     0,    3,        23,          1,      1.165
 256,     0,    3,         0,          1,      1.166
 256,     3,    3,        23,          1,      1.167
 256,     3,    3,         0,          1,      1.167
2048,     0,  256,        23,          1,      0.917
 256,     4,   64,        23,          1,      0.866
2048,     0,  256,         0,          1,      0.917
 256,     4,   64,         0,          1,      0.884
 256,     0,    4,        23,          1,      1.167
 256,     0,    4,         0,          1,      1.167
 256,     4,    4,        23,          1,      1.167
 256,     4,    4,         0,          1,      1.167
2048,     0,  512,        23,          1,      0.928
 256,     5,   64,        23,          1,      0.884
2048,     0,  512,         0,          1,      0.926
 256,     5,   64,         0,          1,      0.876
 256,     0,    5,        23,          1,      1.167
 256,     0,    5,         0,          1,      1.167
 256,     5,    5,        23,          1,      1.167
 256,     5,    5,         0,          1,      1.167
2048,     0, 1024,        23,          1,      1.004
 256,     6,   64,        23,          1,      0.869
2048,     0, 1024,         0,          1,      1.001
 256,     6,   64,         0,          1,      0.877
 256,     0,    6,        23,          1,      1.167
 256,     0,    6,         0,          1,      1.167
 256,     6,    6,        23,          1,      1.167
 256,     6,    6,         0,          1,      1.167
2048,     0, 2048,        23,          1,      0.962
 256,     7,   64,        23,          1,      0.897
2048,     0, 2048,         0,          1,      0.946
 256,     7,   64,         0,          1,      0.873
 256,     0,    7,        23,          1,      1.165
 256,     0,    7,         0,          1,      1.165
 256,     7,    7,        23,          1,      1.165
 256,     7,    7,         0,          1,      1.165
 192,     1,   32,        23,          1,      1.155
 192,     1,   32,         0,          1,      1.153
 256,     1,   32,        23,          1,      1.151
 256,     1,   32,         0,          1,      1.159
 512,     1,   32,        23,          1,      1.159
 512,     1,   32,         0,          1,      1.156
 256,  4081,   32,        23,          1,      1.152
 192,     2,   64,        23,          1,      0.882
 192,     2,   64,         0,          1,      0.869
 512,     2,   64,        23,          1,      0.877
 512,     2,   64,         0,          1,      0.859
 256,  4081,   64,        23,          1,      0.875
 192,     3,   96,        23,          1,      0.928
 192,     3,   96,         0,          1,      0.936
 256,     3,   96,        23,          1,      0.923
 256,     3,   96,         0,          1,      0.918
 512,     3,   96,        23,          1,       0.93
 512,     3,   96,         0,          1,      0.914
 256,  4081,   96,        23,          1,      0.933
 192,     4,  128,        23,          1,      0.917
 192,     4,  128,         0,          1,      0.918
 256,     4,  128,        23,          1,      0.913
 256,     4,  128,         0,          1,      0.919
 512,     4,  128,        23,          1,      0.916
 512,     4,  128,         0,          1,      0.916
 256,  4081,  128,        23,          1,      0.925
 192,     5,  160,        23,          1,      0.657
 192,     5,  160,         0,          1,       0.65
 256,     5,  160,        23,          1,      0.842
 256,     5,  160,         0,          1,      0.832
 512,     5,  160,        23,          1,       0.89
 512,     5,  160,         0,          1,      0.892
 256,  4081,  160,        23,          1,      0.642
 192,     6,  192,        23,          1,      0.701
 192,     6,  192,         0,          1,      0.701
 256,     6,  192,        23,          1,      0.819
 256,     6,  192,         0,          1,      0.815
 512,     6,  192,        23,          1,      0.863
 512,     6,  192,         0,          1,      0.867
 256,  4081,  192,        23,          1,      0.645
 192,     7,  224,        23,          1,      0.702
 192,     7,  224,         0,          1,      0.699
 256,     7,  224,        23,          1,      0.803
 256,     7,  224,         0,          1,      0.801
 512,     7,  224,        23,          1,      0.882
 512,     7,  224,         0,          1,      0.882
 256,  4081,  224,        23,          1,      0.719
   2,     0,    1,        23,          1,      0.809
   2,     0,    1,         0,          1,      0.836
   2,     1,    1,        23,          1,      0.726
   2,     1,    1,         0,          1,      0.733
   0,     0,    1,        23,          1,      0.854
   0,     0,    1,         0,          1,      0.854
   0,     1,    1,        23,          1,      0.854
   0,     1,    1,         0,          1,      0.854
   2,  2048,    1,        23,          1,      0.705
   2,  2048,    1,         0,          1,      0.698
   2,  2049,    1,        23,          1,      0.616
   2,  2049,    1,         0,          1,      0.613
   0,  2048,    1,        23,          1,      0.854
   0,  2048,    1,         0,          1,      0.854
   0,  2049,    1,        23,          1,      0.854
   0,  2049,    1,         0,          1,      0.854
   0,  4081,    1,        23,          1,      0.855
   0,  4081,    1,         0,          1,      0.854
   2,  4081,    1,        23,          1,      0.576
   2,  4081,    1,         0,          1,      0.614
   2,     0,    2,         0,          1,      0.858
   3,     0,    2,        23,          1,       0.83
   3,     0,    2,         0,          1,      0.821
   3,     2,    2,        23,          1,       0.74
   3,     2,    2,         0,          1,      0.712
   1,     0,    2,        23,          1,      0.846
   1,     0,    2,         0,          1,      0.838
   1,     2,    2,        23,          1,      0.933
   1,     2,    2,         0,          1,      0.935
   3,  2048,    2,        23,          1,      0.701
   3,  2048,    2,         0,          1,      0.704
   3,  2050,    2,        23,          1,      0.621
   3,  2050,    2,         0,          1,      0.625
   1,  2048,    2,        23,          1,      0.665
   1,  2048,    2,         0,          1,      0.664
   1,  2050,    2,        23,          1,      0.741
   1,  2050,    2,         0,          1,      0.741
   1,  4081,    2,        23,          1,      0.741
   1,  4081,    2,         0,          1,      0.741
   3,  4081,    2,        23,          1,       0.59
   3,  4081,    2,         0,          1,       0.59
   3,     0,    1,        23,          1,      0.836
   4,     0,    3,        23,          1,      0.831
   4,     0,    3,         0,          1,      0.831
   4,     3,    3,        23,          1,      0.741
   4,     3,    3,         0,          1,      0.719
   2,     0,    3,        23,          1,      0.815
   2,     0,    3,         0,          1,      0.816
   2,     3,    3,        23,          1,      0.919
   2,     3,    3,         0,          1,      0.921
   4,  2048,    3,        23,          1,      0.714
   4,  2048,    3,         0,          1,      0.711
   4,  2051,    3,        23,          1,      0.611
   4,  2051,    3,         0,          1,      0.614
   2,  2048,    3,        23,          1,      0.664
   2,  2048,    3,         0,          1,      0.664
   2,  2051,    3,        23,          1,      0.743
   2,  2051,    3,         0,          1,      0.744
   2,  4081,    3,        23,          1,      0.744
   2,  4081,    3,         0,          1,      0.744
   4,  4081,    3,        23,          1,      0.613
   4,  4081,    3,         0,          1,      0.601
   4,     0,    1,        23,          1,      0.838
   4,     0,    2,         0,          1,       0.82
   5,     0,    4,        23,          1,      0.831
   5,     0,    4,         0,          1,      0.836
   5,     4,    4,        23,          1,      0.742
   5,     4,    4,         0,          1,      0.731
   3,     0,    4,        23,          1,      0.792
   3,     0,    4,         0,          1,      0.788
   3,     4,    4,        23,          1,      0.878
   3,     4,    4,         0,          1,      0.878
   5,  2048,    4,        23,          1,      0.687
   5,  2048,    4,         0,          1,       0.71
   5,  2052,    4,        23,          1,      0.618
   5,  2052,    4,         0,          1,      0.615
   3,  2048,    4,        23,          1,      0.666
   3,  2048,    4,         0,          1,      0.665
   3,  2052,    4,        23,          1,      0.743
   3,  2052,    4,         0,          1,      0.745
   3,  4081,    4,        23,          1,      0.744
   3,  4081,    4,         0,          1,      0.744
   5,  4081,    4,        23,          1,      0.586
   5,  4081,    4,         0,          1,      0.591
   5,     0,    1,        23,          1,      0.824
   5,     0,    2,         0,          1,      0.841
   6,     0,    5,        23,          1,      0.832
   6,     0,    5,         0,          1,      0.849
   6,     5,    5,        23,          1,       0.73
   6,     5,    5,         0,          1,      0.736
   4,     0,    5,        23,          1,      0.778
   4,     0,    5,         0,          1,      0.778
   4,     5,    5,        23,          1,      0.868
   4,     5,    5,         0,          1,      0.868
   6,  2048,    5,        23,          1,      0.727
   6,  2048,    5,         0,          1,       0.71
   6,  2053,    5,        23,          1,      0.604
   6,  2053,    5,         0,          1,      0.624
   4,  2048,    5,        23,          1,      0.667
   4,  2048,    5,         0,          1,      0.667
   4,  2053,    5,        23,          1,      0.744
   4,  2053,    5,         0,          1,      0.744
   4,  4081,    5,        23,          1,      0.743
   4,  4081,    5,         0,          1,      0.743
   6,  4081,    5,        23,          1,      0.606
   6,  4081,    5,         0,          1,      0.601
   6,     0,    1,        23,          1,      0.822
   6,     0,    2,         0,          1,      0.819
   7,     0,    6,        23,          1,      0.842
   7,     0,    6,         0,          1,      0.856
   7,     6,    6,        23,          1,      0.743
   7,     6,    6,         0,          1,      0.743
   5,     0,    6,        23,          1,      0.777
   5,     0,    6,         0,          1,      0.777
   5,     6,    6,        23,          1,      0.867
   5,     6,    6,         0,          1,      0.867
   7,  2048,    6,        23,          1,      0.706
   7,  2048,    6,         0,          1,      0.704
   7,  2054,    6,        23,          1,      0.617
   7,  2054,    6,         0,          1,      0.617
   5,  2048,    6,        23,          1,      0.666
   5,  2048,    6,         0,          1,      0.666
   5,  2054,    6,        23,          1,      0.743
   5,  2054,    6,         0,          1,      0.743
   5,  4081,    6,        23,          1,      0.743
   5,  4081,    6,         0,          1,      0.743
   7,  4081,    6,        23,          1,      0.592
   7,  4081,    6,         0,          1,      0.592
   7,     0,    1,        23,          1,      0.835
   7,     0,    2,         0,          1,      0.829
   8,     0,    7,        23,          1,      0.851
   8,     0,    7,         0,          1,      0.831
   8,     7,    7,        23,          1,      0.731
   8,     7,    7,         0,          1,      0.723
   6,     0,    7,        23,          1,      0.778
   6,     0,    7,         0,          1,      0.778
   6,     7,    7,        23,          1,      0.868
   6,     7,    7,         0,          1,      0.867
   8,  2048,    7,        23,          1,       0.71
   8,  2048,    7,         0,          1,      0.713
   8,  2055,    7,        23,          1,       0.62
   8,  2055,    7,         0,          1,      0.621
   6,  2048,    7,        23,          1,      0.667
   6,  2048,    7,         0,          1,      0.667
   6,  2055,    7,        23,          1,      0.744
   6,  2055,    7,         0,          1,      0.744
   6,  4081,    7,        23,          1,      0.744
   6,  4081,    7,         0,          1,      0.744
   8,  4081,    7,        23,          1,      0.578
   8,  4081,    7,         0,          1,       0.59
   8,     0,    1,        23,          1,      0.833
   8,     0,    2,         0,          1,      0.822
   9,     0,    8,        23,          1,      0.836
   9,     0,    8,         0,          1,      0.817
   9,     8,    8,        23,          1,      0.729
   9,     8,    8,         0,          1,      0.715
   7,     0,    8,        23,          1,      0.778
   7,     0,    8,         0,          1,      0.778
   7,     8,    8,        23,          1,      0.867
   7,     8,    8,         0,          1,      0.868
   9,  2048,    8,        23,          1,      0.697
   9,  2048,    8,         0,          1,      0.697
   9,  2056,    8,        23,          1,      0.621
   9,  2056,    8,         0,          1,      0.621
   7,  2048,    8,        23,          1,      0.667
   7,  2048,    8,         0,          1,      0.667
   7,  2056,    8,        23,          1,      0.744
   7,  2056,    8,         0,          1,      0.744
   7,  4081,    8,        23,          1,      0.744
   7,  4081,    8,         0,          1,      0.744
   9,  4081,    8,        23,          1,      0.611
   9,  4081,    8,         0,          1,      0.605
   9,     0,    1,        23,          1,       0.83
   9,     0,    2,         0,          1,      0.839
  10,     0,    9,        23,          1,      0.849
  10,     0,    9,         0,          1,      0.836
  10,     9,    9,        23,          1,      0.732
  10,     9,    9,         0,          1,      0.717
   8,     0,    9,        23,          1,      0.778
   8,     0,    9,         0,          1,      0.778
   8,     9,    9,        23,          1,      0.867
   8,     9,    9,         0,          1,      0.868
  10,  2048,    9,        23,          1,      0.697
  10,  2048,    9,         0,          1,      0.697
  10,  2057,    9,        23,          1,       0.63
  10,  2057,    9,         0,          1,      0.612
   8,  2048,    9,        23,          1,      0.667
   8,  2048,    9,         0,          1,      0.667
   8,  2057,    9,        23,          1,      0.744
   8,  2057,    9,         0,          1,      0.744
   8,  4081,    9,        23,          1,      0.744
   8,  4081,    9,         0,          1,      0.744
  10,  4081,    9,        23,          1,      0.597
  10,  4081,    9,         0,          1,      0.597
  10,     0,    1,        23,          1,      0.848
  10,     0,    2,         0,          1,      0.826
  11,     0,   10,        23,          1,      0.849
  11,     0,   10,         0,          1,      0.833
  11,    10,   10,        23,          1,      0.725
  11,    10,   10,         0,          1,      0.723
   9,     0,   10,        23,          1,      0.778
   9,     0,   10,         0,          1,      0.778
   9,    10,   10,        23,          1,      0.868
   9,    10,   10,         0,          1,      0.868
  11,  2048,   10,        23,          1,      0.713
  11,  2048,   10,         0,          1,       0.71
  11,  2058,   10,        23,          1,      0.627
  11,  2058,   10,         0,          1,      0.621
   9,  2048,   10,        23,          1,      0.667
   9,  2048,   10,         0,          1,      0.667
   9,  2058,   10,        23,          1,      0.744
   9,  2058,   10,         0,          1,      0.744
   9,  4081,   10,        23,          1,      0.744
   9,  4081,   10,         0,          1,      0.744
  11,  4081,   10,        23,          1,      0.612
  11,  4081,   10,         0,          1,      0.612
  11,     0,    1,        23,          1,      0.836
  11,     0,    2,         0,          1,      0.839
  12,     0,   11,        23,          1,      0.846
  12,     0,   11,         0,          1,      0.833
  12,    11,   11,        23,          1,       0.72
  12,    11,   11,         0,          1,      0.731
  10,     0,   11,        23,          1,      0.778
  10,     0,   11,         0,          1,      0.778
  10,    11,   11,        23,          1,      0.867
  10,    11,   11,         0,          1,      0.868
  12,  2048,   11,        23,          1,       0.71
  12,  2048,   11,         0,          1,      0.707
  12,  2059,   11,        23,          1,      0.621
  12,  2059,   11,         0,          1,      0.632
  10,  2048,   11,        23,          1,      0.667
  10,  2048,   11,         0,          1,      0.667
  10,  2059,   11,        23,          1,      0.744
  10,  2059,   11,         0,          1,      0.743
  10,  4081,   11,        23,          1,      0.744
  10,  4081,   11,         0,          1,      0.744
  12,  4081,   11,        23,          1,      0.592
  12,  4081,   11,         0,          1,      0.583
  12,     0,    1,        23,          1,      0.849
  12,     0,    2,         0,          1,      0.843
  13,     0,   12,        23,          1,      0.846
  13,     0,   12,         0,          1,      0.833
  13,    12,   12,        23,          1,      0.743
  13,    12,   12,         0,          1,      0.731
  11,     0,   12,        23,          1,      0.778
  11,     0,   12,         0,          1,      0.778
  11,    12,   12,        23,          1,      0.867
  11,    12,   12,         0,          1,      0.868
  13,  2048,   12,        23,          1,       0.71
  13,  2048,   12,         0,          1,      0.703
  13,  2060,   12,        23,          1,      0.624
  13,  2060,   12,         0,          1,      0.627
  11,  2048,   12,        23,          1,      0.667
  11,  2048,   12,         0,          1,      0.667
  11,  2060,   12,        23,          1,      0.744
  11,  2060,   12,         0,          1,      0.744
  11,  4081,   12,        23,          1,      0.744
  11,  4081,   12,         0,          1,      0.744
  13,  4081,   12,        23,          1,      0.606
  13,  4081,   12,         0,          1,      0.597
  13,     0,    1,        23,          1,      0.836
  13,     0,    2,         0,          1,       0.82
  14,     0,   13,        23,          1,      0.849
  14,     0,   13,         0,          1,      0.837
  14,    13,   13,        23,          1,      0.743
  14,    13,   13,         0,          1,      0.749
  12,     0,   13,        23,          1,      0.778
  12,     0,   13,         0,          1,      0.778
  12,    13,   13,        23,          1,      0.868
  12,    13,   13,         0,          1,      0.868
  14,  2048,   13,        23,          1,      0.724
  14,  2048,   13,         0,          1,      0.731
  14,  2061,   13,        23,          1,      0.648
  14,  2061,   13,         0,          1,      0.648
  12,  2048,   13,        23,          1,      0.667
  12,  2048,   13,         0,          1,      0.667
  12,  2061,   13,        23,          1,      0.744
  12,  2061,   13,         0,          1,      0.744
  12,  4081,   13,        23,          1,      0.744
  12,  4081,   13,         0,          1,      0.745
  14,  4081,   13,        23,          1,        0.6
  14,  4081,   13,         0,          1,      0.628
  14,     0,    1,        23,          1,      0.836
  14,     0,    2,         0,          1,      0.836
  15,     0,   14,        23,          1,      0.848
  15,     0,   14,         0,          1,      0.847
  15,    14,   14,        23,          1,      0.747
  15,    14,   14,         0,          1,      0.742
  13,     0,   14,        23,          1,      0.778
  13,     0,   14,         0,          1,      0.778
  13,    14,   14,        23,          1,      0.868
  13,    14,   14,         0,          1,      0.868
  15,  2048,   14,        23,          1,      0.731
  15,  2048,   14,         0,          1,      0.735
  15,  2062,   14,        23,          1,      0.648
  15,  2062,   14,         0,          1,      0.649
  13,  2048,   14,        23,          1,      0.667
  13,  2048,   14,         0,          1,      0.667
  13,  2062,   14,        23,          1,      0.744
  13,  2062,   14,         0,          1,      0.744
  13,  4081,   14,        23,          1,      0.744
  13,  4081,   14,         0,          1,      0.744
  15,  4081,   14,        23,          1,      0.645
  15,  4081,   14,         0,          1,      0.627
  15,     0,    1,        23,          1,      0.842
  15,     0,    2,         0,          1,       0.83
  16,     0,   15,        23,          1,      0.858
  16,     0,   15,         0,          1,      0.849
  16,    15,   15,        23,          1,      0.763
  16,    15,   15,         0,          1,      0.745
  14,     0,   15,        23,          1,      0.793
  14,     0,   15,         0,          1,      0.792
  14,    15,   15,        23,          1,      0.875
  14,    15,   15,         0,          1,      0.878
  16,  2048,   15,        23,          1,       0.73
  16,  2048,   15,         0,          1,      0.737
  16,  2063,   15,        23,          1,      0.647
  16,  2063,   15,         0,          1,      0.633
  14,  2048,   15,        23,          1,       0.69
  14,  2048,   15,         0,          1,      0.693
  14,  2063,   15,        23,          1,      0.759
  14,  2063,   15,         0,          1,      0.765
  14,  4081,   15,        23,          1,      0.756
  14,  4081,   15,         0,          1,       0.76
  16,  4081,   15,        23,          1,        0.9
  16,  4081,   15,         0,          1,      0.886
  16,     0,    1,        23,          1,       0.84
  16,     0,    2,         0,          1,      0.831
  17,     0,   16,        23,          1,      0.849
  17,     0,   16,         0,          1,      0.842
  17,    16,   16,        23,          1,      0.562
  17,    16,   16,         0,          1,      0.562
  15,     0,   16,        23,          1,      0.795
  15,     0,   16,         0,          1,      0.795
  15,    16,   16,        23,          1,      0.878
  15,    16,   16,         0,          1,      0.874
  17,  2048,   16,        23,          1,      0.718
  17,  2048,   16,         0,          1,      0.731
  17,  2064,   16,        23,          1,      0.558
  17,  2064,   16,         0,          1,      0.551
  15,  2048,   16,        23,          1,       0.69
  15,  2048,   16,         0,          1,      0.689
  15,  2064,   16,        23,          1,      0.758
  15,  2064,   16,         0,          1,      0.759
  15,  4081,   16,        23,          1,      0.762
  15,  4081,   16,         0,          1,      0.761
  17,  4081,   16,        23,          1,       0.89
  17,  4081,   16,         0,          1,      0.889
  17,     0,    1,        23,          1,      0.824
  17,     0,    2,         0,          1,      0.834
  18,     0,   17,        23,          1,       0.86
  18,     0,   17,         0,          1,      0.849
  18,    17,   17,        23,          1,       0.55
  18,    17,   17,         0,          1,      0.564
  16,     0,   17,        23,          1,      0.794
  16,     0,   17,         0,          1,      0.793
  16,    17,   17,        23,          1,       0.67
  16,    17,   17,         0,          1,      0.672
  18,  2048,   17,        23,          1,      0.727
  18,  2048,   17,         0,          1,      0.747
  18,  2065,   17,        23,          1,      0.549
  18,  2065,   17,         0,          1,      0.565
  16,  2048,   17,        23,          1,       0.69
  16,  2048,   17,         0,          1,      0.687
  16,  2065,   17,        23,          1,      0.669
  16,  2065,   17,         0,          1,      0.672
  16,  4081,   17,        23,          1,      0.982
  16,  4081,   17,         0,          1,      0.984
  18,  4081,   17,        23,          1,       0.89
  18,  4081,   17,         0,          1,      0.888
  18,     0,    1,        23,          1,      0.846
  18,     0,    2,         0,          1,      0.861
  19,     0,   18,        23,          1,      0.848
  19,     0,   18,         0,          1,      0.859
  19,    18,   18,        23,          1,      0.576
  19,    18,   18,         0,          1,      0.561
  17,     0,   18,        23,          1,        0.8
  17,     0,   18,         0,          1,      0.797
  17,    18,   18,        23,          1,      0.676
  17,    18,   18,         0,          1,      0.675
  19,  2048,   18,        23,          1,      0.726
  19,  2048,   18,         0,          1,      0.733
  19,  2066,   18,        23,          1,      0.566
  19,  2066,   18,         0,          1,      0.578
  17,  2048,   18,        23,          1,      0.689
  17,  2048,   18,         0,          1,      0.692
  17,  2066,   18,        23,          1,      0.673
  17,  2066,   18,         0,          1,      0.674
  17,  4081,   18,        23,          1,      0.985
  17,  4081,   18,         0,          1,      0.979
  19,  4081,   18,        23,          1,      0.885
  19,  4081,   18,         0,          1,      0.887
  19,     0,    1,        23,          1,       0.82
  19,     0,    2,         0,          1,      0.834
  20,     0,   19,        23,          1,      0.842
  20,     0,   19,         0,          1,      0.852
  20,    19,   19,        23,          1,      0.556
  20,    19,   19,         0,          1,      0.544
  18,     0,   19,        23,          1,      0.797
  18,     0,   19,         0,          1,      0.797
  18,    19,   19,        23,          1,      0.672
  18,    19,   19,         0,          1,      0.672
  20,  2048,   19,        23,          1,      0.719
  20,  2048,   19,         0,          1,      0.731
  20,  2067,   19,        23,          1,      0.572
  20,  2067,   19,         0,          1,      0.537
  18,  2048,   19,        23,          1,      0.688
  18,  2048,   19,         0,          1,      0.687
  18,  2067,   19,        23,          1,      0.668
  18,  2067,   19,         0,          1,      0.667
  18,  4081,   19,        23,          1,      0.979
  18,  4081,   19,         0,          1,      0.976
  20,  4081,   19,        23,          1,      0.882
  20,  4081,   19,         0,          1,      0.881
  20,     0,    1,        23,          1,      0.829
  20,     0,    2,         0,          1,      0.842
  21,     0,   20,        23,          1,      0.837
  21,     0,   20,         0,          1,      0.845
  21,    20,   20,        23,          1,      0.543
  21,    20,   20,         0,          1,       0.55
  19,     0,   20,        23,          1,      0.792
  19,     0,   20,         0,          1,      0.794
  19,    20,   20,        23,          1,      0.671
  19,    20,   20,         0,          1,      0.669
  21,  2048,   20,        23,          1,      0.728
  21,  2048,   20,         0,          1,      0.735
  21,  2068,   20,        23,          1,      0.553
  21,  2068,   20,         0,          1,      0.528
  19,  2048,   20,        23,          1,      0.685
  19,  2048,   20,         0,          1,      0.685
  19,  2068,   20,        23,          1,      0.669
  19,  2068,   20,         0,          1,      0.669
  19,  4081,   20,        23,          1,      0.984
  19,  4081,   20,         0,          1,      0.981
  21,  4081,   20,        23,          1,      0.886
  21,  4081,   20,         0,          1,      0.885
  21,     0,    1,        23,          1,      0.846
  21,     0,    2,         0,          1,      0.837
  22,     0,   21,        23,          1,      0.854
  22,     0,   21,         0,          1,       0.84
  22,    21,   21,        23,          1,      0.548
  22,    21,   21,         0,          1,      0.553
  20,     0,   21,        23,          1,      0.796
  20,     0,   21,         0,          1,      0.799
  20,    21,   21,        23,          1,      0.674
  20,    21,   21,         0,          1,      0.673
  22,  2048,   21,        23,          1,      0.743
  22,  2048,   21,         0,          1,       0.74
  22,  2069,   21,        23,          1,      0.556
  22,  2069,   21,         0,          1,      0.546
  20,  2048,   21,        23,          1,      0.686
  20,  2048,   21,         0,          1,      0.685
  20,  2069,   21,        23,          1,      0.669
  20,  2069,   21,         0,          1,       0.67
  20,  4081,   21,        23,          1,       0.98
  20,  4081,   21,         0,          1,      0.974
  22,  4081,   21,        23,          1,      0.883
  22,  4081,   21,         0,          1,      0.882
  22,     0,    1,        23,          1,      0.839
  22,     0,    2,         0,          1,      0.835
  23,     0,   22,        23,          1,      0.841
  23,     0,   22,         0,          1,      0.857
  23,    22,   22,        23,          1,      0.554
  23,    22,   22,         0,          1,      0.562
  21,     0,   22,        23,          1,      0.796
  21,     0,   22,         0,          1,      0.792
  21,    22,   22,        23,          1,       0.67
  21,    22,   22,         0,          1,      0.668
  23,  2048,   22,        23,          1,      0.738
  23,  2048,   22,         0,          1,      0.744
  23,  2070,   22,        23,          1,      0.566
  23,  2070,   22,         0,          1,       0.53
  21,  2048,   22,        23,          1,      0.686
  21,  2048,   22,         0,          1,      0.683
  21,  2070,   22,        23,          1,       0.67
  21,  2070,   22,         0,          1,      0.667
  21,  4081,   22,        23,          1,      0.977
  21,  4081,   22,         0,          1,      0.981
  23,  4081,   22,        23,          1,      0.882
  23,  4081,   22,         0,          1,      0.882
  23,     0,    1,        23,          1,      0.849
  23,     0,    2,         0,          1,      0.827
  24,     0,   23,        23,          1,      0.849
  24,     0,   23,         0,          1,      0.851
  24,    23,   23,        23,          1,      0.561
  24,    23,   23,         0,          1,      0.545
  22,     0,   23,        23,          1,      0.793
  22,     0,   23,         0,          1,      0.796
  22,    23,   23,        23,          1,      0.674
  22,    23,   23,         0,          1,      0.673
  24,  2048,   23,        23,          1,      0.741
  24,  2048,   23,         0,          1,      0.745
  24,  2071,   23,        23,          1,      0.549
  24,  2071,   23,         0,          1,      0.567
  22,  2048,   23,        23,          1,      0.693
  22,  2048,   23,         0,          1,      0.692
  22,  2071,   23,        23,          1,      0.675
  22,  2071,   23,         0,          1,      0.674
  22,  4081,   23,        23,          1,      0.984
  22,  4081,   23,         0,          1,      0.979
  24,  4081,   23,        23,          1,       0.88
  24,  4081,   23,         0,          1,      0.882
  24,     0,    1,        23,          1,      0.825
  24,     0,    2,         0,          1,      0.837
  25,     0,   24,        23,          1,      0.847
  25,     0,   24,         0,          1,      0.837
  25,    24,   24,        23,          1,      0.548
  25,    24,   24,         0,          1,      0.554
  23,     0,   24,        23,          1,      0.797
  23,     0,   24,         0,          1,      0.793
  23,    24,   24,        23,          1,      0.671
  23,    24,   24,         0,          1,       0.67
  25,  2048,   24,        23,          1,      0.734
  25,  2048,   24,         0,          1,      0.722
  25,  2072,   24,        23,          1,       0.57
  25,  2072,   24,         0,          1,      0.543
  23,  2048,   24,        23,          1,      0.689
  23,  2048,   24,         0,          1,      0.692
  23,  2072,   24,        23,          1,      0.673
  23,  2072,   24,         0,          1,      0.675
  23,  4081,   24,        23,          1,      0.978
  23,  4081,   24,         0,          1,      0.973
  25,  4081,   24,        23,          1,      0.881
  25,  4081,   24,         0,          1,      0.883
  25,     0,    1,        23,          1,       0.84
  25,     0,    2,         0,          1,      0.824
  26,     0,   25,        23,          1,      0.834
  26,     0,   25,         0,          1,      0.847
  26,    25,   25,        23,          1,      0.559
  26,    25,   25,         0,          1,      0.554
  24,     0,   25,        23,          1,      0.799
  24,     0,   25,         0,          1,      0.794
  24,    25,   25,        23,          1,      0.674
  24,    25,   25,         0,          1,      0.676
  26,  2048,   25,        23,          1,      0.722
  26,  2048,   25,         0,          1,      0.729
  26,  2073,   25,        23,          1,      0.557
  26,  2073,   25,         0,          1,      0.563
  24,  2048,   25,        23,          1,      0.693
  24,  2048,   25,         0,          1,      0.687
  24,  2073,   25,        23,          1,      0.672
  24,  2073,   25,         0,          1,      0.672
  24,  4081,   25,        23,          1,      0.976
  24,  4081,   25,         0,          1,      0.977
  26,  4081,   25,        23,          1,      0.885
  26,  4081,   25,         0,          1,      0.884
  26,     0,    1,        23,          1,      0.843
  26,     0,    2,         0,          1,       0.84
  27,     0,   26,        23,          1,      0.868
  27,     0,   26,         0,          1,      0.854
  27,    26,   26,        23,          1,      0.537
  27,    26,   26,         0,          1,      0.558
  25,     0,   26,        23,          1,      0.799
  25,     0,   26,         0,          1,      0.797
  25,    26,   26,        23,          1,      0.673
  25,    26,   26,         0,          1,       0.67
  27,  2048,   26,        23,          1,      0.737
  27,  2048,   26,         0,          1,      0.724
  27,  2074,   26,        23,          1,      0.559
  27,  2074,   26,         0,          1,      0.555
  25,  2048,   26,        23,          1,      0.692
  25,  2048,   26,         0,          1,      0.697
  25,  2074,   26,        23,          1,      0.674
  25,  2074,   26,         0,          1,      0.676
  25,  4081,   26,        23,          1,       0.98
  25,  4081,   26,         0,          1,      0.986
  27,  4081,   26,        23,          1,       0.89
  27,  4081,   26,         0,          1,      0.884
  27,     0,    1,        23,          1,      0.816
  27,     0,    2,         0,          1,      0.845
  28,     0,   27,        23,          1,      0.846
  28,     0,   27,         0,          1,       0.84
  28,    27,   27,        23,          1,      0.571
  28,    27,   27,         0,          1,       0.56
  26,     0,   27,        23,          1,      0.797
  26,     0,   27,         0,          1,      0.794
  26,    27,   27,        23,          1,      0.674
  26,    27,   27,         0,          1,      0.677
  28,  2048,   27,        23,          1,      0.747
  28,  2048,   27,         0,          1,      0.727
  28,  2075,   27,        23,          1,      0.556
  28,  2075,   27,         0,          1,      0.555
  26,  2048,   27,        23,          1,      0.688
  26,  2048,   27,         0,          1,      0.692
  26,  2075,   27,        23,          1,      0.672
  26,  2075,   27,         0,          1,      0.672
  26,  4081,   27,        23,          1,      0.982
  26,  4081,   27,         0,          1,       0.98
  28,  4081,   27,        23,          1,      0.886
  28,  4081,   27,         0,          1,      0.883
  28,     0,    1,        23,          1,      0.826
  28,     0,    2,         0,          1,      0.838
  29,     0,   28,        23,          1,      0.844
  29,     0,   28,         0,          1,      0.851
  29,    28,   28,        23,          1,      0.567
  29,    28,   28,         0,          1,      0.567
  27,     0,   28,        23,          1,      0.804
  27,     0,   28,         0,          1,      0.797
  27,    28,   28,        23,          1,      0.678
  27,    28,   28,         0,          1,      0.672
  29,  2048,   28,        23,          1,       0.73
  29,  2048,   28,         0,          1,      0.724
  29,  2076,   28,        23,          1,      0.577
  29,  2076,   28,         0,          1,       0.55
  27,  2048,   28,        23,          1,      0.692
  27,  2048,   28,         0,          1,      0.693
  27,  2076,   28,        23,          1,      0.672
  27,  2076,   28,         0,          1,      0.675
  27,  4081,   28,        23,          1,      0.988
  27,  4081,   28,         0,          1,      0.985
  29,  4081,   28,        23,          1,      0.886
  29,  4081,   28,         0,          1,       0.89
  29,     0,    1,        23,          1,      0.858
  29,     0,    2,         0,          1,      0.821
  30,     0,   29,        23,          1,      0.853
  30,     0,   29,         0,          1,      0.865
  30,    29,   29,        23,          1,      0.541
  30,    29,   29,         0,          1,      0.559
  28,     0,   29,        23,          1,      0.801
  28,     0,   29,         0,          1,      0.795
  28,    29,   29,        23,          1,      0.676
  28,    29,   29,         0,          1,      0.678
  30,  2048,   29,        23,          1,      0.756
  30,  2048,   29,         0,          1,      0.735
  30,  2077,   29,        23,          1,      0.569
  30,  2077,   29,         0,          1,      0.563
  28,  2048,   29,        23,          1,        0.7
  28,  2048,   29,         0,          1,      0.698
  28,  2077,   29,        23,          1,      0.678
  28,  2077,   29,         0,          1,      0.674
  28,  4081,   29,        23,          1,      0.983
  28,  4081,   29,         0,          1,      0.974
  30,  4081,   29,        23,          1,      0.883
  30,  4081,   29,         0,          1,      0.887
  30,     0,    1,        23,          1,      0.837
  30,     0,    2,         0,          1,       0.83
  31,     0,   30,        23,          1,      0.859
  31,     0,   30,         0,          1,       0.85
  31,    30,   30,        23,          1,      0.542
  31,    30,   30,         0,          1,      0.551
  29,     0,   30,        23,          1,      0.797
  29,     0,   30,         0,          1,      0.798
  29,    30,   30,        23,          1,      0.676
  29,    30,   30,         0,          1,      0.676
  31,  2048,   30,        23,          1,      0.738
  31,  2048,   30,         0,          1,      0.739
  31,  2078,   30,        23,          1,       0.57
  31,  2078,   30,         0,          1,      0.551
  29,  2048,   30,        23,          1,      0.693
  29,  2048,   30,         0,          1,      0.694
  29,  2078,   30,        23,          1,      0.675
  29,  2078,   30,         0,          1,      0.671
  29,  4081,   30,        23,          1,      0.981
  29,  4081,   30,         0,          1,      0.976
  31,  4081,   30,        23,          1,       0.89
  31,  4081,   30,         0,          1,       0.89
  31,     0,    1,        23,          1,      0.837
  31,     0,    2,         0,          1,      0.848
  32,     0,   31,        23,          1,      0.853
  32,     0,   31,         0,          1,      0.838
  32,    31,   31,        23,          1,      0.646
  32,    31,   31,         0,          1,      0.648
  30,     0,   31,        23,          1,      0.799
  30,     0,   31,         0,          1,      0.798
  30,    31,   31,        23,          1,      0.674
  30,    31,   31,         0,          1,      0.675
  32,  2048,   31,        23,          1,      0.722
  32,  2048,   31,         0,          1,      0.703
  32,  2079,   31,        23,          1,      0.651
  32,  2079,   31,         0,          1,      0.636
  30,  2048,   31,        23,          1,      0.691
  30,  2048,   31,         0,          1,      0.695
  30,  2079,   31,        23,          1,      0.675
  30,  2079,   31,         0,          1,      0.675
  30,  4081,   31,        23,          1,      0.978
  30,  4081,   31,         0,          1,      0.982
  32,  4081,   31,        23,          1,      0.886
  32,  4081,   31,         0,          1,      0.886
  32,     0,    1,        23,          1,      0.852
  32,     0,    2,         0,          1,      0.833

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v1 7/8] x86: Shrink code size of memchr-avx2.S
  2022-06-03  4:42 [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                   ` (4 preceding siblings ...)
  2022-06-03  4:42 ` [PATCH v1 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
@ 2022-06-03  4:42 ` Noah Goldstein
  2022-06-03  4:42 ` [PATCH v1 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
  2022-06-03  4:51 ` [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  7 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03  4:42 UTC (permalink / raw)
  To: libc-alpha

This is not meant as a performance optimization. The previous code was
far to liberal in aligning targets and wasted code size unnecissarily.

The total code size saving is: 59 bytes

There are no major changes in the benchmarks.
Geometric Mean of all benchmarks New / Old: 0.967

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S |   1 +
 sysdeps/x86_64/multiarch/memchr-avx2.S     | 109 +++++++++++----------
 2 files changed, 60 insertions(+), 50 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
index 87b076c7c4..c4d71938c5 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
@@ -2,6 +2,7 @@
 # define MEMCHR __memchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 75bd7262e0..28a01280ec 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -57,7 +57,7 @@
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 5)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 #  ifdef __ILP32__
@@ -87,12 +87,14 @@ ENTRY (MEMCHR)
 # endif
 	testl	%eax, %eax
 	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
 
 # ifndef USE_AS_RAWMEMCHR
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
 	/* Check if first match was before length.  */
 	tzcntl	%eax, %eax
@@ -100,58 +102,31 @@ L(first_vec_x0):
 	/* NB: Multiply length by 4 to get byte count.  */
 	sall	$2, %edx
 #  endif
-	xorl	%ecx, %ecx
+    COND_VZEROUPPER
+	/* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
+	   block. branch here as opposed to cmovcc is not that costly. Common
+	   usage of memchr is to check if the return was NULL (if string was
+	   known to contain CHAR user would use rawmemchr). This branch will be
+	   highly correlated with the user branch and can be used by most
+	   modern branch predictors to predict the user branch.  */
 	cmpl	%eax, %edx
-	leaq	(%rdi, %rax), %rax
-	cmovle	%rcx, %rax
-	VZEROUPPER_RETURN
-
-L(null):
-	xorl	%eax, %eax
-	ret
-# endif
-	.p2align 4
-L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is
-	   necessary for computer return address if byte is found or
-	   adjusting length if it is not and this is memchr.  */
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
-	   and rdi for rawmemchr.  */
-	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
-	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Calculate length until end of page (length checked for a
-	   match).  */
-	leaq	1(%ALGN_PTR_REG), %rsi
-	subq	%RRAW_PTR_REG, %rsi
-#  ifdef USE_AS_WMEMCHR
-	/* NB: Divide bytes by 4 to get wchar_t count.  */
-	shrl	$2, %esi
-#  endif
-# endif
-	/* Remove the leading bytes.  */
-	sarxl	%ERAW_PTR_REG, %eax, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Check the end of data.  */
-	cmpq	%rsi, %rdx
-	jbe	L(first_vec_x0)
+    jle  L(null)
+	addq	%rdi, %rax
+    ret
 # endif
-	testl	%eax, %eax
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
-	addq	%RRAW_PTR_REG, %rax
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x1):
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-
+# ifndef USE_AS_RAWMEMCHR
+	/* First in aligning bytes here.  */
+L(null):
+	xorl	%eax, %eax
+	ret
+# endif
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
@@ -340,7 +315,7 @@ L(first_vec_x1_check):
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-	.p2align 4
+	.p2align 4,, 6
 L(set_zero_end):
 	xorl	%eax, %eax
 	VZEROUPPER_RETURN
@@ -428,5 +403,39 @@ L(last_vec_x3):
 	VZEROUPPER_RETURN
 # endif
 
+	.p2align 4
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is necessary for
+	   computer return address if byte is found or adjusting length if it
+	   is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
+	   rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMPEQ	(%ALGN_PTR_REG), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length checked for a match).  */
+	leal	VEC_SIZE(%ALGN_PTR_REG), %esi
+	subl	%ERAW_PTR_REG, %esi
+# ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+# endif
+# endif
+	/* Remove the leading bytes.  */
+	sarxl	%ERAW_PTR_REG, %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Check the end of data.  */
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	bsfl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
+	VZEROUPPER_RETURN
+
+
 END (MEMCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v1 8/8] x86: Shrink code size of memchr-evex.S
  2022-06-03  4:42 [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                   ` (5 preceding siblings ...)
  2022-06-03  4:42 ` [PATCH v1 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
@ 2022-06-03  4:42 ` Noah Goldstein
  2022-06-03  4:51 ` [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
  7 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03  4:42 UTC (permalink / raw)
  To: libc-alpha

This is not meant as a performance optimization. The previous code was
far to liberal in aligning targets and wasted code size unnecissarily.

The total code size saving is: 32 bytes

There are no non-negligible changes in the benchmarks.
Geometric Mean of all benchmarks New / Old: 1.000

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memchr-evex.S | 32 ++++++++++++++------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index cfaf02907d..ac705d66cb 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -88,7 +88,7 @@
 # define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 6)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
@@ -131,22 +131,24 @@ L(zero):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
-	/* Check if first match was before length.  */
-	tzcntl	%eax, %eax
-	xorl	%ecx, %ecx
-	cmpl	%eax, %edx
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-	cmovle	%rcx, %rax
+	/* Check if first match was before length. NB: tzcnt has false data-
+	   dependency on destination. eax already had a data-dependency on esi
+	   so this should have no affect here.  */
+	tzcntl	%eax, %esi
+#  ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
+#  else
+	addq	%rsi, %rdi
+#  endif
+	xorl	%eax, %eax
+	cmpl	%esi, %edx
+	cmovg	%rdi, %rax
 	ret
-# else
-	/* NB: first_vec_x0 is 17 bytes which will leave
-	   cross_page_boundary (which is relatively cold) close enough
-	   to ideal alignment. So only realign L(cross_page_boundary) if
-	   rawmemchr.  */
-	.p2align 4
 # endif
+
+	.p2align 4
 L(cross_page_boundary):
 	/* Save pointer before aligning as its original value is
 	   necessary for computer return address if byte is found or
@@ -562,6 +564,6 @@ L(last_vec_x3):
 	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 # endif
-
+	/* 7 bytes from next cache line.  */
 END (MEMCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library
  2022-06-03  4:42 [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
                   ` (6 preceding siblings ...)
  2022-06-03  4:42 ` [PATCH v1 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
@ 2022-06-03  4:51 ` Noah Goldstein
  7 siblings, 0 replies; 82+ messages in thread
From: Noah Goldstein @ 2022-06-03  4:51 UTC (permalink / raw)
  To: GNU C Library

Ignore this patchset. There is an issue with it.

On Thu, Jun 2, 2022 at 11:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This patch does not touch any existing code and is only meant to be a
> tool for future patches so that simple source files can more easily be
> maintained to target multiple VEC classes.
>
> There is no difference in the objdump of libc.so before and after this
> patch.
> ---
>  sysdeps/x86_64/multiarch/avx-rtm-vecs.h  | 33 +++++++++
>  sysdeps/x86_64/multiarch/avx-vecs.h      | 53 ++++++++++++++
>  sysdeps/x86_64/multiarch/avx2-rtm-vecs.h | 33 +++++++++
>  sysdeps/x86_64/multiarch/avx2-vecs.h     | 30 ++++++++
>  sysdeps/x86_64/multiarch/evex256-vecs.h  | 50 +++++++++++++
>  sysdeps/x86_64/multiarch/evex512-vecs.h  | 49 +++++++++++++
>  sysdeps/x86_64/multiarch/sse2-vecs.h     | 48 +++++++++++++
>  sysdeps/x86_64/multiarch/vec-macros.h    | 90 ++++++++++++++++++++++++
>  8 files changed, 386 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/avx2-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/vec-macros.h
>
> diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> new file mode 100644
> index 0000000000..c00b83ea0e
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> @@ -0,0 +1,33 @@
> +/* Common config for AVX-RTM VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _AVX_RTM_VECS_H
> +#define _AVX_RTM_VECS_H                        1
> +
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN              jmp L(return_vzeroupper)
> +
> +#define SECTION(p)                             p##.avx.rtm
> +
> +#define USE_WITH_RTM                   1
> +#include "avx-vecs.h"
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
> new file mode 100644
> index 0000000000..3b84d7e8b2
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/avx-vecs.h
> @@ -0,0 +1,53 @@
> +/* Common config for AVX VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _AVX_VECS_H
> +#define _AVX_VECS_H                    1
> +
> +#ifdef HAS_VEC
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define HAS_VEC                                1
> +#include "vec-macros.h"
> +
> +#ifndef USE_WITH_AVX2
> +# define USE_WITH_AVX          1
> +#endif
> +/* Included by RTM version.  */
> +#ifndef SECTION
> +# define SECTION(p)                    p##.avx
> +#endif
> +
> +#define VEC_SIZE                       32
> +/* 4-byte mov instructions with AVX2.  */
> +#define MOV_SIZE                       4
> +/* 1 (ret) + 3 (vzeroupper).  */
> +#define RET_SIZE                       4
> +#define VZEROUPPER                     vzeroupper
> +
> +#define VMOVU                          vmovdqu
> +#define VMOVA                          vmovdqa
> +#define VMOVNT                         vmovntdq
> +
> +/* Often need to access xmm portion.  */
> +#define VEC_xmm                                VEC_any_xmm
> +#define VEC                                    VEC_any_ymm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
> new file mode 100644
> index 0000000000..a5d46e8c66
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
> @@ -0,0 +1,33 @@
> +/* Common config for AVX2-RTM VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _AVX2_RTM_VECS_H
> +#define _AVX2_RTM_VECS_H                       1
> +
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN              jmp L(return_vzeroupper)
> +
> +#define SECTION(p)                             p##.avx.rtm
> +
> +#define USE_WITH_RTM                   1
> +#include "avx2-vecs.h"
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/avx2-vecs.h b/sysdeps/x86_64/multiarch/avx2-vecs.h
> new file mode 100644
> index 0000000000..4c029b4621
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/avx2-vecs.h
> @@ -0,0 +1,30 @@
> +/* Common config for AVX2 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _AVX2_VECS_H
> +#define _AVX2_VECS_H                   1
> +
> +#define USE_WITH_AVX2          1
> +/* Included by RTM version.  */
> +#ifndef SECTION
> +# define SECTION(p)                    p##.avx
> +#endif
> +#include "avx-vecs.h"
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
> new file mode 100644
> index 0000000000..ed7a32b0ec
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
> @@ -0,0 +1,50 @@
> +/* Common config for EVEX256 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _EVEX256_VECS_H
> +#define _EVEX256_VECS_H                        1
> +
> +#ifdef HAS_VEC
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define HAS_VEC                                1
> +#include "vec-macros.h"
> +
> +#define USE_WITH_EVEX256       1
> +#ifndef SECTION
> +# define SECTION(p)                    p##.evex
> +#endif
> +
> +#define VEC_SIZE                       32
> +/* 6-byte mov instructions with EVEX.  */
> +#define MOV_SIZE                       6
> +/* No vzeroupper needed.  */
> +#define RET_SIZE                       1
> +#define VZEROUPPER
> +
> +#define VMOVU                          vmovdqu64
> +#define VMOVA                          vmovdqa64
> +#define VMOVNT                         vmovntdq
> +
> +/* Often need to access xmm portion.  */
> +#define VEC_xmm                                VEC_hi_xmm
> +#define VEC                                    VEC_hi_ymm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
> new file mode 100644
> index 0000000000..53597734fc
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
> @@ -0,0 +1,49 @@
> +/* Common config for EVEX512 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _EVEX512_VECS_H
> +#define _EVEX512_VECS_H                        1
> +
> +#ifdef HAS_VEC
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define HAS_VEC                                1
> +#include "vec-macros.h"
> +
> +#define USE_WITH_EVEX512       1
> +#define SECTION(p)                     p##.evex512
> +
> +#define VEC_SIZE                       64
> +/* 6-byte mov instructions with EVEX.  */
> +#define MOV_SIZE                       6
> +/* No vzeroupper needed.  */
> +#define RET_SIZE                       1
> +#define VZEROUPPER
> +
> +#define VMOVU                          vmovdqu64
> +#define VMOVA                          vmovdqa64
> +#define VMOVNT                         vmovntdq
> +
> +/* Often need to access xmm/ymm portion.  */
> +#define VEC_xmm                                VEC_hi_xmm
> +#define VEC_ymm                                VEC_hi_ymm
> +#define VEC                                    VEC_hi_zmm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
> new file mode 100644
> index 0000000000..b645b93e3d
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
> @@ -0,0 +1,48 @@
> +/* Common config for SSE2 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _SSE2_VECS_H
> +#define _SSE2_VECS_H                   1
> +
> +#ifdef HAS_VEC
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define HAS_VEC                                1
> +#include "vec-macros.h"
> +
> +#define USE_WITH_SSE2          1
> +#define SECTION(p)                     p
> +
> +#define VEC_SIZE                       16
> +/* 3-byte mov instructions with SSE2.  */
> +#define MOV_SIZE                       3
> +/* No vzeroupper needed.  */
> +#define RET_SIZE                       1
> +
> +#define VMOVU                          movups
> +#define VMOVA                          movaps
> +#define VMOVNT                         movntdq
> +#define VZEROUPPER
> +
> +#define VEC_xmm                                VEC_any_xmm
> +#define VEC                                    VEC_any_xmm
> +
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
> new file mode 100644
> index 0000000000..4dae4503c8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/vec-macros.h
> @@ -0,0 +1,90 @@
> +/* Macro helpers for VEC_{type}({vec_num})
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _VEC_MACROS_H
> +# define _VEC_MACROS_H                 1
> +
> +# ifndef HAS_VEC
> +#  error "Never include this file directly. Always include a vector config."
> +# endif
> +
> +/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
> +   VEC(N) values.  */
> +#define VEC_hi_xmm0                            xmm16
> +#define VEC_hi_xmm1                            xmm17
> +#define VEC_hi_xmm2                            xmm18
> +#define VEC_hi_xmm3                            xmm19
> +#define VEC_hi_xmm4                            xmm20
> +#define VEC_hi_xmm5                            xmm21
> +#define VEC_hi_xmm6                            xmm22
> +#define VEC_hi_xmm7                            xmm23
> +#define VEC_hi_xmm8                            xmm24
> +#define VEC_hi_xmm9                            xmm25
> +#define VEC_hi_xmm10                   xmm26
> +#define VEC_hi_xmm11                   xmm27
> +#define VEC_hi_xmm12                   xmm28
> +#define VEC_hi_xmm13                   xmm29
> +#define VEC_hi_xmm14                   xmm30
> +#define VEC_hi_xmm15                   xmm31
> +
> +#define VEC_hi_ymm0                            ymm16
> +#define VEC_hi_ymm1                            ymm17
> +#define VEC_hi_ymm2                            ymm18
> +#define VEC_hi_ymm3                            ymm19
> +#define VEC_hi_ymm4                            ymm20
> +#define VEC_hi_ymm5                            ymm21
> +#define VEC_hi_ymm6                            ymm22
> +#define VEC_hi_ymm7                            ymm23
> +#define VEC_hi_ymm8                            ymm24
> +#define VEC_hi_ymm9                            ymm25
> +#define VEC_hi_ymm10                   ymm26
> +#define VEC_hi_ymm11                   ymm27
> +#define VEC_hi_ymm12                   ymm28
> +#define VEC_hi_ymm13                   ymm29
> +#define VEC_hi_ymm14                   ymm30
> +#define VEC_hi_ymm15                   ymm31
> +
> +#define VEC_hi_zmm0                            zmm16
> +#define VEC_hi_zmm1                            zmm17
> +#define VEC_hi_zmm2                            zmm18
> +#define VEC_hi_zmm3                            zmm19
> +#define VEC_hi_zmm4                            zmm20
> +#define VEC_hi_zmm5                            zmm21
> +#define VEC_hi_zmm6                            zmm22
> +#define VEC_hi_zmm7                            zmm23
> +#define VEC_hi_zmm8                            zmm24
> +#define VEC_hi_zmm9                            zmm25
> +#define VEC_hi_zmm10                   zmm26
> +#define VEC_hi_zmm11                   zmm27
> +#define VEC_hi_zmm12                   zmm28
> +#define VEC_hi_zmm13                   zmm29
> +#define VEC_hi_zmm14                   zmm30
> +#define VEC_hi_zmm15                   zmm31
> +
> +# define PRIMITIVE_VEC(vec, num)               vec##num
> +
> +# define VEC_any_xmm(i)                        PRIMITIVE_VEC(xmm, i)
> +# define VEC_any_ymm(i)                        PRIMITIVE_VEC(ymm, i)
> +# define VEC_any_zmm(i)                        PRIMITIVE_VEC(zmm, i)
> +
> +# define VEC_hi_xmm(i)                 PRIMITIVE_VEC(VEC_hi_xmm, i)
> +# define VEC_hi_ymm(i)                 PRIMITIVE_VEC(VEC_hi_ymm, i)
> +# define VEC_hi_zmm(i)                 PRIMITIVE_VEC(VEC_hi_zmm, i)
> +
> +#endif
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 82+ messages in thread

end of thread, other threads:[~2022-07-14  2:43 UTC | newest]

Thread overview: 82+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-03  4:42 [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
2022-06-03  4:42 ` [PATCH v1 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
2022-06-03 20:04   ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
2022-06-03 20:04     ` [PATCH v2 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
2022-06-03 23:12       ` H.J. Lu
2022-06-03 23:33         ` Noah Goldstein
2022-06-03 20:04     ` [PATCH v2 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
2022-06-03 20:04     ` [PATCH v2 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
2022-06-03 20:04     ` [PATCH v2 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
2022-06-03 20:04     ` [PATCH v2 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
2022-06-03 20:04     ` [PATCH v2 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
2022-06-03 20:04     ` [PATCH v2 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
2022-06-03 23:09     ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library H.J. Lu
2022-06-03 23:49       ` Noah Goldstein
2022-06-03 23:49   ` [PATCH v3 " Noah Goldstein
2022-06-03 23:49     ` [PATCH v3 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
2022-06-06 21:30       ` H.J. Lu
2022-06-06 22:38         ` Noah Goldstein
2022-06-03 23:49     ` [PATCH v3 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
2022-06-03 23:49     ` [PATCH v3 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
2022-06-03 23:49     ` [PATCH v3 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
2022-06-03 23:49     ` [PATCH v3 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
2022-06-03 23:50     ` [PATCH v3 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
2022-06-03 23:50     ` [PATCH v3 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
2022-06-06 22:37   ` [PATCH v4 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
2022-06-06 22:37     ` [PATCH v4 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
2022-06-07  2:45       ` H.J. Lu
2022-07-14  2:12         ` Sunil Pandey
2022-06-06 22:37     ` [PATCH v4 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
2022-06-07  2:44       ` H.J. Lu
2022-06-07  4:10         ` Noah Goldstein
2022-06-06 22:37     ` [PATCH v4 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
2022-06-06 22:37     ` [PATCH v4 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
2022-06-07  2:41       ` H.J. Lu
2022-06-07  4:09         ` Noah Goldstein
2022-06-07  4:12           ` Noah Goldstein
2022-06-06 22:37     ` [PATCH v4 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
2022-06-07  2:35       ` H.J. Lu
2022-06-07  4:06         ` Noah Goldstein
2022-06-06 22:37     ` [PATCH v4 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
2022-06-06 22:37     ` [PATCH v4 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
2022-06-07  4:05   ` [PATCH v5 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
2022-06-07  4:05     ` [PATCH v5 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
2022-06-07  4:05     ` [PATCH v5 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
2022-06-07  4:05     ` [PATCH v5 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
2022-06-07  4:05     ` [PATCH v5 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
2022-06-07  4:05     ` [PATCH v5 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
2022-06-07  4:05     ` [PATCH v5 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
2022-06-07  4:05     ` [PATCH v5 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
2022-06-07  4:11   ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
2022-06-07  4:11     ` [PATCH v6 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
2022-06-07  4:11     ` [PATCH v6 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
2022-06-07 18:03       ` H.J. Lu
2022-06-07  4:11     ` [PATCH v6 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
2022-06-07 18:04       ` H.J. Lu
2022-07-14  2:19         ` Sunil Pandey
2022-06-07  4:11     ` [PATCH v6 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
2022-06-07 18:21       ` H.J. Lu
2022-07-14  2:21         ` Sunil Pandey
2022-06-07  4:11     ` [PATCH v6 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
2022-06-07 18:17       ` H.J. Lu
2022-07-14  2:26         ` Sunil Pandey
2022-07-14  2:43           ` Noah Goldstein
2022-06-07  4:11     ` [PATCH v6 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
2022-06-07 18:18       ` H.J. Lu
2022-07-14  2:31         ` Sunil Pandey
2022-07-14  2:41           ` Noah Goldstein
2022-06-07  4:11     ` [PATCH v6 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
2022-06-07 18:19       ` H.J. Lu
2022-07-14  2:32         ` Sunil Pandey
2022-06-07 18:04     ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library H.J. Lu
2022-07-14  2:07       ` Sunil Pandey
2022-06-03  4:42 ` [PATCH v1 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
2022-06-03  4:42 ` [PATCH v1 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
2022-06-03  4:47   ` Noah Goldstein
2022-06-03  4:42 ` [PATCH v1 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
2022-06-03  4:49   ` Noah Goldstein
2022-06-03  4:42 ` [PATCH v1 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
2022-06-03  4:50   ` Noah Goldstein
2022-06-03  4:42 ` [PATCH v1 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
2022-06-03  4:42 ` [PATCH v1 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
2022-06-03  4:51 ` [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).