public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v1 1/3] x86: Update evex256/512 vec macros
@ 2022-10-14 16:40 Noah Goldstein
  2022-10-14 16:40 ` [PATCH v1 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
                   ` (10 more replies)
  0 siblings, 11 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 16:40 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

1) Make section only define if there is not a previous definition
2) Add `VEC_lo` definition for proper reg-width but in the
   ymm/zmm0-15 range.

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/evex256-vecs.h | 7 +++++--
 sysdeps/x86_64/multiarch/evex512-vecs.h | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
index 222ba46dc7..4fccabd4b8 100644
--- a/sysdeps/x86_64/multiarch/evex256-vecs.h
+++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
@@ -28,8 +28,11 @@
 #include "evex-vecs-common.h"
 
 #define USE_WITH_EVEX256	1
-#define SECTION(p)			p##.evex
 
-#define VEC					VEC_ymm
+#ifndef SECTION
+# define SECTION(p)			p##.evex
+#endif
 
+#define VEC					VEC_ymm
+#define VEC_lo				VEC_any_ymm
 #endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
index d1784d5368..fecc2d3925 100644
--- a/sysdeps/x86_64/multiarch/evex512-vecs.h
+++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
@@ -28,8 +28,11 @@
 #include "evex-vecs-common.h"
 
 #define USE_WITH_EVEX512	1
-#define SECTION(p)			p##.evex512
 
-#define VEC					VEC_zmm
+#ifndef SECTION
+# define SECTION(p)			p##.evex512
+#endif
 
+#define VEC					VEC_zmm
+#define VEC_lo				VEC_any_zmm
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v1 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 16:40 [PATCH v1 1/3] x86: Update evex256/512 vec macros Noah Goldstein
@ 2022-10-14 16:40 ` Noah Goldstein
  2022-10-14 18:02   ` H.J. Lu
  2022-10-14 16:40 ` [PATCH v1 3/3] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 16:40 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

This is to make it easier to do think like:
```
vpcmpb %VEC(0), %VEC(1), %k0
kmov{d|q} %k0, %{eax|rax}
test %{eax|rax}
```

It adds macro s.t any GPR can get the proper width with:
    `V{upper_case_GPR_name}`

and any mask insn can get the proper width with:
    `{mask_insn_without_postfix}V`

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/reg-macros.h         | 337 ++++++++++++++++++
 .../multiarch/scripts/gen-reg-map-macros.py   |  97 +++++
 2 files changed, 434 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
 create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py

diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
new file mode 100644
index 0000000000..c4d7f57b66
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/reg-macros.h
@@ -0,0 +1,337 @@
+/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py.
+
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _REG_MAP_MACROS_H
+#define _REG_MAP_MACROS_H	1
+
+#define rax_8	al
+#define eax_8	al
+#define ax_8	al
+#define al_8	al
+#define rax_16	ax
+#define eax_16	ax
+#define ax_16	ax
+#define al_16	ax
+#define rax_32	eax
+#define eax_32	eax
+#define ax_32	eax
+#define al_32	eax
+#define rax_64	rax
+#define eax_64	rax
+#define ax_64	rax
+#define al_64	rax
+#define rbx_8	bl
+#define ebx_8	bl
+#define bx_8	bl
+#define bl_8	bl
+#define rbx_16	bx
+#define ebx_16	bx
+#define bx_16	bx
+#define bl_16	bx
+#define rbx_32	ebx
+#define ebx_32	ebx
+#define bx_32	ebx
+#define bl_32	ebx
+#define rbx_64	rbx
+#define ebx_64	rbx
+#define bx_64	rbx
+#define bl_64	rbx
+#define rcx_8	cl
+#define ecx_8	cl
+#define cx_8	cl
+#define cl_8	cl
+#define rcx_16	cx
+#define ecx_16	cx
+#define cx_16	cx
+#define cl_16	cx
+#define rcx_32	ecx
+#define ecx_32	ecx
+#define cx_32	ecx
+#define cl_32	ecx
+#define rcx_64	rcx
+#define ecx_64	rcx
+#define cx_64	rcx
+#define cl_64	rcx
+#define rdx_8	dl
+#define edx_8	dl
+#define dx_8	dl
+#define dl_8	dl
+#define rdx_16	dx
+#define edx_16	dx
+#define dx_16	dx
+#define dl_16	dx
+#define rdx_32	edx
+#define edx_32	edx
+#define dx_32	edx
+#define dl_32	edx
+#define rdx_64	rdx
+#define edx_64	rdx
+#define dx_64	rdx
+#define dl_64	rdx
+#define rbp_8	bpl
+#define ebp_8	bpl
+#define bp_8	bpl
+#define bpl_8	bpl
+#define rbp_16	bp
+#define ebp_16	bp
+#define bp_16	bp
+#define bpl_16	bp
+#define rbp_32	ebp
+#define ebp_32	ebp
+#define bp_32	ebp
+#define bpl_32	ebp
+#define rbp_64	rbp
+#define ebp_64	rbp
+#define bp_64	rbp
+#define bpl_64	rbp
+#define rsp_8	spl
+#define esp_8	spl
+#define sp_8	spl
+#define spl_8	spl
+#define rsp_16	sp
+#define esp_16	sp
+#define sp_16	sp
+#define spl_16	sp
+#define rsp_32	esp
+#define esp_32	esp
+#define sp_32	esp
+#define spl_32	esp
+#define rsp_64	rsp
+#define esp_64	rsp
+#define sp_64	rsp
+#define spl_64	rsp
+#define rsi_8	sil
+#define esi_8	sil
+#define si_8	sil
+#define sil_8	sil
+#define rsi_16	si
+#define esi_16	si
+#define si_16	si
+#define sil_16	si
+#define rsi_32	esi
+#define esi_32	esi
+#define si_32	esi
+#define sil_32	esi
+#define rsi_64	rsi
+#define esi_64	rsi
+#define si_64	rsi
+#define sil_64	rsi
+#define rdi_8	dil
+#define edi_8	dil
+#define di_8	dil
+#define dil_8	dil
+#define rdi_16	di
+#define edi_16	di
+#define di_16	di
+#define dil_16	di
+#define rdi_32	edi
+#define edi_32	edi
+#define di_32	edi
+#define dil_32	edi
+#define rdi_64	rdi
+#define edi_64	rdi
+#define di_64	rdi
+#define dil_64	rdi
+#define r8_8	r8b
+#define r8d_8	r8b
+#define r8w_8	r8b
+#define r8b_8	r8b
+#define r8_16	r8w
+#define r8d_16	r8w
+#define r8w_16	r8w
+#define r8b_16	r8w
+#define r8_32	r8d
+#define r8d_32	r8d
+#define r8w_32	r8d
+#define r8b_32	r8d
+#define r8_64	r8
+#define r8d_64	r8
+#define r8w_64	r8
+#define r8b_64	r8
+#define r9_8	r9b
+#define r9d_8	r9b
+#define r9w_8	r9b
+#define r9b_8	r9b
+#define r9_16	r9w
+#define r9d_16	r9w
+#define r9w_16	r9w
+#define r9b_16	r9w
+#define r9_32	r9d
+#define r9d_32	r9d
+#define r9w_32	r9d
+#define r9b_32	r9d
+#define r9_64	r9
+#define r9d_64	r9
+#define r9w_64	r9
+#define r9b_64	r9
+#define r10_8	r10b
+#define r10d_8	r10b
+#define r10w_8	r10b
+#define r10b_8	r10b
+#define r10_16	r10w
+#define r10d_16	r10w
+#define r10w_16	r10w
+#define r10b_16	r10w
+#define r10_32	r10d
+#define r10d_32	r10d
+#define r10w_32	r10d
+#define r10b_32	r10d
+#define r10_64	r10
+#define r10d_64	r10
+#define r10w_64	r10
+#define r10b_64	r10
+#define r11_8	r11b
+#define r11d_8	r11b
+#define r11w_8	r11b
+#define r11b_8	r11b
+#define r11_16	r11w
+#define r11d_16	r11w
+#define r11w_16	r11w
+#define r11b_16	r11w
+#define r11_32	r11d
+#define r11d_32	r11d
+#define r11w_32	r11d
+#define r11b_32	r11d
+#define r11_64	r11
+#define r11d_64	r11
+#define r11w_64	r11
+#define r11b_64	r11
+#define r12_8	r12b
+#define r12d_8	r12b
+#define r12w_8	r12b
+#define r12b_8	r12b
+#define r12_16	r12w
+#define r12d_16	r12w
+#define r12w_16	r12w
+#define r12b_16	r12w
+#define r12_32	r12d
+#define r12d_32	r12d
+#define r12w_32	r12d
+#define r12b_32	r12d
+#define r12_64	r12
+#define r12d_64	r12
+#define r12w_64	r12
+#define r12b_64	r12
+#define r13_8	r13b
+#define r13d_8	r13b
+#define r13w_8	r13b
+#define r13b_8	r13b
+#define r13_16	r13w
+#define r13d_16	r13w
+#define r13w_16	r13w
+#define r13b_16	r13w
+#define r13_32	r13d
+#define r13d_32	r13d
+#define r13w_32	r13d
+#define r13b_32	r13d
+#define r13_64	r13
+#define r13d_64	r13
+#define r13w_64	r13
+#define r13b_64	r13
+#define r14_8	r14b
+#define r14d_8	r14b
+#define r14w_8	r14b
+#define r14b_8	r14b
+#define r14_16	r14w
+#define r14d_16	r14w
+#define r14w_16	r14w
+#define r14b_16	r14w
+#define r14_32	r14d
+#define r14d_32	r14d
+#define r14w_32	r14d
+#define r14b_32	r14d
+#define r14_64	r14
+#define r14d_64	r14
+#define r14w_64	r14
+#define r14b_64	r14
+#define r15_8	r15b
+#define r15d_8	r15b
+#define r15w_8	r15b
+#define r15b_8	r15b
+#define r15_16	r15w
+#define r15d_16	r15w
+#define r15w_16	r15w
+#define r15b_16	r15w
+#define r15_32	r15d
+#define r15d_32	r15d
+#define r15w_32	r15d
+#define r15b_32	r15d
+#define r15_64	r15
+#define r15d_64	r15
+#define r15w_64	r15
+#define r15b_64	r15
+
+#define VRAX	VGPR(rax)
+#define VRBX	VGPR(rbx)
+#define VRCX	VGPR(rcx)
+#define VRDX	VGPR(rdx)
+#define VRBP	VGPR(rbp)
+#define VRSP	VGPR(rsp)
+#define VRSI	VGPR(rsi)
+#define VRDI	VGPR(rdi)
+#define VR8	VGPR(r8)
+#define VR9	VGPR(r9)
+#define VR10	VGPR(r10)
+#define VR11	VGPR(r11)
+#define VR12	VGPR(r12)
+#define VR13	VGPR(r13)
+#define VR14	VGPR(r14)
+#define VR15	VGPR(r15)
+
+#define kmov_8	kmovb
+#define kmov_16	kmovw
+#define kmov_32	kmovd
+#define kmov_64	kmovq
+#define kortest_8	kortestb
+#define kortest_16	kortestw
+#define kortest_32	kortestd
+#define kortest_64	kortestq
+#define kor_8	korb
+#define kor_16	korw
+#define kor_32	kord
+#define kor_64	korq
+#define ktest_8	ktestb
+#define ktest_16	ktestw
+#define ktest_32	ktestd
+#define ktest_64	ktestq
+#define kand_8	kandb
+#define kand_16	kandw
+#define kand_32	kandd
+#define kand_64	kandq
+#define kxor_8	kxorb
+#define kxor_16	kxorw
+#define kxor_32	kxord
+#define kxor_64	kxorq
+
+#define kmovV 	VKINSN_SZ(kmov, REG_WIDTH)
+#define kortestV 	VKINSN_SZ(kortest, REG_WIDTH)
+#define korV 	VKINSN_SZ(kor, REG_WIDTH)
+#define ktestV 	VKINSN_SZ(ktest, REG_WIDTH)
+#define kandV 	VKINSN_SZ(kand, REG_WIDTH)
+#define kxorV 	VKINSN_SZ(kxor, REG_WIDTH)
+
+#ifndef REG_WIDTH
+#define REG_WIDTH VEC_SIZE
+#endif
+#define PRIM_VGPR_SZ(reg_name, reg_size)	reg_name##_##reg_size
+#define VGPR_SZ(reg_name, reg_size)	PRIM_VGPR_SZ(reg_name, reg_size)
+#define VGPR(reg_name)	VGPR_SZ(reg_name, REG_WIDTH)
+#define VKINSN_SZ(insn, reg_size)	PRIM_VGPR_SZ(insn, reg_size)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
new file mode 100644
index 0000000000..5b04e89ecb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
@@ -0,0 +1,97 @@
+#!/usr/bin/python3
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+"""Generate macros for getting GPR name of a certain size
+
+Inputs: None
+Output: Prints header fill to stdout
+
+API:
+    VGPR(reg_name)
+        - Get register name VEC_SIZE component of `reg_name`
+    VGPR_SZ(reg_name, reg_size)
+        - Get register name `reg_size` component of `reg_name`
+"""
+
+import sys
+from datetime import datetime
+
+registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
+             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
+             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
+             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
+             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
+             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
+             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
+             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
+
+mask_insns = ["kmov", "kortest", "kor", "ktest", "kand", "kxor"]
+mask_insns_ext = ["b", "w", "d", "q"]
+
+cr = """
+   Copyright (C) {} Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+"""
+
+print("/* This file was generated by: {}.".format(sys.argv[0]))
+print(cr.format(datetime.today().year))
+
+print("#ifndef _REG_MAP_MACROS_H")
+print("#define _REG_MAP_MACROS_H\t1\n")
+for reg in registers:
+    for i in range(0, 4):
+        for j in range(0, 4):
+            print("#define {}_{}\t{}".format(reg[j], 8 << i, reg[3 - i]))
+
+print("")
+for reg in registers:
+    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
+
+print("")
+for mask_insn in mask_insns:
+    for i in range(0, 4):
+        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
+                                           mask_insns_ext[i]))
+
+print("")
+for mask_insn in mask_insns:
+    print("#define {}V \tVKINSN_SZ({}, REG_WIDTH)".format(mask_insn, mask_insn))
+print("")
+
+print("#ifndef REG_WIDTH")
+print("#define REG_WIDTH VEC_SIZE")
+print("#endif")
+print("#define PRIM_VGPR_SZ(reg_name, reg_size)\treg_name##_##reg_size")
+print("#define VGPR_SZ(reg_name, reg_size)\tPRIM_VGPR_SZ(reg_name, reg_size)")
+print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
+print("#define VKINSN_SZ(insn, reg_size)\tPRIM_VGPR_SZ(insn, reg_size)")
+
+print("\n#endif")
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v1 3/3] x86: Update strlen-evex-base to use new reg/vec macros.
  2022-10-14 16:40 [PATCH v1 1/3] x86: Update evex256/512 vec macros Noah Goldstein
  2022-10-14 16:40 ` [PATCH v1 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
@ 2022-10-14 16:40 ` Noah Goldstein
  2022-10-14 17:31 ` [PATCH v1 1/3] x86: Update evex256/512 vec macros H.J. Lu
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 16:40 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

To avoid duplicate the VMM / GPR / mask insn macros in all incoming
evex512 files use the macros defined in 'reg-macros.h' and
'{vec}-macros.h'

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 118 +++++++-------------
 sysdeps/x86_64/multiarch/strlen-evex512.S   |   4 +-
 2 files changed, 45 insertions(+), 77 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index 418e9f8411..1be36ac48e 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -36,42 +36,10 @@
 #  define CHAR_SIZE	1
 # endif
 
-# define XMM0		xmm16
 # define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# if VEC_SIZE == 64
-#  define KMOV		kmovq
-#  define KORTEST	kortestq
-#  define RAX		rax
-#  define RCX		rcx
-#  define RDX		rdx
-#  define SHR		shrq
-#  define TEXTSUFFIX	evex512
-#  define VMM0		zmm16
-#  define VMM1		zmm17
-#  define VMM2		zmm18
-#  define VMM3		zmm19
-#  define VMM4		zmm20
-#  define VMOVA		vmovdqa64
-# elif VEC_SIZE == 32
-/* Currently Unused.  */
-#  define KMOV		kmovd
-#  define KORTEST	kortestd
-#  define RAX		eax
-#  define RCX		ecx
-#  define RDX		edx
-#  define SHR		shrl
-#  define TEXTSUFFIX	evex256
-#  define VMM0		ymm16
-#  define VMM1		ymm17
-#  define VMM2		ymm18
-#  define VMM3		ymm19
-#  define VMM4		ymm20
-#  define VMOVA		vmovdqa32
-# endif
-
-	.section .text.TEXTSUFFIX, "ax", @progbits
+	.section SECTION(.text),"ax",@progbits
 /* Aligning entry point to 64 byte, provides better performance for
    one vector length string.  */
 ENTRY_P2ALIGN (STRLEN, 6)
@@ -86,18 +54,18 @@ ENTRY_P2ALIGN (STRLEN, 6)
 # endif
 
 	movl	%edi, %eax
-	vpxorq	%XMM0, %XMM0, %XMM0
+	vpxorq	%VEC_xmm(0), %VEC_xmm(0), %VEC_xmm(0)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(page_cross)
 
 	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMP	$0, (%rdi), %VMM0, %k0
-	KMOV	%k0, %RAX
-	test	%RAX, %RAX
+	VPCMP	$0, (%rdi), %VEC(0), %k0
+	kmovV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
@@ -120,7 +88,7 @@ L(align_more):
 	movq	%rax, %rdx
 	subq	%rdi, %rdx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RDX
+	shr	$2, %VRDX
 #  endif
 	/* At this point rdx contains [w]chars already compared.  */
 	subq	%rsi, %rdx
@@ -131,9 +99,9 @@ L(align_more):
 # endif
 
 	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMP	$0, (%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (%rax), %VEC(0), %k0
+	kmovV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
 # ifdef USE_AS_STRNLEN
@@ -141,9 +109,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, VEC_SIZE(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, VEC_SIZE(%rax), %VEC(0), %k0
+	kmovV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
 # ifdef USE_AS_STRNLEN
@@ -151,9 +119,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VEC(0), %k0
+	kmovV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 
 # ifdef USE_AS_STRNLEN
@@ -161,9 +129,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VEC(0), %k0
+	kmovV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x4)
 
 # ifdef USE_AS_STRNLEN
@@ -179,7 +147,7 @@ L(align_more):
 # ifdef USE_AS_STRNLEN
 	subq	%rax, %rcx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RCX
+	shr	$2, %VRCX
 #  endif
 	/* rcx contains number of [w]char will be recompared due to
 	   alignment fixes.  rdx must be incremented by rcx to offset
@@ -199,42 +167,42 @@ L(loop_entry):
 # endif
 	/* VPMINU and VPCMP combination provide better performance as
 	   compared to alternative combinations.  */
-	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
-	VPMINU	(VEC_SIZE * 5)(%rax), %VMM1, %VMM2
-	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
-	VPMINU	(VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+	VMOVA	(VEC_SIZE * 4)(%rax), %VEC(1)
+	VPMINU	(VEC_SIZE * 5)(%rax), %VEC(1), %VEC(2)
+	VMOVA	(VEC_SIZE * 6)(%rax), %VEC(3)
+	VPMINU	(VEC_SIZE * 7)(%rax), %VEC(3), %VEC(4)
 
-	VPTESTN	%VMM2, %VMM2, %k0
-	VPTESTN	%VMM4, %VMM4, %k1
+	VPTESTN	%VEC(2), %VEC(2), %k0
+	VPTESTN	%VEC(4), %VEC(4), %k1
 
 	subq	$-(VEC_SIZE * 4), %rax
-	KORTEST	%k0, %k1
+	kortestV	%k0, %k1
 	jz	L(loop)
 
-	VPTESTN	%VMM1, %VMM1, %k2
-	KMOV	%k2, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VEC(1), %VEC(1), %k2
+	kmovV	%k2, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
-	KMOV	%k0, %RCX
+	kmovV	%k0, %VRCX
 	/* At this point, if k0 is non zero, null char must be in the
 	   second vector.  */
-	test	%RCX, %RCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
-	VPTESTN	%VMM3, %VMM3, %k3
-	KMOV	%k3, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VEC(3), %VEC(3), %k3
+	kmovV	%k3, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 	/* At this point null [w]char must be in the fourth vector so no
 	   need to check.  */
-	KMOV	%k1, %RCX
+	kmovV	%k1, %VRCX
 
 	/* Fourth, third, second vector terminating are pretty much
 	   same, implemented this way to avoid branching and reuse code
 	   from pre loop exit condition.  */
 L(ret_vec_x4):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 3), %rax
@@ -250,7 +218,7 @@ L(ret_vec_x4):
 	ret
 
 L(ret_vec_x3):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 2), %rax
@@ -268,7 +236,7 @@ L(ret_vec_x3):
 L(ret_vec_x2):
 	subq	$-VEC_SIZE, %rax
 L(ret_vec_x1):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
@@ -289,13 +257,13 @@ L(page_cross):
 	/* ecx contains number of w[char] to be skipped as a result
 	   of address alignment.  */
 	xorq	%rdi, %rax
-	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
-	KMOV	%k0, %RAX
+	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VEC(0), %k0
+	kmovV	%k0, %VRAX
 	/* Ignore number of character for alignment adjustment.  */
-	SHR	%cl, %RAX
+	shr	%cl, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
index 116f8981c8..dfd0a7821b 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -2,6 +2,6 @@
 # define STRLEN		__strlen_evex512
 #endif
 
-#define VEC_SIZE	64
-
+#include "evex512-vecs.h"
+#include "reg-macros.h"
 #include "strlen-evex-base.S"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v1 1/3] x86: Update evex256/512 vec macros
  2022-10-14 16:40 [PATCH v1 1/3] x86: Update evex256/512 vec macros Noah Goldstein
  2022-10-14 16:40 ` [PATCH v1 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
  2022-10-14 16:40 ` [PATCH v1 3/3] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
@ 2022-10-14 17:31 ` H.J. Lu
  2022-10-14 18:01 ` [PATCH v2 " Noah Goldstein
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 72+ messages in thread
From: H.J. Lu @ 2022-10-14 17:31 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> 1) Make section only define if there is not a previous definition
> 2) Add `VEC_lo` definition for proper reg-width but in the
>    ymm/zmm0-15 range.
>
> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  sysdeps/x86_64/multiarch/evex256-vecs.h | 7 +++++--
>  sysdeps/x86_64/multiarch/evex512-vecs.h | 7 +++++--
>  2 files changed, 10 insertions(+), 4 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
> index 222ba46dc7..4fccabd4b8 100644
> --- a/sysdeps/x86_64/multiarch/evex256-vecs.h
> +++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
> @@ -28,8 +28,11 @@
>  #include "evex-vecs-common.h"
>
>  #define USE_WITH_EVEX256       1
> -#define SECTION(p)                     p##.evex
>
> -#define VEC                                    VEC_ymm
> +#ifndef SECTION
> +# define SECTION(p)                    p##.evex
> +#endif
>
> +#define VEC                                    VEC_ymm
> +#define VEC_lo                         VEC_any_ymm
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
> index d1784d5368..fecc2d3925 100644
> --- a/sysdeps/x86_64/multiarch/evex512-vecs.h
> +++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
> @@ -28,8 +28,11 @@
>  #include "evex-vecs-common.h"
>
>  #define USE_WITH_EVEX512       1
> -#define SECTION(p)                     p##.evex512
>
> -#define VEC                                    VEC_zmm
> +#ifndef SECTION
> +# define SECTION(p)                    p##.evex512
> +#endif
>
> +#define VEC                                    VEC_zmm
> +#define VEC_lo                         VEC_any_zmm
>  #endif
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v2 1/3] x86: Update evex256/512 vec macros
  2022-10-14 16:40 [PATCH v1 1/3] x86: Update evex256/512 vec macros Noah Goldstein
                   ` (2 preceding siblings ...)
  2022-10-14 17:31 ` [PATCH v1 1/3] x86: Update evex256/512 vec macros H.J. Lu
@ 2022-10-14 18:01 ` Noah Goldstein
  2022-10-14 18:01   ` [PATCH v2 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
  2022-10-14 18:01   ` [PATCH v2 3/3] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
  2022-10-14 18:22 ` [PATCH v3 1/3] x86: Update evex256/512 vec macros Noah Goldstein
                   ` (6 subsequent siblings)
  10 siblings, 2 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 18:01 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

1) Make section only define if there is not a previous definition
2) Add `VEC_lo` definition for proper reg-width but in the
   ymm/zmm0-15 range.

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/evex256-vecs.h | 7 +++++--
 sysdeps/x86_64/multiarch/evex512-vecs.h | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
index 222ba46dc7..4fccabd4b8 100644
--- a/sysdeps/x86_64/multiarch/evex256-vecs.h
+++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
@@ -28,8 +28,11 @@
 #include "evex-vecs-common.h"
 
 #define USE_WITH_EVEX256	1
-#define SECTION(p)			p##.evex
 
-#define VEC					VEC_ymm
+#ifndef SECTION
+# define SECTION(p)			p##.evex
+#endif
 
+#define VEC					VEC_ymm
+#define VEC_lo				VEC_any_ymm
 #endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
index d1784d5368..fecc2d3925 100644
--- a/sysdeps/x86_64/multiarch/evex512-vecs.h
+++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
@@ -28,8 +28,11 @@
 #include "evex-vecs-common.h"
 
 #define USE_WITH_EVEX512	1
-#define SECTION(p)			p##.evex512
 
-#define VEC					VEC_zmm
+#ifndef SECTION
+# define SECTION(p)			p##.evex512
+#endif
 
+#define VEC					VEC_zmm
+#define VEC_lo				VEC_any_zmm
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v2 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 18:01 ` [PATCH v2 " Noah Goldstein
@ 2022-10-14 18:01   ` Noah Goldstein
  2022-10-14 18:01   ` [PATCH v2 3/3] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
  1 sibling, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 18:01 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

This is to make it easier to do think like:
```
vpcmpb %VEC(0), %VEC(1), %k0
kmov{d|q} %k0, %{eax|rax}
test %{eax|rax}
```

It adds macro s.t any GPR can get the proper width with:
    `V{upper_case_GPR_name}`

and any mask insn can get the proper width with:
    `{mask_insn_without_postfix}V`

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/reg-macros.h         | 351 ++++++++++++++++++
 .../multiarch/scripts/gen-reg-macros.py       | 112 ++++++
 2 files changed, 463 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
 create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py

diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
new file mode 100644
index 0000000000..14f0425245
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/reg-macros.h
@@ -0,0 +1,351 @@
+/* This file was generated by: gen-reg-macros.py.
+
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _REG_MACROS_H
+#define _REG_MACROS_H	1
+
+#define rax_8	al
+#define eax_8	al
+#define ax_8	al
+#define al_8	al
+#define rax_16	ax
+#define eax_16	ax
+#define ax_16	ax
+#define al_16	ax
+#define rax_32	eax
+#define eax_32	eax
+#define ax_32	eax
+#define al_32	eax
+#define rax_64	rax
+#define eax_64	rax
+#define ax_64	rax
+#define al_64	rax
+#define rbx_8	bl
+#define ebx_8	bl
+#define bx_8	bl
+#define bl_8	bl
+#define rbx_16	bx
+#define ebx_16	bx
+#define bx_16	bx
+#define bl_16	bx
+#define rbx_32	ebx
+#define ebx_32	ebx
+#define bx_32	ebx
+#define bl_32	ebx
+#define rbx_64	rbx
+#define ebx_64	rbx
+#define bx_64	rbx
+#define bl_64	rbx
+#define rcx_8	cl
+#define ecx_8	cl
+#define cx_8	cl
+#define cl_8	cl
+#define rcx_16	cx
+#define ecx_16	cx
+#define cx_16	cx
+#define cl_16	cx
+#define rcx_32	ecx
+#define ecx_32	ecx
+#define cx_32	ecx
+#define cl_32	ecx
+#define rcx_64	rcx
+#define ecx_64	rcx
+#define cx_64	rcx
+#define cl_64	rcx
+#define rdx_8	dl
+#define edx_8	dl
+#define dx_8	dl
+#define dl_8	dl
+#define rdx_16	dx
+#define edx_16	dx
+#define dx_16	dx
+#define dl_16	dx
+#define rdx_32	edx
+#define edx_32	edx
+#define dx_32	edx
+#define dl_32	edx
+#define rdx_64	rdx
+#define edx_64	rdx
+#define dx_64	rdx
+#define dl_64	rdx
+#define rbp_8	bpl
+#define ebp_8	bpl
+#define bp_8	bpl
+#define bpl_8	bpl
+#define rbp_16	bp
+#define ebp_16	bp
+#define bp_16	bp
+#define bpl_16	bp
+#define rbp_32	ebp
+#define ebp_32	ebp
+#define bp_32	ebp
+#define bpl_32	ebp
+#define rbp_64	rbp
+#define ebp_64	rbp
+#define bp_64	rbp
+#define bpl_64	rbp
+#define rsp_8	spl
+#define esp_8	spl
+#define sp_8	spl
+#define spl_8	spl
+#define rsp_16	sp
+#define esp_16	sp
+#define sp_16	sp
+#define spl_16	sp
+#define rsp_32	esp
+#define esp_32	esp
+#define sp_32	esp
+#define spl_32	esp
+#define rsp_64	rsp
+#define esp_64	rsp
+#define sp_64	rsp
+#define spl_64	rsp
+#define rsi_8	sil
+#define esi_8	sil
+#define si_8	sil
+#define sil_8	sil
+#define rsi_16	si
+#define esi_16	si
+#define si_16	si
+#define sil_16	si
+#define rsi_32	esi
+#define esi_32	esi
+#define si_32	esi
+#define sil_32	esi
+#define rsi_64	rsi
+#define esi_64	rsi
+#define si_64	rsi
+#define sil_64	rsi
+#define rdi_8	dil
+#define edi_8	dil
+#define di_8	dil
+#define dil_8	dil
+#define rdi_16	di
+#define edi_16	di
+#define di_16	di
+#define dil_16	di
+#define rdi_32	edi
+#define edi_32	edi
+#define di_32	edi
+#define dil_32	edi
+#define rdi_64	rdi
+#define edi_64	rdi
+#define di_64	rdi
+#define dil_64	rdi
+#define r8_8	r8b
+#define r8d_8	r8b
+#define r8w_8	r8b
+#define r8b_8	r8b
+#define r8_16	r8w
+#define r8d_16	r8w
+#define r8w_16	r8w
+#define r8b_16	r8w
+#define r8_32	r8d
+#define r8d_32	r8d
+#define r8w_32	r8d
+#define r8b_32	r8d
+#define r8_64	r8
+#define r8d_64	r8
+#define r8w_64	r8
+#define r8b_64	r8
+#define r9_8	r9b
+#define r9d_8	r9b
+#define r9w_8	r9b
+#define r9b_8	r9b
+#define r9_16	r9w
+#define r9d_16	r9w
+#define r9w_16	r9w
+#define r9b_16	r9w
+#define r9_32	r9d
+#define r9d_32	r9d
+#define r9w_32	r9d
+#define r9b_32	r9d
+#define r9_64	r9
+#define r9d_64	r9
+#define r9w_64	r9
+#define r9b_64	r9
+#define r10_8	r10b
+#define r10d_8	r10b
+#define r10w_8	r10b
+#define r10b_8	r10b
+#define r10_16	r10w
+#define r10d_16	r10w
+#define r10w_16	r10w
+#define r10b_16	r10w
+#define r10_32	r10d
+#define r10d_32	r10d
+#define r10w_32	r10d
+#define r10b_32	r10d
+#define r10_64	r10
+#define r10d_64	r10
+#define r10w_64	r10
+#define r10b_64	r10
+#define r11_8	r11b
+#define r11d_8	r11b
+#define r11w_8	r11b
+#define r11b_8	r11b
+#define r11_16	r11w
+#define r11d_16	r11w
+#define r11w_16	r11w
+#define r11b_16	r11w
+#define r11_32	r11d
+#define r11d_32	r11d
+#define r11w_32	r11d
+#define r11b_32	r11d
+#define r11_64	r11
+#define r11d_64	r11
+#define r11w_64	r11
+#define r11b_64	r11
+#define r12_8	r12b
+#define r12d_8	r12b
+#define r12w_8	r12b
+#define r12b_8	r12b
+#define r12_16	r12w
+#define r12d_16	r12w
+#define r12w_16	r12w
+#define r12b_16	r12w
+#define r12_32	r12d
+#define r12d_32	r12d
+#define r12w_32	r12d
+#define r12b_32	r12d
+#define r12_64	r12
+#define r12d_64	r12
+#define r12w_64	r12
+#define r12b_64	r12
+#define r13_8	r13b
+#define r13d_8	r13b
+#define r13w_8	r13b
+#define r13b_8	r13b
+#define r13_16	r13w
+#define r13d_16	r13w
+#define r13w_16	r13w
+#define r13b_16	r13w
+#define r13_32	r13d
+#define r13d_32	r13d
+#define r13w_32	r13d
+#define r13b_32	r13d
+#define r13_64	r13
+#define r13d_64	r13
+#define r13w_64	r13
+#define r13b_64	r13
+#define r14_8	r14b
+#define r14d_8	r14b
+#define r14w_8	r14b
+#define r14b_8	r14b
+#define r14_16	r14w
+#define r14d_16	r14w
+#define r14w_16	r14w
+#define r14b_16	r14w
+#define r14_32	r14d
+#define r14d_32	r14d
+#define r14w_32	r14d
+#define r14b_32	r14d
+#define r14_64	r14
+#define r14d_64	r14
+#define r14w_64	r14
+#define r14b_64	r14
+#define r15_8	r15b
+#define r15d_8	r15b
+#define r15w_8	r15b
+#define r15b_8	r15b
+#define r15_16	r15w
+#define r15d_16	r15w
+#define r15w_16	r15w
+#define r15b_16	r15w
+#define r15_32	r15d
+#define r15d_32	r15d
+#define r15w_32	r15d
+#define r15b_32	r15d
+#define r15_64	r15
+#define r15d_64	r15
+#define r15w_64	r15
+#define r15b_64	r15
+
+#define VRAX	VGPR(rax)
+#define VRBX	VGPR(rbx)
+#define VRCX	VGPR(rcx)
+#define VRDX	VGPR(rdx)
+#define VRBP	VGPR(rbp)
+#define VRSP	VGPR(rsp)
+#define VRSI	VGPR(rsi)
+#define VRDI	VGPR(rdi)
+#define VR8	VGPR(r8)
+#define VR9	VGPR(r9)
+#define VR10	VGPR(r10)
+#define VR11	VGPR(r11)
+#define VR12	VGPR(r12)
+#define VR13	VGPR(r13)
+#define VR14	VGPR(r14)
+#define VR15	VGPR(r15)
+
+#define kmov_8	kmovb
+#define kmov_16	kmovw
+#define kmov_32	kmovd
+#define kmov_64	kmovq
+#define kortest_8	kortestb
+#define kortest_16	kortestw
+#define kortest_32	kortestd
+#define kortest_64	kortestq
+#define kor_8	korb
+#define kor_16	korw
+#define kor_32	kord
+#define kor_64	korq
+#define ktest_8	ktestb
+#define ktest_16	ktestw
+#define ktest_32	ktestd
+#define ktest_64	ktestq
+#define kand_8	kandb
+#define kand_16	kandw
+#define kand_32	kandd
+#define kand_64	kandq
+#define kxor_8	kxorb
+#define kxor_16	kxorw
+#define kxor_32	kxord
+#define kxor_64	kxorq
+#define knot_8	knotb
+#define knot_16	knotw
+#define knot_32	knotd
+#define knot_64	knotq
+#define kxnor_8	kxnorb
+#define kxnor_16	kxnorw
+#define kxnor_32	kxnord
+#define kxnor_64	kxnorq
+#define kunpack_8	kunpackbw
+#define kunpack_16	kunpackwd
+#define kunpack_32	kunpackdq
+
+#define kmovV 	VKINSN_SZ(kmov, REG_WIDTH)
+#define kortestV 	VKINSN_SZ(kortest, REG_WIDTH)
+#define korV 	VKINSN_SZ(kor, REG_WIDTH)
+#define ktestV 	VKINSN_SZ(ktest, REG_WIDTH)
+#define kandV 	VKINSN_SZ(kand, REG_WIDTH)
+#define kxorV 	VKINSN_SZ(kxor, REG_WIDTH)
+#define knotV 	VKINSN_SZ(knot, REG_WIDTH)
+#define kxnorV 	VKINSN_SZ(kxnor, REG_WIDTH)
+#define kunpackV 	VKINSN_SZ(kunpack, REG_WIDTH)
+
+#ifndef REG_WIDTH
+# define REG_WIDTH VEC_SIZE
+#endif
+#define PRIM_VGPR_SZ(reg_name, reg_size)	reg_name##_##reg_size
+#define VGPR_SZ(reg_name, reg_size)	PRIM_VGPR_SZ(reg_name, reg_size)
+#define VGPR(reg_name)	VGPR_SZ(reg_name, REG_WIDTH)
+#define VKINSN_SZ(insn, reg_size)	PRIM_VGPR_SZ(insn, reg_size)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
new file mode 100644
index 0000000000..95bd0e708f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
@@ -0,0 +1,112 @@
+#!/usr/bin/python3
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+"""Generate macros for getting GPR name of a certain size
+
+Inputs: None
+Output: Prints header fill to stdout
+
+API:
+    VGPR(reg_name)
+        - Get register name VEC_SIZE component of `reg_name`
+    VGPR_SZ(reg_name, reg_size)
+        - Get register name `reg_size` component of `reg_name`
+"""
+
+import sys
+from datetime import datetime
+
+registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
+             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
+             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
+             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
+             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
+             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
+             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
+             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
+
+mask_insns = [
+    "kmov",
+    "kortest",
+    "kor",
+    "ktest",
+    "kand",
+    "kxor",
+    "knot",
+    "kxnor",
+]
+mask_insns_ext = ["b", "w", "d", "q"]
+
+cr = """
+   Copyright (C) {} Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+"""
+
+print("/* This file was generated by: {}.".format(sys.argv[0]))
+print(cr.format(datetime.today().year))
+
+print("#ifndef _REG_MACROS_H")
+print("#define _REG_MACROS_H\t1\n")
+for reg in registers:
+    for i in range(0, 4):
+        for j in range(0, 4):
+            print("#define {}_{}\t{}".format(reg[j], 8 << i, reg[3 - i]))
+
+print("")
+for reg in registers:
+    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
+
+print("")
+for mask_insn in mask_insns:
+    for i in range(0, 4):
+        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
+                                           mask_insns_ext[i]))
+for i in range(0, 3):
+    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
+                                                   mask_insns_ext[i + 1]))
+mask_insns.append("kunpack")
+
+print("")
+
+for mask_insn in mask_insns:
+    print("#define {}V \tVKINSN_SZ({}, REG_WIDTH)".format(
+        mask_insn, mask_insn))
+print("")
+
+print("#ifndef REG_WIDTH")
+print("# define REG_WIDTH VEC_SIZE")
+print("#endif")
+print("#define PRIM_VGPR_SZ(reg_name, reg_size)\treg_name##_##reg_size")
+print("#define VGPR_SZ(reg_name, reg_size)\tPRIM_VGPR_SZ(reg_name, reg_size)")
+print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
+print("#define VKINSN_SZ(insn, reg_size)\tPRIM_VGPR_SZ(insn, reg_size)")
+
+print("\n#endif")
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v2 3/3] x86: Update strlen-evex-base to use new reg/vec macros.
  2022-10-14 18:01 ` [PATCH v2 " Noah Goldstein
  2022-10-14 18:01   ` [PATCH v2 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
@ 2022-10-14 18:01   ` Noah Goldstein
  1 sibling, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 18:01 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

To avoid duplicate the VMM / GPR / mask insn macros in all incoming
evex512 files use the macros defined in 'reg-macros.h' and
'{vec}-macros.h'

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 118 +++++++-------------
 sysdeps/x86_64/multiarch/strlen-evex512.S   |   4 +-
 2 files changed, 45 insertions(+), 77 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index 418e9f8411..1be36ac48e 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -36,42 +36,10 @@
 #  define CHAR_SIZE	1
 # endif
 
-# define XMM0		xmm16
 # define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# if VEC_SIZE == 64
-#  define KMOV		kmovq
-#  define KORTEST	kortestq
-#  define RAX		rax
-#  define RCX		rcx
-#  define RDX		rdx
-#  define SHR		shrq
-#  define TEXTSUFFIX	evex512
-#  define VMM0		zmm16
-#  define VMM1		zmm17
-#  define VMM2		zmm18
-#  define VMM3		zmm19
-#  define VMM4		zmm20
-#  define VMOVA		vmovdqa64
-# elif VEC_SIZE == 32
-/* Currently Unused.  */
-#  define KMOV		kmovd
-#  define KORTEST	kortestd
-#  define RAX		eax
-#  define RCX		ecx
-#  define RDX		edx
-#  define SHR		shrl
-#  define TEXTSUFFIX	evex256
-#  define VMM0		ymm16
-#  define VMM1		ymm17
-#  define VMM2		ymm18
-#  define VMM3		ymm19
-#  define VMM4		ymm20
-#  define VMOVA		vmovdqa32
-# endif
-
-	.section .text.TEXTSUFFIX, "ax", @progbits
+	.section SECTION(.text),"ax",@progbits
 /* Aligning entry point to 64 byte, provides better performance for
    one vector length string.  */
 ENTRY_P2ALIGN (STRLEN, 6)
@@ -86,18 +54,18 @@ ENTRY_P2ALIGN (STRLEN, 6)
 # endif
 
 	movl	%edi, %eax
-	vpxorq	%XMM0, %XMM0, %XMM0
+	vpxorq	%VEC_xmm(0), %VEC_xmm(0), %VEC_xmm(0)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(page_cross)
 
 	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMP	$0, (%rdi), %VMM0, %k0
-	KMOV	%k0, %RAX
-	test	%RAX, %RAX
+	VPCMP	$0, (%rdi), %VEC(0), %k0
+	kmovV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
@@ -120,7 +88,7 @@ L(align_more):
 	movq	%rax, %rdx
 	subq	%rdi, %rdx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RDX
+	shr	$2, %VRDX
 #  endif
 	/* At this point rdx contains [w]chars already compared.  */
 	subq	%rsi, %rdx
@@ -131,9 +99,9 @@ L(align_more):
 # endif
 
 	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMP	$0, (%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (%rax), %VEC(0), %k0
+	kmovV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
 # ifdef USE_AS_STRNLEN
@@ -141,9 +109,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, VEC_SIZE(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, VEC_SIZE(%rax), %VEC(0), %k0
+	kmovV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
 # ifdef USE_AS_STRNLEN
@@ -151,9 +119,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VEC(0), %k0
+	kmovV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 
 # ifdef USE_AS_STRNLEN
@@ -161,9 +129,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VEC(0), %k0
+	kmovV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x4)
 
 # ifdef USE_AS_STRNLEN
@@ -179,7 +147,7 @@ L(align_more):
 # ifdef USE_AS_STRNLEN
 	subq	%rax, %rcx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RCX
+	shr	$2, %VRCX
 #  endif
 	/* rcx contains number of [w]char will be recompared due to
 	   alignment fixes.  rdx must be incremented by rcx to offset
@@ -199,42 +167,42 @@ L(loop_entry):
 # endif
 	/* VPMINU and VPCMP combination provide better performance as
 	   compared to alternative combinations.  */
-	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
-	VPMINU	(VEC_SIZE * 5)(%rax), %VMM1, %VMM2
-	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
-	VPMINU	(VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+	VMOVA	(VEC_SIZE * 4)(%rax), %VEC(1)
+	VPMINU	(VEC_SIZE * 5)(%rax), %VEC(1), %VEC(2)
+	VMOVA	(VEC_SIZE * 6)(%rax), %VEC(3)
+	VPMINU	(VEC_SIZE * 7)(%rax), %VEC(3), %VEC(4)
 
-	VPTESTN	%VMM2, %VMM2, %k0
-	VPTESTN	%VMM4, %VMM4, %k1
+	VPTESTN	%VEC(2), %VEC(2), %k0
+	VPTESTN	%VEC(4), %VEC(4), %k1
 
 	subq	$-(VEC_SIZE * 4), %rax
-	KORTEST	%k0, %k1
+	kortestV	%k0, %k1
 	jz	L(loop)
 
-	VPTESTN	%VMM1, %VMM1, %k2
-	KMOV	%k2, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VEC(1), %VEC(1), %k2
+	kmovV	%k2, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
-	KMOV	%k0, %RCX
+	kmovV	%k0, %VRCX
 	/* At this point, if k0 is non zero, null char must be in the
 	   second vector.  */
-	test	%RCX, %RCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
-	VPTESTN	%VMM3, %VMM3, %k3
-	KMOV	%k3, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VEC(3), %VEC(3), %k3
+	kmovV	%k3, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 	/* At this point null [w]char must be in the fourth vector so no
 	   need to check.  */
-	KMOV	%k1, %RCX
+	kmovV	%k1, %VRCX
 
 	/* Fourth, third, second vector terminating are pretty much
 	   same, implemented this way to avoid branching and reuse code
 	   from pre loop exit condition.  */
 L(ret_vec_x4):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 3), %rax
@@ -250,7 +218,7 @@ L(ret_vec_x4):
 	ret
 
 L(ret_vec_x3):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 2), %rax
@@ -268,7 +236,7 @@ L(ret_vec_x3):
 L(ret_vec_x2):
 	subq	$-VEC_SIZE, %rax
 L(ret_vec_x1):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
@@ -289,13 +257,13 @@ L(page_cross):
 	/* ecx contains number of w[char] to be skipped as a result
 	   of address alignment.  */
 	xorq	%rdi, %rax
-	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
-	KMOV	%k0, %RAX
+	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VEC(0), %k0
+	kmovV	%k0, %VRAX
 	/* Ignore number of character for alignment adjustment.  */
-	SHR	%cl, %RAX
+	shr	%cl, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
index 116f8981c8..dfd0a7821b 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -2,6 +2,6 @@
 # define STRLEN		__strlen_evex512
 #endif
 
-#define VEC_SIZE	64
-
+#include "evex512-vecs.h"
+#include "reg-macros.h"
 #include "strlen-evex-base.S"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v1 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 16:40 ` [PATCH v1 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
@ 2022-10-14 18:02   ` H.J. Lu
  2022-10-14 18:26     ` Noah Goldstein
  0 siblings, 1 reply; 72+ messages in thread
From: H.J. Lu @ 2022-10-14 18:02 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

 On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This is to make it easier to do think like:
> ```
> vpcmpb %VEC(0), %VEC(1), %k0
> kmov{d|q} %k0, %{eax|rax}
> test %{eax|rax}
> ```

Since all these register macros are based on VEC_SIZE which is either 32
bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are
needed.  8-bit and 16-bit macros aren't needed.

> It adds macro s.t any GPR can get the proper width with:
>     `V{upper_case_GPR_name}`
>
> and any mask insn can get the proper width with:
>     `{mask_insn_without_postfix}V`

All macros should be in upper cases.

> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  sysdeps/x86_64/multiarch/reg-macros.h         | 337 ++++++++++++++++++
>  .../multiarch/scripts/gen-reg-map-macros.py   |  97 +++++
>  2 files changed, 434 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
>  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
>
> diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> new file mode 100644
> index 0000000000..c4d7f57b66
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/reg-macros.h

vreg-macros.h to indicate macros based on vector size.   Please
add comments to indicate that register macros are expanded based
on vector size.

> @@ -0,0 +1,337 @@
> +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py.
> +
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _REG_MAP_MACROS_H
> +#define _REG_MAP_MACROS_H      1
> +
> +#define rax_8  al
> +#define eax_8  al
> +#define ax_8   al
> +#define al_8   al
> +#define rax_16 ax
> +#define eax_16 ax
> +#define ax_16  ax
> +#define al_16  ax
> +#define rax_32 eax
> +#define eax_32 eax
> +#define ax_32  eax
> +#define al_32  eax
> +#define rax_64 rax
> +#define eax_64 rax
> +#define ax_64  rax
> +#define al_64  rax

Only rax_32 and rax_64 are needed.

> +#define rbx_8  bl
> +#define ebx_8  bl
> +#define bx_8   bl
> +#define bl_8   bl
> +#define rbx_16 bx
> +#define ebx_16 bx
> +#define bx_16  bx
> +#define bl_16  bx
> +#define rbx_32 ebx
> +#define ebx_32 ebx
> +#define bx_32  ebx
> +#define bl_32  ebx
> +#define rbx_64 rbx
> +#define ebx_64 rbx
> +#define bx_64  rbx
> +#define bl_64  rbx
> +#define rcx_8  cl
> +#define ecx_8  cl
> +#define cx_8   cl
> +#define cl_8   cl
> +#define rcx_16 cx
> +#define ecx_16 cx
> +#define cx_16  cx
> +#define cl_16  cx
> +#define rcx_32 ecx
> +#define ecx_32 ecx
> +#define cx_32  ecx
> +#define cl_32  ecx
> +#define rcx_64 rcx
> +#define ecx_64 rcx
> +#define cx_64  rcx
> +#define cl_64  rcx
> +#define rdx_8  dl
> +#define edx_8  dl
> +#define dx_8   dl
> +#define dl_8   dl
> +#define rdx_16 dx
> +#define edx_16 dx
> +#define dx_16  dx
> +#define dl_16  dx
> +#define rdx_32 edx
> +#define edx_32 edx
> +#define dx_32  edx
> +#define dl_32  edx
> +#define rdx_64 rdx
> +#define edx_64 rdx
> +#define dx_64  rdx
> +#define dl_64  rdx
> +#define rbp_8  bpl
> +#define ebp_8  bpl
> +#define bp_8   bpl
> +#define bpl_8  bpl
> +#define rbp_16 bp
> +#define ebp_16 bp
> +#define bp_16  bp
> +#define bpl_16 bp
> +#define rbp_32 ebp
> +#define ebp_32 ebp
> +#define bp_32  ebp
> +#define bpl_32 ebp
> +#define rbp_64 rbp
> +#define ebp_64 rbp
> +#define bp_64  rbp
> +#define bpl_64 rbp
> +#define rsp_8  spl
> +#define esp_8  spl
> +#define sp_8   spl
> +#define spl_8  spl
> +#define rsp_16 sp
> +#define esp_16 sp
> +#define sp_16  sp
> +#define spl_16 sp
> +#define rsp_32 esp
> +#define esp_32 esp
> +#define sp_32  esp
> +#define spl_32 esp
> +#define rsp_64 rsp
> +#define esp_64 rsp
> +#define sp_64  rsp
> +#define spl_64 rsp
> +#define rsi_8  sil
> +#define esi_8  sil
> +#define si_8   sil
> +#define sil_8  sil
> +#define rsi_16 si
> +#define esi_16 si
> +#define si_16  si
> +#define sil_16 si
> +#define rsi_32 esi
> +#define esi_32 esi
> +#define si_32  esi
> +#define sil_32 esi
> +#define rsi_64 rsi
> +#define esi_64 rsi
> +#define si_64  rsi
> +#define sil_64 rsi
> +#define rdi_8  dil
> +#define edi_8  dil
> +#define di_8   dil
> +#define dil_8  dil
> +#define rdi_16 di
> +#define edi_16 di
> +#define di_16  di
> +#define dil_16 di
> +#define rdi_32 edi
> +#define edi_32 edi
> +#define di_32  edi
> +#define dil_32 edi
> +#define rdi_64 rdi
> +#define edi_64 rdi
> +#define di_64  rdi
> +#define dil_64 rdi
> +#define r8_8   r8b
> +#define r8d_8  r8b
> +#define r8w_8  r8b
> +#define r8b_8  r8b
> +#define r8_16  r8w
> +#define r8d_16 r8w
> +#define r8w_16 r8w
> +#define r8b_16 r8w
> +#define r8_32  r8d
> +#define r8d_32 r8d
> +#define r8w_32 r8d
> +#define r8b_32 r8d
> +#define r8_64  r8
> +#define r8d_64 r8
> +#define r8w_64 r8
> +#define r8b_64 r8
> +#define r9_8   r9b
> +#define r9d_8  r9b
> +#define r9w_8  r9b
> +#define r9b_8  r9b
> +#define r9_16  r9w
> +#define r9d_16 r9w
> +#define r9w_16 r9w
> +#define r9b_16 r9w
> +#define r9_32  r9d
> +#define r9d_32 r9d
> +#define r9w_32 r9d
> +#define r9b_32 r9d
> +#define r9_64  r9
> +#define r9d_64 r9
> +#define r9w_64 r9
> +#define r9b_64 r9
> +#define r10_8  r10b
> +#define r10d_8 r10b
> +#define r10w_8 r10b
> +#define r10b_8 r10b
> +#define r10_16 r10w
> +#define r10d_16        r10w
> +#define r10w_16        r10w
> +#define r10b_16        r10w
> +#define r10_32 r10d
> +#define r10d_32        r10d
> +#define r10w_32        r10d
> +#define r10b_32        r10d
> +#define r10_64 r10
> +#define r10d_64        r10
> +#define r10w_64        r10
> +#define r10b_64        r10
> +#define r11_8  r11b
> +#define r11d_8 r11b
> +#define r11w_8 r11b
> +#define r11b_8 r11b
> +#define r11_16 r11w
> +#define r11d_16        r11w
> +#define r11w_16        r11w
> +#define r11b_16        r11w
> +#define r11_32 r11d
> +#define r11d_32        r11d
> +#define r11w_32        r11d
> +#define r11b_32        r11d
> +#define r11_64 r11
> +#define r11d_64        r11
> +#define r11w_64        r11
> +#define r11b_64        r11
> +#define r12_8  r12b
> +#define r12d_8 r12b
> +#define r12w_8 r12b
> +#define r12b_8 r12b
> +#define r12_16 r12w
> +#define r12d_16        r12w
> +#define r12w_16        r12w
> +#define r12b_16        r12w
> +#define r12_32 r12d
> +#define r12d_32        r12d
> +#define r12w_32        r12d
> +#define r12b_32        r12d
> +#define r12_64 r12
> +#define r12d_64        r12
> +#define r12w_64        r12
> +#define r12b_64        r12
> +#define r13_8  r13b
> +#define r13d_8 r13b
> +#define r13w_8 r13b
> +#define r13b_8 r13b
> +#define r13_16 r13w
> +#define r13d_16        r13w
> +#define r13w_16        r13w
> +#define r13b_16        r13w
> +#define r13_32 r13d
> +#define r13d_32        r13d
> +#define r13w_32        r13d
> +#define r13b_32        r13d
> +#define r13_64 r13
> +#define r13d_64        r13
> +#define r13w_64        r13
> +#define r13b_64        r13
> +#define r14_8  r14b
> +#define r14d_8 r14b
> +#define r14w_8 r14b
> +#define r14b_8 r14b
> +#define r14_16 r14w
> +#define r14d_16        r14w
> +#define r14w_16        r14w
> +#define r14b_16        r14w
> +#define r14_32 r14d
> +#define r14d_32        r14d
> +#define r14w_32        r14d
> +#define r14b_32        r14d
> +#define r14_64 r14
> +#define r14d_64        r14
> +#define r14w_64        r14
> +#define r14b_64        r14
> +#define r15_8  r15b
> +#define r15d_8 r15b
> +#define r15w_8 r15b
> +#define r15b_8 r15b
> +#define r15_16 r15w
> +#define r15d_16        r15w
> +#define r15w_16        r15w
> +#define r15b_16        r15w
> +#define r15_32 r15d
> +#define r15d_32        r15d
> +#define r15w_32        r15d
> +#define r15b_32        r15d
> +#define r15_64 r15
> +#define r15d_64        r15
> +#define r15w_64        r15
> +#define r15b_64        r15
> +
> +#define VRAX   VGPR(rax)
> +#define VRBX   VGPR(rbx)
> +#define VRCX   VGPR(rcx)
> +#define VRDX   VGPR(rdx)
> +#define VRBP   VGPR(rbp)
> +#define VRSP   VGPR(rsp)
> +#define VRSI   VGPR(rsi)
> +#define VRDI   VGPR(rdi)
> +#define VR8    VGPR(r8)
> +#define VR9    VGPR(r9)
> +#define VR10   VGPR(r10)
> +#define VR11   VGPR(r11)
> +#define VR12   VGPR(r12)
> +#define VR13   VGPR(r13)
> +#define VR14   VGPR(r14)
> +#define VR15   VGPR(r15)
> +
> +#define kmov_8 kmovb
> +#define kmov_16        kmovw
> +#define kmov_32        kmovd
> +#define kmov_64        kmovq

Only 32 and 64 are needed.

> +#define kortest_8      kortestb
> +#define kortest_16     kortestw
> +#define kortest_32     kortestd
> +#define kortest_64     kortestq
> +#define kor_8  korb
> +#define kor_16 korw
> +#define kor_32 kord
> +#define kor_64 korq
> +#define ktest_8        ktestb
> +#define ktest_16       ktestw
> +#define ktest_32       ktestd
> +#define ktest_64       ktestq
> +#define kand_8 kandb
> +#define kand_16        kandw
> +#define kand_32        kandd
> +#define kand_64        kandq
> +#define kxor_8 kxorb
> +#define kxor_16        kxorw
> +#define kxor_32        kxord
> +#define kxor_64        kxorq
> +
> +#define kmovV  VKINSN_SZ(kmov, REG_WIDTH)
> +#define kortestV       VKINSN_SZ(kortest, REG_WIDTH)
> +#define korV   VKINSN_SZ(kor, REG_WIDTH)
> +#define ktestV         VKINSN_SZ(ktest, REG_WIDTH)
> +#define kandV  VKINSN_SZ(kand, REG_WIDTH)
> +#define kxorV  VKINSN_SZ(kxor, REG_WIDTH)

#define VKINSN(op) VKINSN_SZ(op, REG_WIDTH)

> +
> +#ifndef REG_WIDTH
> +#define REG_WIDTH VEC_SIZE

Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH
can be dropped.

> +#endif
> +#define PRIM_VGPR_SZ(reg_name, reg_size)       reg_name##_##reg_size

This is used for both register and instruction.  How about

#define VPASTER(x,y) x##_##y


> +#define VGPR_SZ(reg_name, reg_size)    PRIM_VGPR_SZ(reg_name, reg_size)
> +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> +#define VKINSN_SZ(insn, reg_size)      PRIM_VGPR_SZ(insn, reg_size)

No need for both VGPR_SZ and VKINSN_SZ.  How about

#define VEVALUATOR(x,y) VPASTER(x,y)

> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
> new file mode 100644
> index 0000000000..5b04e89ecb
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
> @@ -0,0 +1,97 @@
> +#!/usr/bin/python3
> +# Copyright (C) 2022 Free Software Foundation, Inc.
> +# This file is part of the GNU C Library.
> +#
> +# The GNU C Library is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU Lesser General Public
> +# License as published by the Free Software Foundation; either
> +# version 2.1 of the License, or (at your option) any later version.
> +#
> +# The GNU C Library is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +# Lesser General Public License for more details.
> +#
> +# You should have received a copy of the GNU Lesser General Public
> +# License along with the GNU C Library; if not, see
> +# <https://www.gnu.org/licenses/>.
> +"""Generate macros for getting GPR name of a certain size
> +
> +Inputs: None
> +Output: Prints header fill to stdout
> +
> +API:
> +    VGPR(reg_name)
> +        - Get register name VEC_SIZE component of `reg_name`
> +    VGPR_SZ(reg_name, reg_size)
> +        - Get register name `reg_size` component of `reg_name`
> +"""
> +
> +import sys
> +from datetime import datetime
> +
> +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> +
> +mask_insns = ["kmov", "kortest", "kor", "ktest", "kand", "kxor"]
> +mask_insns_ext = ["b", "w", "d", "q"]
> +
> +cr = """
> +   Copyright (C) {} Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +"""
> +
> +print("/* This file was generated by: {}.".format(sys.argv[0]))
> +print(cr.format(datetime.today().year))
> +
> +print("#ifndef _REG_MAP_MACROS_H")
> +print("#define _REG_MAP_MACROS_H\t1\n")
> +for reg in registers:
> +    for i in range(0, 4):
> +        for j in range(0, 4):
> +            print("#define {}_{}\t{}".format(reg[j], 8 << i, reg[3 - i]))
> +
> +print("")
> +for reg in registers:
> +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> +
> +print("")
> +for mask_insn in mask_insns:
> +    for i in range(0, 4):
> +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> +                                           mask_insns_ext[i]))
> +
> +print("")
> +for mask_insn in mask_insns:
> +    print("#define {}V \tVKINSN_SZ({}, REG_WIDTH)".format(mask_insn, mask_insn))
> +print("")
> +
> +print("#ifndef REG_WIDTH")
> +print("#define REG_WIDTH VEC_SIZE")
> +print("#endif")
> +print("#define PRIM_VGPR_SZ(reg_name, reg_size)\treg_name##_##reg_size")
> +print("#define VGPR_SZ(reg_name, reg_size)\tPRIM_VGPR_SZ(reg_name, reg_size)")
> +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> +print("#define VKINSN_SZ(insn, reg_size)\tPRIM_VGPR_SZ(insn, reg_size)")
> +
> +print("\n#endif")
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v3 1/3] x86: Update evex256/512 vec macros
  2022-10-14 16:40 [PATCH v1 1/3] x86: Update evex256/512 vec macros Noah Goldstein
                   ` (3 preceding siblings ...)
  2022-10-14 18:01 ` [PATCH v2 " Noah Goldstein
@ 2022-10-14 18:22 ` Noah Goldstein
  2022-10-14 18:22   ` [PATCH v3 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
  2022-10-14 18:22   ` [PATCH v3 3/3] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
  2022-10-14 18:41 ` [PATCH v4 1/3] x86: Update evex256/512 vec macros Noah Goldstein
                   ` (5 subsequent siblings)
  10 siblings, 2 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 18:22 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

1) Make section only define if there is not a previous definition
2) Add `VEC_lo` definition for proper reg-width but in the
   ymm/zmm0-15 range.

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/evex256-vecs.h | 7 +++++--
 sysdeps/x86_64/multiarch/evex512-vecs.h | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
index 222ba46dc7..4fccabd4b8 100644
--- a/sysdeps/x86_64/multiarch/evex256-vecs.h
+++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
@@ -28,8 +28,11 @@
 #include "evex-vecs-common.h"
 
 #define USE_WITH_EVEX256	1
-#define SECTION(p)			p##.evex
 
-#define VEC					VEC_ymm
+#ifndef SECTION
+# define SECTION(p)			p##.evex
+#endif
 
+#define VEC					VEC_ymm
+#define VEC_lo				VEC_any_ymm
 #endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
index d1784d5368..fecc2d3925 100644
--- a/sysdeps/x86_64/multiarch/evex512-vecs.h
+++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
@@ -28,8 +28,11 @@
 #include "evex-vecs-common.h"
 
 #define USE_WITH_EVEX512	1
-#define SECTION(p)			p##.evex512
 
-#define VEC					VEC_zmm
+#ifndef SECTION
+# define SECTION(p)			p##.evex512
+#endif
 
+#define VEC					VEC_zmm
+#define VEC_lo				VEC_any_zmm
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v3 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 18:22 ` [PATCH v3 1/3] x86: Update evex256/512 vec macros Noah Goldstein
@ 2022-10-14 18:22   ` Noah Goldstein
  2022-10-14 18:22   ` [PATCH v3 3/3] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
  1 sibling, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 18:22 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

This is to make it easier to do think like:
```
vpcmpb %VEC(0), %VEC(1), %k0
kmov{d|q} %k0, %{eax|rax}
test %{eax|rax}
```

It adds macro s.t any GPR can get the proper width with:
    `V{upper_case_GPR_name}`

and any mask insn can get the proper width with:
    `{mask_insn_without_postfix}V`

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/reg-macros.h         | 351 ++++++++++++++++++
 .../multiarch/scripts/gen-reg-macros.py       | 112 ++++++
 2 files changed, 463 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
 create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py

diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
new file mode 100644
index 0000000000..2b6bf417d1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/reg-macros.h
@@ -0,0 +1,351 @@
+/* This file was generated by: gen-reg-macros.py.
+
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _REG_MACROS_H
+#define _REG_MACROS_H	1
+
+#define rax_8	al
+#define eax_8	al
+#define ax_8	al
+#define al_8	al
+#define rax_16	ax
+#define eax_16	ax
+#define ax_16	ax
+#define al_16	ax
+#define rax_32	eax
+#define eax_32	eax
+#define ax_32	eax
+#define al_32	eax
+#define rax_64	rax
+#define eax_64	rax
+#define ax_64	rax
+#define al_64	rax
+#define rbx_8	bl
+#define ebx_8	bl
+#define bx_8	bl
+#define bl_8	bl
+#define rbx_16	bx
+#define ebx_16	bx
+#define bx_16	bx
+#define bl_16	bx
+#define rbx_32	ebx
+#define ebx_32	ebx
+#define bx_32	ebx
+#define bl_32	ebx
+#define rbx_64	rbx
+#define ebx_64	rbx
+#define bx_64	rbx
+#define bl_64	rbx
+#define rcx_8	cl
+#define ecx_8	cl
+#define cx_8	cl
+#define cl_8	cl
+#define rcx_16	cx
+#define ecx_16	cx
+#define cx_16	cx
+#define cl_16	cx
+#define rcx_32	ecx
+#define ecx_32	ecx
+#define cx_32	ecx
+#define cl_32	ecx
+#define rcx_64	rcx
+#define ecx_64	rcx
+#define cx_64	rcx
+#define cl_64	rcx
+#define rdx_8	dl
+#define edx_8	dl
+#define dx_8	dl
+#define dl_8	dl
+#define rdx_16	dx
+#define edx_16	dx
+#define dx_16	dx
+#define dl_16	dx
+#define rdx_32	edx
+#define edx_32	edx
+#define dx_32	edx
+#define dl_32	edx
+#define rdx_64	rdx
+#define edx_64	rdx
+#define dx_64	rdx
+#define dl_64	rdx
+#define rbp_8	bpl
+#define ebp_8	bpl
+#define bp_8	bpl
+#define bpl_8	bpl
+#define rbp_16	bp
+#define ebp_16	bp
+#define bp_16	bp
+#define bpl_16	bp
+#define rbp_32	ebp
+#define ebp_32	ebp
+#define bp_32	ebp
+#define bpl_32	ebp
+#define rbp_64	rbp
+#define ebp_64	rbp
+#define bp_64	rbp
+#define bpl_64	rbp
+#define rsp_8	spl
+#define esp_8	spl
+#define sp_8	spl
+#define spl_8	spl
+#define rsp_16	sp
+#define esp_16	sp
+#define sp_16	sp
+#define spl_16	sp
+#define rsp_32	esp
+#define esp_32	esp
+#define sp_32	esp
+#define spl_32	esp
+#define rsp_64	rsp
+#define esp_64	rsp
+#define sp_64	rsp
+#define spl_64	rsp
+#define rsi_8	sil
+#define esi_8	sil
+#define si_8	sil
+#define sil_8	sil
+#define rsi_16	si
+#define esi_16	si
+#define si_16	si
+#define sil_16	si
+#define rsi_32	esi
+#define esi_32	esi
+#define si_32	esi
+#define sil_32	esi
+#define rsi_64	rsi
+#define esi_64	rsi
+#define si_64	rsi
+#define sil_64	rsi
+#define rdi_8	dil
+#define edi_8	dil
+#define di_8	dil
+#define dil_8	dil
+#define rdi_16	di
+#define edi_16	di
+#define di_16	di
+#define dil_16	di
+#define rdi_32	edi
+#define edi_32	edi
+#define di_32	edi
+#define dil_32	edi
+#define rdi_64	rdi
+#define edi_64	rdi
+#define di_64	rdi
+#define dil_64	rdi
+#define r8_8	r8b
+#define r8d_8	r8b
+#define r8w_8	r8b
+#define r8b_8	r8b
+#define r8_16	r8w
+#define r8d_16	r8w
+#define r8w_16	r8w
+#define r8b_16	r8w
+#define r8_32	r8d
+#define r8d_32	r8d
+#define r8w_32	r8d
+#define r8b_32	r8d
+#define r8_64	r8
+#define r8d_64	r8
+#define r8w_64	r8
+#define r8b_64	r8
+#define r9_8	r9b
+#define r9d_8	r9b
+#define r9w_8	r9b
+#define r9b_8	r9b
+#define r9_16	r9w
+#define r9d_16	r9w
+#define r9w_16	r9w
+#define r9b_16	r9w
+#define r9_32	r9d
+#define r9d_32	r9d
+#define r9w_32	r9d
+#define r9b_32	r9d
+#define r9_64	r9
+#define r9d_64	r9
+#define r9w_64	r9
+#define r9b_64	r9
+#define r10_8	r10b
+#define r10d_8	r10b
+#define r10w_8	r10b
+#define r10b_8	r10b
+#define r10_16	r10w
+#define r10d_16	r10w
+#define r10w_16	r10w
+#define r10b_16	r10w
+#define r10_32	r10d
+#define r10d_32	r10d
+#define r10w_32	r10d
+#define r10b_32	r10d
+#define r10_64	r10
+#define r10d_64	r10
+#define r10w_64	r10
+#define r10b_64	r10
+#define r11_8	r11b
+#define r11d_8	r11b
+#define r11w_8	r11b
+#define r11b_8	r11b
+#define r11_16	r11w
+#define r11d_16	r11w
+#define r11w_16	r11w
+#define r11b_16	r11w
+#define r11_32	r11d
+#define r11d_32	r11d
+#define r11w_32	r11d
+#define r11b_32	r11d
+#define r11_64	r11
+#define r11d_64	r11
+#define r11w_64	r11
+#define r11b_64	r11
+#define r12_8	r12b
+#define r12d_8	r12b
+#define r12w_8	r12b
+#define r12b_8	r12b
+#define r12_16	r12w
+#define r12d_16	r12w
+#define r12w_16	r12w
+#define r12b_16	r12w
+#define r12_32	r12d
+#define r12d_32	r12d
+#define r12w_32	r12d
+#define r12b_32	r12d
+#define r12_64	r12
+#define r12d_64	r12
+#define r12w_64	r12
+#define r12b_64	r12
+#define r13_8	r13b
+#define r13d_8	r13b
+#define r13w_8	r13b
+#define r13b_8	r13b
+#define r13_16	r13w
+#define r13d_16	r13w
+#define r13w_16	r13w
+#define r13b_16	r13w
+#define r13_32	r13d
+#define r13d_32	r13d
+#define r13w_32	r13d
+#define r13b_32	r13d
+#define r13_64	r13
+#define r13d_64	r13
+#define r13w_64	r13
+#define r13b_64	r13
+#define r14_8	r14b
+#define r14d_8	r14b
+#define r14w_8	r14b
+#define r14b_8	r14b
+#define r14_16	r14w
+#define r14d_16	r14w
+#define r14w_16	r14w
+#define r14b_16	r14w
+#define r14_32	r14d
+#define r14d_32	r14d
+#define r14w_32	r14d
+#define r14b_32	r14d
+#define r14_64	r14
+#define r14d_64	r14
+#define r14w_64	r14
+#define r14b_64	r14
+#define r15_8	r15b
+#define r15d_8	r15b
+#define r15w_8	r15b
+#define r15b_8	r15b
+#define r15_16	r15w
+#define r15d_16	r15w
+#define r15w_16	r15w
+#define r15b_16	r15w
+#define r15_32	r15d
+#define r15d_32	r15d
+#define r15w_32	r15d
+#define r15b_32	r15d
+#define r15_64	r15
+#define r15d_64	r15
+#define r15w_64	r15
+#define r15b_64	r15
+
+#define VRAX	VGPR(rax)
+#define VRBX	VGPR(rbx)
+#define VRCX	VGPR(rcx)
+#define VRDX	VGPR(rdx)
+#define VRBP	VGPR(rbp)
+#define VRSP	VGPR(rsp)
+#define VRSI	VGPR(rsi)
+#define VRDI	VGPR(rdi)
+#define VR8	VGPR(r8)
+#define VR9	VGPR(r9)
+#define VR10	VGPR(r10)
+#define VR11	VGPR(r11)
+#define VR12	VGPR(r12)
+#define VR13	VGPR(r13)
+#define VR14	VGPR(r14)
+#define VR15	VGPR(r15)
+
+#define kmov_8	kmovb
+#define kmov_16	kmovw
+#define kmov_32	kmovd
+#define kmov_64	kmovq
+#define kortest_8	kortestb
+#define kortest_16	kortestw
+#define kortest_32	kortestd
+#define kortest_64	kortestq
+#define kor_8	korb
+#define kor_16	korw
+#define kor_32	kord
+#define kor_64	korq
+#define ktest_8	ktestb
+#define ktest_16	ktestw
+#define ktest_32	ktestd
+#define ktest_64	ktestq
+#define kand_8	kandb
+#define kand_16	kandw
+#define kand_32	kandd
+#define kand_64	kandq
+#define kxor_8	kxorb
+#define kxor_16	kxorw
+#define kxor_32	kxord
+#define kxor_64	kxorq
+#define knot_8	knotb
+#define knot_16	knotw
+#define knot_32	knotd
+#define knot_64	knotq
+#define kxnor_8	kxnorb
+#define kxnor_16	kxnorw
+#define kxnor_32	kxnord
+#define kxnor_64	kxnorq
+#define kunpack_8	kunpackbw
+#define kunpack_16	kunpackwd
+#define kunpack_32	kunpackdq
+
+#define KMOV 	VKINSN_SZ(kmov, REG_WIDTH)
+#define KORTEST 	VKINSN_SZ(kortest, REG_WIDTH)
+#define KOR 	VKINSN_SZ(kor, REG_WIDTH)
+#define KTEST 	VKINSN_SZ(ktest, REG_WIDTH)
+#define KAND 	VKINSN_SZ(kand, REG_WIDTH)
+#define KXOR 	VKINSN_SZ(kxor, REG_WIDTH)
+#define KNOT 	VKINSN_SZ(knot, REG_WIDTH)
+#define KXNOR 	VKINSN_SZ(kxnor, REG_WIDTH)
+#define KUNPACK 	VKINSN_SZ(kunpack, REG_WIDTH)
+
+#ifndef REG_WIDTH
+# define REG_WIDTH VEC_SIZE
+#endif
+#define PRIM_VGPR_SZ(reg_name, reg_size)	reg_name##_##reg_size
+#define VGPR_SZ(reg_name, reg_size)	PRIM_VGPR_SZ(reg_name, reg_size)
+#define VGPR(reg_name)	VGPR_SZ(reg_name, REG_WIDTH)
+#define VKINSN_SZ(insn, reg_size)	PRIM_VGPR_SZ(insn, reg_size)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
new file mode 100644
index 0000000000..cf65c9fb8d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
@@ -0,0 +1,112 @@
+#!/usr/bin/python3
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+"""Generate macros for getting GPR name of a certain size
+
+Inputs: None
+Output: Prints header fill to stdout
+
+API:
+    VGPR(reg_name)
+        - Get register name VEC_SIZE component of `reg_name`
+    VGPR_SZ(reg_name, reg_size)
+        - Get register name `reg_size` component of `reg_name`
+"""
+
+import sys
+from datetime import datetime
+
+registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
+             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
+             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
+             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
+             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
+             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
+             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
+             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
+
+mask_insns = [
+    "kmov",
+    "kortest",
+    "kor",
+    "ktest",
+    "kand",
+    "kxor",
+    "knot",
+    "kxnor",
+]
+mask_insns_ext = ["b", "w", "d", "q"]
+
+cr = """
+   Copyright (C) {} Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+"""
+
+print("/* This file was generated by: {}.".format(sys.argv[0]))
+print(cr.format(datetime.today().year))
+
+print("#ifndef _REG_MACROS_H")
+print("#define _REG_MACROS_H\t1\n")
+for reg in registers:
+    for i in range(0, 4):
+        for j in range(0, 4):
+            print("#define {}_{}\t{}".format(reg[j], 8 << i, reg[3 - i]))
+
+print("")
+for reg in registers:
+    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
+
+print("")
+for mask_insn in mask_insns:
+    for i in range(0, 4):
+        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
+                                           mask_insns_ext[i]))
+for i in range(0, 3):
+    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
+                                                   mask_insns_ext[i + 1]))
+mask_insns.append("kunpack")
+
+print("")
+
+for mask_insn in mask_insns:
+    print("#define {} \tVKINSN_SZ({}, REG_WIDTH)".format(
+        mask_insn.upper(), mask_insn))
+print("")
+
+print("#ifndef REG_WIDTH")
+print("# define REG_WIDTH VEC_SIZE")
+print("#endif")
+print("#define PRIM_VGPR_SZ(reg_name, reg_size)\treg_name##_##reg_size")
+print("#define VGPR_SZ(reg_name, reg_size)\tPRIM_VGPR_SZ(reg_name, reg_size)")
+print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
+print("#define VKINSN_SZ(insn, reg_size)\tPRIM_VGPR_SZ(insn, reg_size)")
+
+print("\n#endif")
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v3 3/3] x86: Update strlen-evex-base to use new reg/vec macros.
  2022-10-14 18:22 ` [PATCH v3 1/3] x86: Update evex256/512 vec macros Noah Goldstein
  2022-10-14 18:22   ` [PATCH v3 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
@ 2022-10-14 18:22   ` Noah Goldstein
  1 sibling, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 18:22 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

To avoid duplicate the VMM / GPR / mask insn macros in all incoming
evex512 files use the macros defined in 'reg-macros.h' and
'{vec}-macros.h'

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 116 +++++++-------------
 sysdeps/x86_64/multiarch/strlen-evex512.S   |   4 +-
 2 files changed, 44 insertions(+), 76 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index 418e9f8411..8af9791e92 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -36,42 +36,10 @@
 #  define CHAR_SIZE	1
 # endif
 
-# define XMM0		xmm16
 # define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# if VEC_SIZE == 64
-#  define KMOV		kmovq
-#  define KORTEST	kortestq
-#  define RAX		rax
-#  define RCX		rcx
-#  define RDX		rdx
-#  define SHR		shrq
-#  define TEXTSUFFIX	evex512
-#  define VMM0		zmm16
-#  define VMM1		zmm17
-#  define VMM2		zmm18
-#  define VMM3		zmm19
-#  define VMM4		zmm20
-#  define VMOVA		vmovdqa64
-# elif VEC_SIZE == 32
-/* Currently Unused.  */
-#  define KMOV		kmovd
-#  define KORTEST	kortestd
-#  define RAX		eax
-#  define RCX		ecx
-#  define RDX		edx
-#  define SHR		shrl
-#  define TEXTSUFFIX	evex256
-#  define VMM0		ymm16
-#  define VMM1		ymm17
-#  define VMM2		ymm18
-#  define VMM3		ymm19
-#  define VMM4		ymm20
-#  define VMOVA		vmovdqa32
-# endif
-
-	.section .text.TEXTSUFFIX, "ax", @progbits
+	.section SECTION(.text),"ax",@progbits
 /* Aligning entry point to 64 byte, provides better performance for
    one vector length string.  */
 ENTRY_P2ALIGN (STRLEN, 6)
@@ -86,18 +54,18 @@ ENTRY_P2ALIGN (STRLEN, 6)
 # endif
 
 	movl	%edi, %eax
-	vpxorq	%XMM0, %XMM0, %XMM0
+	vpxorq	%VEC_xmm(0), %VEC_xmm(0), %VEC_xmm(0)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(page_cross)
 
 	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMP	$0, (%rdi), %VMM0, %k0
-	KMOV	%k0, %RAX
-	test	%RAX, %RAX
+	VPCMP	$0, (%rdi), %VEC(0), %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
@@ -120,7 +88,7 @@ L(align_more):
 	movq	%rax, %rdx
 	subq	%rdi, %rdx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RDX
+	shr	$2, %VRDX
 #  endif
 	/* At this point rdx contains [w]chars already compared.  */
 	subq	%rsi, %rdx
@@ -131,9 +99,9 @@ L(align_more):
 # endif
 
 	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMP	$0, (%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (%rax), %VEC(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
 # ifdef USE_AS_STRNLEN
@@ -141,9 +109,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, VEC_SIZE(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, VEC_SIZE(%rax), %VEC(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
 # ifdef USE_AS_STRNLEN
@@ -151,9 +119,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VEC(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 
 # ifdef USE_AS_STRNLEN
@@ -161,9 +129,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VEC(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x4)
 
 # ifdef USE_AS_STRNLEN
@@ -179,7 +147,7 @@ L(align_more):
 # ifdef USE_AS_STRNLEN
 	subq	%rax, %rcx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RCX
+	shr	$2, %VRCX
 #  endif
 	/* rcx contains number of [w]char will be recompared due to
 	   alignment fixes.  rdx must be incremented by rcx to offset
@@ -199,42 +167,42 @@ L(loop_entry):
 # endif
 	/* VPMINU and VPCMP combination provide better performance as
 	   compared to alternative combinations.  */
-	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
-	VPMINU	(VEC_SIZE * 5)(%rax), %VMM1, %VMM2
-	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
-	VPMINU	(VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+	VMOVA	(VEC_SIZE * 4)(%rax), %VEC(1)
+	VPMINU	(VEC_SIZE * 5)(%rax), %VEC(1), %VEC(2)
+	VMOVA	(VEC_SIZE * 6)(%rax), %VEC(3)
+	VPMINU	(VEC_SIZE * 7)(%rax), %VEC(3), %VEC(4)
 
-	VPTESTN	%VMM2, %VMM2, %k0
-	VPTESTN	%VMM4, %VMM4, %k1
+	VPTESTN	%VEC(2), %VEC(2), %k0
+	VPTESTN	%VEC(4), %VEC(4), %k1
 
 	subq	$-(VEC_SIZE * 4), %rax
 	KORTEST	%k0, %k1
 	jz	L(loop)
 
-	VPTESTN	%VMM1, %VMM1, %k2
-	KMOV	%k2, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VEC(1), %VEC(1), %k2
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
-	KMOV	%k0, %RCX
+	KMOV	%k0, %VRCX
 	/* At this point, if k0 is non zero, null char must be in the
 	   second vector.  */
-	test	%RCX, %RCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
-	VPTESTN	%VMM3, %VMM3, %k3
-	KMOV	%k3, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VEC(3), %VEC(3), %k3
+	KMOV	%k3, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 	/* At this point null [w]char must be in the fourth vector so no
 	   need to check.  */
-	KMOV	%k1, %RCX
+	KMOV	%k1, %VRCX
 
 	/* Fourth, third, second vector terminating are pretty much
 	   same, implemented this way to avoid branching and reuse code
 	   from pre loop exit condition.  */
 L(ret_vec_x4):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 3), %rax
@@ -250,7 +218,7 @@ L(ret_vec_x4):
 	ret
 
 L(ret_vec_x3):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 2), %rax
@@ -268,7 +236,7 @@ L(ret_vec_x3):
 L(ret_vec_x2):
 	subq	$-VEC_SIZE, %rax
 L(ret_vec_x1):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
@@ -289,13 +257,13 @@ L(page_cross):
 	/* ecx contains number of w[char] to be skipped as a result
 	   of address alignment.  */
 	xorq	%rdi, %rax
-	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
-	KMOV	%k0, %RAX
+	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VEC(0), %k0
+	KMOV	%k0, %VRAX
 	/* Ignore number of character for alignment adjustment.  */
-	SHR	%cl, %RAX
+	shr	%cl, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
index 116f8981c8..dfd0a7821b 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -2,6 +2,6 @@
 # define STRLEN		__strlen_evex512
 #endif
 
-#define VEC_SIZE	64
-
+#include "evex512-vecs.h"
+#include "reg-macros.h"
 #include "strlen-evex-base.S"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v1 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 18:02   ` H.J. Lu
@ 2022-10-14 18:26     ` Noah Goldstein
  2022-10-14 18:35       ` H.J. Lu
  0 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 18:26 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 1:02 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
>  On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > This is to make it easier to do think like:
> > ```
> > vpcmpb %VEC(0), %VEC(1), %k0
> > kmov{d|q} %k0, %{eax|rax}
> > test %{eax|rax}
> > ```
>
> Since all these register macros are based on VEC_SIZE which is either 32
> bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are
> needed.  8-bit and 16-bit macros aren't needed.
>
> > It adds macro s.t any GPR can get the proper width with:
> >     `V{upper_case_GPR_name}`
> >
> > and any mask insn can get the proper width with:
> >     `{mask_insn_without_postfix}V`
>
> All macros should be in upper cases.
>
> > This commit does not change libc.so
> >
> > Tested build on x86-64
> > ---
> >  sysdeps/x86_64/multiarch/reg-macros.h         | 337 ++++++++++++++++++
> >  .../multiarch/scripts/gen-reg-map-macros.py   |  97 +++++
> >  2 files changed, 434 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
> >  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
> >
> > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> > new file mode 100644
> > index 0000000000..c4d7f57b66
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/reg-macros.h
>
> vreg-macros.h to indicate macros based on vector size.   Please
> add comments to indicate that register macros are expanded based
> on vector size.
>
> > @@ -0,0 +1,337 @@
> > +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py.
> > +
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _REG_MAP_MACROS_H
> > +#define _REG_MAP_MACROS_H      1
> > +
> > +#define rax_8  al
> > +#define eax_8  al
> > +#define ax_8   al
> > +#define al_8   al
> > +#define rax_16 ax
> > +#define eax_16 ax
> > +#define ax_16  ax
> > +#define al_16  ax
> > +#define rax_32 eax
> > +#define eax_32 eax
> > +#define ax_32  eax
> > +#define al_32  eax
> > +#define rax_64 rax
> > +#define eax_64 rax
> > +#define ax_64  rax
> > +#define al_64  rax
>
> Only rax_32 and rax_64 are needed.
>
> > +#define rbx_8  bl
> > +#define ebx_8  bl
> > +#define bx_8   bl
> > +#define bl_8   bl
> > +#define rbx_16 bx
> > +#define ebx_16 bx
> > +#define bx_16  bx
> > +#define bl_16  bx
> > +#define rbx_32 ebx
> > +#define ebx_32 ebx
> > +#define bx_32  ebx
> > +#define bl_32  ebx
> > +#define rbx_64 rbx
> > +#define ebx_64 rbx
> > +#define bx_64  rbx
> > +#define bl_64  rbx
> > +#define rcx_8  cl
> > +#define ecx_8  cl
> > +#define cx_8   cl
> > +#define cl_8   cl
> > +#define rcx_16 cx
> > +#define ecx_16 cx
> > +#define cx_16  cx
> > +#define cl_16  cx
> > +#define rcx_32 ecx
> > +#define ecx_32 ecx
> > +#define cx_32  ecx
> > +#define cl_32  ecx
> > +#define rcx_64 rcx
> > +#define ecx_64 rcx
> > +#define cx_64  rcx
> > +#define cl_64  rcx
> > +#define rdx_8  dl
> > +#define edx_8  dl
> > +#define dx_8   dl
> > +#define dl_8   dl
> > +#define rdx_16 dx
> > +#define edx_16 dx
> > +#define dx_16  dx
> > +#define dl_16  dx
> > +#define rdx_32 edx
> > +#define edx_32 edx
> > +#define dx_32  edx
> > +#define dl_32  edx
> > +#define rdx_64 rdx
> > +#define edx_64 rdx
> > +#define dx_64  rdx
> > +#define dl_64  rdx
> > +#define rbp_8  bpl
> > +#define ebp_8  bpl
> > +#define bp_8   bpl
> > +#define bpl_8  bpl
> > +#define rbp_16 bp
> > +#define ebp_16 bp
> > +#define bp_16  bp
> > +#define bpl_16 bp
> > +#define rbp_32 ebp
> > +#define ebp_32 ebp
> > +#define bp_32  ebp
> > +#define bpl_32 ebp
> > +#define rbp_64 rbp
> > +#define ebp_64 rbp
> > +#define bp_64  rbp
> > +#define bpl_64 rbp
> > +#define rsp_8  spl
> > +#define esp_8  spl
> > +#define sp_8   spl
> > +#define spl_8  spl
> > +#define rsp_16 sp
> > +#define esp_16 sp
> > +#define sp_16  sp
> > +#define spl_16 sp
> > +#define rsp_32 esp
> > +#define esp_32 esp
> > +#define sp_32  esp
> > +#define spl_32 esp
> > +#define rsp_64 rsp
> > +#define esp_64 rsp
> > +#define sp_64  rsp
> > +#define spl_64 rsp
> > +#define rsi_8  sil
> > +#define esi_8  sil
> > +#define si_8   sil
> > +#define sil_8  sil
> > +#define rsi_16 si
> > +#define esi_16 si
> > +#define si_16  si
> > +#define sil_16 si
> > +#define rsi_32 esi
> > +#define esi_32 esi
> > +#define si_32  esi
> > +#define sil_32 esi
> > +#define rsi_64 rsi
> > +#define esi_64 rsi
> > +#define si_64  rsi
> > +#define sil_64 rsi
> > +#define rdi_8  dil
> > +#define edi_8  dil
> > +#define di_8   dil
> > +#define dil_8  dil
> > +#define rdi_16 di
> > +#define edi_16 di
> > +#define di_16  di
> > +#define dil_16 di
> > +#define rdi_32 edi
> > +#define edi_32 edi
> > +#define di_32  edi
> > +#define dil_32 edi
> > +#define rdi_64 rdi
> > +#define edi_64 rdi
> > +#define di_64  rdi
> > +#define dil_64 rdi
> > +#define r8_8   r8b
> > +#define r8d_8  r8b
> > +#define r8w_8  r8b
> > +#define r8b_8  r8b
> > +#define r8_16  r8w
> > +#define r8d_16 r8w
> > +#define r8w_16 r8w
> > +#define r8b_16 r8w
> > +#define r8_32  r8d
> > +#define r8d_32 r8d
> > +#define r8w_32 r8d
> > +#define r8b_32 r8d
> > +#define r8_64  r8
> > +#define r8d_64 r8
> > +#define r8w_64 r8
> > +#define r8b_64 r8
> > +#define r9_8   r9b
> > +#define r9d_8  r9b
> > +#define r9w_8  r9b
> > +#define r9b_8  r9b
> > +#define r9_16  r9w
> > +#define r9d_16 r9w
> > +#define r9w_16 r9w
> > +#define r9b_16 r9w
> > +#define r9_32  r9d
> > +#define r9d_32 r9d
> > +#define r9w_32 r9d
> > +#define r9b_32 r9d
> > +#define r9_64  r9
> > +#define r9d_64 r9
> > +#define r9w_64 r9
> > +#define r9b_64 r9
> > +#define r10_8  r10b
> > +#define r10d_8 r10b
> > +#define r10w_8 r10b
> > +#define r10b_8 r10b
> > +#define r10_16 r10w
> > +#define r10d_16        r10w
> > +#define r10w_16        r10w
> > +#define r10b_16        r10w
> > +#define r10_32 r10d
> > +#define r10d_32        r10d
> > +#define r10w_32        r10d
> > +#define r10b_32        r10d
> > +#define r10_64 r10
> > +#define r10d_64        r10
> > +#define r10w_64        r10
> > +#define r10b_64        r10
> > +#define r11_8  r11b
> > +#define r11d_8 r11b
> > +#define r11w_8 r11b
> > +#define r11b_8 r11b
> > +#define r11_16 r11w
> > +#define r11d_16        r11w
> > +#define r11w_16        r11w
> > +#define r11b_16        r11w
> > +#define r11_32 r11d
> > +#define r11d_32        r11d
> > +#define r11w_32        r11d
> > +#define r11b_32        r11d
> > +#define r11_64 r11
> > +#define r11d_64        r11
> > +#define r11w_64        r11
> > +#define r11b_64        r11
> > +#define r12_8  r12b
> > +#define r12d_8 r12b
> > +#define r12w_8 r12b
> > +#define r12b_8 r12b
> > +#define r12_16 r12w
> > +#define r12d_16        r12w
> > +#define r12w_16        r12w
> > +#define r12b_16        r12w
> > +#define r12_32 r12d
> > +#define r12d_32        r12d
> > +#define r12w_32        r12d
> > +#define r12b_32        r12d
> > +#define r12_64 r12
> > +#define r12d_64        r12
> > +#define r12w_64        r12
> > +#define r12b_64        r12
> > +#define r13_8  r13b
> > +#define r13d_8 r13b
> > +#define r13w_8 r13b
> > +#define r13b_8 r13b
> > +#define r13_16 r13w
> > +#define r13d_16        r13w
> > +#define r13w_16        r13w
> > +#define r13b_16        r13w
> > +#define r13_32 r13d
> > +#define r13d_32        r13d
> > +#define r13w_32        r13d
> > +#define r13b_32        r13d
> > +#define r13_64 r13
> > +#define r13d_64        r13
> > +#define r13w_64        r13
> > +#define r13b_64        r13
> > +#define r14_8  r14b
> > +#define r14d_8 r14b
> > +#define r14w_8 r14b
> > +#define r14b_8 r14b
> > +#define r14_16 r14w
> > +#define r14d_16        r14w
> > +#define r14w_16        r14w
> > +#define r14b_16        r14w
> > +#define r14_32 r14d
> > +#define r14d_32        r14d
> > +#define r14w_32        r14d
> > +#define r14b_32        r14d
> > +#define r14_64 r14
> > +#define r14d_64        r14
> > +#define r14w_64        r14
> > +#define r14b_64        r14
> > +#define r15_8  r15b
> > +#define r15d_8 r15b
> > +#define r15w_8 r15b
> > +#define r15b_8 r15b
> > +#define r15_16 r15w
> > +#define r15d_16        r15w
> > +#define r15w_16        r15w
> > +#define r15b_16        r15w
> > +#define r15_32 r15d
> > +#define r15d_32        r15d
> > +#define r15w_32        r15d
> > +#define r15b_32        r15d
> > +#define r15_64 r15
> > +#define r15d_64        r15
> > +#define r15w_64        r15
> > +#define r15b_64        r15
> > +
> > +#define VRAX   VGPR(rax)
> > +#define VRBX   VGPR(rbx)
> > +#define VRCX   VGPR(rcx)
> > +#define VRDX   VGPR(rdx)
> > +#define VRBP   VGPR(rbp)
> > +#define VRSP   VGPR(rsp)
> > +#define VRSI   VGPR(rsi)
> > +#define VRDI   VGPR(rdi)
> > +#define VR8    VGPR(r8)
> > +#define VR9    VGPR(r9)
> > +#define VR10   VGPR(r10)
> > +#define VR11   VGPR(r11)
> > +#define VR12   VGPR(r12)
> > +#define VR13   VGPR(r13)
> > +#define VR14   VGPR(r14)
> > +#define VR15   VGPR(r15)
> > +
> > +#define kmov_8 kmovb
> > +#define kmov_16        kmovw
> > +#define kmov_32        kmovd
> > +#define kmov_64        kmovq
>
> Only 32 and 64 are needed.

Thats not entirely true for the wide-char impls.
>
> > +#define kortest_8      kortestb
> > +#define kortest_16     kortestw
> > +#define kortest_32     kortestd
> > +#define kortest_64     kortestq
> > +#define kor_8  korb
> > +#define kor_16 korw
> > +#define kor_32 kord
> > +#define kor_64 korq
> > +#define ktest_8        ktestb
> > +#define ktest_16       ktestw
> > +#define ktest_32       ktestd
> > +#define ktest_64       ktestq
> > +#define kand_8 kandb
> > +#define kand_16        kandw
> > +#define kand_32        kandd
> > +#define kand_64        kandq
> > +#define kxor_8 kxorb
> > +#define kxor_16        kxorw
> > +#define kxor_32        kxord
> > +#define kxor_64        kxorq
> > +
> > +#define kmovV  VKINSN_SZ(kmov, REG_WIDTH)
> > +#define kortestV       VKINSN_SZ(kortest, REG_WIDTH)
> > +#define korV   VKINSN_SZ(kor, REG_WIDTH)
> > +#define ktestV         VKINSN_SZ(ktest, REG_WIDTH)
> > +#define kandV  VKINSN_SZ(kand, REG_WIDTH)
> > +#define kxorV  VKINSN_SZ(kxor, REG_WIDTH)
>
> #define VKINSN(op) VKINSN_SZ(op, REG_WIDTH)

Will fix for V5.
>
> > +
> > +#ifndef REG_WIDTH
> > +#define REG_WIDTH VEC_SIZE
>
> Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH
> can be dropped.

Thats not quite true.

For wide-char impls REG_WIDTH != VEC_SIZE.
>
> > +#endif
> > +#define PRIM_VGPR_SZ(reg_name, reg_size)       reg_name##_##reg_size
>
> This is used for both register and instruction.  How about
>
> #define VPASTER(x,y) x##_##y

Will fix for V5.
>
>
> > +#define VGPR_SZ(reg_name, reg_size)    PRIM_VGPR_SZ(reg_name, reg_size)
> > +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> > +#define VKINSN_SZ(insn, reg_size)      PRIM_VGPR_SZ(insn, reg_size)
>
> No need for both VGPR_SZ and VKINSN_SZ.  How about
>
> #define VEVALUATOR(x,y) VPASTER(x,y)

Will change for V5.
>
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
> > new file mode 100644
> > index 0000000000..5b04e89ecb
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
> > @@ -0,0 +1,97 @@
> > +#!/usr/bin/python3
> > +# Copyright (C) 2022 Free Software Foundation, Inc.
> > +# This file is part of the GNU C Library.
> > +#
> > +# The GNU C Library is free software; you can redistribute it and/or
> > +# modify it under the terms of the GNU Lesser General Public
> > +# License as published by the Free Software Foundation; either
> > +# version 2.1 of the License, or (at your option) any later version.
> > +#
> > +# The GNU C Library is distributed in the hope that it will be useful,
> > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +# Lesser General Public License for more details.
> > +#
> > +# You should have received a copy of the GNU Lesser General Public
> > +# License along with the GNU C Library; if not, see
> > +# <https://www.gnu.org/licenses/>.
> > +"""Generate macros for getting GPR name of a certain size
> > +
> > +Inputs: None
> > +Output: Prints header fill to stdout
> > +
> > +API:
> > +    VGPR(reg_name)
> > +        - Get register name VEC_SIZE component of `reg_name`
> > +    VGPR_SZ(reg_name, reg_size)
> > +        - Get register name `reg_size` component of `reg_name`
> > +"""
> > +
> > +import sys
> > +from datetime import datetime
> > +
> > +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> > +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> > +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> > +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> > +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> > +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> > +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> > +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> > +
> > +mask_insns = ["kmov", "kortest", "kor", "ktest", "kand", "kxor"]
> > +mask_insns_ext = ["b", "w", "d", "q"]
> > +
> > +cr = """
> > +   Copyright (C) {} Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +"""
> > +
> > +print("/* This file was generated by: {}.".format(sys.argv[0]))
> > +print(cr.format(datetime.today().year))
> > +
> > +print("#ifndef _REG_MAP_MACROS_H")
> > +print("#define _REG_MAP_MACROS_H\t1\n")
> > +for reg in registers:
> > +    for i in range(0, 4):
> > +        for j in range(0, 4):
> > +            print("#define {}_{}\t{}".format(reg[j], 8 << i, reg[3 - i]))
> > +
> > +print("")
> > +for reg in registers:
> > +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> > +
> > +print("")
> > +for mask_insn in mask_insns:
> > +    for i in range(0, 4):
> > +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> > +                                           mask_insns_ext[i]))
> > +
> > +print("")
> > +for mask_insn in mask_insns:
> > +    print("#define {}V \tVKINSN_SZ({}, REG_WIDTH)".format(mask_insn, mask_insn))
> > +print("")
> > +
> > +print("#ifndef REG_WIDTH")
> > +print("#define REG_WIDTH VEC_SIZE")
> > +print("#endif")
> > +print("#define PRIM_VGPR_SZ(reg_name, reg_size)\treg_name##_##reg_size")
> > +print("#define VGPR_SZ(reg_name, reg_size)\tPRIM_VGPR_SZ(reg_name, reg_size)")
> > +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> > +print("#define VKINSN_SZ(insn, reg_size)\tPRIM_VGPR_SZ(insn, reg_size)")
> > +
> > +print("\n#endif")
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v1 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 18:26     ` Noah Goldstein
@ 2022-10-14 18:35       ` H.J. Lu
  2022-10-14 18:38         ` Noah Goldstein
  0 siblings, 1 reply; 72+ messages in thread
From: H.J. Lu @ 2022-10-14 18:35 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 11:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Oct 14, 2022 at 1:02 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> >  On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > This is to make it easier to do think like:
> > > ```
> > > vpcmpb %VEC(0), %VEC(1), %k0
> > > kmov{d|q} %k0, %{eax|rax}
> > > test %{eax|rax}
> > > ```
> >
> > Since all these register macros are based on VEC_SIZE which is either 32
> > bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are
> > needed.  8-bit and 16-bit macros aren't needed.
> >
> > > It adds macro s.t any GPR can get the proper width with:
> > >     `V{upper_case_GPR_name}`
> > >
> > > and any mask insn can get the proper width with:
> > >     `{mask_insn_without_postfix}V`
> >
> > All macros should be in upper cases.
> >
> > > This commit does not change libc.so
> > >
> > > Tested build on x86-64
> > > ---
> > >  sysdeps/x86_64/multiarch/reg-macros.h         | 337 ++++++++++++++++++
> > >  .../multiarch/scripts/gen-reg-map-macros.py   |  97 +++++
> > >  2 files changed, 434 insertions(+)
> > >  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
> > >  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> > > new file mode 100644
> > > index 0000000000..c4d7f57b66
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> >
> > vreg-macros.h to indicate macros based on vector size.   Please
> > add comments to indicate that register macros are expanded based
> > on vector size.
> >
> > > @@ -0,0 +1,337 @@
> > > +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py.
> > > +
> > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#ifndef _REG_MAP_MACROS_H
> > > +#define _REG_MAP_MACROS_H      1
> > > +
> > > +#define rax_8  al
> > > +#define eax_8  al
> > > +#define ax_8   al
> > > +#define al_8   al
> > > +#define rax_16 ax
> > > +#define eax_16 ax
> > > +#define ax_16  ax
> > > +#define al_16  ax
> > > +#define rax_32 eax
> > > +#define eax_32 eax
> > > +#define ax_32  eax
> > > +#define al_32  eax
> > > +#define rax_64 rax
> > > +#define eax_64 rax
> > > +#define ax_64  rax
> > > +#define al_64  rax
> >
> > Only rax_32 and rax_64 are needed.
> >
> > > +#define rbx_8  bl
> > > +#define ebx_8  bl
> > > +#define bx_8   bl
> > > +#define bl_8   bl
> > > +#define rbx_16 bx
> > > +#define ebx_16 bx
> > > +#define bx_16  bx
> > > +#define bl_16  bx
> > > +#define rbx_32 ebx
> > > +#define ebx_32 ebx
> > > +#define bx_32  ebx
> > > +#define bl_32  ebx
> > > +#define rbx_64 rbx
> > > +#define ebx_64 rbx
> > > +#define bx_64  rbx
> > > +#define bl_64  rbx
> > > +#define rcx_8  cl
> > > +#define ecx_8  cl
> > > +#define cx_8   cl
> > > +#define cl_8   cl
> > > +#define rcx_16 cx
> > > +#define ecx_16 cx
> > > +#define cx_16  cx
> > > +#define cl_16  cx
> > > +#define rcx_32 ecx
> > > +#define ecx_32 ecx
> > > +#define cx_32  ecx
> > > +#define cl_32  ecx
> > > +#define rcx_64 rcx
> > > +#define ecx_64 rcx
> > > +#define cx_64  rcx
> > > +#define cl_64  rcx
> > > +#define rdx_8  dl
> > > +#define edx_8  dl
> > > +#define dx_8   dl
> > > +#define dl_8   dl
> > > +#define rdx_16 dx
> > > +#define edx_16 dx
> > > +#define dx_16  dx
> > > +#define dl_16  dx
> > > +#define rdx_32 edx
> > > +#define edx_32 edx
> > > +#define dx_32  edx
> > > +#define dl_32  edx
> > > +#define rdx_64 rdx
> > > +#define edx_64 rdx
> > > +#define dx_64  rdx
> > > +#define dl_64  rdx
> > > +#define rbp_8  bpl
> > > +#define ebp_8  bpl
> > > +#define bp_8   bpl
> > > +#define bpl_8  bpl
> > > +#define rbp_16 bp
> > > +#define ebp_16 bp
> > > +#define bp_16  bp
> > > +#define bpl_16 bp
> > > +#define rbp_32 ebp
> > > +#define ebp_32 ebp
> > > +#define bp_32  ebp
> > > +#define bpl_32 ebp
> > > +#define rbp_64 rbp
> > > +#define ebp_64 rbp
> > > +#define bp_64  rbp
> > > +#define bpl_64 rbp
> > > +#define rsp_8  spl
> > > +#define esp_8  spl
> > > +#define sp_8   spl
> > > +#define spl_8  spl
> > > +#define rsp_16 sp
> > > +#define esp_16 sp
> > > +#define sp_16  sp
> > > +#define spl_16 sp
> > > +#define rsp_32 esp
> > > +#define esp_32 esp
> > > +#define sp_32  esp
> > > +#define spl_32 esp
> > > +#define rsp_64 rsp
> > > +#define esp_64 rsp
> > > +#define sp_64  rsp
> > > +#define spl_64 rsp
> > > +#define rsi_8  sil
> > > +#define esi_8  sil
> > > +#define si_8   sil
> > > +#define sil_8  sil
> > > +#define rsi_16 si
> > > +#define esi_16 si
> > > +#define si_16  si
> > > +#define sil_16 si
> > > +#define rsi_32 esi
> > > +#define esi_32 esi
> > > +#define si_32  esi
> > > +#define sil_32 esi
> > > +#define rsi_64 rsi
> > > +#define esi_64 rsi
> > > +#define si_64  rsi
> > > +#define sil_64 rsi
> > > +#define rdi_8  dil
> > > +#define edi_8  dil
> > > +#define di_8   dil
> > > +#define dil_8  dil
> > > +#define rdi_16 di
> > > +#define edi_16 di
> > > +#define di_16  di
> > > +#define dil_16 di
> > > +#define rdi_32 edi
> > > +#define edi_32 edi
> > > +#define di_32  edi
> > > +#define dil_32 edi
> > > +#define rdi_64 rdi
> > > +#define edi_64 rdi
> > > +#define di_64  rdi
> > > +#define dil_64 rdi
> > > +#define r8_8   r8b
> > > +#define r8d_8  r8b
> > > +#define r8w_8  r8b
> > > +#define r8b_8  r8b
> > > +#define r8_16  r8w
> > > +#define r8d_16 r8w
> > > +#define r8w_16 r8w
> > > +#define r8b_16 r8w
> > > +#define r8_32  r8d
> > > +#define r8d_32 r8d
> > > +#define r8w_32 r8d
> > > +#define r8b_32 r8d
> > > +#define r8_64  r8
> > > +#define r8d_64 r8
> > > +#define r8w_64 r8
> > > +#define r8b_64 r8
> > > +#define r9_8   r9b
> > > +#define r9d_8  r9b
> > > +#define r9w_8  r9b
> > > +#define r9b_8  r9b
> > > +#define r9_16  r9w
> > > +#define r9d_16 r9w
> > > +#define r9w_16 r9w
> > > +#define r9b_16 r9w
> > > +#define r9_32  r9d
> > > +#define r9d_32 r9d
> > > +#define r9w_32 r9d
> > > +#define r9b_32 r9d
> > > +#define r9_64  r9
> > > +#define r9d_64 r9
> > > +#define r9w_64 r9
> > > +#define r9b_64 r9
> > > +#define r10_8  r10b
> > > +#define r10d_8 r10b
> > > +#define r10w_8 r10b
> > > +#define r10b_8 r10b
> > > +#define r10_16 r10w
> > > +#define r10d_16        r10w
> > > +#define r10w_16        r10w
> > > +#define r10b_16        r10w
> > > +#define r10_32 r10d
> > > +#define r10d_32        r10d
> > > +#define r10w_32        r10d
> > > +#define r10b_32        r10d
> > > +#define r10_64 r10
> > > +#define r10d_64        r10
> > > +#define r10w_64        r10
> > > +#define r10b_64        r10
> > > +#define r11_8  r11b
> > > +#define r11d_8 r11b
> > > +#define r11w_8 r11b
> > > +#define r11b_8 r11b
> > > +#define r11_16 r11w
> > > +#define r11d_16        r11w
> > > +#define r11w_16        r11w
> > > +#define r11b_16        r11w
> > > +#define r11_32 r11d
> > > +#define r11d_32        r11d
> > > +#define r11w_32        r11d
> > > +#define r11b_32        r11d
> > > +#define r11_64 r11
> > > +#define r11d_64        r11
> > > +#define r11w_64        r11
> > > +#define r11b_64        r11
> > > +#define r12_8  r12b
> > > +#define r12d_8 r12b
> > > +#define r12w_8 r12b
> > > +#define r12b_8 r12b
> > > +#define r12_16 r12w
> > > +#define r12d_16        r12w
> > > +#define r12w_16        r12w
> > > +#define r12b_16        r12w
> > > +#define r12_32 r12d
> > > +#define r12d_32        r12d
> > > +#define r12w_32        r12d
> > > +#define r12b_32        r12d
> > > +#define r12_64 r12
> > > +#define r12d_64        r12
> > > +#define r12w_64        r12
> > > +#define r12b_64        r12
> > > +#define r13_8  r13b
> > > +#define r13d_8 r13b
> > > +#define r13w_8 r13b
> > > +#define r13b_8 r13b
> > > +#define r13_16 r13w
> > > +#define r13d_16        r13w
> > > +#define r13w_16        r13w
> > > +#define r13b_16        r13w
> > > +#define r13_32 r13d
> > > +#define r13d_32        r13d
> > > +#define r13w_32        r13d
> > > +#define r13b_32        r13d
> > > +#define r13_64 r13
> > > +#define r13d_64        r13
> > > +#define r13w_64        r13
> > > +#define r13b_64        r13
> > > +#define r14_8  r14b
> > > +#define r14d_8 r14b
> > > +#define r14w_8 r14b
> > > +#define r14b_8 r14b
> > > +#define r14_16 r14w
> > > +#define r14d_16        r14w
> > > +#define r14w_16        r14w
> > > +#define r14b_16        r14w
> > > +#define r14_32 r14d
> > > +#define r14d_32        r14d
> > > +#define r14w_32        r14d
> > > +#define r14b_32        r14d
> > > +#define r14_64 r14
> > > +#define r14d_64        r14
> > > +#define r14w_64        r14
> > > +#define r14b_64        r14
> > > +#define r15_8  r15b
> > > +#define r15d_8 r15b
> > > +#define r15w_8 r15b
> > > +#define r15b_8 r15b
> > > +#define r15_16 r15w
> > > +#define r15d_16        r15w
> > > +#define r15w_16        r15w
> > > +#define r15b_16        r15w
> > > +#define r15_32 r15d
> > > +#define r15d_32        r15d
> > > +#define r15w_32        r15d
> > > +#define r15b_32        r15d
> > > +#define r15_64 r15
> > > +#define r15d_64        r15
> > > +#define r15w_64        r15
> > > +#define r15b_64        r15
> > > +
> > > +#define VRAX   VGPR(rax)
> > > +#define VRBX   VGPR(rbx)
> > > +#define VRCX   VGPR(rcx)
> > > +#define VRDX   VGPR(rdx)
> > > +#define VRBP   VGPR(rbp)
> > > +#define VRSP   VGPR(rsp)
> > > +#define VRSI   VGPR(rsi)
> > > +#define VRDI   VGPR(rdi)
> > > +#define VR8    VGPR(r8)
> > > +#define VR9    VGPR(r9)
> > > +#define VR10   VGPR(r10)
> > > +#define VR11   VGPR(r11)
> > > +#define VR12   VGPR(r12)
> > > +#define VR13   VGPR(r13)
> > > +#define VR14   VGPR(r14)
> > > +#define VR15   VGPR(r15)
> > > +
> > > +#define kmov_8 kmovb
> > > +#define kmov_16        kmovw
> > > +#define kmov_32        kmovd
> > > +#define kmov_64        kmovq
> >
> > Only 32 and 64 are needed.
>
> Thats not entirely true for the wide-char impls.
> >
> > > +#define kortest_8      kortestb
> > > +#define kortest_16     kortestw
> > > +#define kortest_32     kortestd
> > > +#define kortest_64     kortestq
> > > +#define kor_8  korb
> > > +#define kor_16 korw
> > > +#define kor_32 kord
> > > +#define kor_64 korq
> > > +#define ktest_8        ktestb
> > > +#define ktest_16       ktestw
> > > +#define ktest_32       ktestd
> > > +#define ktest_64       ktestq
> > > +#define kand_8 kandb
> > > +#define kand_16        kandw
> > > +#define kand_32        kandd
> > > +#define kand_64        kandq
> > > +#define kxor_8 kxorb
> > > +#define kxor_16        kxorw
> > > +#define kxor_32        kxord
> > > +#define kxor_64        kxorq
> > > +
> > > +#define kmovV  VKINSN_SZ(kmov, REG_WIDTH)
> > > +#define kortestV       VKINSN_SZ(kortest, REG_WIDTH)
> > > +#define korV   VKINSN_SZ(kor, REG_WIDTH)
> > > +#define ktestV         VKINSN_SZ(ktest, REG_WIDTH)
> > > +#define kandV  VKINSN_SZ(kand, REG_WIDTH)
> > > +#define kxorV  VKINSN_SZ(kxor, REG_WIDTH)
> >
> > #define VKINSN(op) VKINSN_SZ(op, REG_WIDTH)
>
> Will fix for V5.
> >
> > > +
> > > +#ifndef REG_WIDTH
> > > +#define REG_WIDTH VEC_SIZE
> >
> > Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH
> > can be dropped.
>
> Thats not quite true.
>
> For wide-char impls REG_WIDTH != VEC_SIZE.

These register macros are used to operate vectors.  Do you have
an example of REG_WIDTH != VEC_SIZE?

> >
> > > +#endif
> > > +#define PRIM_VGPR_SZ(reg_name, reg_size)       reg_name##_##reg_size
> >
> > This is used for both register and instruction.  How about
> >
> > #define VPASTER(x,y) x##_##y
>
> Will fix for V5.
> >
> >
> > > +#define VGPR_SZ(reg_name, reg_size)    PRIM_VGPR_SZ(reg_name, reg_size)
> > > +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> > > +#define VKINSN_SZ(insn, reg_size)      PRIM_VGPR_SZ(insn, reg_size)
> >
> > No need for both VGPR_SZ and VKINSN_SZ.  How about
> >
> > #define VEVALUATOR(x,y) VPASTER(x,y)
>
> Will change for V5.
> >
> > > +
> > > +#endif
> > > diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
> > > new file mode 100644
> > > index 0000000000..5b04e89ecb
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
> > > @@ -0,0 +1,97 @@
> > > +#!/usr/bin/python3
> > > +# Copyright (C) 2022 Free Software Foundation, Inc.
> > > +# This file is part of the GNU C Library.
> > > +#
> > > +# The GNU C Library is free software; you can redistribute it and/or
> > > +# modify it under the terms of the GNU Lesser General Public
> > > +# License as published by the Free Software Foundation; either
> > > +# version 2.1 of the License, or (at your option) any later version.
> > > +#
> > > +# The GNU C Library is distributed in the hope that it will be useful,
> > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +# Lesser General Public License for more details.
> > > +#
> > > +# You should have received a copy of the GNU Lesser General Public
> > > +# License along with the GNU C Library; if not, see
> > > +# <https://www.gnu.org/licenses/>.
> > > +"""Generate macros for getting GPR name of a certain size
> > > +
> > > +Inputs: None
> > > +Output: Prints header fill to stdout
> > > +
> > > +API:
> > > +    VGPR(reg_name)
> > > +        - Get register name VEC_SIZE component of `reg_name`
> > > +    VGPR_SZ(reg_name, reg_size)
> > > +        - Get register name `reg_size` component of `reg_name`
> > > +"""
> > > +
> > > +import sys
> > > +from datetime import datetime
> > > +
> > > +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> > > +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> > > +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> > > +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> > > +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> > > +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> > > +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> > > +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> > > +
> > > +mask_insns = ["kmov", "kortest", "kor", "ktest", "kand", "kxor"]
> > > +mask_insns_ext = ["b", "w", "d", "q"]
> > > +
> > > +cr = """
> > > +   Copyright (C) {} Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +"""
> > > +
> > > +print("/* This file was generated by: {}.".format(sys.argv[0]))
> > > +print(cr.format(datetime.today().year))
> > > +
> > > +print("#ifndef _REG_MAP_MACROS_H")
> > > +print("#define _REG_MAP_MACROS_H\t1\n")
> > > +for reg in registers:
> > > +    for i in range(0, 4):
> > > +        for j in range(0, 4):
> > > +            print("#define {}_{}\t{}".format(reg[j], 8 << i, reg[3 - i]))
> > > +
> > > +print("")
> > > +for reg in registers:
> > > +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> > > +
> > > +print("")
> > > +for mask_insn in mask_insns:
> > > +    for i in range(0, 4):
> > > +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> > > +                                           mask_insns_ext[i]))
> > > +
> > > +print("")
> > > +for mask_insn in mask_insns:
> > > +    print("#define {}V \tVKINSN_SZ({}, REG_WIDTH)".format(mask_insn, mask_insn))
> > > +print("")
> > > +
> > > +print("#ifndef REG_WIDTH")
> > > +print("#define REG_WIDTH VEC_SIZE")
> > > +print("#endif")
> > > +print("#define PRIM_VGPR_SZ(reg_name, reg_size)\treg_name##_##reg_size")
> > > +print("#define VGPR_SZ(reg_name, reg_size)\tPRIM_VGPR_SZ(reg_name, reg_size)")
> > > +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> > > +print("#define VKINSN_SZ(insn, reg_size)\tPRIM_VGPR_SZ(insn, reg_size)")
> > > +
> > > +print("\n#endif")
> > > --
> > > 2.34.1
> > >
> >
> >
> > --
> > H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v1 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 18:35       ` H.J. Lu
@ 2022-10-14 18:38         ` Noah Goldstein
  2022-10-14 18:53           ` H.J. Lu
  0 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 18:38 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 1:35 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Oct 14, 2022 at 11:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Oct 14, 2022 at 1:02 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > >  On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > This is to make it easier to do think like:
> > > > ```
> > > > vpcmpb %VEC(0), %VEC(1), %k0
> > > > kmov{d|q} %k0, %{eax|rax}
> > > > test %{eax|rax}
> > > > ```
> > >
> > > Since all these register macros are based on VEC_SIZE which is either 32
> > > bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are
> > > needed.  8-bit and 16-bit macros aren't needed.
> > >
> > > > It adds macro s.t any GPR can get the proper width with:
> > > >     `V{upper_case_GPR_name}`
> > > >
> > > > and any mask insn can get the proper width with:
> > > >     `{mask_insn_without_postfix}V`
> > >
> > > All macros should be in upper cases.
> > >
> > > > This commit does not change libc.so
> > > >
> > > > Tested build on x86-64
> > > > ---
> > > >  sysdeps/x86_64/multiarch/reg-macros.h         | 337 ++++++++++++++++++
> > > >  .../multiarch/scripts/gen-reg-map-macros.py   |  97 +++++
> > > >  2 files changed, 434 insertions(+)
> > > >  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
> > > >  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > new file mode 100644
> > > > index 0000000000..c4d7f57b66
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> > >
> > > vreg-macros.h to indicate macros based on vector size.   Please
> > > add comments to indicate that register macros are expanded based
> > > on vector size.
> > >
> > > > @@ -0,0 +1,337 @@
> > > > +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py.
> > > > +
> > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#ifndef _REG_MAP_MACROS_H
> > > > +#define _REG_MAP_MACROS_H      1
> > > > +
> > > > +#define rax_8  al
> > > > +#define eax_8  al
> > > > +#define ax_8   al
> > > > +#define al_8   al
> > > > +#define rax_16 ax
> > > > +#define eax_16 ax
> > > > +#define ax_16  ax
> > > > +#define al_16  ax
> > > > +#define rax_32 eax
> > > > +#define eax_32 eax
> > > > +#define ax_32  eax
> > > > +#define al_32  eax
> > > > +#define rax_64 rax
> > > > +#define eax_64 rax
> > > > +#define ax_64  rax
> > > > +#define al_64  rax
> > >
> > > Only rax_32 and rax_64 are needed.
> > >
> > > > +#define rbx_8  bl
> > > > +#define ebx_8  bl
> > > > +#define bx_8   bl
> > > > +#define bl_8   bl
> > > > +#define rbx_16 bx
> > > > +#define ebx_16 bx
> > > > +#define bx_16  bx
> > > > +#define bl_16  bx
> > > > +#define rbx_32 ebx
> > > > +#define ebx_32 ebx
> > > > +#define bx_32  ebx
> > > > +#define bl_32  ebx
> > > > +#define rbx_64 rbx
> > > > +#define ebx_64 rbx
> > > > +#define bx_64  rbx
> > > > +#define bl_64  rbx
> > > > +#define rcx_8  cl
> > > > +#define ecx_8  cl
> > > > +#define cx_8   cl
> > > > +#define cl_8   cl
> > > > +#define rcx_16 cx
> > > > +#define ecx_16 cx
> > > > +#define cx_16  cx
> > > > +#define cl_16  cx
> > > > +#define rcx_32 ecx
> > > > +#define ecx_32 ecx
> > > > +#define cx_32  ecx
> > > > +#define cl_32  ecx
> > > > +#define rcx_64 rcx
> > > > +#define ecx_64 rcx
> > > > +#define cx_64  rcx
> > > > +#define cl_64  rcx
> > > > +#define rdx_8  dl
> > > > +#define edx_8  dl
> > > > +#define dx_8   dl
> > > > +#define dl_8   dl
> > > > +#define rdx_16 dx
> > > > +#define edx_16 dx
> > > > +#define dx_16  dx
> > > > +#define dl_16  dx
> > > > +#define rdx_32 edx
> > > > +#define edx_32 edx
> > > > +#define dx_32  edx
> > > > +#define dl_32  edx
> > > > +#define rdx_64 rdx
> > > > +#define edx_64 rdx
> > > > +#define dx_64  rdx
> > > > +#define dl_64  rdx
> > > > +#define rbp_8  bpl
> > > > +#define ebp_8  bpl
> > > > +#define bp_8   bpl
> > > > +#define bpl_8  bpl
> > > > +#define rbp_16 bp
> > > > +#define ebp_16 bp
> > > > +#define bp_16  bp
> > > > +#define bpl_16 bp
> > > > +#define rbp_32 ebp
> > > > +#define ebp_32 ebp
> > > > +#define bp_32  ebp
> > > > +#define bpl_32 ebp
> > > > +#define rbp_64 rbp
> > > > +#define ebp_64 rbp
> > > > +#define bp_64  rbp
> > > > +#define bpl_64 rbp
> > > > +#define rsp_8  spl
> > > > +#define esp_8  spl
> > > > +#define sp_8   spl
> > > > +#define spl_8  spl
> > > > +#define rsp_16 sp
> > > > +#define esp_16 sp
> > > > +#define sp_16  sp
> > > > +#define spl_16 sp
> > > > +#define rsp_32 esp
> > > > +#define esp_32 esp
> > > > +#define sp_32  esp
> > > > +#define spl_32 esp
> > > > +#define rsp_64 rsp
> > > > +#define esp_64 rsp
> > > > +#define sp_64  rsp
> > > > +#define spl_64 rsp
> > > > +#define rsi_8  sil
> > > > +#define esi_8  sil
> > > > +#define si_8   sil
> > > > +#define sil_8  sil
> > > > +#define rsi_16 si
> > > > +#define esi_16 si
> > > > +#define si_16  si
> > > > +#define sil_16 si
> > > > +#define rsi_32 esi
> > > > +#define esi_32 esi
> > > > +#define si_32  esi
> > > > +#define sil_32 esi
> > > > +#define rsi_64 rsi
> > > > +#define esi_64 rsi
> > > > +#define si_64  rsi
> > > > +#define sil_64 rsi
> > > > +#define rdi_8  dil
> > > > +#define edi_8  dil
> > > > +#define di_8   dil
> > > > +#define dil_8  dil
> > > > +#define rdi_16 di
> > > > +#define edi_16 di
> > > > +#define di_16  di
> > > > +#define dil_16 di
> > > > +#define rdi_32 edi
> > > > +#define edi_32 edi
> > > > +#define di_32  edi
> > > > +#define dil_32 edi
> > > > +#define rdi_64 rdi
> > > > +#define edi_64 rdi
> > > > +#define di_64  rdi
> > > > +#define dil_64 rdi
> > > > +#define r8_8   r8b
> > > > +#define r8d_8  r8b
> > > > +#define r8w_8  r8b
> > > > +#define r8b_8  r8b
> > > > +#define r8_16  r8w
> > > > +#define r8d_16 r8w
> > > > +#define r8w_16 r8w
> > > > +#define r8b_16 r8w
> > > > +#define r8_32  r8d
> > > > +#define r8d_32 r8d
> > > > +#define r8w_32 r8d
> > > > +#define r8b_32 r8d
> > > > +#define r8_64  r8
> > > > +#define r8d_64 r8
> > > > +#define r8w_64 r8
> > > > +#define r8b_64 r8
> > > > +#define r9_8   r9b
> > > > +#define r9d_8  r9b
> > > > +#define r9w_8  r9b
> > > > +#define r9b_8  r9b
> > > > +#define r9_16  r9w
> > > > +#define r9d_16 r9w
> > > > +#define r9w_16 r9w
> > > > +#define r9b_16 r9w
> > > > +#define r9_32  r9d
> > > > +#define r9d_32 r9d
> > > > +#define r9w_32 r9d
> > > > +#define r9b_32 r9d
> > > > +#define r9_64  r9
> > > > +#define r9d_64 r9
> > > > +#define r9w_64 r9
> > > > +#define r9b_64 r9
> > > > +#define r10_8  r10b
> > > > +#define r10d_8 r10b
> > > > +#define r10w_8 r10b
> > > > +#define r10b_8 r10b
> > > > +#define r10_16 r10w
> > > > +#define r10d_16        r10w
> > > > +#define r10w_16        r10w
> > > > +#define r10b_16        r10w
> > > > +#define r10_32 r10d
> > > > +#define r10d_32        r10d
> > > > +#define r10w_32        r10d
> > > > +#define r10b_32        r10d
> > > > +#define r10_64 r10
> > > > +#define r10d_64        r10
> > > > +#define r10w_64        r10
> > > > +#define r10b_64        r10
> > > > +#define r11_8  r11b
> > > > +#define r11d_8 r11b
> > > > +#define r11w_8 r11b
> > > > +#define r11b_8 r11b
> > > > +#define r11_16 r11w
> > > > +#define r11d_16        r11w
> > > > +#define r11w_16        r11w
> > > > +#define r11b_16        r11w
> > > > +#define r11_32 r11d
> > > > +#define r11d_32        r11d
> > > > +#define r11w_32        r11d
> > > > +#define r11b_32        r11d
> > > > +#define r11_64 r11
> > > > +#define r11d_64        r11
> > > > +#define r11w_64        r11
> > > > +#define r11b_64        r11
> > > > +#define r12_8  r12b
> > > > +#define r12d_8 r12b
> > > > +#define r12w_8 r12b
> > > > +#define r12b_8 r12b
> > > > +#define r12_16 r12w
> > > > +#define r12d_16        r12w
> > > > +#define r12w_16        r12w
> > > > +#define r12b_16        r12w
> > > > +#define r12_32 r12d
> > > > +#define r12d_32        r12d
> > > > +#define r12w_32        r12d
> > > > +#define r12b_32        r12d
> > > > +#define r12_64 r12
> > > > +#define r12d_64        r12
> > > > +#define r12w_64        r12
> > > > +#define r12b_64        r12
> > > > +#define r13_8  r13b
> > > > +#define r13d_8 r13b
> > > > +#define r13w_8 r13b
> > > > +#define r13b_8 r13b
> > > > +#define r13_16 r13w
> > > > +#define r13d_16        r13w
> > > > +#define r13w_16        r13w
> > > > +#define r13b_16        r13w
> > > > +#define r13_32 r13d
> > > > +#define r13d_32        r13d
> > > > +#define r13w_32        r13d
> > > > +#define r13b_32        r13d
> > > > +#define r13_64 r13
> > > > +#define r13d_64        r13
> > > > +#define r13w_64        r13
> > > > +#define r13b_64        r13
> > > > +#define r14_8  r14b
> > > > +#define r14d_8 r14b
> > > > +#define r14w_8 r14b
> > > > +#define r14b_8 r14b
> > > > +#define r14_16 r14w
> > > > +#define r14d_16        r14w
> > > > +#define r14w_16        r14w
> > > > +#define r14b_16        r14w
> > > > +#define r14_32 r14d
> > > > +#define r14d_32        r14d
> > > > +#define r14w_32        r14d
> > > > +#define r14b_32        r14d
> > > > +#define r14_64 r14
> > > > +#define r14d_64        r14
> > > > +#define r14w_64        r14
> > > > +#define r14b_64        r14
> > > > +#define r15_8  r15b
> > > > +#define r15d_8 r15b
> > > > +#define r15w_8 r15b
> > > > +#define r15b_8 r15b
> > > > +#define r15_16 r15w
> > > > +#define r15d_16        r15w
> > > > +#define r15w_16        r15w
> > > > +#define r15b_16        r15w
> > > > +#define r15_32 r15d
> > > > +#define r15d_32        r15d
> > > > +#define r15w_32        r15d
> > > > +#define r15b_32        r15d
> > > > +#define r15_64 r15
> > > > +#define r15d_64        r15
> > > > +#define r15w_64        r15
> > > > +#define r15b_64        r15
> > > > +
> > > > +#define VRAX   VGPR(rax)
> > > > +#define VRBX   VGPR(rbx)
> > > > +#define VRCX   VGPR(rcx)
> > > > +#define VRDX   VGPR(rdx)
> > > > +#define VRBP   VGPR(rbp)
> > > > +#define VRSP   VGPR(rsp)
> > > > +#define VRSI   VGPR(rsi)
> > > > +#define VRDI   VGPR(rdi)
> > > > +#define VR8    VGPR(r8)
> > > > +#define VR9    VGPR(r9)
> > > > +#define VR10   VGPR(r10)
> > > > +#define VR11   VGPR(r11)
> > > > +#define VR12   VGPR(r12)
> > > > +#define VR13   VGPR(r13)
> > > > +#define VR14   VGPR(r14)
> > > > +#define VR15   VGPR(r15)
> > > > +
> > > > +#define kmov_8 kmovb
> > > > +#define kmov_16        kmovw
> > > > +#define kmov_32        kmovd
> > > > +#define kmov_64        kmovq
> > >
> > > Only 32 and 64 are needed.
> >
> > Thats not entirely true for the wide-char impls.
> > >
> > > > +#define kortest_8      kortestb
> > > > +#define kortest_16     kortestw
> > > > +#define kortest_32     kortestd
> > > > +#define kortest_64     kortestq
> > > > +#define kor_8  korb
> > > > +#define kor_16 korw
> > > > +#define kor_32 kord
> > > > +#define kor_64 korq
> > > > +#define ktest_8        ktestb
> > > > +#define ktest_16       ktestw
> > > > +#define ktest_32       ktestd
> > > > +#define ktest_64       ktestq
> > > > +#define kand_8 kandb
> > > > +#define kand_16        kandw
> > > > +#define kand_32        kandd
> > > > +#define kand_64        kandq
> > > > +#define kxor_8 kxorb
> > > > +#define kxor_16        kxorw
> > > > +#define kxor_32        kxord
> > > > +#define kxor_64        kxorq
> > > > +
> > > > +#define kmovV  VKINSN_SZ(kmov, REG_WIDTH)
> > > > +#define kortestV       VKINSN_SZ(kortest, REG_WIDTH)
> > > > +#define korV   VKINSN_SZ(kor, REG_WIDTH)
> > > > +#define ktestV         VKINSN_SZ(ktest, REG_WIDTH)
> > > > +#define kandV  VKINSN_SZ(kand, REG_WIDTH)
> > > > +#define kxorV  VKINSN_SZ(kxor, REG_WIDTH)
> > >
> > > #define VKINSN(op) VKINSN_SZ(op, REG_WIDTH)
> >
> > Will fix for V5.
> > >
> > > > +
> > > > +#ifndef REG_WIDTH
> > > > +#define REG_WIDTH VEC_SIZE
> > >
> > > Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH
> > > can be dropped.
> >
> > Thats not quite true.
> >
> > For wide-char impls REG_WIDTH != VEC_SIZE.
>
> These register macros are used to operate vectors.  Do you have
> an example of REG_WIDTH != VEC_SIZE?

But since wide-char instructions use 32-bit comparison the resulting
mask is < 64-bit i.e:

vpcmpd %zmm16, %zmm17, %k0
kmovd %k0, %eax
will collect all the necessary bits and is prefered.

Next version of Sunil's memchr-evex512 should have it.

>
> > >
> > > > +#endif
> > > > +#define PRIM_VGPR_SZ(reg_name, reg_size)       reg_name##_##reg_size
> > >
> > > This is used for both register and instruction.  How about
> > >
> > > #define VPASTER(x,y) x##_##y
> >
> > Will fix for V5.
> > >
> > >
> > > > +#define VGPR_SZ(reg_name, reg_size)    PRIM_VGPR_SZ(reg_name, reg_size)
> > > > +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> > > > +#define VKINSN_SZ(insn, reg_size)      PRIM_VGPR_SZ(insn, reg_size)
> > >
> > > No need for both VGPR_SZ and VKINSN_SZ.  How about
> > >
> > > #define VEVALUATOR(x,y) VPASTER(x,y)
> >
> > Will change for V5.
> > >
> > > > +
> > > > +#endif
> > > > diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
> > > > new file mode 100644
> > > > index 0000000000..5b04e89ecb
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
> > > > @@ -0,0 +1,97 @@
> > > > +#!/usr/bin/python3
> > > > +# Copyright (C) 2022 Free Software Foundation, Inc.
> > > > +# This file is part of the GNU C Library.
> > > > +#
> > > > +# The GNU C Library is free software; you can redistribute it and/or
> > > > +# modify it under the terms of the GNU Lesser General Public
> > > > +# License as published by the Free Software Foundation; either
> > > > +# version 2.1 of the License, or (at your option) any later version.
> > > > +#
> > > > +# The GNU C Library is distributed in the hope that it will be useful,
> > > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +# Lesser General Public License for more details.
> > > > +#
> > > > +# You should have received a copy of the GNU Lesser General Public
> > > > +# License along with the GNU C Library; if not, see
> > > > +# <https://www.gnu.org/licenses/>.
> > > > +"""Generate macros for getting GPR name of a certain size
> > > > +
> > > > +Inputs: None
> > > > +Output: Prints header fill to stdout
> > > > +
> > > > +API:
> > > > +    VGPR(reg_name)
> > > > +        - Get register name VEC_SIZE component of `reg_name`
> > > > +    VGPR_SZ(reg_name, reg_size)
> > > > +        - Get register name `reg_size` component of `reg_name`
> > > > +"""
> > > > +
> > > > +import sys
> > > > +from datetime import datetime
> > > > +
> > > > +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> > > > +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> > > > +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> > > > +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> > > > +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> > > > +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> > > > +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> > > > +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> > > > +
> > > > +mask_insns = ["kmov", "kortest", "kor", "ktest", "kand", "kxor"]
> > > > +mask_insns_ext = ["b", "w", "d", "q"]
> > > > +
> > > > +cr = """
> > > > +   Copyright (C) {} Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +"""
> > > > +
> > > > +print("/* This file was generated by: {}.".format(sys.argv[0]))
> > > > +print(cr.format(datetime.today().year))
> > > > +
> > > > +print("#ifndef _REG_MAP_MACROS_H")
> > > > +print("#define _REG_MAP_MACROS_H\t1\n")
> > > > +for reg in registers:
> > > > +    for i in range(0, 4):
> > > > +        for j in range(0, 4):
> > > > +            print("#define {}_{}\t{}".format(reg[j], 8 << i, reg[3 - i]))
> > > > +
> > > > +print("")
> > > > +for reg in registers:
> > > > +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> > > > +
> > > > +print("")
> > > > +for mask_insn in mask_insns:
> > > > +    for i in range(0, 4):
> > > > +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> > > > +                                           mask_insns_ext[i]))
> > > > +
> > > > +print("")
> > > > +for mask_insn in mask_insns:
> > > > +    print("#define {}V \tVKINSN_SZ({}, REG_WIDTH)".format(mask_insn, mask_insn))
> > > > +print("")
> > > > +
> > > > +print("#ifndef REG_WIDTH")
> > > > +print("#define REG_WIDTH VEC_SIZE")
> > > > +print("#endif")
> > > > +print("#define PRIM_VGPR_SZ(reg_name, reg_size)\treg_name##_##reg_size")
> > > > +print("#define VGPR_SZ(reg_name, reg_size)\tPRIM_VGPR_SZ(reg_name, reg_size)")
> > > > +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> > > > +print("#define VKINSN_SZ(insn, reg_size)\tPRIM_VGPR_SZ(insn, reg_size)")
> > > > +
> > > > +print("\n#endif")
> > > > --
> > > > 2.34.1
> > > >
> > >
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v4 1/3] x86: Update evex256/512 vec macros
  2022-10-14 16:40 [PATCH v1 1/3] x86: Update evex256/512 vec macros Noah Goldstein
                   ` (4 preceding siblings ...)
  2022-10-14 18:22 ` [PATCH v3 1/3] x86: Update evex256/512 vec macros Noah Goldstein
@ 2022-10-14 18:41 ` Noah Goldstein
  2022-10-14 18:41   ` [PATCH v4 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
  2022-10-14 18:41   ` [PATCH v4 3/3] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
  2022-10-14 21:14 ` [PATCH v5 1/3] x86: Update evex256/512 vec macros Noah Goldstein
                   ` (4 subsequent siblings)
  10 siblings, 2 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 18:41 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

1) Make section only define if there is not a previous definition
2) Add `VEC_lo` definition for proper reg-width but in the
   ymm/zmm0-15 range.

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/evex256-vecs.h | 7 +++++--
 sysdeps/x86_64/multiarch/evex512-vecs.h | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
index 222ba46dc7..4fccabd4b8 100644
--- a/sysdeps/x86_64/multiarch/evex256-vecs.h
+++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
@@ -28,8 +28,11 @@
 #include "evex-vecs-common.h"
 
 #define USE_WITH_EVEX256	1
-#define SECTION(p)			p##.evex
 
-#define VEC					VEC_ymm
+#ifndef SECTION
+# define SECTION(p)			p##.evex
+#endif
 
+#define VEC					VEC_ymm
+#define VEC_lo				VEC_any_ymm
 #endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
index d1784d5368..fecc2d3925 100644
--- a/sysdeps/x86_64/multiarch/evex512-vecs.h
+++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
@@ -28,8 +28,11 @@
 #include "evex-vecs-common.h"
 
 #define USE_WITH_EVEX512	1
-#define SECTION(p)			p##.evex512
 
-#define VEC					VEC_zmm
+#ifndef SECTION
+# define SECTION(p)			p##.evex512
+#endif
 
+#define VEC					VEC_zmm
+#define VEC_lo				VEC_any_zmm
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v4 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 18:41 ` [PATCH v4 1/3] x86: Update evex256/512 vec macros Noah Goldstein
@ 2022-10-14 18:41   ` Noah Goldstein
  2022-10-14 18:41   ` [PATCH v4 3/3] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
  1 sibling, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 18:41 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

This is to make it easier to do think like:
```
vpcmpb %VEC(0), %VEC(1), %k0
kmov{d|q} %k0, %{eax|rax}
test %{eax|rax}
```

It adds macro s.t any GPR can get the proper width with:
    `V{upper_case_GPR_name}`

and any mask insn can get the proper width with:
    `{mask_insn_without_postfix}V`

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/reg-macros.h         | 358 ++++++++++++++++++
 .../multiarch/scripts/gen-reg-macros.py       | 124 ++++++
 2 files changed, 482 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
 create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py

diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
new file mode 100644
index 0000000000..e5337bbd40
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/reg-macros.h
@@ -0,0 +1,358 @@
+/* This file was generated by: gen-reg-macros.py.
+
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _REG_MACROS_H
+#define _REG_MACROS_H	1
+
+#define rax_8	al
+#define eax_8	al
+#define ax_8	al
+#define al_8	al
+#define rax_16	ax
+#define eax_16	ax
+#define ax_16	ax
+#define al_16	ax
+#define rax_32	eax
+#define eax_32	eax
+#define ax_32	eax
+#define al_32	eax
+#define rax_64	rax
+#define eax_64	rax
+#define ax_64	rax
+#define al_64	rax
+#define rbx_8	bl
+#define ebx_8	bl
+#define bx_8	bl
+#define bl_8	bl
+#define rbx_16	bx
+#define ebx_16	bx
+#define bx_16	bx
+#define bl_16	bx
+#define rbx_32	ebx
+#define ebx_32	ebx
+#define bx_32	ebx
+#define bl_32	ebx
+#define rbx_64	rbx
+#define ebx_64	rbx
+#define bx_64	rbx
+#define bl_64	rbx
+#define rcx_8	cl
+#define ecx_8	cl
+#define cx_8	cl
+#define cl_8	cl
+#define rcx_16	cx
+#define ecx_16	cx
+#define cx_16	cx
+#define cl_16	cx
+#define rcx_32	ecx
+#define ecx_32	ecx
+#define cx_32	ecx
+#define cl_32	ecx
+#define rcx_64	rcx
+#define ecx_64	rcx
+#define cx_64	rcx
+#define cl_64	rcx
+#define rdx_8	dl
+#define edx_8	dl
+#define dx_8	dl
+#define dl_8	dl
+#define rdx_16	dx
+#define edx_16	dx
+#define dx_16	dx
+#define dl_16	dx
+#define rdx_32	edx
+#define edx_32	edx
+#define dx_32	edx
+#define dl_32	edx
+#define rdx_64	rdx
+#define edx_64	rdx
+#define dx_64	rdx
+#define dl_64	rdx
+#define rbp_8	bpl
+#define ebp_8	bpl
+#define bp_8	bpl
+#define bpl_8	bpl
+#define rbp_16	bp
+#define ebp_16	bp
+#define bp_16	bp
+#define bpl_16	bp
+#define rbp_32	ebp
+#define ebp_32	ebp
+#define bp_32	ebp
+#define bpl_32	ebp
+#define rbp_64	rbp
+#define ebp_64	rbp
+#define bp_64	rbp
+#define bpl_64	rbp
+#define rsp_8	spl
+#define esp_8	spl
+#define sp_8	spl
+#define spl_8	spl
+#define rsp_16	sp
+#define esp_16	sp
+#define sp_16	sp
+#define spl_16	sp
+#define rsp_32	esp
+#define esp_32	esp
+#define sp_32	esp
+#define spl_32	esp
+#define rsp_64	rsp
+#define esp_64	rsp
+#define sp_64	rsp
+#define spl_64	rsp
+#define rsi_8	sil
+#define esi_8	sil
+#define si_8	sil
+#define sil_8	sil
+#define rsi_16	si
+#define esi_16	si
+#define si_16	si
+#define sil_16	si
+#define rsi_32	esi
+#define esi_32	esi
+#define si_32	esi
+#define sil_32	esi
+#define rsi_64	rsi
+#define esi_64	rsi
+#define si_64	rsi
+#define sil_64	rsi
+#define rdi_8	dil
+#define edi_8	dil
+#define di_8	dil
+#define dil_8	dil
+#define rdi_16	di
+#define edi_16	di
+#define di_16	di
+#define dil_16	di
+#define rdi_32	edi
+#define edi_32	edi
+#define di_32	edi
+#define dil_32	edi
+#define rdi_64	rdi
+#define edi_64	rdi
+#define di_64	rdi
+#define dil_64	rdi
+#define r8_8	r8b
+#define r8d_8	r8b
+#define r8w_8	r8b
+#define r8b_8	r8b
+#define r8_16	r8w
+#define r8d_16	r8w
+#define r8w_16	r8w
+#define r8b_16	r8w
+#define r8_32	r8d
+#define r8d_32	r8d
+#define r8w_32	r8d
+#define r8b_32	r8d
+#define r8_64	r8
+#define r8d_64	r8
+#define r8w_64	r8
+#define r8b_64	r8
+#define r9_8	r9b
+#define r9d_8	r9b
+#define r9w_8	r9b
+#define r9b_8	r9b
+#define r9_16	r9w
+#define r9d_16	r9w
+#define r9w_16	r9w
+#define r9b_16	r9w
+#define r9_32	r9d
+#define r9d_32	r9d
+#define r9w_32	r9d
+#define r9b_32	r9d
+#define r9_64	r9
+#define r9d_64	r9
+#define r9w_64	r9
+#define r9b_64	r9
+#define r10_8	r10b
+#define r10d_8	r10b
+#define r10w_8	r10b
+#define r10b_8	r10b
+#define r10_16	r10w
+#define r10d_16	r10w
+#define r10w_16	r10w
+#define r10b_16	r10w
+#define r10_32	r10d
+#define r10d_32	r10d
+#define r10w_32	r10d
+#define r10b_32	r10d
+#define r10_64	r10
+#define r10d_64	r10
+#define r10w_64	r10
+#define r10b_64	r10
+#define r11_8	r11b
+#define r11d_8	r11b
+#define r11w_8	r11b
+#define r11b_8	r11b
+#define r11_16	r11w
+#define r11d_16	r11w
+#define r11w_16	r11w
+#define r11b_16	r11w
+#define r11_32	r11d
+#define r11d_32	r11d
+#define r11w_32	r11d
+#define r11b_32	r11d
+#define r11_64	r11
+#define r11d_64	r11
+#define r11w_64	r11
+#define r11b_64	r11
+#define r12_8	r12b
+#define r12d_8	r12b
+#define r12w_8	r12b
+#define r12b_8	r12b
+#define r12_16	r12w
+#define r12d_16	r12w
+#define r12w_16	r12w
+#define r12b_16	r12w
+#define r12_32	r12d
+#define r12d_32	r12d
+#define r12w_32	r12d
+#define r12b_32	r12d
+#define r12_64	r12
+#define r12d_64	r12
+#define r12w_64	r12
+#define r12b_64	r12
+#define r13_8	r13b
+#define r13d_8	r13b
+#define r13w_8	r13b
+#define r13b_8	r13b
+#define r13_16	r13w
+#define r13d_16	r13w
+#define r13w_16	r13w
+#define r13b_16	r13w
+#define r13_32	r13d
+#define r13d_32	r13d
+#define r13w_32	r13d
+#define r13b_32	r13d
+#define r13_64	r13
+#define r13d_64	r13
+#define r13w_64	r13
+#define r13b_64	r13
+#define r14_8	r14b
+#define r14d_8	r14b
+#define r14w_8	r14b
+#define r14b_8	r14b
+#define r14_16	r14w
+#define r14d_16	r14w
+#define r14w_16	r14w
+#define r14b_16	r14w
+#define r14_32	r14d
+#define r14d_32	r14d
+#define r14w_32	r14d
+#define r14b_32	r14d
+#define r14_64	r14
+#define r14d_64	r14
+#define r14w_64	r14
+#define r14b_64	r14
+#define r15_8	r15b
+#define r15d_8	r15b
+#define r15w_8	r15b
+#define r15b_8	r15b
+#define r15_16	r15w
+#define r15d_16	r15w
+#define r15w_16	r15w
+#define r15b_16	r15w
+#define r15_32	r15d
+#define r15d_32	r15d
+#define r15w_32	r15d
+#define r15b_32	r15d
+#define r15_64	r15
+#define r15d_64	r15
+#define r15w_64	r15
+#define r15b_64	r15
+
+#define kmov_8	kmovb
+#define kmov_16	kmovw
+#define kmov_32	kmovd
+#define kmov_64	kmovq
+#define kortest_8	kortestb
+#define kortest_16	kortestw
+#define kortest_32	kortestd
+#define kortest_64	kortestq
+#define kor_8	korb
+#define kor_16	korw
+#define kor_32	kord
+#define kor_64	korq
+#define ktest_8	ktestb
+#define ktest_16	ktestw
+#define ktest_32	ktestd
+#define ktest_64	ktestq
+#define kand_8	kandb
+#define kand_16	kandw
+#define kand_32	kandd
+#define kand_64	kandq
+#define kxor_8	kxorb
+#define kxor_16	kxorw
+#define kxor_32	kxord
+#define kxor_64	kxorq
+#define knot_8	knotb
+#define knot_16	knotw
+#define knot_32	knotd
+#define knot_64	knotq
+#define kxnor_8	kxnorb
+#define kxnor_16	kxnorw
+#define kxnor_32	kxnord
+#define kxnor_64	kxnorq
+#define kunpack_8	kunpackbw
+#define kunpack_16	kunpackwd
+#define kunpack_32	kunpackdq
+
+/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
+#define VRAX	VGPR(rax)
+#define VRBX	VGPR(rbx)
+#define VRCX	VGPR(rcx)
+#define VRDX	VGPR(rdx)
+#define VRBP	VGPR(rbp)
+#define VRSP	VGPR(rsp)
+#define VRSI	VGPR(rsi)
+#define VRDI	VGPR(rdi)
+#define VR8	VGPR(r8)
+#define VR9	VGPR(r9)
+#define VR10	VGPR(r10)
+#define VR11	VGPR(r11)
+#define VR12	VGPR(r12)
+#define VR13	VGPR(r13)
+#define VR14	VGPR(r14)
+#define VR15	VGPR(r15)
+
+/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
+#define KMOV 	VKINSN(kmov)
+#define KORTEST 	VKINSN(kortest)
+#define KOR 	VKINSN(kor)
+#define KTEST 	VKINSN(ktest)
+#define KAND 	VKINSN(kand)
+#define KXOR 	VKINSN(kxor)
+#define KNOT 	VKINSN(knot)
+#define KXNOR 	VKINSN(kxnor)
+#define KUNPACK 	VKINSN(kunpack)
+
+#ifndef REG_WIDTH
+# define REG_WIDTH VEC_SIZE
+#endif
+
+#define VPASTER(x, y)	x##_##y
+#define VEVALUATOR(x, y)	VPASTER(x, y)
+
+#define VGPR_SZ(reg_name, reg_size)	VEVALUATOR(reg_name, reg_size)
+#define VKINSN_SZ(insn, reg_size)	VEVALUATOR(insn, reg_size)
+
+#define VGPR(reg_name)	VGPR_SZ(reg_name, REG_WIDTH)
+#define VKINSN(mask_insn)	VKINSN_SZ(mask_insn, REG_WIDTH)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
new file mode 100644
index 0000000000..dd0439f087
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
@@ -0,0 +1,124 @@
+#!/usr/bin/python3
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+"""Generate macros for getting GPR name of a certain size
+
+Inputs: None
+Output: Prints header fill to stdout
+
+API:
+    VGPR(reg_name)
+        - Get register name VEC_SIZE component of `reg_name`
+    VGPR_SZ(reg_name, reg_size)
+        - Get register name `reg_size` component of `reg_name`
+"""
+
+import sys
+import os
+from datetime import datetime
+
+registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
+             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
+             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
+             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
+             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
+             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
+             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
+             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
+
+mask_insns = [
+    "kmov",
+    "kortest",
+    "kor",
+    "ktest",
+    "kand",
+    "kxor",
+    "knot",
+    "kxnor",
+]
+mask_insns_ext = ["b", "w", "d", "q"]
+
+cr = """
+   Copyright (C) {} Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+"""
+
+print("/* This file was generated by: {}.".format(os.path.basename(
+    sys.argv[0])))
+print(cr.format(datetime.today().year))
+
+print("#ifndef _REG_MACROS_H")
+print("#define _REG_MACROS_H\t1")
+print("")
+for reg in registers:
+    for i in range(0, 4):
+        for j in range(0, 4):
+            print("#define {}_{}\t{}".format(reg[j], 8 << i, reg[3 - i]))
+
+print("")
+for mask_insn in mask_insns:
+    for i in range(0, 4):
+        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
+                                           mask_insns_ext[i]))
+for i in range(0, 3):
+    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
+                                                   mask_insns_ext[i + 1]))
+mask_insns.append("kunpack")
+
+print("")
+print(
+    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
+for reg in registers:
+    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
+
+print("")
+
+print(
+    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
+)
+for mask_insn in mask_insns:
+    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
+print("")
+
+print("#ifndef REG_WIDTH")
+print("# define REG_WIDTH VEC_SIZE")
+print("#endif")
+print("")
+print("#define VPASTER(x, y)\tx##_##y")
+print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
+print("")
+print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
+print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
+print("")
+print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
+print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
+
+print("\n#endif")
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v4 3/3] x86: Update strlen-evex-base to use new reg/vec macros.
  2022-10-14 18:41 ` [PATCH v4 1/3] x86: Update evex256/512 vec macros Noah Goldstein
  2022-10-14 18:41   ` [PATCH v4 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
@ 2022-10-14 18:41   ` Noah Goldstein
  1 sibling, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 18:41 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

To avoid duplicate the VMM / GPR / mask insn macros in all incoming
evex512 files use the macros defined in 'reg-macros.h' and
'{vec}-macros.h'

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 116 +++++++-------------
 sysdeps/x86_64/multiarch/strlen-evex512.S   |   4 +-
 2 files changed, 44 insertions(+), 76 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index 418e9f8411..8af9791e92 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -36,42 +36,10 @@
 #  define CHAR_SIZE	1
 # endif
 
-# define XMM0		xmm16
 # define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# if VEC_SIZE == 64
-#  define KMOV		kmovq
-#  define KORTEST	kortestq
-#  define RAX		rax
-#  define RCX		rcx
-#  define RDX		rdx
-#  define SHR		shrq
-#  define TEXTSUFFIX	evex512
-#  define VMM0		zmm16
-#  define VMM1		zmm17
-#  define VMM2		zmm18
-#  define VMM3		zmm19
-#  define VMM4		zmm20
-#  define VMOVA		vmovdqa64
-# elif VEC_SIZE == 32
-/* Currently Unused.  */
-#  define KMOV		kmovd
-#  define KORTEST	kortestd
-#  define RAX		eax
-#  define RCX		ecx
-#  define RDX		edx
-#  define SHR		shrl
-#  define TEXTSUFFIX	evex256
-#  define VMM0		ymm16
-#  define VMM1		ymm17
-#  define VMM2		ymm18
-#  define VMM3		ymm19
-#  define VMM4		ymm20
-#  define VMOVA		vmovdqa32
-# endif
-
-	.section .text.TEXTSUFFIX, "ax", @progbits
+	.section SECTION(.text),"ax",@progbits
 /* Aligning entry point to 64 byte, provides better performance for
    one vector length string.  */
 ENTRY_P2ALIGN (STRLEN, 6)
@@ -86,18 +54,18 @@ ENTRY_P2ALIGN (STRLEN, 6)
 # endif
 
 	movl	%edi, %eax
-	vpxorq	%XMM0, %XMM0, %XMM0
+	vpxorq	%VEC_xmm(0), %VEC_xmm(0), %VEC_xmm(0)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(page_cross)
 
 	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMP	$0, (%rdi), %VMM0, %k0
-	KMOV	%k0, %RAX
-	test	%RAX, %RAX
+	VPCMP	$0, (%rdi), %VEC(0), %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
@@ -120,7 +88,7 @@ L(align_more):
 	movq	%rax, %rdx
 	subq	%rdi, %rdx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RDX
+	shr	$2, %VRDX
 #  endif
 	/* At this point rdx contains [w]chars already compared.  */
 	subq	%rsi, %rdx
@@ -131,9 +99,9 @@ L(align_more):
 # endif
 
 	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMP	$0, (%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (%rax), %VEC(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
 # ifdef USE_AS_STRNLEN
@@ -141,9 +109,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, VEC_SIZE(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, VEC_SIZE(%rax), %VEC(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
 # ifdef USE_AS_STRNLEN
@@ -151,9 +119,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VEC(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 
 # ifdef USE_AS_STRNLEN
@@ -161,9 +129,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VEC(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x4)
 
 # ifdef USE_AS_STRNLEN
@@ -179,7 +147,7 @@ L(align_more):
 # ifdef USE_AS_STRNLEN
 	subq	%rax, %rcx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RCX
+	shr	$2, %VRCX
 #  endif
 	/* rcx contains number of [w]char will be recompared due to
 	   alignment fixes.  rdx must be incremented by rcx to offset
@@ -199,42 +167,42 @@ L(loop_entry):
 # endif
 	/* VPMINU and VPCMP combination provide better performance as
 	   compared to alternative combinations.  */
-	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
-	VPMINU	(VEC_SIZE * 5)(%rax), %VMM1, %VMM2
-	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
-	VPMINU	(VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+	VMOVA	(VEC_SIZE * 4)(%rax), %VEC(1)
+	VPMINU	(VEC_SIZE * 5)(%rax), %VEC(1), %VEC(2)
+	VMOVA	(VEC_SIZE * 6)(%rax), %VEC(3)
+	VPMINU	(VEC_SIZE * 7)(%rax), %VEC(3), %VEC(4)
 
-	VPTESTN	%VMM2, %VMM2, %k0
-	VPTESTN	%VMM4, %VMM4, %k1
+	VPTESTN	%VEC(2), %VEC(2), %k0
+	VPTESTN	%VEC(4), %VEC(4), %k1
 
 	subq	$-(VEC_SIZE * 4), %rax
 	KORTEST	%k0, %k1
 	jz	L(loop)
 
-	VPTESTN	%VMM1, %VMM1, %k2
-	KMOV	%k2, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VEC(1), %VEC(1), %k2
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
-	KMOV	%k0, %RCX
+	KMOV	%k0, %VRCX
 	/* At this point, if k0 is non zero, null char must be in the
 	   second vector.  */
-	test	%RCX, %RCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
-	VPTESTN	%VMM3, %VMM3, %k3
-	KMOV	%k3, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VEC(3), %VEC(3), %k3
+	KMOV	%k3, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 	/* At this point null [w]char must be in the fourth vector so no
 	   need to check.  */
-	KMOV	%k1, %RCX
+	KMOV	%k1, %VRCX
 
 	/* Fourth, third, second vector terminating are pretty much
 	   same, implemented this way to avoid branching and reuse code
 	   from pre loop exit condition.  */
 L(ret_vec_x4):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 3), %rax
@@ -250,7 +218,7 @@ L(ret_vec_x4):
 	ret
 
 L(ret_vec_x3):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 2), %rax
@@ -268,7 +236,7 @@ L(ret_vec_x3):
 L(ret_vec_x2):
 	subq	$-VEC_SIZE, %rax
 L(ret_vec_x1):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
@@ -289,13 +257,13 @@ L(page_cross):
 	/* ecx contains number of w[char] to be skipped as a result
 	   of address alignment.  */
 	xorq	%rdi, %rax
-	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
-	KMOV	%k0, %RAX
+	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VEC(0), %k0
+	KMOV	%k0, %VRAX
 	/* Ignore number of character for alignment adjustment.  */
-	SHR	%cl, %RAX
+	shr	%cl, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
index 116f8981c8..dfd0a7821b 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -2,6 +2,6 @@
 # define STRLEN		__strlen_evex512
 #endif
 
-#define VEC_SIZE	64
-
+#include "evex512-vecs.h"
+#include "reg-macros.h"
 #include "strlen-evex-base.S"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v1 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 18:38         ` Noah Goldstein
@ 2022-10-14 18:53           ` H.J. Lu
  2022-10-14 19:00             ` Noah Goldstein
  0 siblings, 1 reply; 72+ messages in thread
From: H.J. Lu @ 2022-10-14 18:53 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 11:38 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Oct 14, 2022 at 1:35 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Oct 14, 2022 at 11:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Fri, Oct 14, 2022 at 1:02 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > >  On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > This is to make it easier to do think like:
> > > > > ```
> > > > > vpcmpb %VEC(0), %VEC(1), %k0
> > > > > kmov{d|q} %k0, %{eax|rax}
> > > > > test %{eax|rax}
> > > > > ```
> > > >
> > > > Since all these register macros are based on VEC_SIZE which is either 32
> > > > bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are
> > > > needed.  8-bit and 16-bit macros aren't needed.
> > > >
> > > > > It adds macro s.t any GPR can get the proper width with:
> > > > >     `V{upper_case_GPR_name}`
> > > > >
> > > > > and any mask insn can get the proper width with:
> > > > >     `{mask_insn_without_postfix}V`
> > > >
> > > > All macros should be in upper cases.
> > > >
> > > > > This commit does not change libc.so
> > > > >
> > > > > Tested build on x86-64
> > > > > ---
> > > > >  sysdeps/x86_64/multiarch/reg-macros.h         | 337 ++++++++++++++++++
> > > > >  .../multiarch/scripts/gen-reg-map-macros.py   |  97 +++++
> > > > >  2 files changed, 434 insertions(+)
> > > > >  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
> > > > >  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
> > > > >
> > > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > > new file mode 100644
> > > > > index 0000000000..c4d7f57b66
> > > > > --- /dev/null
> > > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> > > >
> > > > vreg-macros.h to indicate macros based on vector size.   Please
> > > > add comments to indicate that register macros are expanded based
> > > > on vector size.
> > > >
> > > > > @@ -0,0 +1,337 @@
> > > > > +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py.
> > > > > +
> > > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > +   This file is part of the GNU C Library.
> > > > > +
> > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > +   License as published by the Free Software Foundation; either
> > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > +
> > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > +   Lesser General Public License for more details.
> > > > > +
> > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > +   License along with the GNU C Library; if not, see
> > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > +
> > > > > +#ifndef _REG_MAP_MACROS_H
> > > > > +#define _REG_MAP_MACROS_H      1
> > > > > +
> > > > > +#define rax_8  al
> > > > > +#define eax_8  al
> > > > > +#define ax_8   al
> > > > > +#define al_8   al
> > > > > +#define rax_16 ax
> > > > > +#define eax_16 ax
> > > > > +#define ax_16  ax
> > > > > +#define al_16  ax
> > > > > +#define rax_32 eax
> > > > > +#define eax_32 eax
> > > > > +#define ax_32  eax
> > > > > +#define al_32  eax
> > > > > +#define rax_64 rax
> > > > > +#define eax_64 rax
> > > > > +#define ax_64  rax
> > > > > +#define al_64  rax
> > > >
> > > > Only rax_32 and rax_64 are needed.
> > > >
> > > > > +#define rbx_8  bl
> > > > > +#define ebx_8  bl
> > > > > +#define bx_8   bl
> > > > > +#define bl_8   bl
> > > > > +#define rbx_16 bx
> > > > > +#define ebx_16 bx
> > > > > +#define bx_16  bx
> > > > > +#define bl_16  bx
> > > > > +#define rbx_32 ebx
> > > > > +#define ebx_32 ebx
> > > > > +#define bx_32  ebx
> > > > > +#define bl_32  ebx
> > > > > +#define rbx_64 rbx
> > > > > +#define ebx_64 rbx
> > > > > +#define bx_64  rbx
> > > > > +#define bl_64  rbx
> > > > > +#define rcx_8  cl
> > > > > +#define ecx_8  cl
> > > > > +#define cx_8   cl
> > > > > +#define cl_8   cl
> > > > > +#define rcx_16 cx
> > > > > +#define ecx_16 cx
> > > > > +#define cx_16  cx
> > > > > +#define cl_16  cx
> > > > > +#define rcx_32 ecx
> > > > > +#define ecx_32 ecx
> > > > > +#define cx_32  ecx
> > > > > +#define cl_32  ecx
> > > > > +#define rcx_64 rcx
> > > > > +#define ecx_64 rcx
> > > > > +#define cx_64  rcx
> > > > > +#define cl_64  rcx
> > > > > +#define rdx_8  dl
> > > > > +#define edx_8  dl
> > > > > +#define dx_8   dl
> > > > > +#define dl_8   dl
> > > > > +#define rdx_16 dx
> > > > > +#define edx_16 dx
> > > > > +#define dx_16  dx
> > > > > +#define dl_16  dx
> > > > > +#define rdx_32 edx
> > > > > +#define edx_32 edx
> > > > > +#define dx_32  edx
> > > > > +#define dl_32  edx
> > > > > +#define rdx_64 rdx
> > > > > +#define edx_64 rdx
> > > > > +#define dx_64  rdx
> > > > > +#define dl_64  rdx
> > > > > +#define rbp_8  bpl
> > > > > +#define ebp_8  bpl
> > > > > +#define bp_8   bpl
> > > > > +#define bpl_8  bpl
> > > > > +#define rbp_16 bp
> > > > > +#define ebp_16 bp
> > > > > +#define bp_16  bp
> > > > > +#define bpl_16 bp
> > > > > +#define rbp_32 ebp
> > > > > +#define ebp_32 ebp
> > > > > +#define bp_32  ebp
> > > > > +#define bpl_32 ebp
> > > > > +#define rbp_64 rbp
> > > > > +#define ebp_64 rbp
> > > > > +#define bp_64  rbp
> > > > > +#define bpl_64 rbp
> > > > > +#define rsp_8  spl
> > > > > +#define esp_8  spl
> > > > > +#define sp_8   spl
> > > > > +#define spl_8  spl
> > > > > +#define rsp_16 sp
> > > > > +#define esp_16 sp
> > > > > +#define sp_16  sp
> > > > > +#define spl_16 sp
> > > > > +#define rsp_32 esp
> > > > > +#define esp_32 esp
> > > > > +#define sp_32  esp
> > > > > +#define spl_32 esp
> > > > > +#define rsp_64 rsp
> > > > > +#define esp_64 rsp
> > > > > +#define sp_64  rsp
> > > > > +#define spl_64 rsp
> > > > > +#define rsi_8  sil
> > > > > +#define esi_8  sil
> > > > > +#define si_8   sil
> > > > > +#define sil_8  sil
> > > > > +#define rsi_16 si
> > > > > +#define esi_16 si
> > > > > +#define si_16  si
> > > > > +#define sil_16 si
> > > > > +#define rsi_32 esi
> > > > > +#define esi_32 esi
> > > > > +#define si_32  esi
> > > > > +#define sil_32 esi
> > > > > +#define rsi_64 rsi
> > > > > +#define esi_64 rsi
> > > > > +#define si_64  rsi
> > > > > +#define sil_64 rsi
> > > > > +#define rdi_8  dil
> > > > > +#define edi_8  dil
> > > > > +#define di_8   dil
> > > > > +#define dil_8  dil
> > > > > +#define rdi_16 di
> > > > > +#define edi_16 di
> > > > > +#define di_16  di
> > > > > +#define dil_16 di
> > > > > +#define rdi_32 edi
> > > > > +#define edi_32 edi
> > > > > +#define di_32  edi
> > > > > +#define dil_32 edi
> > > > > +#define rdi_64 rdi
> > > > > +#define edi_64 rdi
> > > > > +#define di_64  rdi
> > > > > +#define dil_64 rdi
> > > > > +#define r8_8   r8b
> > > > > +#define r8d_8  r8b
> > > > > +#define r8w_8  r8b
> > > > > +#define r8b_8  r8b
> > > > > +#define r8_16  r8w
> > > > > +#define r8d_16 r8w
> > > > > +#define r8w_16 r8w
> > > > > +#define r8b_16 r8w
> > > > > +#define r8_32  r8d
> > > > > +#define r8d_32 r8d
> > > > > +#define r8w_32 r8d
> > > > > +#define r8b_32 r8d
> > > > > +#define r8_64  r8
> > > > > +#define r8d_64 r8
> > > > > +#define r8w_64 r8
> > > > > +#define r8b_64 r8
> > > > > +#define r9_8   r9b
> > > > > +#define r9d_8  r9b
> > > > > +#define r9w_8  r9b
> > > > > +#define r9b_8  r9b
> > > > > +#define r9_16  r9w
> > > > > +#define r9d_16 r9w
> > > > > +#define r9w_16 r9w
> > > > > +#define r9b_16 r9w
> > > > > +#define r9_32  r9d
> > > > > +#define r9d_32 r9d
> > > > > +#define r9w_32 r9d
> > > > > +#define r9b_32 r9d
> > > > > +#define r9_64  r9
> > > > > +#define r9d_64 r9
> > > > > +#define r9w_64 r9
> > > > > +#define r9b_64 r9
> > > > > +#define r10_8  r10b
> > > > > +#define r10d_8 r10b
> > > > > +#define r10w_8 r10b
> > > > > +#define r10b_8 r10b
> > > > > +#define r10_16 r10w
> > > > > +#define r10d_16        r10w
> > > > > +#define r10w_16        r10w
> > > > > +#define r10b_16        r10w
> > > > > +#define r10_32 r10d
> > > > > +#define r10d_32        r10d
> > > > > +#define r10w_32        r10d
> > > > > +#define r10b_32        r10d
> > > > > +#define r10_64 r10
> > > > > +#define r10d_64        r10
> > > > > +#define r10w_64        r10
> > > > > +#define r10b_64        r10
> > > > > +#define r11_8  r11b
> > > > > +#define r11d_8 r11b
> > > > > +#define r11w_8 r11b
> > > > > +#define r11b_8 r11b
> > > > > +#define r11_16 r11w
> > > > > +#define r11d_16        r11w
> > > > > +#define r11w_16        r11w
> > > > > +#define r11b_16        r11w
> > > > > +#define r11_32 r11d
> > > > > +#define r11d_32        r11d
> > > > > +#define r11w_32        r11d
> > > > > +#define r11b_32        r11d
> > > > > +#define r11_64 r11
> > > > > +#define r11d_64        r11
> > > > > +#define r11w_64        r11
> > > > > +#define r11b_64        r11
> > > > > +#define r12_8  r12b
> > > > > +#define r12d_8 r12b
> > > > > +#define r12w_8 r12b
> > > > > +#define r12b_8 r12b
> > > > > +#define r12_16 r12w
> > > > > +#define r12d_16        r12w
> > > > > +#define r12w_16        r12w
> > > > > +#define r12b_16        r12w
> > > > > +#define r12_32 r12d
> > > > > +#define r12d_32        r12d
> > > > > +#define r12w_32        r12d
> > > > > +#define r12b_32        r12d
> > > > > +#define r12_64 r12
> > > > > +#define r12d_64        r12
> > > > > +#define r12w_64        r12
> > > > > +#define r12b_64        r12
> > > > > +#define r13_8  r13b
> > > > > +#define r13d_8 r13b
> > > > > +#define r13w_8 r13b
> > > > > +#define r13b_8 r13b
> > > > > +#define r13_16 r13w
> > > > > +#define r13d_16        r13w
> > > > > +#define r13w_16        r13w
> > > > > +#define r13b_16        r13w
> > > > > +#define r13_32 r13d
> > > > > +#define r13d_32        r13d
> > > > > +#define r13w_32        r13d
> > > > > +#define r13b_32        r13d
> > > > > +#define r13_64 r13
> > > > > +#define r13d_64        r13
> > > > > +#define r13w_64        r13
> > > > > +#define r13b_64        r13
> > > > > +#define r14_8  r14b
> > > > > +#define r14d_8 r14b
> > > > > +#define r14w_8 r14b
> > > > > +#define r14b_8 r14b
> > > > > +#define r14_16 r14w
> > > > > +#define r14d_16        r14w
> > > > > +#define r14w_16        r14w
> > > > > +#define r14b_16        r14w
> > > > > +#define r14_32 r14d
> > > > > +#define r14d_32        r14d
> > > > > +#define r14w_32        r14d
> > > > > +#define r14b_32        r14d
> > > > > +#define r14_64 r14
> > > > > +#define r14d_64        r14
> > > > > +#define r14w_64        r14
> > > > > +#define r14b_64        r14
> > > > > +#define r15_8  r15b
> > > > > +#define r15d_8 r15b
> > > > > +#define r15w_8 r15b
> > > > > +#define r15b_8 r15b
> > > > > +#define r15_16 r15w
> > > > > +#define r15d_16        r15w
> > > > > +#define r15w_16        r15w
> > > > > +#define r15b_16        r15w
> > > > > +#define r15_32 r15d
> > > > > +#define r15d_32        r15d
> > > > > +#define r15w_32        r15d
> > > > > +#define r15b_32        r15d
> > > > > +#define r15_64 r15
> > > > > +#define r15d_64        r15
> > > > > +#define r15w_64        r15
> > > > > +#define r15b_64        r15
> > > > > +
> > > > > +#define VRAX   VGPR(rax)
> > > > > +#define VRBX   VGPR(rbx)
> > > > > +#define VRCX   VGPR(rcx)
> > > > > +#define VRDX   VGPR(rdx)
> > > > > +#define VRBP   VGPR(rbp)
> > > > > +#define VRSP   VGPR(rsp)
> > > > > +#define VRSI   VGPR(rsi)
> > > > > +#define VRDI   VGPR(rdi)
> > > > > +#define VR8    VGPR(r8)
> > > > > +#define VR9    VGPR(r9)
> > > > > +#define VR10   VGPR(r10)
> > > > > +#define VR11   VGPR(r11)
> > > > > +#define VR12   VGPR(r12)
> > > > > +#define VR13   VGPR(r13)
> > > > > +#define VR14   VGPR(r14)
> > > > > +#define VR15   VGPR(r15)
> > > > > +
> > > > > +#define kmov_8 kmovb
> > > > > +#define kmov_16        kmovw
> > > > > +#define kmov_32        kmovd
> > > > > +#define kmov_64        kmovq
> > > >
> > > > Only 32 and 64 are needed.
> > >
> > > Thats not entirely true for the wide-char impls.
> > > >
> > > > > +#define kortest_8      kortestb
> > > > > +#define kortest_16     kortestw
> > > > > +#define kortest_32     kortestd
> > > > > +#define kortest_64     kortestq
> > > > > +#define kor_8  korb
> > > > > +#define kor_16 korw
> > > > > +#define kor_32 kord
> > > > > +#define kor_64 korq
> > > > > +#define ktest_8        ktestb
> > > > > +#define ktest_16       ktestw
> > > > > +#define ktest_32       ktestd
> > > > > +#define ktest_64       ktestq
> > > > > +#define kand_8 kandb
> > > > > +#define kand_16        kandw
> > > > > +#define kand_32        kandd
> > > > > +#define kand_64        kandq
> > > > > +#define kxor_8 kxorb
> > > > > +#define kxor_16        kxorw
> > > > > +#define kxor_32        kxord
> > > > > +#define kxor_64        kxorq
> > > > > +
> > > > > +#define kmovV  VKINSN_SZ(kmov, REG_WIDTH)
> > > > > +#define kortestV       VKINSN_SZ(kortest, REG_WIDTH)
> > > > > +#define korV   VKINSN_SZ(kor, REG_WIDTH)
> > > > > +#define ktestV         VKINSN_SZ(ktest, REG_WIDTH)
> > > > > +#define kandV  VKINSN_SZ(kand, REG_WIDTH)
> > > > > +#define kxorV  VKINSN_SZ(kxor, REG_WIDTH)
> > > >
> > > > #define VKINSN(op) VKINSN_SZ(op, REG_WIDTH)
> > >
> > > Will fix for V5.
> > > >
> > > > > +
> > > > > +#ifndef REG_WIDTH
> > > > > +#define REG_WIDTH VEC_SIZE
> > > >
> > > > Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH
> > > > can be dropped.
> > >
> > > Thats not quite true.
> > >
> > > For wide-char impls REG_WIDTH != VEC_SIZE.
> >
> > These register macros are used to operate vectors.  Do you have
> > an example of REG_WIDTH != VEC_SIZE?
>
> But since wide-char instructions use 32-bit comparison the resulting
> mask is < 64-bit i.e:
>
> vpcmpd %zmm16, %zmm17, %k0
> kmovd %k0, %eax
> will collect all the necessary bits and is prefered.
>
> Next version of Sunil's memchr-evex512 should have it.
>

So it is based on CHAR_PER_VEC.  When will 8-bit and 16-bit
registers be used?

H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v1 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 18:53           ` H.J. Lu
@ 2022-10-14 19:00             ` Noah Goldstein
  2022-10-14 19:13               ` H.J. Lu
  0 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 19:00 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Oct 14, 2022 at 11:38 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Oct 14, 2022 at 1:35 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Oct 14, 2022 at 11:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Fri, Oct 14, 2022 at 1:02 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > >  On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > >
> > > > > > This is to make it easier to do think like:
> > > > > > ```
> > > > > > vpcmpb %VEC(0), %VEC(1), %k0
> > > > > > kmov{d|q} %k0, %{eax|rax}
> > > > > > test %{eax|rax}
> > > > > > ```
> > > > >
> > > > > Since all these register macros are based on VEC_SIZE which is either 32
> > > > > bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are
> > > > > needed.  8-bit and 16-bit macros aren't needed.
> > > > >
> > > > > > It adds macro s.t any GPR can get the proper width with:
> > > > > >     `V{upper_case_GPR_name}`
> > > > > >
> > > > > > and any mask insn can get the proper width with:
> > > > > >     `{mask_insn_without_postfix}V`
> > > > >
> > > > > All macros should be in upper cases.
> > > > >
> > > > > > This commit does not change libc.so
> > > > > >
> > > > > > Tested build on x86-64
> > > > > > ---
> > > > > >  sysdeps/x86_64/multiarch/reg-macros.h         | 337 ++++++++++++++++++
> > > > > >  .../multiarch/scripts/gen-reg-map-macros.py   |  97 +++++
> > > > > >  2 files changed, 434 insertions(+)
> > > > > >  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
> > > > > >  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
> > > > > >
> > > > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > > > new file mode 100644
> > > > > > index 0000000000..c4d7f57b66
> > > > > > --- /dev/null
> > > > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > >
> > > > > vreg-macros.h to indicate macros based on vector size.   Please
> > > > > add comments to indicate that register macros are expanded based
> > > > > on vector size.
> > > > >
> > > > > > @@ -0,0 +1,337 @@
> > > > > > +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py.
> > > > > > +
> > > > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > +   This file is part of the GNU C Library.
> > > > > > +
> > > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > > +   License as published by the Free Software Foundation; either
> > > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > > +
> > > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > +   Lesser General Public License for more details.
> > > > > > +
> > > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > > +   License along with the GNU C Library; if not, see
> > > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > > +
> > > > > > +#ifndef _REG_MAP_MACROS_H
> > > > > > +#define _REG_MAP_MACROS_H      1
> > > > > > +
> > > > > > +#define rax_8  al
> > > > > > +#define eax_8  al
> > > > > > +#define ax_8   al
> > > > > > +#define al_8   al
> > > > > > +#define rax_16 ax
> > > > > > +#define eax_16 ax
> > > > > > +#define ax_16  ax
> > > > > > +#define al_16  ax
> > > > > > +#define rax_32 eax
> > > > > > +#define eax_32 eax
> > > > > > +#define ax_32  eax
> > > > > > +#define al_32  eax
> > > > > > +#define rax_64 rax
> > > > > > +#define eax_64 rax
> > > > > > +#define ax_64  rax
> > > > > > +#define al_64  rax
> > > > >
> > > > > Only rax_32 and rax_64 are needed.
> > > > >
> > > > > > +#define rbx_8  bl
> > > > > > +#define ebx_8  bl
> > > > > > +#define bx_8   bl
> > > > > > +#define bl_8   bl
> > > > > > +#define rbx_16 bx
> > > > > > +#define ebx_16 bx
> > > > > > +#define bx_16  bx
> > > > > > +#define bl_16  bx
> > > > > > +#define rbx_32 ebx
> > > > > > +#define ebx_32 ebx
> > > > > > +#define bx_32  ebx
> > > > > > +#define bl_32  ebx
> > > > > > +#define rbx_64 rbx
> > > > > > +#define ebx_64 rbx
> > > > > > +#define bx_64  rbx
> > > > > > +#define bl_64  rbx
> > > > > > +#define rcx_8  cl
> > > > > > +#define ecx_8  cl
> > > > > > +#define cx_8   cl
> > > > > > +#define cl_8   cl
> > > > > > +#define rcx_16 cx
> > > > > > +#define ecx_16 cx
> > > > > > +#define cx_16  cx
> > > > > > +#define cl_16  cx
> > > > > > +#define rcx_32 ecx
> > > > > > +#define ecx_32 ecx
> > > > > > +#define cx_32  ecx
> > > > > > +#define cl_32  ecx
> > > > > > +#define rcx_64 rcx
> > > > > > +#define ecx_64 rcx
> > > > > > +#define cx_64  rcx
> > > > > > +#define cl_64  rcx
> > > > > > +#define rdx_8  dl
> > > > > > +#define edx_8  dl
> > > > > > +#define dx_8   dl
> > > > > > +#define dl_8   dl
> > > > > > +#define rdx_16 dx
> > > > > > +#define edx_16 dx
> > > > > > +#define dx_16  dx
> > > > > > +#define dl_16  dx
> > > > > > +#define rdx_32 edx
> > > > > > +#define edx_32 edx
> > > > > > +#define dx_32  edx
> > > > > > +#define dl_32  edx
> > > > > > +#define rdx_64 rdx
> > > > > > +#define edx_64 rdx
> > > > > > +#define dx_64  rdx
> > > > > > +#define dl_64  rdx
> > > > > > +#define rbp_8  bpl
> > > > > > +#define ebp_8  bpl
> > > > > > +#define bp_8   bpl
> > > > > > +#define bpl_8  bpl
> > > > > > +#define rbp_16 bp
> > > > > > +#define ebp_16 bp
> > > > > > +#define bp_16  bp
> > > > > > +#define bpl_16 bp
> > > > > > +#define rbp_32 ebp
> > > > > > +#define ebp_32 ebp
> > > > > > +#define bp_32  ebp
> > > > > > +#define bpl_32 ebp
> > > > > > +#define rbp_64 rbp
> > > > > > +#define ebp_64 rbp
> > > > > > +#define bp_64  rbp
> > > > > > +#define bpl_64 rbp
> > > > > > +#define rsp_8  spl
> > > > > > +#define esp_8  spl
> > > > > > +#define sp_8   spl
> > > > > > +#define spl_8  spl
> > > > > > +#define rsp_16 sp
> > > > > > +#define esp_16 sp
> > > > > > +#define sp_16  sp
> > > > > > +#define spl_16 sp
> > > > > > +#define rsp_32 esp
> > > > > > +#define esp_32 esp
> > > > > > +#define sp_32  esp
> > > > > > +#define spl_32 esp
> > > > > > +#define rsp_64 rsp
> > > > > > +#define esp_64 rsp
> > > > > > +#define sp_64  rsp
> > > > > > +#define spl_64 rsp
> > > > > > +#define rsi_8  sil
> > > > > > +#define esi_8  sil
> > > > > > +#define si_8   sil
> > > > > > +#define sil_8  sil
> > > > > > +#define rsi_16 si
> > > > > > +#define esi_16 si
> > > > > > +#define si_16  si
> > > > > > +#define sil_16 si
> > > > > > +#define rsi_32 esi
> > > > > > +#define esi_32 esi
> > > > > > +#define si_32  esi
> > > > > > +#define sil_32 esi
> > > > > > +#define rsi_64 rsi
> > > > > > +#define esi_64 rsi
> > > > > > +#define si_64  rsi
> > > > > > +#define sil_64 rsi
> > > > > > +#define rdi_8  dil
> > > > > > +#define edi_8  dil
> > > > > > +#define di_8   dil
> > > > > > +#define dil_8  dil
> > > > > > +#define rdi_16 di
> > > > > > +#define edi_16 di
> > > > > > +#define di_16  di
> > > > > > +#define dil_16 di
> > > > > > +#define rdi_32 edi
> > > > > > +#define edi_32 edi
> > > > > > +#define di_32  edi
> > > > > > +#define dil_32 edi
> > > > > > +#define rdi_64 rdi
> > > > > > +#define edi_64 rdi
> > > > > > +#define di_64  rdi
> > > > > > +#define dil_64 rdi
> > > > > > +#define r8_8   r8b
> > > > > > +#define r8d_8  r8b
> > > > > > +#define r8w_8  r8b
> > > > > > +#define r8b_8  r8b
> > > > > > +#define r8_16  r8w
> > > > > > +#define r8d_16 r8w
> > > > > > +#define r8w_16 r8w
> > > > > > +#define r8b_16 r8w
> > > > > > +#define r8_32  r8d
> > > > > > +#define r8d_32 r8d
> > > > > > +#define r8w_32 r8d
> > > > > > +#define r8b_32 r8d
> > > > > > +#define r8_64  r8
> > > > > > +#define r8d_64 r8
> > > > > > +#define r8w_64 r8
> > > > > > +#define r8b_64 r8
> > > > > > +#define r9_8   r9b
> > > > > > +#define r9d_8  r9b
> > > > > > +#define r9w_8  r9b
> > > > > > +#define r9b_8  r9b
> > > > > > +#define r9_16  r9w
> > > > > > +#define r9d_16 r9w
> > > > > > +#define r9w_16 r9w
> > > > > > +#define r9b_16 r9w
> > > > > > +#define r9_32  r9d
> > > > > > +#define r9d_32 r9d
> > > > > > +#define r9w_32 r9d
> > > > > > +#define r9b_32 r9d
> > > > > > +#define r9_64  r9
> > > > > > +#define r9d_64 r9
> > > > > > +#define r9w_64 r9
> > > > > > +#define r9b_64 r9
> > > > > > +#define r10_8  r10b
> > > > > > +#define r10d_8 r10b
> > > > > > +#define r10w_8 r10b
> > > > > > +#define r10b_8 r10b
> > > > > > +#define r10_16 r10w
> > > > > > +#define r10d_16        r10w
> > > > > > +#define r10w_16        r10w
> > > > > > +#define r10b_16        r10w
> > > > > > +#define r10_32 r10d
> > > > > > +#define r10d_32        r10d
> > > > > > +#define r10w_32        r10d
> > > > > > +#define r10b_32        r10d
> > > > > > +#define r10_64 r10
> > > > > > +#define r10d_64        r10
> > > > > > +#define r10w_64        r10
> > > > > > +#define r10b_64        r10
> > > > > > +#define r11_8  r11b
> > > > > > +#define r11d_8 r11b
> > > > > > +#define r11w_8 r11b
> > > > > > +#define r11b_8 r11b
> > > > > > +#define r11_16 r11w
> > > > > > +#define r11d_16        r11w
> > > > > > +#define r11w_16        r11w
> > > > > > +#define r11b_16        r11w
> > > > > > +#define r11_32 r11d
> > > > > > +#define r11d_32        r11d
> > > > > > +#define r11w_32        r11d
> > > > > > +#define r11b_32        r11d
> > > > > > +#define r11_64 r11
> > > > > > +#define r11d_64        r11
> > > > > > +#define r11w_64        r11
> > > > > > +#define r11b_64        r11
> > > > > > +#define r12_8  r12b
> > > > > > +#define r12d_8 r12b
> > > > > > +#define r12w_8 r12b
> > > > > > +#define r12b_8 r12b
> > > > > > +#define r12_16 r12w
> > > > > > +#define r12d_16        r12w
> > > > > > +#define r12w_16        r12w
> > > > > > +#define r12b_16        r12w
> > > > > > +#define r12_32 r12d
> > > > > > +#define r12d_32        r12d
> > > > > > +#define r12w_32        r12d
> > > > > > +#define r12b_32        r12d
> > > > > > +#define r12_64 r12
> > > > > > +#define r12d_64        r12
> > > > > > +#define r12w_64        r12
> > > > > > +#define r12b_64        r12
> > > > > > +#define r13_8  r13b
> > > > > > +#define r13d_8 r13b
> > > > > > +#define r13w_8 r13b
> > > > > > +#define r13b_8 r13b
> > > > > > +#define r13_16 r13w
> > > > > > +#define r13d_16        r13w
> > > > > > +#define r13w_16        r13w
> > > > > > +#define r13b_16        r13w
> > > > > > +#define r13_32 r13d
> > > > > > +#define r13d_32        r13d
> > > > > > +#define r13w_32        r13d
> > > > > > +#define r13b_32        r13d
> > > > > > +#define r13_64 r13
> > > > > > +#define r13d_64        r13
> > > > > > +#define r13w_64        r13
> > > > > > +#define r13b_64        r13
> > > > > > +#define r14_8  r14b
> > > > > > +#define r14d_8 r14b
> > > > > > +#define r14w_8 r14b
> > > > > > +#define r14b_8 r14b
> > > > > > +#define r14_16 r14w
> > > > > > +#define r14d_16        r14w
> > > > > > +#define r14w_16        r14w
> > > > > > +#define r14b_16        r14w
> > > > > > +#define r14_32 r14d
> > > > > > +#define r14d_32        r14d
> > > > > > +#define r14w_32        r14d
> > > > > > +#define r14b_32        r14d
> > > > > > +#define r14_64 r14
> > > > > > +#define r14d_64        r14
> > > > > > +#define r14w_64        r14
> > > > > > +#define r14b_64        r14
> > > > > > +#define r15_8  r15b
> > > > > > +#define r15d_8 r15b
> > > > > > +#define r15w_8 r15b
> > > > > > +#define r15b_8 r15b
> > > > > > +#define r15_16 r15w
> > > > > > +#define r15d_16        r15w
> > > > > > +#define r15w_16        r15w
> > > > > > +#define r15b_16        r15w
> > > > > > +#define r15_32 r15d
> > > > > > +#define r15d_32        r15d
> > > > > > +#define r15w_32        r15d
> > > > > > +#define r15b_32        r15d
> > > > > > +#define r15_64 r15
> > > > > > +#define r15d_64        r15
> > > > > > +#define r15w_64        r15
> > > > > > +#define r15b_64        r15
> > > > > > +
> > > > > > +#define VRAX   VGPR(rax)
> > > > > > +#define VRBX   VGPR(rbx)
> > > > > > +#define VRCX   VGPR(rcx)
> > > > > > +#define VRDX   VGPR(rdx)
> > > > > > +#define VRBP   VGPR(rbp)
> > > > > > +#define VRSP   VGPR(rsp)
> > > > > > +#define VRSI   VGPR(rsi)
> > > > > > +#define VRDI   VGPR(rdi)
> > > > > > +#define VR8    VGPR(r8)
> > > > > > +#define VR9    VGPR(r9)
> > > > > > +#define VR10   VGPR(r10)
> > > > > > +#define VR11   VGPR(r11)
> > > > > > +#define VR12   VGPR(r12)
> > > > > > +#define VR13   VGPR(r13)
> > > > > > +#define VR14   VGPR(r14)
> > > > > > +#define VR15   VGPR(r15)
> > > > > > +
> > > > > > +#define kmov_8 kmovb
> > > > > > +#define kmov_16        kmovw
> > > > > > +#define kmov_32        kmovd
> > > > > > +#define kmov_64        kmovq
> > > > >
> > > > > Only 32 and 64 are needed.
> > > >
> > > > Thats not entirely true for the wide-char impls.
> > > > >
> > > > > > +#define kortest_8      kortestb
> > > > > > +#define kortest_16     kortestw
> > > > > > +#define kortest_32     kortestd
> > > > > > +#define kortest_64     kortestq
> > > > > > +#define kor_8  korb
> > > > > > +#define kor_16 korw
> > > > > > +#define kor_32 kord
> > > > > > +#define kor_64 korq
> > > > > > +#define ktest_8        ktestb
> > > > > > +#define ktest_16       ktestw
> > > > > > +#define ktest_32       ktestd
> > > > > > +#define ktest_64       ktestq
> > > > > > +#define kand_8 kandb
> > > > > > +#define kand_16        kandw
> > > > > > +#define kand_32        kandd
> > > > > > +#define kand_64        kandq
> > > > > > +#define kxor_8 kxorb
> > > > > > +#define kxor_16        kxorw
> > > > > > +#define kxor_32        kxord
> > > > > > +#define kxor_64        kxorq
> > > > > > +
> > > > > > +#define kmovV  VKINSN_SZ(kmov, REG_WIDTH)
> > > > > > +#define kortestV       VKINSN_SZ(kortest, REG_WIDTH)
> > > > > > +#define korV   VKINSN_SZ(kor, REG_WIDTH)
> > > > > > +#define ktestV         VKINSN_SZ(ktest, REG_WIDTH)
> > > > > > +#define kandV  VKINSN_SZ(kand, REG_WIDTH)
> > > > > > +#define kxorV  VKINSN_SZ(kxor, REG_WIDTH)
> > > > >
> > > > > #define VKINSN(op) VKINSN_SZ(op, REG_WIDTH)
> > > >
> > > > Will fix for V5.
> > > > >
> > > > > > +
> > > > > > +#ifndef REG_WIDTH
> > > > > > +#define REG_WIDTH VEC_SIZE
> > > > >
> > > > > Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH
> > > > > can be dropped.
> > > >
> > > > Thats not quite true.
> > > >
> > > > For wide-char impls REG_WIDTH != VEC_SIZE.
> > >
> > > These register macros are used to operate vectors.  Do you have
> > > an example of REG_WIDTH != VEC_SIZE?
> >
> > But since wide-char instructions use 32-bit comparison the resulting
> > mask is < 64-bit i.e:
> >
> > vpcmpd %zmm16, %zmm17, %k0
> > kmovd %k0, %eax
> > will collect all the necessary bits and is prefered.
> >
> > Next version of Sunil's memchr-evex512 should have it.
> >
>
> So it is based on CHAR_PER_VEC.  When will 8-bit and 16-bit
> registers be used?

In a sense. generally even if CHAR_PER_VEC < 32 its better to use
32 but in some cases where you want to use `inc{b|w}` to test for
all 1s its useful for `VGPR_SZ(rax, CHAR_PER_VEC)` to work.


>
> H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v1 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 19:00             ` Noah Goldstein
@ 2022-10-14 19:13               ` H.J. Lu
  2022-10-14 19:15                 ` Noah Goldstein
  0 siblings, 1 reply; 72+ messages in thread
From: H.J. Lu @ 2022-10-14 19:13 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 12:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Oct 14, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Oct 14, 2022 at 11:38 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Fri, Oct 14, 2022 at 1:35 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Fri, Oct 14, 2022 at 11:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > On Fri, Oct 14, 2022 at 1:02 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > >  On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > > >
> > > > > > > This is to make it easier to do think like:
> > > > > > > ```
> > > > > > > vpcmpb %VEC(0), %VEC(1), %k0
> > > > > > > kmov{d|q} %k0, %{eax|rax}
> > > > > > > test %{eax|rax}
> > > > > > > ```
> > > > > >
> > > > > > Since all these register macros are based on VEC_SIZE which is either 32
> > > > > > bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are
> > > > > > needed.  8-bit and 16-bit macros aren't needed.
> > > > > >
> > > > > > > It adds macro s.t any GPR can get the proper width with:
> > > > > > >     `V{upper_case_GPR_name}`
> > > > > > >
> > > > > > > and any mask insn can get the proper width with:
> > > > > > >     `{mask_insn_without_postfix}V`
> > > > > >
> > > > > > All macros should be in upper cases.
> > > > > >
> > > > > > > This commit does not change libc.so
> > > > > > >
> > > > > > > Tested build on x86-64
> > > > > > > ---
> > > > > > >  sysdeps/x86_64/multiarch/reg-macros.h         | 337 ++++++++++++++++++
> > > > > > >  .../multiarch/scripts/gen-reg-map-macros.py   |  97 +++++
> > > > > > >  2 files changed, 434 insertions(+)
> > > > > > >  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
> > > > > > >  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
> > > > > > >
> > > > > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > > > > new file mode 100644
> > > > > > > index 0000000000..c4d7f57b66
> > > > > > > --- /dev/null
> > > > > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > > >
> > > > > > vreg-macros.h to indicate macros based on vector size.   Please
> > > > > > add comments to indicate that register macros are expanded based
> > > > > > on vector size.
> > > > > >
> > > > > > > @@ -0,0 +1,337 @@
> > > > > > > +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py.
> > > > > > > +
> > > > > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > > +   This file is part of the GNU C Library.
> > > > > > > +
> > > > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > > > +   License as published by the Free Software Foundation; either
> > > > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > > > +
> > > > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > +   Lesser General Public License for more details.
> > > > > > > +
> > > > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > > > +   License along with the GNU C Library; if not, see
> > > > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > > > +
> > > > > > > +#ifndef _REG_MAP_MACROS_H
> > > > > > > +#define _REG_MAP_MACROS_H      1
> > > > > > > +
> > > > > > > +#define rax_8  al
> > > > > > > +#define eax_8  al
> > > > > > > +#define ax_8   al
> > > > > > > +#define al_8   al
> > > > > > > +#define rax_16 ax
> > > > > > > +#define eax_16 ax
> > > > > > > +#define ax_16  ax
> > > > > > > +#define al_16  ax
> > > > > > > +#define rax_32 eax
> > > > > > > +#define eax_32 eax
> > > > > > > +#define ax_32  eax
> > > > > > > +#define al_32  eax
> > > > > > > +#define rax_64 rax
> > > > > > > +#define eax_64 rax
> > > > > > > +#define ax_64  rax
> > > > > > > +#define al_64  rax
> > > > > >
> > > > > > Only rax_32 and rax_64 are needed.
> > > > > >
> > > > > > > +#define rbx_8  bl
> > > > > > > +#define ebx_8  bl
> > > > > > > +#define bx_8   bl
> > > > > > > +#define bl_8   bl
> > > > > > > +#define rbx_16 bx
> > > > > > > +#define ebx_16 bx
> > > > > > > +#define bx_16  bx
> > > > > > > +#define bl_16  bx
> > > > > > > +#define rbx_32 ebx
> > > > > > > +#define ebx_32 ebx
> > > > > > > +#define bx_32  ebx
> > > > > > > +#define bl_32  ebx
> > > > > > > +#define rbx_64 rbx
> > > > > > > +#define ebx_64 rbx
> > > > > > > +#define bx_64  rbx
> > > > > > > +#define bl_64  rbx
> > > > > > > +#define rcx_8  cl
> > > > > > > +#define ecx_8  cl
> > > > > > > +#define cx_8   cl
> > > > > > > +#define cl_8   cl
> > > > > > > +#define rcx_16 cx
> > > > > > > +#define ecx_16 cx
> > > > > > > +#define cx_16  cx
> > > > > > > +#define cl_16  cx
> > > > > > > +#define rcx_32 ecx
> > > > > > > +#define ecx_32 ecx
> > > > > > > +#define cx_32  ecx
> > > > > > > +#define cl_32  ecx
> > > > > > > +#define rcx_64 rcx
> > > > > > > +#define ecx_64 rcx
> > > > > > > +#define cx_64  rcx
> > > > > > > +#define cl_64  rcx
> > > > > > > +#define rdx_8  dl
> > > > > > > +#define edx_8  dl
> > > > > > > +#define dx_8   dl
> > > > > > > +#define dl_8   dl
> > > > > > > +#define rdx_16 dx
> > > > > > > +#define edx_16 dx
> > > > > > > +#define dx_16  dx
> > > > > > > +#define dl_16  dx
> > > > > > > +#define rdx_32 edx
> > > > > > > +#define edx_32 edx
> > > > > > > +#define dx_32  edx
> > > > > > > +#define dl_32  edx
> > > > > > > +#define rdx_64 rdx
> > > > > > > +#define edx_64 rdx
> > > > > > > +#define dx_64  rdx
> > > > > > > +#define dl_64  rdx
> > > > > > > +#define rbp_8  bpl
> > > > > > > +#define ebp_8  bpl
> > > > > > > +#define bp_8   bpl
> > > > > > > +#define bpl_8  bpl
> > > > > > > +#define rbp_16 bp
> > > > > > > +#define ebp_16 bp
> > > > > > > +#define bp_16  bp
> > > > > > > +#define bpl_16 bp
> > > > > > > +#define rbp_32 ebp
> > > > > > > +#define ebp_32 ebp
> > > > > > > +#define bp_32  ebp
> > > > > > > +#define bpl_32 ebp
> > > > > > > +#define rbp_64 rbp
> > > > > > > +#define ebp_64 rbp
> > > > > > > +#define bp_64  rbp
> > > > > > > +#define bpl_64 rbp
> > > > > > > +#define rsp_8  spl
> > > > > > > +#define esp_8  spl
> > > > > > > +#define sp_8   spl
> > > > > > > +#define spl_8  spl
> > > > > > > +#define rsp_16 sp
> > > > > > > +#define esp_16 sp
> > > > > > > +#define sp_16  sp
> > > > > > > +#define spl_16 sp
> > > > > > > +#define rsp_32 esp
> > > > > > > +#define esp_32 esp
> > > > > > > +#define sp_32  esp
> > > > > > > +#define spl_32 esp
> > > > > > > +#define rsp_64 rsp
> > > > > > > +#define esp_64 rsp
> > > > > > > +#define sp_64  rsp
> > > > > > > +#define spl_64 rsp
> > > > > > > +#define rsi_8  sil
> > > > > > > +#define esi_8  sil
> > > > > > > +#define si_8   sil
> > > > > > > +#define sil_8  sil
> > > > > > > +#define rsi_16 si
> > > > > > > +#define esi_16 si
> > > > > > > +#define si_16  si
> > > > > > > +#define sil_16 si
> > > > > > > +#define rsi_32 esi
> > > > > > > +#define esi_32 esi
> > > > > > > +#define si_32  esi
> > > > > > > +#define sil_32 esi
> > > > > > > +#define rsi_64 rsi
> > > > > > > +#define esi_64 rsi
> > > > > > > +#define si_64  rsi
> > > > > > > +#define sil_64 rsi
> > > > > > > +#define rdi_8  dil
> > > > > > > +#define edi_8  dil
> > > > > > > +#define di_8   dil
> > > > > > > +#define dil_8  dil
> > > > > > > +#define rdi_16 di
> > > > > > > +#define edi_16 di
> > > > > > > +#define di_16  di
> > > > > > > +#define dil_16 di
> > > > > > > +#define rdi_32 edi
> > > > > > > +#define edi_32 edi
> > > > > > > +#define di_32  edi
> > > > > > > +#define dil_32 edi
> > > > > > > +#define rdi_64 rdi
> > > > > > > +#define edi_64 rdi
> > > > > > > +#define di_64  rdi
> > > > > > > +#define dil_64 rdi
> > > > > > > +#define r8_8   r8b
> > > > > > > +#define r8d_8  r8b
> > > > > > > +#define r8w_8  r8b
> > > > > > > +#define r8b_8  r8b
> > > > > > > +#define r8_16  r8w
> > > > > > > +#define r8d_16 r8w
> > > > > > > +#define r8w_16 r8w
> > > > > > > +#define r8b_16 r8w
> > > > > > > +#define r8_32  r8d
> > > > > > > +#define r8d_32 r8d
> > > > > > > +#define r8w_32 r8d
> > > > > > > +#define r8b_32 r8d
> > > > > > > +#define r8_64  r8
> > > > > > > +#define r8d_64 r8
> > > > > > > +#define r8w_64 r8
> > > > > > > +#define r8b_64 r8
> > > > > > > +#define r9_8   r9b
> > > > > > > +#define r9d_8  r9b
> > > > > > > +#define r9w_8  r9b
> > > > > > > +#define r9b_8  r9b
> > > > > > > +#define r9_16  r9w
> > > > > > > +#define r9d_16 r9w
> > > > > > > +#define r9w_16 r9w
> > > > > > > +#define r9b_16 r9w
> > > > > > > +#define r9_32  r9d
> > > > > > > +#define r9d_32 r9d
> > > > > > > +#define r9w_32 r9d
> > > > > > > +#define r9b_32 r9d
> > > > > > > +#define r9_64  r9
> > > > > > > +#define r9d_64 r9
> > > > > > > +#define r9w_64 r9
> > > > > > > +#define r9b_64 r9
> > > > > > > +#define r10_8  r10b
> > > > > > > +#define r10d_8 r10b
> > > > > > > +#define r10w_8 r10b
> > > > > > > +#define r10b_8 r10b
> > > > > > > +#define r10_16 r10w
> > > > > > > +#define r10d_16        r10w
> > > > > > > +#define r10w_16        r10w
> > > > > > > +#define r10b_16        r10w
> > > > > > > +#define r10_32 r10d
> > > > > > > +#define r10d_32        r10d
> > > > > > > +#define r10w_32        r10d
> > > > > > > +#define r10b_32        r10d
> > > > > > > +#define r10_64 r10
> > > > > > > +#define r10d_64        r10
> > > > > > > +#define r10w_64        r10
> > > > > > > +#define r10b_64        r10
> > > > > > > +#define r11_8  r11b
> > > > > > > +#define r11d_8 r11b
> > > > > > > +#define r11w_8 r11b
> > > > > > > +#define r11b_8 r11b
> > > > > > > +#define r11_16 r11w
> > > > > > > +#define r11d_16        r11w
> > > > > > > +#define r11w_16        r11w
> > > > > > > +#define r11b_16        r11w
> > > > > > > +#define r11_32 r11d
> > > > > > > +#define r11d_32        r11d
> > > > > > > +#define r11w_32        r11d
> > > > > > > +#define r11b_32        r11d
> > > > > > > +#define r11_64 r11
> > > > > > > +#define r11d_64        r11
> > > > > > > +#define r11w_64        r11
> > > > > > > +#define r11b_64        r11
> > > > > > > +#define r12_8  r12b
> > > > > > > +#define r12d_8 r12b
> > > > > > > +#define r12w_8 r12b
> > > > > > > +#define r12b_8 r12b
> > > > > > > +#define r12_16 r12w
> > > > > > > +#define r12d_16        r12w
> > > > > > > +#define r12w_16        r12w
> > > > > > > +#define r12b_16        r12w
> > > > > > > +#define r12_32 r12d
> > > > > > > +#define r12d_32        r12d
> > > > > > > +#define r12w_32        r12d
> > > > > > > +#define r12b_32        r12d
> > > > > > > +#define r12_64 r12
> > > > > > > +#define r12d_64        r12
> > > > > > > +#define r12w_64        r12
> > > > > > > +#define r12b_64        r12
> > > > > > > +#define r13_8  r13b
> > > > > > > +#define r13d_8 r13b
> > > > > > > +#define r13w_8 r13b
> > > > > > > +#define r13b_8 r13b
> > > > > > > +#define r13_16 r13w
> > > > > > > +#define r13d_16        r13w
> > > > > > > +#define r13w_16        r13w
> > > > > > > +#define r13b_16        r13w
> > > > > > > +#define r13_32 r13d
> > > > > > > +#define r13d_32        r13d
> > > > > > > +#define r13w_32        r13d
> > > > > > > +#define r13b_32        r13d
> > > > > > > +#define r13_64 r13
> > > > > > > +#define r13d_64        r13
> > > > > > > +#define r13w_64        r13
> > > > > > > +#define r13b_64        r13
> > > > > > > +#define r14_8  r14b
> > > > > > > +#define r14d_8 r14b
> > > > > > > +#define r14w_8 r14b
> > > > > > > +#define r14b_8 r14b
> > > > > > > +#define r14_16 r14w
> > > > > > > +#define r14d_16        r14w
> > > > > > > +#define r14w_16        r14w
> > > > > > > +#define r14b_16        r14w
> > > > > > > +#define r14_32 r14d
> > > > > > > +#define r14d_32        r14d
> > > > > > > +#define r14w_32        r14d
> > > > > > > +#define r14b_32        r14d
> > > > > > > +#define r14_64 r14
> > > > > > > +#define r14d_64        r14
> > > > > > > +#define r14w_64        r14
> > > > > > > +#define r14b_64        r14
> > > > > > > +#define r15_8  r15b
> > > > > > > +#define r15d_8 r15b
> > > > > > > +#define r15w_8 r15b
> > > > > > > +#define r15b_8 r15b
> > > > > > > +#define r15_16 r15w
> > > > > > > +#define r15d_16        r15w
> > > > > > > +#define r15w_16        r15w
> > > > > > > +#define r15b_16        r15w
> > > > > > > +#define r15_32 r15d
> > > > > > > +#define r15d_32        r15d
> > > > > > > +#define r15w_32        r15d
> > > > > > > +#define r15b_32        r15d
> > > > > > > +#define r15_64 r15
> > > > > > > +#define r15d_64        r15
> > > > > > > +#define r15w_64        r15
> > > > > > > +#define r15b_64        r15
> > > > > > > +
> > > > > > > +#define VRAX   VGPR(rax)
> > > > > > > +#define VRBX   VGPR(rbx)
> > > > > > > +#define VRCX   VGPR(rcx)
> > > > > > > +#define VRDX   VGPR(rdx)
> > > > > > > +#define VRBP   VGPR(rbp)
> > > > > > > +#define VRSP   VGPR(rsp)
> > > > > > > +#define VRSI   VGPR(rsi)
> > > > > > > +#define VRDI   VGPR(rdi)
> > > > > > > +#define VR8    VGPR(r8)
> > > > > > > +#define VR9    VGPR(r9)
> > > > > > > +#define VR10   VGPR(r10)
> > > > > > > +#define VR11   VGPR(r11)
> > > > > > > +#define VR12   VGPR(r12)
> > > > > > > +#define VR13   VGPR(r13)
> > > > > > > +#define VR14   VGPR(r14)
> > > > > > > +#define VR15   VGPR(r15)
> > > > > > > +
> > > > > > > +#define kmov_8 kmovb
> > > > > > > +#define kmov_16        kmovw
> > > > > > > +#define kmov_32        kmovd
> > > > > > > +#define kmov_64        kmovq
> > > > > >
> > > > > > Only 32 and 64 are needed.
> > > > >
> > > > > Thats not entirely true for the wide-char impls.
> > > > > >
> > > > > > > +#define kortest_8      kortestb
> > > > > > > +#define kortest_16     kortestw
> > > > > > > +#define kortest_32     kortestd
> > > > > > > +#define kortest_64     kortestq
> > > > > > > +#define kor_8  korb
> > > > > > > +#define kor_16 korw
> > > > > > > +#define kor_32 kord
> > > > > > > +#define kor_64 korq
> > > > > > > +#define ktest_8        ktestb
> > > > > > > +#define ktest_16       ktestw
> > > > > > > +#define ktest_32       ktestd
> > > > > > > +#define ktest_64       ktestq
> > > > > > > +#define kand_8 kandb
> > > > > > > +#define kand_16        kandw
> > > > > > > +#define kand_32        kandd
> > > > > > > +#define kand_64        kandq
> > > > > > > +#define kxor_8 kxorb
> > > > > > > +#define kxor_16        kxorw
> > > > > > > +#define kxor_32        kxord
> > > > > > > +#define kxor_64        kxorq
> > > > > > > +
> > > > > > > +#define kmovV  VKINSN_SZ(kmov, REG_WIDTH)
> > > > > > > +#define kortestV       VKINSN_SZ(kortest, REG_WIDTH)
> > > > > > > +#define korV   VKINSN_SZ(kor, REG_WIDTH)
> > > > > > > +#define ktestV         VKINSN_SZ(ktest, REG_WIDTH)
> > > > > > > +#define kandV  VKINSN_SZ(kand, REG_WIDTH)
> > > > > > > +#define kxorV  VKINSN_SZ(kxor, REG_WIDTH)
> > > > > >
> > > > > > #define VKINSN(op) VKINSN_SZ(op, REG_WIDTH)
> > > > >
> > > > > Will fix for V5.
> > > > > >
> > > > > > > +
> > > > > > > +#ifndef REG_WIDTH
> > > > > > > +#define REG_WIDTH VEC_SIZE
> > > > > >
> > > > > > Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH
> > > > > > can be dropped.
> > > > >
> > > > > Thats not quite true.
> > > > >
> > > > > For wide-char impls REG_WIDTH != VEC_SIZE.
> > > >
> > > > These register macros are used to operate vectors.  Do you have
> > > > an example of REG_WIDTH != VEC_SIZE?
> > >
> > > But since wide-char instructions use 32-bit comparison the resulting
> > > mask is < 64-bit i.e:
> > >
> > > vpcmpd %zmm16, %zmm17, %k0
> > > kmovd %k0, %eax
> > > will collect all the necessary bits and is prefered.
> > >
> > > Next version of Sunil's memchr-evex512 should have it.
> > >
> >
> > So it is based on CHAR_PER_VEC.  When will 8-bit and 16-bit
> > registers be used?
>
> In a sense. generally even if CHAR_PER_VEC < 32 its better to use
> 32 but in some cases where you want to use `inc{b|w}` to test for
> all 1s its useful for `VGPR_SZ(rax, CHAR_PER_VEC)` to work.

We only need

#define rax_8  al

not

#define eax_8  al
#define ax_8   al
#define al_8   al

-- 
H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v1 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 19:13               ` H.J. Lu
@ 2022-10-14 19:15                 ` Noah Goldstein
  0 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 19:15 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 2:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Oct 14, 2022 at 12:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Oct 14, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Oct 14, 2022 at 11:38 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Fri, Oct 14, 2022 at 1:35 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Fri, Oct 14, 2022 at 11:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > >
> > > > > > On Fri, Oct 14, 2022 at 1:02 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > >  On Fri, Oct 14, 2022 at 9:40 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > > > >
> > > > > > > > This is to make it easier to do think like:
> > > > > > > > ```
> > > > > > > > vpcmpb %VEC(0), %VEC(1), %k0
> > > > > > > > kmov{d|q} %k0, %{eax|rax}
> > > > > > > > test %{eax|rax}
> > > > > > > > ```
> > > > > > >
> > > > > > > Since all these register macros are based on VEC_SIZE which is either 32
> > > > > > > bytes or 64 bytes, only 32-bit or 64-bit integer and mask register macros are
> > > > > > > needed.  8-bit and 16-bit macros aren't needed.
> > > > > > >
> > > > > > > > It adds macro s.t any GPR can get the proper width with:
> > > > > > > >     `V{upper_case_GPR_name}`
> > > > > > > >
> > > > > > > > and any mask insn can get the proper width with:
> > > > > > > >     `{mask_insn_without_postfix}V`
> > > > > > >
> > > > > > > All macros should be in upper cases.
> > > > > > >
> > > > > > > > This commit does not change libc.so
> > > > > > > >
> > > > > > > > Tested build on x86-64
> > > > > > > > ---
> > > > > > > >  sysdeps/x86_64/multiarch/reg-macros.h         | 337 ++++++++++++++++++
> > > > > > > >  .../multiarch/scripts/gen-reg-map-macros.py   |  97 +++++
> > > > > > > >  2 files changed, 434 insertions(+)
> > > > > > > >  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
> > > > > > > >  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py
> > > > > > > >
> > > > > > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > > > > > new file mode 100644
> > > > > > > > index 0000000000..c4d7f57b66
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > > > >
> > > > > > > vreg-macros.h to indicate macros based on vector size.   Please
> > > > > > > add comments to indicate that register macros are expanded based
> > > > > > > on vector size.
> > > > > > >
> > > > > > > > @@ -0,0 +1,337 @@
> > > > > > > > +/* This file was generated by: sysdeps/x86_64/multiarch/scripts/gen-reg-map-macros.py.
> > > > > > > > +
> > > > > > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > > > +   This file is part of the GNU C Library.
> > > > > > > > +
> > > > > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > > > > +   License as published by the Free Software Foundation; either
> > > > > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > > > > +
> > > > > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > > +   Lesser General Public License for more details.
> > > > > > > > +
> > > > > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > > > > +   License along with the GNU C Library; if not, see
> > > > > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > > > > +
> > > > > > > > +#ifndef _REG_MAP_MACROS_H
> > > > > > > > +#define _REG_MAP_MACROS_H      1
> > > > > > > > +
> > > > > > > > +#define rax_8  al
> > > > > > > > +#define eax_8  al
> > > > > > > > +#define ax_8   al
> > > > > > > > +#define al_8   al
> > > > > > > > +#define rax_16 ax
> > > > > > > > +#define eax_16 ax
> > > > > > > > +#define ax_16  ax
> > > > > > > > +#define al_16  ax
> > > > > > > > +#define rax_32 eax
> > > > > > > > +#define eax_32 eax
> > > > > > > > +#define ax_32  eax
> > > > > > > > +#define al_32  eax
> > > > > > > > +#define rax_64 rax
> > > > > > > > +#define eax_64 rax
> > > > > > > > +#define ax_64  rax
> > > > > > > > +#define al_64  rax
> > > > > > >
> > > > > > > Only rax_32 and rax_64 are needed.
> > > > > > >
> > > > > > > > +#define rbx_8  bl
> > > > > > > > +#define ebx_8  bl
> > > > > > > > +#define bx_8   bl
> > > > > > > > +#define bl_8   bl
> > > > > > > > +#define rbx_16 bx
> > > > > > > > +#define ebx_16 bx
> > > > > > > > +#define bx_16  bx
> > > > > > > > +#define bl_16  bx
> > > > > > > > +#define rbx_32 ebx
> > > > > > > > +#define ebx_32 ebx
> > > > > > > > +#define bx_32  ebx
> > > > > > > > +#define bl_32  ebx
> > > > > > > > +#define rbx_64 rbx
> > > > > > > > +#define ebx_64 rbx
> > > > > > > > +#define bx_64  rbx
> > > > > > > > +#define bl_64  rbx
> > > > > > > > +#define rcx_8  cl
> > > > > > > > +#define ecx_8  cl
> > > > > > > > +#define cx_8   cl
> > > > > > > > +#define cl_8   cl
> > > > > > > > +#define rcx_16 cx
> > > > > > > > +#define ecx_16 cx
> > > > > > > > +#define cx_16  cx
> > > > > > > > +#define cl_16  cx
> > > > > > > > +#define rcx_32 ecx
> > > > > > > > +#define ecx_32 ecx
> > > > > > > > +#define cx_32  ecx
> > > > > > > > +#define cl_32  ecx
> > > > > > > > +#define rcx_64 rcx
> > > > > > > > +#define ecx_64 rcx
> > > > > > > > +#define cx_64  rcx
> > > > > > > > +#define cl_64  rcx
> > > > > > > > +#define rdx_8  dl
> > > > > > > > +#define edx_8  dl
> > > > > > > > +#define dx_8   dl
> > > > > > > > +#define dl_8   dl
> > > > > > > > +#define rdx_16 dx
> > > > > > > > +#define edx_16 dx
> > > > > > > > +#define dx_16  dx
> > > > > > > > +#define dl_16  dx
> > > > > > > > +#define rdx_32 edx
> > > > > > > > +#define edx_32 edx
> > > > > > > > +#define dx_32  edx
> > > > > > > > +#define dl_32  edx
> > > > > > > > +#define rdx_64 rdx
> > > > > > > > +#define edx_64 rdx
> > > > > > > > +#define dx_64  rdx
> > > > > > > > +#define dl_64  rdx
> > > > > > > > +#define rbp_8  bpl
> > > > > > > > +#define ebp_8  bpl
> > > > > > > > +#define bp_8   bpl
> > > > > > > > +#define bpl_8  bpl
> > > > > > > > +#define rbp_16 bp
> > > > > > > > +#define ebp_16 bp
> > > > > > > > +#define bp_16  bp
> > > > > > > > +#define bpl_16 bp
> > > > > > > > +#define rbp_32 ebp
> > > > > > > > +#define ebp_32 ebp
> > > > > > > > +#define bp_32  ebp
> > > > > > > > +#define bpl_32 ebp
> > > > > > > > +#define rbp_64 rbp
> > > > > > > > +#define ebp_64 rbp
> > > > > > > > +#define bp_64  rbp
> > > > > > > > +#define bpl_64 rbp
> > > > > > > > +#define rsp_8  spl
> > > > > > > > +#define esp_8  spl
> > > > > > > > +#define sp_8   spl
> > > > > > > > +#define spl_8  spl
> > > > > > > > +#define rsp_16 sp
> > > > > > > > +#define esp_16 sp
> > > > > > > > +#define sp_16  sp
> > > > > > > > +#define spl_16 sp
> > > > > > > > +#define rsp_32 esp
> > > > > > > > +#define esp_32 esp
> > > > > > > > +#define sp_32  esp
> > > > > > > > +#define spl_32 esp
> > > > > > > > +#define rsp_64 rsp
> > > > > > > > +#define esp_64 rsp
> > > > > > > > +#define sp_64  rsp
> > > > > > > > +#define spl_64 rsp
> > > > > > > > +#define rsi_8  sil
> > > > > > > > +#define esi_8  sil
> > > > > > > > +#define si_8   sil
> > > > > > > > +#define sil_8  sil
> > > > > > > > +#define rsi_16 si
> > > > > > > > +#define esi_16 si
> > > > > > > > +#define si_16  si
> > > > > > > > +#define sil_16 si
> > > > > > > > +#define rsi_32 esi
> > > > > > > > +#define esi_32 esi
> > > > > > > > +#define si_32  esi
> > > > > > > > +#define sil_32 esi
> > > > > > > > +#define rsi_64 rsi
> > > > > > > > +#define esi_64 rsi
> > > > > > > > +#define si_64  rsi
> > > > > > > > +#define sil_64 rsi
> > > > > > > > +#define rdi_8  dil
> > > > > > > > +#define edi_8  dil
> > > > > > > > +#define di_8   dil
> > > > > > > > +#define dil_8  dil
> > > > > > > > +#define rdi_16 di
> > > > > > > > +#define edi_16 di
> > > > > > > > +#define di_16  di
> > > > > > > > +#define dil_16 di
> > > > > > > > +#define rdi_32 edi
> > > > > > > > +#define edi_32 edi
> > > > > > > > +#define di_32  edi
> > > > > > > > +#define dil_32 edi
> > > > > > > > +#define rdi_64 rdi
> > > > > > > > +#define edi_64 rdi
> > > > > > > > +#define di_64  rdi
> > > > > > > > +#define dil_64 rdi
> > > > > > > > +#define r8_8   r8b
> > > > > > > > +#define r8d_8  r8b
> > > > > > > > +#define r8w_8  r8b
> > > > > > > > +#define r8b_8  r8b
> > > > > > > > +#define r8_16  r8w
> > > > > > > > +#define r8d_16 r8w
> > > > > > > > +#define r8w_16 r8w
> > > > > > > > +#define r8b_16 r8w
> > > > > > > > +#define r8_32  r8d
> > > > > > > > +#define r8d_32 r8d
> > > > > > > > +#define r8w_32 r8d
> > > > > > > > +#define r8b_32 r8d
> > > > > > > > +#define r8_64  r8
> > > > > > > > +#define r8d_64 r8
> > > > > > > > +#define r8w_64 r8
> > > > > > > > +#define r8b_64 r8
> > > > > > > > +#define r9_8   r9b
> > > > > > > > +#define r9d_8  r9b
> > > > > > > > +#define r9w_8  r9b
> > > > > > > > +#define r9b_8  r9b
> > > > > > > > +#define r9_16  r9w
> > > > > > > > +#define r9d_16 r9w
> > > > > > > > +#define r9w_16 r9w
> > > > > > > > +#define r9b_16 r9w
> > > > > > > > +#define r9_32  r9d
> > > > > > > > +#define r9d_32 r9d
> > > > > > > > +#define r9w_32 r9d
> > > > > > > > +#define r9b_32 r9d
> > > > > > > > +#define r9_64  r9
> > > > > > > > +#define r9d_64 r9
> > > > > > > > +#define r9w_64 r9
> > > > > > > > +#define r9b_64 r9
> > > > > > > > +#define r10_8  r10b
> > > > > > > > +#define r10d_8 r10b
> > > > > > > > +#define r10w_8 r10b
> > > > > > > > +#define r10b_8 r10b
> > > > > > > > +#define r10_16 r10w
> > > > > > > > +#define r10d_16        r10w
> > > > > > > > +#define r10w_16        r10w
> > > > > > > > +#define r10b_16        r10w
> > > > > > > > +#define r10_32 r10d
> > > > > > > > +#define r10d_32        r10d
> > > > > > > > +#define r10w_32        r10d
> > > > > > > > +#define r10b_32        r10d
> > > > > > > > +#define r10_64 r10
> > > > > > > > +#define r10d_64        r10
> > > > > > > > +#define r10w_64        r10
> > > > > > > > +#define r10b_64        r10
> > > > > > > > +#define r11_8  r11b
> > > > > > > > +#define r11d_8 r11b
> > > > > > > > +#define r11w_8 r11b
> > > > > > > > +#define r11b_8 r11b
> > > > > > > > +#define r11_16 r11w
> > > > > > > > +#define r11d_16        r11w
> > > > > > > > +#define r11w_16        r11w
> > > > > > > > +#define r11b_16        r11w
> > > > > > > > +#define r11_32 r11d
> > > > > > > > +#define r11d_32        r11d
> > > > > > > > +#define r11w_32        r11d
> > > > > > > > +#define r11b_32        r11d
> > > > > > > > +#define r11_64 r11
> > > > > > > > +#define r11d_64        r11
> > > > > > > > +#define r11w_64        r11
> > > > > > > > +#define r11b_64        r11
> > > > > > > > +#define r12_8  r12b
> > > > > > > > +#define r12d_8 r12b
> > > > > > > > +#define r12w_8 r12b
> > > > > > > > +#define r12b_8 r12b
> > > > > > > > +#define r12_16 r12w
> > > > > > > > +#define r12d_16        r12w
> > > > > > > > +#define r12w_16        r12w
> > > > > > > > +#define r12b_16        r12w
> > > > > > > > +#define r12_32 r12d
> > > > > > > > +#define r12d_32        r12d
> > > > > > > > +#define r12w_32        r12d
> > > > > > > > +#define r12b_32        r12d
> > > > > > > > +#define r12_64 r12
> > > > > > > > +#define r12d_64        r12
> > > > > > > > +#define r12w_64        r12
> > > > > > > > +#define r12b_64        r12
> > > > > > > > +#define r13_8  r13b
> > > > > > > > +#define r13d_8 r13b
> > > > > > > > +#define r13w_8 r13b
> > > > > > > > +#define r13b_8 r13b
> > > > > > > > +#define r13_16 r13w
> > > > > > > > +#define r13d_16        r13w
> > > > > > > > +#define r13w_16        r13w
> > > > > > > > +#define r13b_16        r13w
> > > > > > > > +#define r13_32 r13d
> > > > > > > > +#define r13d_32        r13d
> > > > > > > > +#define r13w_32        r13d
> > > > > > > > +#define r13b_32        r13d
> > > > > > > > +#define r13_64 r13
> > > > > > > > +#define r13d_64        r13
> > > > > > > > +#define r13w_64        r13
> > > > > > > > +#define r13b_64        r13
> > > > > > > > +#define r14_8  r14b
> > > > > > > > +#define r14d_8 r14b
> > > > > > > > +#define r14w_8 r14b
> > > > > > > > +#define r14b_8 r14b
> > > > > > > > +#define r14_16 r14w
> > > > > > > > +#define r14d_16        r14w
> > > > > > > > +#define r14w_16        r14w
> > > > > > > > +#define r14b_16        r14w
> > > > > > > > +#define r14_32 r14d
> > > > > > > > +#define r14d_32        r14d
> > > > > > > > +#define r14w_32        r14d
> > > > > > > > +#define r14b_32        r14d
> > > > > > > > +#define r14_64 r14
> > > > > > > > +#define r14d_64        r14
> > > > > > > > +#define r14w_64        r14
> > > > > > > > +#define r14b_64        r14
> > > > > > > > +#define r15_8  r15b
> > > > > > > > +#define r15d_8 r15b
> > > > > > > > +#define r15w_8 r15b
> > > > > > > > +#define r15b_8 r15b
> > > > > > > > +#define r15_16 r15w
> > > > > > > > +#define r15d_16        r15w
> > > > > > > > +#define r15w_16        r15w
> > > > > > > > +#define r15b_16        r15w
> > > > > > > > +#define r15_32 r15d
> > > > > > > > +#define r15d_32        r15d
> > > > > > > > +#define r15w_32        r15d
> > > > > > > > +#define r15b_32        r15d
> > > > > > > > +#define r15_64 r15
> > > > > > > > +#define r15d_64        r15
> > > > > > > > +#define r15w_64        r15
> > > > > > > > +#define r15b_64        r15
> > > > > > > > +
> > > > > > > > +#define VRAX   VGPR(rax)
> > > > > > > > +#define VRBX   VGPR(rbx)
> > > > > > > > +#define VRCX   VGPR(rcx)
> > > > > > > > +#define VRDX   VGPR(rdx)
> > > > > > > > +#define VRBP   VGPR(rbp)
> > > > > > > > +#define VRSP   VGPR(rsp)
> > > > > > > > +#define VRSI   VGPR(rsi)
> > > > > > > > +#define VRDI   VGPR(rdi)
> > > > > > > > +#define VR8    VGPR(r8)
> > > > > > > > +#define VR9    VGPR(r9)
> > > > > > > > +#define VR10   VGPR(r10)
> > > > > > > > +#define VR11   VGPR(r11)
> > > > > > > > +#define VR12   VGPR(r12)
> > > > > > > > +#define VR13   VGPR(r13)
> > > > > > > > +#define VR14   VGPR(r14)
> > > > > > > > +#define VR15   VGPR(r15)
> > > > > > > > +
> > > > > > > > +#define kmov_8 kmovb
> > > > > > > > +#define kmov_16        kmovw
> > > > > > > > +#define kmov_32        kmovd
> > > > > > > > +#define kmov_64        kmovq
> > > > > > >
> > > > > > > Only 32 and 64 are needed.
> > > > > >
> > > > > > Thats not entirely true for the wide-char impls.
> > > > > > >
> > > > > > > > +#define kortest_8      kortestb
> > > > > > > > +#define kortest_16     kortestw
> > > > > > > > +#define kortest_32     kortestd
> > > > > > > > +#define kortest_64     kortestq
> > > > > > > > +#define kor_8  korb
> > > > > > > > +#define kor_16 korw
> > > > > > > > +#define kor_32 kord
> > > > > > > > +#define kor_64 korq
> > > > > > > > +#define ktest_8        ktestb
> > > > > > > > +#define ktest_16       ktestw
> > > > > > > > +#define ktest_32       ktestd
> > > > > > > > +#define ktest_64       ktestq
> > > > > > > > +#define kand_8 kandb
> > > > > > > > +#define kand_16        kandw
> > > > > > > > +#define kand_32        kandd
> > > > > > > > +#define kand_64        kandq
> > > > > > > > +#define kxor_8 kxorb
> > > > > > > > +#define kxor_16        kxorw
> > > > > > > > +#define kxor_32        kxord
> > > > > > > > +#define kxor_64        kxorq
> > > > > > > > +
> > > > > > > > +#define kmovV  VKINSN_SZ(kmov, REG_WIDTH)
> > > > > > > > +#define kortestV       VKINSN_SZ(kortest, REG_WIDTH)
> > > > > > > > +#define korV   VKINSN_SZ(kor, REG_WIDTH)
> > > > > > > > +#define ktestV         VKINSN_SZ(ktest, REG_WIDTH)
> > > > > > > > +#define kandV  VKINSN_SZ(kand, REG_WIDTH)
> > > > > > > > +#define kxorV  VKINSN_SZ(kxor, REG_WIDTH)
> > > > > > >
> > > > > > > #define VKINSN(op) VKINSN_SZ(op, REG_WIDTH)
> > > > > >
> > > > > > Will fix for V5.
> > > > > > >
> > > > > > > > +
> > > > > > > > +#ifndef REG_WIDTH
> > > > > > > > +#define REG_WIDTH VEC_SIZE
> > > > > > >
> > > > > > > Since REG_WIDTH must be the same as VEC_SIZE, REG_WIDTH
> > > > > > > can be dropped.
> > > > > >
> > > > > > Thats not quite true.
> > > > > >
> > > > > > For wide-char impls REG_WIDTH != VEC_SIZE.
> > > > >
> > > > > These register macros are used to operate vectors.  Do you have
> > > > > an example of REG_WIDTH != VEC_SIZE?
> > > >
> > > > But since wide-char instructions use 32-bit comparison the resulting
> > > > mask is < 64-bit i.e:
> > > >
> > > > vpcmpd %zmm16, %zmm17, %k0
> > > > kmovd %k0, %eax
> > > > will collect all the necessary bits and is prefered.
> > > >
> > > > Next version of Sunil's memchr-evex512 should have it.
> > > >
> > >
> > > So it is based on CHAR_PER_VEC.  When will 8-bit and 16-bit
> > > registers be used?
> >
> > In a sense. generally even if CHAR_PER_VEC < 32 its better to use
> > 32 but in some cases where you want to use `inc{b|w}` to test for
> > all 1s its useful for `VGPR_SZ(rax, CHAR_PER_VEC)` to work.
>
> We only need
>
> #define rax_8  al
>
> not
>
> #define eax_8  al
> #define ax_8   al
> #define al_8   al
>

Thats fair. I guess I was thinking this would be the easiest as you
can replace any
but would you prefer for these only to apply to 64-bit variants?
> --
> H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v5 1/3] x86: Update evex256/512 vec macros
  2022-10-14 16:40 [PATCH v1 1/3] x86: Update evex256/512 vec macros Noah Goldstein
                   ` (5 preceding siblings ...)
  2022-10-14 18:41 ` [PATCH v4 1/3] x86: Update evex256/512 vec macros Noah Goldstein
@ 2022-10-14 21:14 ` Noah Goldstein
  2022-10-14 21:15   ` [PATCH v5 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
  2022-10-14 21:15   ` [PATCH v5 3/3] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
  2022-10-14 22:39 ` [PATCH v6 1/7] x86: Update and move evex256/512 vec macros Noah Goldstein
                   ` (3 subsequent siblings)
  10 siblings, 2 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 21:14 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

1) Make section only define if there is not a previous definition
2) Add `VEC_lo` definition for proper reg-width but in the
   ymm/zmm0-15 range.

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/avx-vecs.h           |   4 +-
 sysdeps/x86_64/multiarch/evex-vecs-common.h   |   6 +-
 sysdeps/x86_64/multiarch/evex256-vecs.h       |   7 +-
 sysdeps/x86_64/multiarch/evex512-vecs.h       |   7 +-
 .../memmove-avx-unaligned-erms-rtm.S          |  15 +-
 .../multiarch/memmove-avx-unaligned-erms.S    |   9 +-
 .../multiarch/memmove-avx512-unaligned-erms.S |  30 +-
 .../multiarch/memmove-evex-unaligned-erms.S   |  30 +-
 .../multiarch/memmove-sse2-unaligned-erms.S   |  11 +-
 .../multiarch/memmove-vec-unaligned-erms.S    | 262 +++++++++---------
 sysdeps/x86_64/multiarch/memrchr-evex.S       |  40 +--
 .../memset-avx2-unaligned-erms-rtm.S          |   8 +-
 .../multiarch/memset-avx2-unaligned-erms.S    |  14 +-
 .../multiarch/memset-avx512-unaligned-erms.S  |  20 +-
 .../multiarch/memset-evex-unaligned-erms.S    |  20 +-
 .../multiarch/memset-sse2-unaligned-erms.S    |  10 +-
 .../multiarch/memset-vec-unaligned-erms.S     |  70 +++--
 sysdeps/x86_64/multiarch/sse2-vecs.h          |   4 +-
 sysdeps/x86_64/multiarch/vec-macros.h         | 112 ++++----
 19 files changed, 271 insertions(+), 408 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
index 89680f5db8..e0c865eaef 100644
--- a/sysdeps/x86_64/multiarch/avx-vecs.h
+++ b/sysdeps/x86_64/multiarch/avx-vecs.h
@@ -41,7 +41,7 @@
 #define VMOVNT				vmovntdq
 
 /* Often need to access xmm portion.  */
-#define VEC_xmm				VEC_any_xmm
-#define VEC					VEC_any_ymm
+#define VMM_128				VMM_any_xmm
+#define VMM					VMM_any_ymm
 
 #endif
diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
index 99806ebcd7..5468de8d42 100644
--- a/sysdeps/x86_64/multiarch/evex-vecs-common.h
+++ b/sysdeps/x86_64/multiarch/evex-vecs-common.h
@@ -32,8 +32,8 @@
 #define VMOVA				vmovdqa64
 #define VMOVNT				vmovntdq
 
-#define VEC_xmm				VEC_hi_xmm
-#define VEC_ymm				VEC_hi_ymm
-#define VEC_zmm				VEC_hi_zmm
+#define VMM_128				VMM_hi_xmm
+#define VMM_256				VMM_hi_ymm
+#define VMM_512				VMM_hi_zmm
 
 #endif
diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
index 222ba46dc7..497020fbbc 100644
--- a/sysdeps/x86_64/multiarch/evex256-vecs.h
+++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
@@ -28,8 +28,11 @@
 #include "evex-vecs-common.h"
 
 #define USE_WITH_EVEX256	1
-#define SECTION(p)			p##.evex
 
-#define VEC					VEC_ymm
+#ifndef SECTION
+# define SECTION(p)			p##.evex
+#endif
 
+#define VMM					VMM_256
+#define VMM_lo				VMM_any_ymm
 #endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
index d1784d5368..2ddc7cb41e 100644
--- a/sysdeps/x86_64/multiarch/evex512-vecs.h
+++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
@@ -28,8 +28,11 @@
 #include "evex-vecs-common.h"
 
 #define USE_WITH_EVEX512	1
-#define SECTION(p)			p##.evex512
 
-#define VEC					VEC_zmm
+#ifndef SECTION
+# define SECTION(p)			p##.evex512
+#endif
 
+#define VMM					VMM_512
+#define VMM_lo				VMM_any_zmm
 #endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
index 67a55f0c85..54b29bcf66 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -1,16 +1,9 @@
-#if IS_IN (libc)
-# define VEC_SIZE	32
-# define VEC(i)		ymm##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu
-# define VMOVA		vmovdqa
-# define MOV_SIZE	4
-# define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+#include <isa-level.h>
 
-# define VZEROUPPER_RETURN jmp	 L(return)
+#if ISA_SHOULD_BUILD (3)
+
+# include "avx-rtm-vecs.h"
 
-# define SECTION(p)		p##.avx.rtm
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s##_rtm
 
 # include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index a14b155667..6960571779 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -2,14 +2,7 @@
 
 #if ISA_SHOULD_BUILD (3)
 
-# define VEC_SIZE	32
-# define VEC(i)		ymm##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu
-# define VMOVA		vmovdqa
-# define MOV_SIZE	4
-
-# define SECTION(p)		p##.avx
+# include "avx-vecs.h"
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index 8d1568a7ba..1794c7de6d 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -2,35 +2,7 @@
 
 #if ISA_SHOULD_BUILD (4)
 
-# define VEC_SIZE	64
-# define XMM0		xmm16
-# define XMM1		xmm17
-# define YMM0		ymm16
-# define YMM1		ymm17
-# define VEC0		zmm16
-# define VEC1		zmm17
-# define VEC2		zmm18
-# define VEC3		zmm19
-# define VEC4		zmm20
-# define VEC5		zmm21
-# define VEC6		zmm22
-# define VEC7		zmm23
-# define VEC8		zmm24
-# define VEC9		zmm25
-# define VEC10		zmm26
-# define VEC11		zmm27
-# define VEC12		zmm28
-# define VEC13		zmm29
-# define VEC14		zmm30
-# define VEC15		zmm31
-# define VEC(i)		VEC##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-# define VZEROUPPER
-# define MOV_SIZE	6
-
-# define SECTION(p)		p##.evex512
+# include "evex512-vecs.h"
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
index 2373017358..200e7042a0 100644
--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -2,35 +2,7 @@
 
 #if ISA_SHOULD_BUILD (4)
 
-# define VEC_SIZE	32
-# define XMM0		xmm16
-# define XMM1		xmm17
-# define YMM0		ymm16
-# define YMM1		ymm17
-# define VEC0		ymm16
-# define VEC1		ymm17
-# define VEC2		ymm18
-# define VEC3		ymm19
-# define VEC4		ymm20
-# define VEC5		ymm21
-# define VEC6		ymm22
-# define VEC7		ymm23
-# define VEC8		ymm24
-# define VEC9		ymm25
-# define VEC10		ymm26
-# define VEC11		ymm27
-# define VEC12		ymm28
-# define VEC13		ymm29
-# define VEC14		ymm30
-# define VEC15		ymm31
-# define VEC(i)		VEC##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-# define VZEROUPPER
-# define MOV_SIZE	6
-
-# define SECTION(p)		p##.evex
+# include "evex256-vecs.h"
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
index 422a079902..a2fe816270 100644
--- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -22,18 +22,9 @@
    so we need this to build for ISA V2 builds. */
 #if ISA_SHOULD_BUILD (2)
 
-# include <sysdep.h>
+# include "sse2-vecs.h"
 
-# define VEC_SIZE	16
-# define VEC(i)		xmm##i
 # define PREFETCHNT	prefetchnta
-# define VMOVNT		movntdq
-/* Use movups and movaps for smaller code sizes.  */
-# define VMOVU		movups
-# define VMOVA		movaps
-# define MOV_SIZE	3
-
-# define SECTION(p)		p
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 04747133b7..5b758cae5e 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -60,14 +60,6 @@
 # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
 #endif
 
-#ifndef XMM0
-# define XMM0				xmm0
-#endif
-
-#ifndef YMM0
-# define YMM0				ymm0
-#endif
-
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER vzeroupper
@@ -225,13 +217,13 @@ L(start):
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 	/* Load regardless.  */
-	VMOVU	(%rsi), %VEC(0)
+	VMOVU	(%rsi), %VMM(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VMM(1)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), -VEC_SIZE(%rdi,%rdx)
 #if !(defined USE_MULTIARCH && IS_IN (libc))
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
@@ -270,15 +262,15 @@ L(start_erms):
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 	/* Load regardless.  */
-	VMOVU	(%rsi), %VEC(0)
+	VMOVU	(%rsi), %VMM(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
 	 */
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
-L(return):
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), -VEC_SIZE(%rdi, %rdx)
+L(return_vzeroupper):
 # if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 # else
@@ -359,10 +351,10 @@ L(between_16_31):
 	.p2align 4,, 10
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
-	VMOVU	(%rsi), %YMM0
-	VMOVU	-32(%rsi, %rdx), %YMM1
-	VMOVU	%YMM0, (%rdi)
-	VMOVU	%YMM1, -32(%rdi, %rdx)
+	VMOVU	(%rsi), %VMM_256(0)
+	VMOVU	-32(%rsi, %rdx), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -32(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
@@ -380,12 +372,12 @@ L(last_4x_vec):
 	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
 
 	/* VEC(0) and VEC(1) have already been loaded.  */
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
+	VMOVU	%VMM(2), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VMM(3), -(VEC_SIZE * 2)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
 	.p2align 4
@@ -400,24 +392,24 @@ L(more_2x_vec):
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
 	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_4x_vec)
 	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(7)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(4), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VMM(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VMM(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(7), -(VEC_SIZE * 4)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
 	.p2align 4,, 4
@@ -466,14 +458,14 @@ L(more_8x_vec_forward):
 	 */
 
 	/* First vec was already loaded into VEC(0).  */
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(5)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(6)
 	/* Save begining of dst.  */
 	movq	%rdi, %rcx
 	/* Align dst to VEC_SIZE - 1.  */
 	orq	$(VEC_SIZE - 1), %rdi
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(7)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(8)
 
 	/* Subtract dst from src. Add back after dst aligned.  */
 	subq	%rcx, %rsi
@@ -488,25 +480,25 @@ L(more_8x_vec_forward):
 	.p2align 4,, 11
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
-	VMOVU	(%rsi), %VEC(1)
-	VMOVU	VEC_SIZE(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
+	VMOVU	(%rsi), %VMM(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(4)
 	subq	$-(VEC_SIZE * 4), %rsi
-	VMOVA	%VEC(1), (%rdi)
-	VMOVA	%VEC(2), VEC_SIZE(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VMM(1), (%rdi)
+	VMOVA	%VMM(2), VEC_SIZE(%rdi)
+	VMOVA	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VMM(4), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpq	%rdi, %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
-	VMOVU	%VEC(7), VEC_SIZE(%rdx)
-	VMOVU	%VEC(8), (%rdx)
+	VMOVU	%VMM(5), (VEC_SIZE * 3)(%rdx)
+	VMOVU	%VMM(6), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VMM(7), VEC_SIZE(%rdx)
+	VMOVU	%VMM(8), (%rdx)
 	/* Store the first VEC.  */
-	VMOVU	%VEC(0), (%rcx)
+	VMOVU	%VMM(0), (%rcx)
 	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
 	 */
 L(nop_backward):
@@ -523,12 +515,12 @@ L(more_8x_vec_backward):
 	   addresses.  */
 
 	/* First vec was also loaded into VEC(0).  */
-	VMOVU	VEC_SIZE(%rsi), %VEC(5)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	VMOVU	VEC_SIZE(%rsi), %VMM(5)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(6)
 	/* Begining of region for 4x backward copy stored in rcx.  */
 	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(7)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(8)
 	/* Subtract dst from src. Add back after dst aligned.  */
 	subq	%rdi, %rsi
 	/* Align dst.  */
@@ -540,25 +532,25 @@ L(more_8x_vec_backward):
 	.p2align 4,, 11
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
-	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VMM(3)
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VMM(4)
 	addq	$(VEC_SIZE * -4), %rsi
-	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
-	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
-	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
+	VMOVA	%VMM(1), (VEC_SIZE * 3)(%rcx)
+	VMOVA	%VMM(2), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VMM(3), (VEC_SIZE * 1)(%rcx)
+	VMOVA	%VMM(4), (VEC_SIZE * 0)(%rcx)
 	addq	$(VEC_SIZE * -4), %rcx
 	cmpq	%rcx, %rdi
 	jb	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(5), VEC_SIZE(%rdi)
+	VMOVU	%VMM(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
-	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
+	VMOVU	%VMM(8), -VEC_SIZE(%rdx, %rdi)
 	VZEROUPPER_RETURN
 
 #if defined USE_MULTIARCH && IS_IN (libc)
@@ -568,7 +560,7 @@ L(loop_4x_vec_backward):
 # if ALIGN_MOVSB
 L(skip_short_movsb_check):
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 #  endif
 #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
 #   error Unsupported MOVSB_ALIGN_TO
@@ -597,9 +589,9 @@ L(skip_short_movsb_check):
 
 	rep	movsb
 
-	VMOVU	%VEC(0), (%r8)
+	VMOVU	%VMM(0), (%r8)
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	%VEC(1), VEC_SIZE(%r8)
+	VMOVU	%VMM(1), VEC_SIZE(%r8)
 #  endif
 	VZEROUPPER_RETURN
 # endif
@@ -640,7 +632,7 @@ L(movsb):
 # endif
 # if ALIGN_MOVSB
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 #  endif
 #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
 #   error Unsupported MOVSB_ALIGN_TO
@@ -664,9 +656,9 @@ L(movsb_align_dst):
 	rep	movsb
 
 	/* Store VECs loaded for aligning.  */
-	VMOVU	%VEC(0), (%r8)
+	VMOVU	%VMM(0), (%r8)
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	%VEC(1), VEC_SIZE(%r8)
+	VMOVU	%VMM(1), VEC_SIZE(%r8)
 #  endif
 	VZEROUPPER_RETURN
 # else	/* !ALIGN_MOVSB.  */
@@ -701,18 +693,18 @@ L(large_memcpy_2x):
 
 	/* First vec was also loaded into VEC(0).  */
 # if VEC_SIZE < 64
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 #  if VEC_SIZE < 32
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
 #  endif
 # endif
-	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VMM(0), (%rdi)
 # if VEC_SIZE < 64
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
 #  if VEC_SIZE < 32
-	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
 #  endif
 # endif
 
@@ -761,12 +753,12 @@ L(loop_large_memcpy_2x_inner):
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
 	/* Load vectors from rsi.  */
-	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
 	subq	$-LARGE_LOAD_SIZE, %rsi
 	/* Non-temporal store vectors to rdi.  */
-	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
 	subq	$-LARGE_LOAD_SIZE, %rdi
 	decl	%ecx
 	jnz	L(loop_large_memcpy_2x_inner)
@@ -785,31 +777,31 @@ L(loop_large_memcpy_2x_tail):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
 	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VMM(0)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
 	subq	$-(VEC_SIZE * 4), %rsi
 	addl	$-(VEC_SIZE * 4), %edx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VMM(0), (%rdi)
+	VMOVA	%VMM(1), VEC_SIZE(%rdi)
+	VMOVA	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VMM(3), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpl	$(VEC_SIZE * 4), %edx
 	ja	L(loop_large_memcpy_2x_tail)
 
 L(large_memcpy_2x_end):
 	/* Store the last 4 * VEC.  */
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
-
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
-	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
-	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
-	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(3)
+
+	VMOVU	%VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VMM(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
 	.p2align 4
@@ -831,16 +823,16 @@ L(loop_large_memcpy_4x_inner):
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
 	/* Load vectors from rsi.  */
-	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
 	subq	$-LARGE_LOAD_SIZE, %rsi
 	/* Non-temporal store vectors to rdi.  */
-	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
-	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
-	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
 	subq	$-LARGE_LOAD_SIZE, %rdi
 	decl	%ecx
 	jnz	L(loop_large_memcpy_4x_inner)
@@ -858,31 +850,31 @@ L(loop_large_memcpy_4x_tail):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
 	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VMM(0)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
 	subq	$-(VEC_SIZE * 4), %rsi
 	addl	$-(VEC_SIZE * 4), %edx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VMM(0), (%rdi)
+	VMOVA	%VMM(1), VEC_SIZE(%rdi)
+	VMOVA	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VMM(3), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpl	$(VEC_SIZE * 4), %edx
 	ja	L(loop_large_memcpy_4x_tail)
 
 L(large_memcpy_4x_end):
 	/* Store the last 4 * VEC.  */
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
-
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
-	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
-	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
-	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(3)
+
+	VMOVU	%VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VMM(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index ea3a0a0a60..d5c535a8f0 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -31,7 +31,7 @@
 # endif
 
 # define PAGE_SIZE			4096
-# define VECMATCH			VEC(0)
+# define VMMMATCH			VMM(0)
 
 	.section SECTION(.text), "ax", @progbits
 ENTRY_P2ALIGN(MEMRCHR, 6)
@@ -47,7 +47,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
 	   correct page cross check and 2) it correctly sets up end ptr to be
 	   subtract by lzcnt aligned.  */
 	leaq	-1(%rdi, %rdx), %rax
-	vpbroadcastb %esi, %VECMATCH
+	vpbroadcastb %esi, %VMMMATCH
 
 	/* Check if we can load 1x VEC without cross a page.  */
 	testl	$(PAGE_SIZE - VEC_SIZE), %eax
@@ -55,7 +55,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
 
 	/* Don't use rax for pointer here because EVEX has better encoding with
 	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
@@ -96,7 +96,7 @@ L(more_1x_vec):
 	movq	%rax, %rdx
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	subq	%rdi, %rdx
@@ -115,7 +115,7 @@ L(last_2x_vec):
 
 	/* Don't use rax for pointer here because EVEX has better encoding with
 	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
 	lzcntq	%rcx, %rcx
@@ -131,7 +131,7 @@ L(last_2x_vec):
 L(page_cross):
 	movq	%rax, %rsi
 	andq	$-VEC_SIZE, %rsi
-	vpcmpb	$0, (%rsi), %VECMATCH, %k0
+	vpcmpb	$0, (%rsi), %VMMMATCH, %k0
 	kmovd	%k0, %r8d
 	/* Shift out negative alignment (because we are starting from endptr and
 	   working backwards).  */
@@ -165,13 +165,13 @@ L(more_2x_vec):
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x0_dec)
 
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x1)
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	subq	$(VEC_SIZE * 4), %rdx
@@ -185,7 +185,7 @@ L(last_vec):
 
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 	lzcntl	%ecx, %ecx
 	subq	$(VEC_SIZE * 3 + 1), %rax
@@ -220,7 +220,7 @@ L(more_4x_vec):
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x2)
 
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	testl	%ecx, %ecx
@@ -243,17 +243,17 @@ L(more_4x_vec):
 L(loop_4x_vec):
 	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
 	   on).  */
-	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1
 
 	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
-	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
-	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
-	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
+	vpxorq	(VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3)
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4
 
 	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
 	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
-	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
-	vptestnmb %VEC(3), %VEC(3), %k2
+	vpminub	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	vptestnmb %VMM(3), %VMM(3), %k2
 
 	/* Any 1s and we found CHAR.  */
 	kortestd %k2, %k4
@@ -270,7 +270,7 @@ L(loop_4x_vec):
 L(last_4x_vec):
 
 	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	cmpl	$(VEC_SIZE * 2), %edx
@@ -280,14 +280,14 @@ L(last_4x_vec):
 	jnz	L(ret_vec_x0_dec)
 
 
-	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x1)
 
 	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	cmpl	$(VEC_SIZE * 3), %edx
@@ -309,7 +309,7 @@ L(loop_end):
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x0_end)
 
-	vptestnmb %VEC(2), %VEC(2), %k0
+	vptestnmb %VMM(2), %VMM(2), %k0
 	kmovd	%k0, %ecx
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x1_end)
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
index 8ac3e479bb..3bd3b34150 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
@@ -1,10 +1,6 @@
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+#include "avx-rtm-vecs.h"
 
-#define VZEROUPPER_RETURN jmp	 L(return)
-
-#define SECTION(p) p##.avx.rtm
 #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
 #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
 
-#include "memset-avx2-unaligned-erms.S"
+# include "memset-avx2-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index a9054a9122..b3f2bbd61b 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -4,14 +4,9 @@
 
 # define USE_WITH_AVX2	1
 
-# define VEC_SIZE	32
-# define MOV_SIZE	4
-# define RET_SIZE	4
-
-# define VEC(i)		ymm##i
-
-# define VMOVU     vmovdqu
-# define VMOVA     vmovdqa
+# ifndef VEC_SIZE
+#  include "avx-vecs.h"
+# endif
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
@@ -26,9 +21,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
 # define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
 
-# ifndef SECTION
-#  define SECTION(p)		p##.avx
-# endif
 # ifndef MEMSET_SYMBOL
 #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 47623b8ee8..7f7a8eb33c 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -4,26 +4,14 @@
 
 # define USE_WITH_AVX512	1
 
-# define VEC_SIZE	64
-# define MOV_SIZE	6
-# define RET_SIZE	1
-
-# define XMM0		xmm16
-# define YMM0		ymm16
-# define VEC0		zmm16
-# define VEC(i)		VEC##i
-
-# define VMOVU     vmovdqu64
-# define VMOVA     vmovdqa64
-
-# define VZEROUPPER
+# include "evex512-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastb d, %VEC0; \
+  vpbroadcastb d, %VMM(0); \
   movq r, %rax
 
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastd d, %VEC0; \
+  vpbroadcastd d, %VMM(0); \
   movq r, %rax
 
 # define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -32,8 +20,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p##.evex512
-
 #ifndef MEMSET_SYMBOL
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index ac4b2d2d50..50fd9722f2 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -4,26 +4,14 @@
 
 # define USE_WITH_EVEX	1
 
-# define VEC_SIZE	32
-# define MOV_SIZE	6
-# define RET_SIZE	1
-
-# define XMM0		xmm16
-# define YMM0		ymm16
-# define VEC0		ymm16
-# define VEC(i)		VEC##i
-
-# define VMOVU     vmovdqu64
-# define VMOVA     vmovdqa64
-
-# define VZEROUPPER
+# include "evex256-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastb d, %VEC0; \
+  vpbroadcastb d, %VMM(0); \
   movq r, %rax
 
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastd d, %VEC0; \
+  vpbroadcastd d, %VMM(0); \
   movq r, %rax
 
 # define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -32,8 +20,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p##.evex
-
 #ifndef MEMSET_SYMBOL
 # define MEMSET_SYMBOL(p,s)	p##_evex_##s
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index 44f9b8888b..11c845f6c2 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -26,13 +26,7 @@
 # include <sysdep.h>
 # define USE_WITH_SSE2	1
 
-# define VEC_SIZE	16
-# define MOV_SIZE	3
-# define RET_SIZE	1
-
-# define VEC(i)		xmm##i
-# define VMOVU     movups
-# define VMOVA     movaps
+# include "sse2-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
@@ -52,8 +46,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p
-
 # ifndef MEMSET_SYMBOL
 #  define MEMSET_SYMBOL(p,s)	p##_sse2_##s
 # endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 905d0fa464..03de0ab907 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -34,14 +34,6 @@
 # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
 #endif
 
-#ifndef XMM0
-# define XMM0				xmm0
-#endif
-
-#ifndef YMM0
-# define YMM0				ymm0
-#endif
-
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER			vzeroupper
@@ -150,8 +142,8 @@ L(entry_from_wmemset):
 	cmpq	$(VEC_SIZE * 2), %rdx
 	ja	L(more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VMM(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VMM(0), (%rdi)
 	VZEROUPPER_RETURN
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
@@ -175,19 +167,19 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(stosb_more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
 	.p2align 4,, 4
 L(last_2x_vec):
 #ifdef USE_LESS_VEC_MASK_STORE
-	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * -2)(%rdi, %rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
 #else
-	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * -2)(%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * -1)(%rdi)
 #endif
 	VZEROUPPER_RETURN
 
@@ -221,7 +213,7 @@ L(less_vec_from_wmemset):
 	bzhil	%edx, %ecx, %ecx
 	kmovd	%ecx, %k1
 # endif
-	vmovdqu8 %VEC(0), (%rax){%k1}
+	vmovdqu8 %VMM(0), (%rax){%k1}
 	VZEROUPPER_RETURN
 
 # if defined USE_MULTIARCH && IS_IN (libc)
@@ -249,8 +241,8 @@ L(stosb_more_2x_vec):
 	   and (4x, 8x] jump to target.  */
 L(more_2x_vec):
 	/* Store next 2x vec regardless.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * 1)(%rdi)
 
 
 	/* Two different methods of setting up pointers / compare. The two
@@ -278,8 +270,8 @@ L(more_2x_vec):
 #endif
 
 	/* Store next 2x vec regardless.  */
-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
-	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
+	VMOVU	%VMM(0), (VEC_SIZE * 2)(%rax)
+	VMOVU	%VMM(0), (VEC_SIZE * 3)(%rax)
 
 
 #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
@@ -304,20 +296,20 @@ L(more_2x_vec):
 	andq	$(VEC_SIZE * -2), %LOOP_REG
 	.p2align 4
 L(loop):
-	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
-	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
-	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
-	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VMM(0), LOOP_4X_OFFSET(%LOOP_REG)
+	VMOVA	%VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
 	subq	$-(VEC_SIZE * 4), %LOOP_REG
 	cmpq	%END_REG, %LOOP_REG
 	jb	L(loop)
 	.p2align 4,, MOV_SIZE
 L(last_4x_vec):
-	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
-	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
-L(return):
+	VMOVU	%VMM(0), LOOP_4X_OFFSET(%END_REG)
+	VMOVU	%VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
+L(return_vzeroupper):
 #if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
@@ -355,7 +347,7 @@ L(cross_page):
 	jge	L(between_16_31)
 #endif
 #ifndef USE_XMM_LESS_VEC
-	MOVQ	%XMM0, %SET_REG64
+	MOVQ	%VMM_128(0), %SET_REG64
 #endif
 	cmpl	$8, %edx
 	jge	L(between_8_15)
@@ -374,8 +366,8 @@ L(between_0_0):
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
-	VMOVU	%YMM0, (%LESS_VEC_REG)
-	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
+	VMOVU	%VMM_256(0), (%LESS_VEC_REG)
+	VMOVU	%VMM_256(0), -32(%LESS_VEC_REG, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
@@ -383,8 +375,8 @@ L(between_32_63):
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
 L(between_16_31):
 	/* From 16 to 31.  No branch when size == 16.  */
-	VMOVU	%XMM0, (%LESS_VEC_REG)
-	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
+	VMOVU	%VMM_128(0), (%LESS_VEC_REG)
+	VMOVU	%VMM_128(0), -16(%LESS_VEC_REG, %rdx)
 	ret
 #endif
 
@@ -394,8 +386,8 @@ L(between_16_31):
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
 #ifdef USE_XMM_LESS_VEC
-	MOVQ	%XMM0, (%rdi)
-	MOVQ	%XMM0, -8(%rdi, %rdx)
+	MOVQ	%VMM_128(0), (%rdi)
+	MOVQ	%VMM_128(0), -8(%rdi, %rdx)
 #else
 	movq	%SET_REG64, (%LESS_VEC_REG)
 	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
@@ -408,8 +400,8 @@ L(between_8_15):
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
 #ifdef USE_XMM_LESS_VEC
-	MOVD	%XMM0, (%rdi)
-	MOVD	%XMM0, -4(%rdi, %rdx)
+	MOVD	%VMM_128(0), (%rdi)
+	MOVD	%VMM_128(0), -4(%rdi, %rdx)
 #else
 	movl	%SET_REG32, (%LESS_VEC_REG)
 	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
index 2b77a59d56..3574fab0aa 100644
--- a/sysdeps/x86_64/multiarch/sse2-vecs.h
+++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
@@ -40,8 +40,8 @@
 #define VMOVA				movaps
 #define VMOVNT				movntdq
 
-#define VEC_xmm				VEC_any_xmm
-#define VEC					VEC_any_xmm
+#define VMM_128				VMM_any_xmm
+#define VMM					VMM_any_xmm
 
 
 #endif
diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
index 9f3ffecede..fc5aa57edc 100644
--- a/sysdeps/x86_64/multiarch/vec-macros.h
+++ b/sysdeps/x86_64/multiarch/vec-macros.h
@@ -25,66 +25,66 @@
 #endif
 
 /* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
-   VEC(N) values.  */
-#define VEC_hi_xmm0				xmm16
-#define VEC_hi_xmm1				xmm17
-#define VEC_hi_xmm2				xmm18
-#define VEC_hi_xmm3				xmm19
-#define VEC_hi_xmm4				xmm20
-#define VEC_hi_xmm5				xmm21
-#define VEC_hi_xmm6				xmm22
-#define VEC_hi_xmm7				xmm23
-#define VEC_hi_xmm8				xmm24
-#define VEC_hi_xmm9				xmm25
-#define VEC_hi_xmm10			xmm26
-#define VEC_hi_xmm11			xmm27
-#define VEC_hi_xmm12			xmm28
-#define VEC_hi_xmm13			xmm29
-#define VEC_hi_xmm14			xmm30
-#define VEC_hi_xmm15			xmm31
+   VMM(N) values.  */
+#define VMM_hi_xmm0				xmm16
+#define VMM_hi_xmm1				xmm17
+#define VMM_hi_xmm2				xmm18
+#define VMM_hi_xmm3				xmm19
+#define VMM_hi_xmm4				xmm20
+#define VMM_hi_xmm5				xmm21
+#define VMM_hi_xmm6				xmm22
+#define VMM_hi_xmm7				xmm23
+#define VMM_hi_xmm8				xmm24
+#define VMM_hi_xmm9				xmm25
+#define VMM_hi_xmm10			xmm26
+#define VMM_hi_xmm11			xmm27
+#define VMM_hi_xmm12			xmm28
+#define VMM_hi_xmm13			xmm29
+#define VMM_hi_xmm14			xmm30
+#define VMM_hi_xmm15			xmm31
 
-#define VEC_hi_ymm0				ymm16
-#define VEC_hi_ymm1				ymm17
-#define VEC_hi_ymm2				ymm18
-#define VEC_hi_ymm3				ymm19
-#define VEC_hi_ymm4				ymm20
-#define VEC_hi_ymm5				ymm21
-#define VEC_hi_ymm6				ymm22
-#define VEC_hi_ymm7				ymm23
-#define VEC_hi_ymm8				ymm24
-#define VEC_hi_ymm9				ymm25
-#define VEC_hi_ymm10			ymm26
-#define VEC_hi_ymm11			ymm27
-#define VEC_hi_ymm12			ymm28
-#define VEC_hi_ymm13			ymm29
-#define VEC_hi_ymm14			ymm30
-#define VEC_hi_ymm15			ymm31
+#define VMM_hi_ymm0				ymm16
+#define VMM_hi_ymm1				ymm17
+#define VMM_hi_ymm2				ymm18
+#define VMM_hi_ymm3				ymm19
+#define VMM_hi_ymm4				ymm20
+#define VMM_hi_ymm5				ymm21
+#define VMM_hi_ymm6				ymm22
+#define VMM_hi_ymm7				ymm23
+#define VMM_hi_ymm8				ymm24
+#define VMM_hi_ymm9				ymm25
+#define VMM_hi_ymm10			ymm26
+#define VMM_hi_ymm11			ymm27
+#define VMM_hi_ymm12			ymm28
+#define VMM_hi_ymm13			ymm29
+#define VMM_hi_ymm14			ymm30
+#define VMM_hi_ymm15			ymm31
 
-#define VEC_hi_zmm0				zmm16
-#define VEC_hi_zmm1				zmm17
-#define VEC_hi_zmm2				zmm18
-#define VEC_hi_zmm3				zmm19
-#define VEC_hi_zmm4				zmm20
-#define VEC_hi_zmm5				zmm21
-#define VEC_hi_zmm6				zmm22
-#define VEC_hi_zmm7				zmm23
-#define VEC_hi_zmm8				zmm24
-#define VEC_hi_zmm9				zmm25
-#define VEC_hi_zmm10			zmm26
-#define VEC_hi_zmm11			zmm27
-#define VEC_hi_zmm12			zmm28
-#define VEC_hi_zmm13			zmm29
-#define VEC_hi_zmm14			zmm30
-#define VEC_hi_zmm15			zmm31
+#define VMM_hi_zmm0				zmm16
+#define VMM_hi_zmm1				zmm17
+#define VMM_hi_zmm2				zmm18
+#define VMM_hi_zmm3				zmm19
+#define VMM_hi_zmm4				zmm20
+#define VMM_hi_zmm5				zmm21
+#define VMM_hi_zmm6				zmm22
+#define VMM_hi_zmm7				zmm23
+#define VMM_hi_zmm8				zmm24
+#define VMM_hi_zmm9				zmm25
+#define VMM_hi_zmm10			zmm26
+#define VMM_hi_zmm11			zmm27
+#define VMM_hi_zmm12			zmm28
+#define VMM_hi_zmm13			zmm29
+#define VMM_hi_zmm14			zmm30
+#define VMM_hi_zmm15			zmm31
 
-#define PRIMITIVE_VEC(vec, num)		vec##num
+#define PRIMITIVE_VMM(vec, num)		vec##num
 
-#define VEC_any_xmm(i)			PRIMITIVE_VEC(xmm, i)
-#define VEC_any_ymm(i)			PRIMITIVE_VEC(ymm, i)
-#define VEC_any_zmm(i)			PRIMITIVE_VEC(zmm, i)
+#define VMM_any_xmm(i)			PRIMITIVE_VMM(xmm, i)
+#define VMM_any_ymm(i)			PRIMITIVE_VMM(ymm, i)
+#define VMM_any_zmm(i)			PRIMITIVE_VMM(zmm, i)
 
-#define VEC_hi_xmm(i)			PRIMITIVE_VEC(VEC_hi_xmm, i)
-#define VEC_hi_ymm(i)			PRIMITIVE_VEC(VEC_hi_ymm, i)
-#define VEC_hi_zmm(i)			PRIMITIVE_VEC(VEC_hi_zmm, i)
+#define VMM_hi_xmm(i)			PRIMITIVE_VMM(VMM_hi_xmm, i)
+#define VMM_hi_ymm(i)			PRIMITIVE_VMM(VMM_hi_ymm, i)
+#define VMM_hi_zmm(i)			PRIMITIVE_VMM(VMM_hi_zmm, i)
 
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v5 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 21:14 ` [PATCH v5 1/3] x86: Update evex256/512 vec macros Noah Goldstein
@ 2022-10-14 21:15   ` Noah Goldstein
  2022-10-14 21:28     ` H.J. Lu
  2022-10-14 21:15   ` [PATCH v5 3/3] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
  1 sibling, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 21:15 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

This is to make it easier to do think like:
```
vpcmpb %VEC(0), %VEC(1), %k0
kmov{d|q} %k0, %{eax|rax}
test %{eax|rax}
```

It adds macro s.t any GPR can get the proper width with:
    `V{upper_case_GPR_name}`

and any mask insn can get the proper width with:
    `{mask_insn_without_postfix}V`

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/reg-macros.h         | 166 ++++++++++++++++++
 .../multiarch/scripts/gen-reg-macros.py       | 123 +++++++++++++
 2 files changed, 289 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
 create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py

diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
new file mode 100644
index 0000000000..16168b6fda
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/reg-macros.h
@@ -0,0 +1,166 @@
+/* This file was generated by: gen-reg-macros.py.
+
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _REG_MACROS_H
+#define _REG_MACROS_H	1
+
+#define rax_8	al
+#define rax_16	ax
+#define rax_32	eax
+#define rax_64	rax
+#define rbx_8	bl
+#define rbx_16	bx
+#define rbx_32	ebx
+#define rbx_64	rbx
+#define rcx_8	cl
+#define rcx_16	cx
+#define rcx_32	ecx
+#define rcx_64	rcx
+#define rdx_8	dl
+#define rdx_16	dx
+#define rdx_32	edx
+#define rdx_64	rdx
+#define rbp_8	bpl
+#define rbp_16	bp
+#define rbp_32	ebp
+#define rbp_64	rbp
+#define rsp_8	spl
+#define rsp_16	sp
+#define rsp_32	esp
+#define rsp_64	rsp
+#define rsi_8	sil
+#define rsi_16	si
+#define rsi_32	esi
+#define rsi_64	rsi
+#define rdi_8	dil
+#define rdi_16	di
+#define rdi_32	edi
+#define rdi_64	rdi
+#define r8_8	r8b
+#define r8_16	r8w
+#define r8_32	r8d
+#define r8_64	r8
+#define r9_8	r9b
+#define r9_16	r9w
+#define r9_32	r9d
+#define r9_64	r9
+#define r10_8	r10b
+#define r10_16	r10w
+#define r10_32	r10d
+#define r10_64	r10
+#define r11_8	r11b
+#define r11_16	r11w
+#define r11_32	r11d
+#define r11_64	r11
+#define r12_8	r12b
+#define r12_16	r12w
+#define r12_32	r12d
+#define r12_64	r12
+#define r13_8	r13b
+#define r13_16	r13w
+#define r13_32	r13d
+#define r13_64	r13
+#define r14_8	r14b
+#define r14_16	r14w
+#define r14_32	r14d
+#define r14_64	r14
+#define r15_8	r15b
+#define r15_16	r15w
+#define r15_32	r15d
+#define r15_64	r15
+
+#define kmov_8	kmovb
+#define kmov_16	kmovw
+#define kmov_32	kmovd
+#define kmov_64	kmovq
+#define kortest_8	kortestb
+#define kortest_16	kortestw
+#define kortest_32	kortestd
+#define kortest_64	kortestq
+#define kor_8	korb
+#define kor_16	korw
+#define kor_32	kord
+#define kor_64	korq
+#define ktest_8	ktestb
+#define ktest_16	ktestw
+#define ktest_32	ktestd
+#define ktest_64	ktestq
+#define kand_8	kandb
+#define kand_16	kandw
+#define kand_32	kandd
+#define kand_64	kandq
+#define kxor_8	kxorb
+#define kxor_16	kxorw
+#define kxor_32	kxord
+#define kxor_64	kxorq
+#define knot_8	knotb
+#define knot_16	knotw
+#define knot_32	knotd
+#define knot_64	knotq
+#define kxnor_8	kxnorb
+#define kxnor_16	kxnorw
+#define kxnor_32	kxnord
+#define kxnor_64	kxnorq
+#define kunpack_8	kunpackbw
+#define kunpack_16	kunpackwd
+#define kunpack_32	kunpackdq
+
+/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
+#define VRAX	VGPR(rax)
+#define VRBX	VGPR(rbx)
+#define VRCX	VGPR(rcx)
+#define VRDX	VGPR(rdx)
+#define VRBP	VGPR(rbp)
+#define VRSP	VGPR(rsp)
+#define VRSI	VGPR(rsi)
+#define VRDI	VGPR(rdi)
+#define VR8	VGPR(r8)
+#define VR9	VGPR(r9)
+#define VR10	VGPR(r10)
+#define VR11	VGPR(r11)
+#define VR12	VGPR(r12)
+#define VR13	VGPR(r13)
+#define VR14	VGPR(r14)
+#define VR15	VGPR(r15)
+
+/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
+#define KMOV 	VKINSN(kmov)
+#define KORTEST 	VKINSN(kortest)
+#define KOR 	VKINSN(kor)
+#define KTEST 	VKINSN(ktest)
+#define KAND 	VKINSN(kand)
+#define KXOR 	VKINSN(kxor)
+#define KNOT 	VKINSN(knot)
+#define KXNOR 	VKINSN(kxnor)
+#define KUNPACK 	VKINSN(kunpack)
+
+#ifndef REG_WIDTH
+# define REG_WIDTH VEC_SIZE
+#endif
+
+#define VPASTER(x, y)	x##_##y
+#define VEVALUATOR(x, y)	VPASTER(x, y)
+
+#define VGPR_SZ(reg_name, reg_size)	VEVALUATOR(reg_name, reg_size)
+#define VKINSN_SZ(insn, reg_size)	VEVALUATOR(insn, reg_size)
+
+#define VGPR(reg_name)	VGPR_SZ(reg_name, REG_WIDTH)
+#define VKINSN(mask_insn)	VKINSN_SZ(mask_insn, REG_WIDTH)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
new file mode 100644
index 0000000000..c7296a8104
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
@@ -0,0 +1,123 @@
+#!/usr/bin/python3
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+"""Generate macros for getting GPR name of a certain size
+
+Inputs: None
+Output: Prints header fill to stdout
+
+API:
+    VGPR(reg_name)
+        - Get register name VEC_SIZE component of `reg_name`
+    VGPR_SZ(reg_name, reg_size)
+        - Get register name `reg_size` component of `reg_name`
+"""
+
+import sys
+import os
+from datetime import datetime
+
+registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
+             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
+             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
+             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
+             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
+             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
+             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
+             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
+
+mask_insns = [
+    "kmov",
+    "kortest",
+    "kor",
+    "ktest",
+    "kand",
+    "kxor",
+    "knot",
+    "kxnor",
+]
+mask_insns_ext = ["b", "w", "d", "q"]
+
+cr = """
+   Copyright (C) {} Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+"""
+
+print("/* This file was generated by: {}.".format(os.path.basename(
+    sys.argv[0])))
+print(cr.format(datetime.today().year))
+
+print("#ifndef _REG_MACROS_H")
+print("#define _REG_MACROS_H\t1")
+print("")
+for reg in registers:
+    for i in range(0, 4):
+        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
+
+print("")
+for mask_insn in mask_insns:
+    for i in range(0, 4):
+        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
+                                           mask_insns_ext[i]))
+for i in range(0, 3):
+    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
+                                                   mask_insns_ext[i + 1]))
+mask_insns.append("kunpack")
+
+print("")
+print(
+    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
+for reg in registers:
+    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
+
+print("")
+
+print(
+    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
+)
+for mask_insn in mask_insns:
+    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
+print("")
+
+print("#ifndef REG_WIDTH")
+print("# define REG_WIDTH VEC_SIZE")
+print("#endif")
+print("")
+print("#define VPASTER(x, y)\tx##_##y")
+print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
+print("")
+print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
+print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
+print("")
+print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
+print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
+
+print("\n#endif")
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v5 3/3] x86: Update strlen-evex-base to use new reg/vec macros.
  2022-10-14 21:14 ` [PATCH v5 1/3] x86: Update evex256/512 vec macros Noah Goldstein
  2022-10-14 21:15   ` [PATCH v5 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
@ 2022-10-14 21:15   ` Noah Goldstein
  1 sibling, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 21:15 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

To avoid duplicate the VMM / GPR / mask insn macros in all incoming
evex512 files use the macros defined in 'reg-macros.h' and
'{vec}-macros.h'

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 116 +++++++-------------
 sysdeps/x86_64/multiarch/strlen-evex512.S   |   4 +-
 2 files changed, 44 insertions(+), 76 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index 418e9f8411..c832b15a48 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -36,42 +36,10 @@
 #  define CHAR_SIZE	1
 # endif
 
-# define XMM0		xmm16
 # define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# if VEC_SIZE == 64
-#  define KMOV		kmovq
-#  define KORTEST	kortestq
-#  define RAX		rax
-#  define RCX		rcx
-#  define RDX		rdx
-#  define SHR		shrq
-#  define TEXTSUFFIX	evex512
-#  define VMM0		zmm16
-#  define VMM1		zmm17
-#  define VMM2		zmm18
-#  define VMM3		zmm19
-#  define VMM4		zmm20
-#  define VMOVA		vmovdqa64
-# elif VEC_SIZE == 32
-/* Currently Unused.  */
-#  define KMOV		kmovd
-#  define KORTEST	kortestd
-#  define RAX		eax
-#  define RCX		ecx
-#  define RDX		edx
-#  define SHR		shrl
-#  define TEXTSUFFIX	evex256
-#  define VMM0		ymm16
-#  define VMM1		ymm17
-#  define VMM2		ymm18
-#  define VMM3		ymm19
-#  define VMM4		ymm20
-#  define VMOVA		vmovdqa32
-# endif
-
-	.section .text.TEXTSUFFIX, "ax", @progbits
+	.section SECTION(.text),"ax",@progbits
 /* Aligning entry point to 64 byte, provides better performance for
    one vector length string.  */
 ENTRY_P2ALIGN (STRLEN, 6)
@@ -86,18 +54,18 @@ ENTRY_P2ALIGN (STRLEN, 6)
 # endif
 
 	movl	%edi, %eax
-	vpxorq	%XMM0, %XMM0, %XMM0
+	vpxorq	%VMM_128(0), %VMM_128(0), %VMM_128(0)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(page_cross)
 
 	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMP	$0, (%rdi), %VMM0, %k0
-	KMOV	%k0, %RAX
-	test	%RAX, %RAX
+	VPCMP	$0, (%rdi), %VMM(0), %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
@@ -120,7 +88,7 @@ L(align_more):
 	movq	%rax, %rdx
 	subq	%rdi, %rdx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RDX
+	shr	$2, %VRDX
 #  endif
 	/* At this point rdx contains [w]chars already compared.  */
 	subq	%rsi, %rdx
@@ -131,9 +99,9 @@ L(align_more):
 # endif
 
 	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMP	$0, (%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
 # ifdef USE_AS_STRNLEN
@@ -141,9 +109,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, VEC_SIZE(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, VEC_SIZE(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
 # ifdef USE_AS_STRNLEN
@@ -151,9 +119,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 
 # ifdef USE_AS_STRNLEN
@@ -161,9 +129,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x4)
 
 # ifdef USE_AS_STRNLEN
@@ -179,7 +147,7 @@ L(align_more):
 # ifdef USE_AS_STRNLEN
 	subq	%rax, %rcx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RCX
+	shr	$2, %VRCX
 #  endif
 	/* rcx contains number of [w]char will be recompared due to
 	   alignment fixes.  rdx must be incremented by rcx to offset
@@ -199,42 +167,42 @@ L(loop_entry):
 # endif
 	/* VPMINU and VPCMP combination provide better performance as
 	   compared to alternative combinations.  */
-	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
-	VPMINU	(VEC_SIZE * 5)(%rax), %VMM1, %VMM2
-	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
-	VPMINU	(VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+	VMOVA	(VEC_SIZE * 4)(%rax), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rax), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
 
-	VPTESTN	%VMM2, %VMM2, %k0
-	VPTESTN	%VMM4, %VMM4, %k1
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k1
 
 	subq	$-(VEC_SIZE * 4), %rax
 	KORTEST	%k0, %k1
 	jz	L(loop)
 
-	VPTESTN	%VMM1, %VMM1, %k2
-	KMOV	%k2, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VMM(1), %VMM(1), %k2
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
-	KMOV	%k0, %RCX
+	KMOV	%k0, %VRCX
 	/* At this point, if k0 is non zero, null char must be in the
 	   second vector.  */
-	test	%RCX, %RCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
-	VPTESTN	%VMM3, %VMM3, %k3
-	KMOV	%k3, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VMM(3), %VMM(3), %k3
+	KMOV	%k3, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 	/* At this point null [w]char must be in the fourth vector so no
 	   need to check.  */
-	KMOV	%k1, %RCX
+	KMOV	%k1, %VRCX
 
 	/* Fourth, third, second vector terminating are pretty much
 	   same, implemented this way to avoid branching and reuse code
 	   from pre loop exit condition.  */
 L(ret_vec_x4):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 3), %rax
@@ -250,7 +218,7 @@ L(ret_vec_x4):
 	ret
 
 L(ret_vec_x3):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 2), %rax
@@ -268,7 +236,7 @@ L(ret_vec_x3):
 L(ret_vec_x2):
 	subq	$-VEC_SIZE, %rax
 L(ret_vec_x1):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
@@ -289,13 +257,13 @@ L(page_cross):
 	/* ecx contains number of w[char] to be skipped as a result
 	   of address alignment.  */
 	xorq	%rdi, %rax
-	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
-	KMOV	%k0, %RAX
+	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRAX
 	/* Ignore number of character for alignment adjustment.  */
-	SHR	%cl, %RAX
+	shr	%cl, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
index 116f8981c8..dfd0a7821b 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -2,6 +2,6 @@
 # define STRLEN		__strlen_evex512
 #endif
 
-#define VEC_SIZE	64
-
+#include "evex512-vecs.h"
+#include "reg-macros.h"
 #include "strlen-evex-base.S"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v5 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 21:15   ` [PATCH v5 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
@ 2022-10-14 21:28     ` H.J. Lu
  2022-10-14 22:01       ` Noah Goldstein
  0 siblings, 1 reply; 72+ messages in thread
From: H.J. Lu @ 2022-10-14 21:28 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 2:15 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This is to make it easier to do think like:
> ```
> vpcmpb %VEC(0), %VEC(1), %k0
> kmov{d|q} %k0, %{eax|rax}
> test %{eax|rax}
> ```
>
> It adds macro s.t any GPR can get the proper width with:
>     `V{upper_case_GPR_name}`
>
> and any mask insn can get the proper width with:
>     `{mask_insn_without_postfix}V`
>
> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  sysdeps/x86_64/multiarch/reg-macros.h         | 166 ++++++++++++++++++
>  .../multiarch/scripts/gen-reg-macros.py       | 123 +++++++++++++
>  2 files changed, 289 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
>  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
>
> diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> new file mode 100644
> index 0000000000..16168b6fda
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> @@ -0,0 +1,166 @@
> +/* This file was generated by: gen-reg-macros.py.
> +
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _REG_MACROS_H
> +#define _REG_MACROS_H  1
> +
> +#define rax_8  al
> +#define rax_16 ax
> +#define rax_32 eax
> +#define rax_64 rax
> +#define rbx_8  bl
> +#define rbx_16 bx
> +#define rbx_32 ebx
> +#define rbx_64 rbx
> +#define rcx_8  cl
> +#define rcx_16 cx
> +#define rcx_32 ecx
> +#define rcx_64 rcx
> +#define rdx_8  dl
> +#define rdx_16 dx
> +#define rdx_32 edx
> +#define rdx_64 rdx
> +#define rbp_8  bpl
> +#define rbp_16 bp
> +#define rbp_32 ebp
> +#define rbp_64 rbp
> +#define rsp_8  spl
> +#define rsp_16 sp
> +#define rsp_32 esp
> +#define rsp_64 rsp
> +#define rsi_8  sil
> +#define rsi_16 si
> +#define rsi_32 esi
> +#define rsi_64 rsi
> +#define rdi_8  dil
> +#define rdi_16 di
> +#define rdi_32 edi
> +#define rdi_64 rdi
> +#define r8_8   r8b
> +#define r8_16  r8w
> +#define r8_32  r8d
> +#define r8_64  r8
> +#define r9_8   r9b
> +#define r9_16  r9w
> +#define r9_32  r9d
> +#define r9_64  r9
> +#define r10_8  r10b
> +#define r10_16 r10w
> +#define r10_32 r10d
> +#define r10_64 r10
> +#define r11_8  r11b
> +#define r11_16 r11w
> +#define r11_32 r11d
> +#define r11_64 r11
> +#define r12_8  r12b
> +#define r12_16 r12w
> +#define r12_32 r12d
> +#define r12_64 r12
> +#define r13_8  r13b
> +#define r13_16 r13w
> +#define r13_32 r13d
> +#define r13_64 r13
> +#define r14_8  r14b
> +#define r14_16 r14w
> +#define r14_32 r14d
> +#define r14_64 r14
> +#define r15_8  r15b
> +#define r15_16 r15w
> +#define r15_32 r15d
> +#define r15_64 r15
> +
> +#define kmov_8 kmovb
> +#define kmov_16        kmovw
> +#define kmov_32        kmovd
> +#define kmov_64        kmovq
> +#define kortest_8      kortestb
> +#define kortest_16     kortestw
> +#define kortest_32     kortestd
> +#define kortest_64     kortestq
> +#define kor_8  korb
> +#define kor_16 korw
> +#define kor_32 kord
> +#define kor_64 korq
> +#define ktest_8        ktestb
> +#define ktest_16       ktestw
> +#define ktest_32       ktestd
> +#define ktest_64       ktestq
> +#define kand_8 kandb
> +#define kand_16        kandw
> +#define kand_32        kandd
> +#define kand_64        kandq
> +#define kxor_8 kxorb
> +#define kxor_16        kxorw
> +#define kxor_32        kxord
> +#define kxor_64        kxorq
> +#define knot_8 knotb
> +#define knot_16        knotw
> +#define knot_32        knotd
> +#define knot_64        knotq
> +#define kxnor_8        kxnorb
> +#define kxnor_16       kxnorw
> +#define kxnor_32       kxnord
> +#define kxnor_64       kxnorq
> +#define kunpack_8      kunpackbw
> +#define kunpack_16     kunpackwd
> +#define kunpack_32     kunpackdq
> +
> +/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
> +#define VRAX   VGPR(rax)
> +#define VRBX   VGPR(rbx)
> +#define VRCX   VGPR(rcx)
> +#define VRDX   VGPR(rdx)
> +#define VRBP   VGPR(rbp)
> +#define VRSP   VGPR(rsp)
> +#define VRSI   VGPR(rsi)
> +#define VRDI   VGPR(rdi)
> +#define VR8    VGPR(r8)
> +#define VR9    VGPR(r9)
> +#define VR10   VGPR(r10)
> +#define VR11   VGPR(r11)
> +#define VR12   VGPR(r12)
> +#define VR13   VGPR(r13)
> +#define VR14   VGPR(r14)
> +#define VR15   VGPR(r15)
> +
> +/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
> +#define KMOV   VKINSN(kmov)
> +#define KORTEST        VKINSN(kortest)
> +#define KOR    VKINSN(kor)
> +#define KTEST  VKINSN(ktest)
> +#define KAND   VKINSN(kand)
> +#define KXOR   VKINSN(kxor)
> +#define KNOT   VKINSN(knot)
> +#define KXNOR  VKINSN(kxnor)
> +#define KUNPACK        VKINSN(kunpack)
> +
> +#ifndef REG_WIDTH
> +# define REG_WIDTH VEC_SIZE
> +#endif

Which files will define REG_WIDTH?  What values will it be for
YMM and ZMM vectors?

> +#define VPASTER(x, y)  x##_##y
> +#define VEVALUATOR(x, y)       VPASTER(x, y)
> +
> +#define VGPR_SZ(reg_name, reg_size)    VEVALUATOR(reg_name, reg_size)
> +#define VKINSN_SZ(insn, reg_size)      VEVALUATOR(insn, reg_size)
> +
> +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> +#define VKINSN(mask_insn)      VKINSN_SZ(mask_insn, REG_WIDTH)
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> new file mode 100644
> index 0000000000..c7296a8104
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> @@ -0,0 +1,123 @@
> +#!/usr/bin/python3
> +# Copyright (C) 2022 Free Software Foundation, Inc.
> +# This file is part of the GNU C Library.
> +#
> +# The GNU C Library is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU Lesser General Public
> +# License as published by the Free Software Foundation; either
> +# version 2.1 of the License, or (at your option) any later version.
> +#
> +# The GNU C Library is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +# Lesser General Public License for more details.
> +#
> +# You should have received a copy of the GNU Lesser General Public
> +# License along with the GNU C Library; if not, see
> +# <https://www.gnu.org/licenses/>.
> +"""Generate macros for getting GPR name of a certain size
> +
> +Inputs: None
> +Output: Prints header fill to stdout
> +
> +API:
> +    VGPR(reg_name)
> +        - Get register name VEC_SIZE component of `reg_name`
> +    VGPR_SZ(reg_name, reg_size)
> +        - Get register name `reg_size` component of `reg_name`
> +"""
> +
> +import sys
> +import os
> +from datetime import datetime
> +
> +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> +
> +mask_insns = [
> +    "kmov",
> +    "kortest",
> +    "kor",
> +    "ktest",
> +    "kand",
> +    "kxor",
> +    "knot",
> +    "kxnor",
> +]
> +mask_insns_ext = ["b", "w", "d", "q"]
> +
> +cr = """
> +   Copyright (C) {} Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +"""
> +
> +print("/* This file was generated by: {}.".format(os.path.basename(
> +    sys.argv[0])))
> +print(cr.format(datetime.today().year))
> +
> +print("#ifndef _REG_MACROS_H")
> +print("#define _REG_MACROS_H\t1")
> +print("")
> +for reg in registers:
> +    for i in range(0, 4):
> +        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
> +
> +print("")
> +for mask_insn in mask_insns:
> +    for i in range(0, 4):
> +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> +                                           mask_insns_ext[i]))
> +for i in range(0, 3):
> +    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
> +                                                   mask_insns_ext[i + 1]))
> +mask_insns.append("kunpack")
> +
> +print("")
> +print(
> +    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
> +for reg in registers:
> +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> +
> +print("")
> +
> +print(
> +    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
> +)
> +for mask_insn in mask_insns:
> +    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
> +print("")
> +
> +print("#ifndef REG_WIDTH")
> +print("# define REG_WIDTH VEC_SIZE")
> +print("#endif")
> +print("")
> +print("#define VPASTER(x, y)\tx##_##y")
> +print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
> +print("")
> +print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
> +print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
> +print("")
> +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> +print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
> +
> +print("\n#endif")
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v5 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 21:28     ` H.J. Lu
@ 2022-10-14 22:01       ` Noah Goldstein
  2022-10-14 22:05         ` H.J. Lu
  0 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 22:01 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 4:28 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Oct 14, 2022 at 2:15 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > This is to make it easier to do think like:
> > ```
> > vpcmpb %VEC(0), %VEC(1), %k0
> > kmov{d|q} %k0, %{eax|rax}
> > test %{eax|rax}
> > ```
> >
> > It adds macro s.t any GPR can get the proper width with:
> >     `V{upper_case_GPR_name}`
> >
> > and any mask insn can get the proper width with:
> >     `{mask_insn_without_postfix}V`
> >
> > This commit does not change libc.so
> >
> > Tested build on x86-64
> > ---
> >  sysdeps/x86_64/multiarch/reg-macros.h         | 166 ++++++++++++++++++
> >  .../multiarch/scripts/gen-reg-macros.py       | 123 +++++++++++++
> >  2 files changed, 289 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
> >  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> >
> > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> > new file mode 100644
> > index 0000000000..16168b6fda
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> > @@ -0,0 +1,166 @@
> > +/* This file was generated by: gen-reg-macros.py.
> > +
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _REG_MACROS_H
> > +#define _REG_MACROS_H  1
> > +
> > +#define rax_8  al
> > +#define rax_16 ax
> > +#define rax_32 eax
> > +#define rax_64 rax
> > +#define rbx_8  bl
> > +#define rbx_16 bx
> > +#define rbx_32 ebx
> > +#define rbx_64 rbx
> > +#define rcx_8  cl
> > +#define rcx_16 cx
> > +#define rcx_32 ecx
> > +#define rcx_64 rcx
> > +#define rdx_8  dl
> > +#define rdx_16 dx
> > +#define rdx_32 edx
> > +#define rdx_64 rdx
> > +#define rbp_8  bpl
> > +#define rbp_16 bp
> > +#define rbp_32 ebp
> > +#define rbp_64 rbp
> > +#define rsp_8  spl
> > +#define rsp_16 sp
> > +#define rsp_32 esp
> > +#define rsp_64 rsp
> > +#define rsi_8  sil
> > +#define rsi_16 si
> > +#define rsi_32 esi
> > +#define rsi_64 rsi
> > +#define rdi_8  dil
> > +#define rdi_16 di
> > +#define rdi_32 edi
> > +#define rdi_64 rdi
> > +#define r8_8   r8b
> > +#define r8_16  r8w
> > +#define r8_32  r8d
> > +#define r8_64  r8
> > +#define r9_8   r9b
> > +#define r9_16  r9w
> > +#define r9_32  r9d
> > +#define r9_64  r9
> > +#define r10_8  r10b
> > +#define r10_16 r10w
> > +#define r10_32 r10d
> > +#define r10_64 r10
> > +#define r11_8  r11b
> > +#define r11_16 r11w
> > +#define r11_32 r11d
> > +#define r11_64 r11
> > +#define r12_8  r12b
> > +#define r12_16 r12w
> > +#define r12_32 r12d
> > +#define r12_64 r12
> > +#define r13_8  r13b
> > +#define r13_16 r13w
> > +#define r13_32 r13d
> > +#define r13_64 r13
> > +#define r14_8  r14b
> > +#define r14_16 r14w
> > +#define r14_32 r14d
> > +#define r14_64 r14
> > +#define r15_8  r15b
> > +#define r15_16 r15w
> > +#define r15_32 r15d
> > +#define r15_64 r15
> > +
> > +#define kmov_8 kmovb
> > +#define kmov_16        kmovw
> > +#define kmov_32        kmovd
> > +#define kmov_64        kmovq
> > +#define kortest_8      kortestb
> > +#define kortest_16     kortestw
> > +#define kortest_32     kortestd
> > +#define kortest_64     kortestq
> > +#define kor_8  korb
> > +#define kor_16 korw
> > +#define kor_32 kord
> > +#define kor_64 korq
> > +#define ktest_8        ktestb
> > +#define ktest_16       ktestw
> > +#define ktest_32       ktestd
> > +#define ktest_64       ktestq
> > +#define kand_8 kandb
> > +#define kand_16        kandw
> > +#define kand_32        kandd
> > +#define kand_64        kandq
> > +#define kxor_8 kxorb
> > +#define kxor_16        kxorw
> > +#define kxor_32        kxord
> > +#define kxor_64        kxorq
> > +#define knot_8 knotb
> > +#define knot_16        knotw
> > +#define knot_32        knotd
> > +#define knot_64        knotq
> > +#define kxnor_8        kxnorb
> > +#define kxnor_16       kxnorw
> > +#define kxnor_32       kxnord
> > +#define kxnor_64       kxnorq
> > +#define kunpack_8      kunpackbw
> > +#define kunpack_16     kunpackwd
> > +#define kunpack_32     kunpackdq
> > +
> > +/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
> > +#define VRAX   VGPR(rax)
> > +#define VRBX   VGPR(rbx)
> > +#define VRCX   VGPR(rcx)
> > +#define VRDX   VGPR(rdx)
> > +#define VRBP   VGPR(rbp)
> > +#define VRSP   VGPR(rsp)
> > +#define VRSI   VGPR(rsi)
> > +#define VRDI   VGPR(rdi)
> > +#define VR8    VGPR(r8)
> > +#define VR9    VGPR(r9)
> > +#define VR10   VGPR(r10)
> > +#define VR11   VGPR(r11)
> > +#define VR12   VGPR(r12)
> > +#define VR13   VGPR(r13)
> > +#define VR14   VGPR(r14)
> > +#define VR15   VGPR(r15)
> > +
> > +/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
> > +#define KMOV   VKINSN(kmov)
> > +#define KORTEST        VKINSN(kortest)
> > +#define KOR    VKINSN(kor)
> > +#define KTEST  VKINSN(ktest)
> > +#define KAND   VKINSN(kand)
> > +#define KXOR   VKINSN(kxor)
> > +#define KNOT   VKINSN(knot)
> > +#define KXNOR  VKINSN(kxnor)
> > +#define KUNPACK        VKINSN(kunpack)
> > +
> > +#ifndef REG_WIDTH
> > +# define REG_WIDTH VEC_SIZE
> > +#endif
>
> Which files will define REG_WIDTH?  What values will it be for
> YMM and ZMM vectors?

for non-wide char evex or avx2/sse2 REG_WIDTH = VEC_SIZE
so for YMM REG_WIDTH = 32, for ZMM REG_WIDTH = 64.

For wchar impls REG_WIDTH will often be 32 irrelivant of YMM/ZMM.
>
> > +#define VPASTER(x, y)  x##_##y
> > +#define VEVALUATOR(x, y)       VPASTER(x, y)
> > +
> > +#define VGPR_SZ(reg_name, reg_size)    VEVALUATOR(reg_name, reg_size)
> > +#define VKINSN_SZ(insn, reg_size)      VEVALUATOR(insn, reg_size)
> > +
> > +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> > +#define VKINSN(mask_insn)      VKINSN_SZ(mask_insn, REG_WIDTH)
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > new file mode 100644
> > index 0000000000..c7296a8104
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > @@ -0,0 +1,123 @@
> > +#!/usr/bin/python3
> > +# Copyright (C) 2022 Free Software Foundation, Inc.
> > +# This file is part of the GNU C Library.
> > +#
> > +# The GNU C Library is free software; you can redistribute it and/or
> > +# modify it under the terms of the GNU Lesser General Public
> > +# License as published by the Free Software Foundation; either
> > +# version 2.1 of the License, or (at your option) any later version.
> > +#
> > +# The GNU C Library is distributed in the hope that it will be useful,
> > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +# Lesser General Public License for more details.
> > +#
> > +# You should have received a copy of the GNU Lesser General Public
> > +# License along with the GNU C Library; if not, see
> > +# <https://www.gnu.org/licenses/>.
> > +"""Generate macros for getting GPR name of a certain size
> > +
> > +Inputs: None
> > +Output: Prints header fill to stdout
> > +
> > +API:
> > +    VGPR(reg_name)
> > +        - Get register name VEC_SIZE component of `reg_name`
> > +    VGPR_SZ(reg_name, reg_size)
> > +        - Get register name `reg_size` component of `reg_name`
> > +"""
> > +
> > +import sys
> > +import os
> > +from datetime import datetime
> > +
> > +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> > +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> > +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> > +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> > +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> > +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> > +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> > +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> > +
> > +mask_insns = [
> > +    "kmov",
> > +    "kortest",
> > +    "kor",
> > +    "ktest",
> > +    "kand",
> > +    "kxor",
> > +    "knot",
> > +    "kxnor",
> > +]
> > +mask_insns_ext = ["b", "w", "d", "q"]
> > +
> > +cr = """
> > +   Copyright (C) {} Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +"""
> > +
> > +print("/* This file was generated by: {}.".format(os.path.basename(
> > +    sys.argv[0])))
> > +print(cr.format(datetime.today().year))
> > +
> > +print("#ifndef _REG_MACROS_H")
> > +print("#define _REG_MACROS_H\t1")
> > +print("")
> > +for reg in registers:
> > +    for i in range(0, 4):
> > +        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
> > +
> > +print("")
> > +for mask_insn in mask_insns:
> > +    for i in range(0, 4):
> > +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> > +                                           mask_insns_ext[i]))
> > +for i in range(0, 3):
> > +    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
> > +                                                   mask_insns_ext[i + 1]))
> > +mask_insns.append("kunpack")
> > +
> > +print("")
> > +print(
> > +    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
> > +for reg in registers:
> > +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> > +
> > +print("")
> > +
> > +print(
> > +    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
> > +)
> > +for mask_insn in mask_insns:
> > +    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
> > +print("")
> > +
> > +print("#ifndef REG_WIDTH")
> > +print("# define REG_WIDTH VEC_SIZE")
> > +print("#endif")
> > +print("")
> > +print("#define VPASTER(x, y)\tx##_##y")
> > +print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
> > +print("")
> > +print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
> > +print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
> > +print("")
> > +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> > +print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
> > +
> > +print("\n#endif")
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v5 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 22:01       ` Noah Goldstein
@ 2022-10-14 22:05         ` H.J. Lu
  2022-10-14 22:27           ` Noah Goldstein
  0 siblings, 1 reply; 72+ messages in thread
From: H.J. Lu @ 2022-10-14 22:05 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

 On Fri, Oct 14, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Oct 14, 2022 at 4:28 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Oct 14, 2022 at 2:15 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > This is to make it easier to do think like:
> > > ```
> > > vpcmpb %VEC(0), %VEC(1), %k0
> > > kmov{d|q} %k0, %{eax|rax}
> > > test %{eax|rax}
> > > ```
> > >
> > > It adds macro s.t any GPR can get the proper width with:
> > >     `V{upper_case_GPR_name}`
> > >
> > > and any mask insn can get the proper width with:
> > >     `{mask_insn_without_postfix}V`
> > >
> > > This commit does not change libc.so
> > >
> > > Tested build on x86-64
> > > ---
> > >  sysdeps/x86_64/multiarch/reg-macros.h         | 166 ++++++++++++++++++
> > >  .../multiarch/scripts/gen-reg-macros.py       | 123 +++++++++++++
> > >  2 files changed, 289 insertions(+)
> > >  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
> > >  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> > > new file mode 100644
> > > index 0000000000..16168b6fda
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> > > @@ -0,0 +1,166 @@
> > > +/* This file was generated by: gen-reg-macros.py.
> > > +
> > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#ifndef _REG_MACROS_H
> > > +#define _REG_MACROS_H  1
> > > +
> > > +#define rax_8  al
> > > +#define rax_16 ax
> > > +#define rax_32 eax
> > > +#define rax_64 rax
> > > +#define rbx_8  bl
> > > +#define rbx_16 bx
> > > +#define rbx_32 ebx
> > > +#define rbx_64 rbx
> > > +#define rcx_8  cl
> > > +#define rcx_16 cx
> > > +#define rcx_32 ecx
> > > +#define rcx_64 rcx
> > > +#define rdx_8  dl
> > > +#define rdx_16 dx
> > > +#define rdx_32 edx
> > > +#define rdx_64 rdx
> > > +#define rbp_8  bpl
> > > +#define rbp_16 bp
> > > +#define rbp_32 ebp
> > > +#define rbp_64 rbp
> > > +#define rsp_8  spl
> > > +#define rsp_16 sp
> > > +#define rsp_32 esp
> > > +#define rsp_64 rsp
> > > +#define rsi_8  sil
> > > +#define rsi_16 si
> > > +#define rsi_32 esi
> > > +#define rsi_64 rsi
> > > +#define rdi_8  dil
> > > +#define rdi_16 di
> > > +#define rdi_32 edi
> > > +#define rdi_64 rdi
> > > +#define r8_8   r8b
> > > +#define r8_16  r8w
> > > +#define r8_32  r8d
> > > +#define r8_64  r8
> > > +#define r9_8   r9b
> > > +#define r9_16  r9w
> > > +#define r9_32  r9d
> > > +#define r9_64  r9
> > > +#define r10_8  r10b
> > > +#define r10_16 r10w
> > > +#define r10_32 r10d
> > > +#define r10_64 r10
> > > +#define r11_8  r11b
> > > +#define r11_16 r11w
> > > +#define r11_32 r11d
> > > +#define r11_64 r11
> > > +#define r12_8  r12b
> > > +#define r12_16 r12w
> > > +#define r12_32 r12d
> > > +#define r12_64 r12
> > > +#define r13_8  r13b
> > > +#define r13_16 r13w
> > > +#define r13_32 r13d
> > > +#define r13_64 r13
> > > +#define r14_8  r14b
> > > +#define r14_16 r14w
> > > +#define r14_32 r14d
> > > +#define r14_64 r14
> > > +#define r15_8  r15b
> > > +#define r15_16 r15w
> > > +#define r15_32 r15d
> > > +#define r15_64 r15
> > > +
> > > +#define kmov_8 kmovb
> > > +#define kmov_16        kmovw
> > > +#define kmov_32        kmovd
> > > +#define kmov_64        kmovq
> > > +#define kortest_8      kortestb
> > > +#define kortest_16     kortestw
> > > +#define kortest_32     kortestd
> > > +#define kortest_64     kortestq
> > > +#define kor_8  korb
> > > +#define kor_16 korw
> > > +#define kor_32 kord
> > > +#define kor_64 korq
> > > +#define ktest_8        ktestb
> > > +#define ktest_16       ktestw
> > > +#define ktest_32       ktestd
> > > +#define ktest_64       ktestq
> > > +#define kand_8 kandb
> > > +#define kand_16        kandw
> > > +#define kand_32        kandd
> > > +#define kand_64        kandq
> > > +#define kxor_8 kxorb
> > > +#define kxor_16        kxorw
> > > +#define kxor_32        kxord
> > > +#define kxor_64        kxorq
> > > +#define knot_8 knotb
> > > +#define knot_16        knotw
> > > +#define knot_32        knotd
> > > +#define knot_64        knotq
> > > +#define kxnor_8        kxnorb
> > > +#define kxnor_16       kxnorw
> > > +#define kxnor_32       kxnord
> > > +#define kxnor_64       kxnorq
> > > +#define kunpack_8      kunpackbw
> > > +#define kunpack_16     kunpackwd
> > > +#define kunpack_32     kunpackdq
> > > +
> > > +/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
> > > +#define VRAX   VGPR(rax)
> > > +#define VRBX   VGPR(rbx)
> > > +#define VRCX   VGPR(rcx)
> > > +#define VRDX   VGPR(rdx)
> > > +#define VRBP   VGPR(rbp)
> > > +#define VRSP   VGPR(rsp)
> > > +#define VRSI   VGPR(rsi)
> > > +#define VRDI   VGPR(rdi)
> > > +#define VR8    VGPR(r8)
> > > +#define VR9    VGPR(r9)
> > > +#define VR10   VGPR(r10)
> > > +#define VR11   VGPR(r11)
> > > +#define VR12   VGPR(r12)
> > > +#define VR13   VGPR(r13)
> > > +#define VR14   VGPR(r14)
> > > +#define VR15   VGPR(r15)
> > > +
> > > +/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
> > > +#define KMOV   VKINSN(kmov)
> > > +#define KORTEST        VKINSN(kortest)
> > > +#define KOR    VKINSN(kor)
> > > +#define KTEST  VKINSN(ktest)
> > > +#define KAND   VKINSN(kand)
> > > +#define KXOR   VKINSN(kxor)
> > > +#define KNOT   VKINSN(knot)
> > > +#define KXNOR  VKINSN(kxnor)
> > > +#define KUNPACK        VKINSN(kunpack)
> > > +
> > > +#ifndef REG_WIDTH
> > > +# define REG_WIDTH VEC_SIZE
> > > +#endif
> >
> > Which files will define REG_WIDTH?  What values will it be for
> > YMM and ZMM vectors?
>
> for non-wide char evex or avx2/sse2 REG_WIDTH = VEC_SIZE
> so for YMM REG_WIDTH = 32, for ZMM REG_WIDTH = 64.
>
> For wchar impls REG_WIDTH will often be 32 irrelivant of YMM/ZMM.

Then we should have

#ifdef USE_WIDE_CHAR
# define REG_WIDTH 32
#else
# define REG_WIDTH VEC_SIZE
#endif

> >
> > > +#define VPASTER(x, y)  x##_##y
> > > +#define VEVALUATOR(x, y)       VPASTER(x, y)
> > > +
> > > +#define VGPR_SZ(reg_name, reg_size)    VEVALUATOR(reg_name, reg_size)
> > > +#define VKINSN_SZ(insn, reg_size)      VEVALUATOR(insn, reg_size)
> > > +
> > > +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> > > +#define VKINSN(mask_insn)      VKINSN_SZ(mask_insn, REG_WIDTH)
> > > +
> > > +#endif
> > > diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > new file mode 100644
> > > index 0000000000..c7296a8104
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > @@ -0,0 +1,123 @@
> > > +#!/usr/bin/python3
> > > +# Copyright (C) 2022 Free Software Foundation, Inc.
> > > +# This file is part of the GNU C Library.
> > > +#
> > > +# The GNU C Library is free software; you can redistribute it and/or
> > > +# modify it under the terms of the GNU Lesser General Public
> > > +# License as published by the Free Software Foundation; either
> > > +# version 2.1 of the License, or (at your option) any later version.
> > > +#
> > > +# The GNU C Library is distributed in the hope that it will be useful,
> > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +# Lesser General Public License for more details.
> > > +#
> > > +# You should have received a copy of the GNU Lesser General Public
> > > +# License along with the GNU C Library; if not, see
> > > +# <https://www.gnu.org/licenses/>.
> > > +"""Generate macros for getting GPR name of a certain size
> > > +
> > > +Inputs: None
> > > +Output: Prints header fill to stdout
> > > +
> > > +API:
> > > +    VGPR(reg_name)
> > > +        - Get register name VEC_SIZE component of `reg_name`
> > > +    VGPR_SZ(reg_name, reg_size)
> > > +        - Get register name `reg_size` component of `reg_name`
> > > +"""
> > > +
> > > +import sys
> > > +import os
> > > +from datetime import datetime
> > > +
> > > +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> > > +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> > > +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> > > +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> > > +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> > > +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> > > +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> > > +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> > > +
> > > +mask_insns = [
> > > +    "kmov",
> > > +    "kortest",
> > > +    "kor",
> > > +    "ktest",
> > > +    "kand",
> > > +    "kxor",
> > > +    "knot",
> > > +    "kxnor",
> > > +]
> > > +mask_insns_ext = ["b", "w", "d", "q"]
> > > +
> > > +cr = """
> > > +   Copyright (C) {} Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +"""
> > > +
> > > +print("/* This file was generated by: {}.".format(os.path.basename(
> > > +    sys.argv[0])))
> > > +print(cr.format(datetime.today().year))
> > > +
> > > +print("#ifndef _REG_MACROS_H")
> > > +print("#define _REG_MACROS_H\t1")
> > > +print("")
> > > +for reg in registers:
> > > +    for i in range(0, 4):
> > > +        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
> > > +
> > > +print("")
> > > +for mask_insn in mask_insns:
> > > +    for i in range(0, 4):
> > > +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> > > +                                           mask_insns_ext[i]))
> > > +for i in range(0, 3):
> > > +    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
> > > +                                                   mask_insns_ext[i + 1]))
> > > +mask_insns.append("kunpack")
> > > +
> > > +print("")
> > > +print(
> > > +    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
> > > +for reg in registers:
> > > +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> > > +
> > > +print("")
> > > +
> > > +print(
> > > +    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
> > > +)
> > > +for mask_insn in mask_insns:
> > > +    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
> > > +print("")
> > > +
> > > +print("#ifndef REG_WIDTH")
> > > +print("# define REG_WIDTH VEC_SIZE")
> > > +print("#endif")
> > > +print("")
> > > +print("#define VPASTER(x, y)\tx##_##y")
> > > +print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
> > > +print("")
> > > +print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
> > > +print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
> > > +print("")
> > > +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> > > +print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
> > > +
> > > +print("\n#endif")
> > > --
> > > 2.34.1
> > >
> >
> >
> > --
> > H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v5 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 22:05         ` H.J. Lu
@ 2022-10-14 22:27           ` Noah Goldstein
  2022-10-14 22:41             ` H.J. Lu
  0 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 22:27 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 5:06 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
>  On Fri, Oct 14, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Oct 14, 2022 at 4:28 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Oct 14, 2022 at 2:15 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > This is to make it easier to do think like:
> > > > ```
> > > > vpcmpb %VEC(0), %VEC(1), %k0
> > > > kmov{d|q} %k0, %{eax|rax}
> > > > test %{eax|rax}
> > > > ```
> > > >
> > > > It adds macro s.t any GPR can get the proper width with:
> > > >     `V{upper_case_GPR_name}`
> > > >
> > > > and any mask insn can get the proper width with:
> > > >     `{mask_insn_without_postfix}V`
> > > >
> > > > This commit does not change libc.so
> > > >
> > > > Tested build on x86-64
> > > > ---
> > > >  sysdeps/x86_64/multiarch/reg-macros.h         | 166 ++++++++++++++++++
> > > >  .../multiarch/scripts/gen-reg-macros.py       | 123 +++++++++++++
> > > >  2 files changed, 289 insertions(+)
> > > >  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
> > > >  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > new file mode 100644
> > > > index 0000000000..16168b6fda
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > @@ -0,0 +1,166 @@
> > > > +/* This file was generated by: gen-reg-macros.py.
> > > > +
> > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#ifndef _REG_MACROS_H
> > > > +#define _REG_MACROS_H  1
> > > > +
> > > > +#define rax_8  al
> > > > +#define rax_16 ax
> > > > +#define rax_32 eax
> > > > +#define rax_64 rax
> > > > +#define rbx_8  bl
> > > > +#define rbx_16 bx
> > > > +#define rbx_32 ebx
> > > > +#define rbx_64 rbx
> > > > +#define rcx_8  cl
> > > > +#define rcx_16 cx
> > > > +#define rcx_32 ecx
> > > > +#define rcx_64 rcx
> > > > +#define rdx_8  dl
> > > > +#define rdx_16 dx
> > > > +#define rdx_32 edx
> > > > +#define rdx_64 rdx
> > > > +#define rbp_8  bpl
> > > > +#define rbp_16 bp
> > > > +#define rbp_32 ebp
> > > > +#define rbp_64 rbp
> > > > +#define rsp_8  spl
> > > > +#define rsp_16 sp
> > > > +#define rsp_32 esp
> > > > +#define rsp_64 rsp
> > > > +#define rsi_8  sil
> > > > +#define rsi_16 si
> > > > +#define rsi_32 esi
> > > > +#define rsi_64 rsi
> > > > +#define rdi_8  dil
> > > > +#define rdi_16 di
> > > > +#define rdi_32 edi
> > > > +#define rdi_64 rdi
> > > > +#define r8_8   r8b
> > > > +#define r8_16  r8w
> > > > +#define r8_32  r8d
> > > > +#define r8_64  r8
> > > > +#define r9_8   r9b
> > > > +#define r9_16  r9w
> > > > +#define r9_32  r9d
> > > > +#define r9_64  r9
> > > > +#define r10_8  r10b
> > > > +#define r10_16 r10w
> > > > +#define r10_32 r10d
> > > > +#define r10_64 r10
> > > > +#define r11_8  r11b
> > > > +#define r11_16 r11w
> > > > +#define r11_32 r11d
> > > > +#define r11_64 r11
> > > > +#define r12_8  r12b
> > > > +#define r12_16 r12w
> > > > +#define r12_32 r12d
> > > > +#define r12_64 r12
> > > > +#define r13_8  r13b
> > > > +#define r13_16 r13w
> > > > +#define r13_32 r13d
> > > > +#define r13_64 r13
> > > > +#define r14_8  r14b
> > > > +#define r14_16 r14w
> > > > +#define r14_32 r14d
> > > > +#define r14_64 r14
> > > > +#define r15_8  r15b
> > > > +#define r15_16 r15w
> > > > +#define r15_32 r15d
> > > > +#define r15_64 r15
> > > > +
> > > > +#define kmov_8 kmovb
> > > > +#define kmov_16        kmovw
> > > > +#define kmov_32        kmovd
> > > > +#define kmov_64        kmovq
> > > > +#define kortest_8      kortestb
> > > > +#define kortest_16     kortestw
> > > > +#define kortest_32     kortestd
> > > > +#define kortest_64     kortestq
> > > > +#define kor_8  korb
> > > > +#define kor_16 korw
> > > > +#define kor_32 kord
> > > > +#define kor_64 korq
> > > > +#define ktest_8        ktestb
> > > > +#define ktest_16       ktestw
> > > > +#define ktest_32       ktestd
> > > > +#define ktest_64       ktestq
> > > > +#define kand_8 kandb
> > > > +#define kand_16        kandw
> > > > +#define kand_32        kandd
> > > > +#define kand_64        kandq
> > > > +#define kxor_8 kxorb
> > > > +#define kxor_16        kxorw
> > > > +#define kxor_32        kxord
> > > > +#define kxor_64        kxorq
> > > > +#define knot_8 knotb
> > > > +#define knot_16        knotw
> > > > +#define knot_32        knotd
> > > > +#define knot_64        knotq
> > > > +#define kxnor_8        kxnorb
> > > > +#define kxnor_16       kxnorw
> > > > +#define kxnor_32       kxnord
> > > > +#define kxnor_64       kxnorq
> > > > +#define kunpack_8      kunpackbw
> > > > +#define kunpack_16     kunpackwd
> > > > +#define kunpack_32     kunpackdq
> > > > +
> > > > +/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
> > > > +#define VRAX   VGPR(rax)
> > > > +#define VRBX   VGPR(rbx)
> > > > +#define VRCX   VGPR(rcx)
> > > > +#define VRDX   VGPR(rdx)
> > > > +#define VRBP   VGPR(rbp)
> > > > +#define VRSP   VGPR(rsp)
> > > > +#define VRSI   VGPR(rsi)
> > > > +#define VRDI   VGPR(rdi)
> > > > +#define VR8    VGPR(r8)
> > > > +#define VR9    VGPR(r9)
> > > > +#define VR10   VGPR(r10)
> > > > +#define VR11   VGPR(r11)
> > > > +#define VR12   VGPR(r12)
> > > > +#define VR13   VGPR(r13)
> > > > +#define VR14   VGPR(r14)
> > > > +#define VR15   VGPR(r15)
> > > > +
> > > > +/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
> > > > +#define KMOV   VKINSN(kmov)
> > > > +#define KORTEST        VKINSN(kortest)
> > > > +#define KOR    VKINSN(kor)
> > > > +#define KTEST  VKINSN(ktest)
> > > > +#define KAND   VKINSN(kand)
> > > > +#define KXOR   VKINSN(kxor)
> > > > +#define KNOT   VKINSN(knot)
> > > > +#define KXNOR  VKINSN(kxnor)
> > > > +#define KUNPACK        VKINSN(kunpack)
> > > > +
> > > > +#ifndef REG_WIDTH
> > > > +# define REG_WIDTH VEC_SIZE
> > > > +#endif
> > >
> > > Which files will define REG_WIDTH?  What values will it be for
> > > YMM and ZMM vectors?
> >
> > for non-wide char evex or avx2/sse2 REG_WIDTH = VEC_SIZE
> > so for YMM REG_WIDTH = 32, for ZMM REG_WIDTH = 64.
> >
> > For wchar impls REG_WIDTH will often be 32 irrelivant of YMM/ZMM.
>
> Then we should have
>
> #ifdef USE_WIDE_CHAR
> # define REG_WIDTH 32
> #else
> # define REG_WIDTH VEC_SIZE
> #endif
>

It may not be universal. It may be that some wide-char impls will want
REG_WIDTH == 8/16 if they rely heavily on `inc` to do zero test or
for some reason or another uses the full VEC_SIZE (as wcslen-evex512
currently does).

Also don't really see what it saves to give up the granularity.
Either way to specify a seperate reg width the wchar impl will
need to define something else. Seems reasonable for that
something else to just be REG_WIDTH directly as opposed to
USE_WIDE_CHAR.

What do you think?
> > >
> > > > +#define VPASTER(x, y)  x##_##y
> > > > +#define VEVALUATOR(x, y)       VPASTER(x, y)
> > > > +
> > > > +#define VGPR_SZ(reg_name, reg_size)    VEVALUATOR(reg_name, reg_size)
> > > > +#define VKINSN_SZ(insn, reg_size)      VEVALUATOR(insn, reg_size)
> > > > +
> > > > +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> > > > +#define VKINSN(mask_insn)      VKINSN_SZ(mask_insn, REG_WIDTH)
> > > > +
> > > > +#endif
> > > > diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > > new file mode 100644
> > > > index 0000000000..c7296a8104
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > > @@ -0,0 +1,123 @@
> > > > +#!/usr/bin/python3
> > > > +# Copyright (C) 2022 Free Software Foundation, Inc.
> > > > +# This file is part of the GNU C Library.
> > > > +#
> > > > +# The GNU C Library is free software; you can redistribute it and/or
> > > > +# modify it under the terms of the GNU Lesser General Public
> > > > +# License as published by the Free Software Foundation; either
> > > > +# version 2.1 of the License, or (at your option) any later version.
> > > > +#
> > > > +# The GNU C Library is distributed in the hope that it will be useful,
> > > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +# Lesser General Public License for more details.
> > > > +#
> > > > +# You should have received a copy of the GNU Lesser General Public
> > > > +# License along with the GNU C Library; if not, see
> > > > +# <https://www.gnu.org/licenses/>.
> > > > +"""Generate macros for getting GPR name of a certain size
> > > > +
> > > > +Inputs: None
> > > > +Output: Prints header fill to stdout
> > > > +
> > > > +API:
> > > > +    VGPR(reg_name)
> > > > +        - Get register name VEC_SIZE component of `reg_name`
> > > > +    VGPR_SZ(reg_name, reg_size)
> > > > +        - Get register name `reg_size` component of `reg_name`
> > > > +"""
> > > > +
> > > > +import sys
> > > > +import os
> > > > +from datetime import datetime
> > > > +
> > > > +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> > > > +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> > > > +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> > > > +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> > > > +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> > > > +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> > > > +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> > > > +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> > > > +
> > > > +mask_insns = [
> > > > +    "kmov",
> > > > +    "kortest",
> > > > +    "kor",
> > > > +    "ktest",
> > > > +    "kand",
> > > > +    "kxor",
> > > > +    "knot",
> > > > +    "kxnor",
> > > > +]
> > > > +mask_insns_ext = ["b", "w", "d", "q"]
> > > > +
> > > > +cr = """
> > > > +   Copyright (C) {} Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +"""
> > > > +
> > > > +print("/* This file was generated by: {}.".format(os.path.basename(
> > > > +    sys.argv[0])))
> > > > +print(cr.format(datetime.today().year))
> > > > +
> > > > +print("#ifndef _REG_MACROS_H")
> > > > +print("#define _REG_MACROS_H\t1")
> > > > +print("")
> > > > +for reg in registers:
> > > > +    for i in range(0, 4):
> > > > +        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
> > > > +
> > > > +print("")
> > > > +for mask_insn in mask_insns:
> > > > +    for i in range(0, 4):
> > > > +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> > > > +                                           mask_insns_ext[i]))
> > > > +for i in range(0, 3):
> > > > +    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
> > > > +                                                   mask_insns_ext[i + 1]))
> > > > +mask_insns.append("kunpack")
> > > > +
> > > > +print("")
> > > > +print(
> > > > +    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
> > > > +for reg in registers:
> > > > +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> > > > +
> > > > +print("")
> > > > +
> > > > +print(
> > > > +    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
> > > > +)
> > > > +for mask_insn in mask_insns:
> > > > +    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
> > > > +print("")
> > > > +
> > > > +print("#ifndef REG_WIDTH")
> > > > +print("# define REG_WIDTH VEC_SIZE")
> > > > +print("#endif")
> > > > +print("")
> > > > +print("#define VPASTER(x, y)\tx##_##y")
> > > > +print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
> > > > +print("")
> > > > +print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
> > > > +print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
> > > > +print("")
> > > > +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> > > > +print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
> > > > +
> > > > +print("\n#endif")
> > > > --
> > > > 2.34.1
> > > >
> > >
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v6 1/7] x86: Update and move evex256/512 vec macros
  2022-10-14 16:40 [PATCH v1 1/3] x86: Update evex256/512 vec macros Noah Goldstein
                   ` (6 preceding siblings ...)
  2022-10-14 21:14 ` [PATCH v5 1/3] x86: Update evex256/512 vec macros Noah Goldstein
@ 2022-10-14 22:39 ` Noah Goldstein
  2022-10-14 22:39   ` [PATCH v6 2/7] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
                     ` (5 more replies)
  2022-10-15  0:06 ` [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls Noah Goldstein
                   ` (2 subsequent siblings)
  10 siblings, 6 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 22:39 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

1) Copy so that backport will be easier.
2) Make section only define if there is not a previous definition
3) Add `VEC_lo` definition for proper reg-width but in the
   ymm/zmm0-15 range.

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h   | 35 ++++++++
 sysdeps/x86_64/multiarch/x86-avx-vecs.h       | 47 ++++++++++
 .../x86_64/multiarch/x86-evex-vecs-common.h   | 39 ++++++++
 sysdeps/x86_64/multiarch/x86-evex256-vecs.h   | 38 ++++++++
 sysdeps/x86_64/multiarch/x86-evex512-vecs.h   | 38 ++++++++
 sysdeps/x86_64/multiarch/x86-sse2-vecs.h      | 47 ++++++++++
 sysdeps/x86_64/multiarch/x86-vec-macros.h     | 90 +++++++++++++++++++
 7 files changed, 334 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-evex256-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-evex512-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-sse2-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-vec-macros.h

diff --git a/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
new file mode 100644
index 0000000000..0b326c8a70
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
@@ -0,0 +1,35 @@
+/* Common config for AVX-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX_RTM_VECS_H
+#define _X86_AVX_RTM_VECS_H			1
+
+#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define USE_WITH_RTM			1
+#include "x86-avx-vecs.h"
+
+#undef SECTION
+#define SECTION(p)				p##.avx.rtm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
new file mode 100644
index 0000000000..dca1089060
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for AVX VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX_VECS_H
+#define _X86_AVX_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "x86-vec-macros.h"
+
+#define USE_WITH_AVX		1
+#define SECTION(p)			p##.avx
+
+/* 4-byte mov instructions with AVX2.  */
+#define MOV_SIZE			4
+/* 1 (ret) + 3 (vzeroupper).  */
+#define RET_SIZE			4
+#define VZEROUPPER			vzeroupper
+
+#define VMOVU				vmovdqu
+#define VMOVA				vmovdqa
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VMM_128				VMM_any_xmm
+#define VMM					VMM_any_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
new file mode 100644
index 0000000000..f331e9d8ec
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
@@ -0,0 +1,39 @@
+/* Common config for EVEX256 and EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_EVEX_VECS_COMMON_H
+#define _X86_EVEX_VECS_COMMON_H			1
+
+#include "x86-vec-macros.h"
+
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+#define VMM_128				VMM_hi_xmm
+#define VMM_256				VMM_hi_ymm
+#define VMM_512				VMM_hi_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-evex256-vecs.h b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
new file mode 100644
index 0000000000..8337b95504
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
@@ -0,0 +1,38 @@
+/* Common config for EVEX256 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX256_VECS_H
+#define _EVEX256_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "x86-evex-vecs-common.h"
+
+#define USE_WITH_EVEX256	1
+
+#ifndef SECTION
+# define SECTION(p)			p##.evex
+#endif
+
+#define VMM					VMM_256
+#define VMM_lo				VMM_any_ymm
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-evex512-vecs.h b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
new file mode 100644
index 0000000000..7dc5c23ad0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
@@ -0,0 +1,38 @@
+/* Common config for EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX512_VECS_H
+#define _EVEX512_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			64
+#include "x86-evex-vecs-common.h"
+
+#define USE_WITH_EVEX512	1
+
+#ifndef SECTION
+# define SECTION(p)			p##.evex512
+#endif
+
+#define VMM					VMM_512
+#define VMM_lo				VMM_any_zmm
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-sse2-vecs.h b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
new file mode 100644
index 0000000000..b8bbd5dc29
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for SSE2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_SSE2_VECS_H
+#define _X86_SSE2_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			16
+#include "x86-vec-macros.h"
+
+#define USE_WITH_SSE2		1
+#define SECTION(p)			p
+
+/* 3-byte mov instructions with SSE2.  */
+#define MOV_SIZE			3
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				movups
+#define VMOVA				movaps
+#define VMOVNT				movntdq
+
+#define VMM_128				VMM_any_xmm
+#define VMM					VMM_any_xmm
+
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-vec-macros.h b/sysdeps/x86_64/multiarch/x86-vec-macros.h
new file mode 100644
index 0000000000..7d6bb31d55
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-vec-macros.h
@@ -0,0 +1,90 @@
+/* Macro helpers for VEC_{type}({vec_num})
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_VEC_MACROS_H
+#define _X86_VEC_MACROS_H			1
+
+#ifndef VEC_SIZE
+# error "Never include this file directly. Always include a vector config."
+#endif
+
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
+   VMM(N) values.  */
+#define VMM_hi_xmm0				xmm16
+#define VMM_hi_xmm1				xmm17
+#define VMM_hi_xmm2				xmm18
+#define VMM_hi_xmm3				xmm19
+#define VMM_hi_xmm4				xmm20
+#define VMM_hi_xmm5				xmm21
+#define VMM_hi_xmm6				xmm22
+#define VMM_hi_xmm7				xmm23
+#define VMM_hi_xmm8				xmm24
+#define VMM_hi_xmm9				xmm25
+#define VMM_hi_xmm10			xmm26
+#define VMM_hi_xmm11			xmm27
+#define VMM_hi_xmm12			xmm28
+#define VMM_hi_xmm13			xmm29
+#define VMM_hi_xmm14			xmm30
+#define VMM_hi_xmm15			xmm31
+
+#define VMM_hi_ymm0				ymm16
+#define VMM_hi_ymm1				ymm17
+#define VMM_hi_ymm2				ymm18
+#define VMM_hi_ymm3				ymm19
+#define VMM_hi_ymm4				ymm20
+#define VMM_hi_ymm5				ymm21
+#define VMM_hi_ymm6				ymm22
+#define VMM_hi_ymm7				ymm23
+#define VMM_hi_ymm8				ymm24
+#define VMM_hi_ymm9				ymm25
+#define VMM_hi_ymm10			ymm26
+#define VMM_hi_ymm11			ymm27
+#define VMM_hi_ymm12			ymm28
+#define VMM_hi_ymm13			ymm29
+#define VMM_hi_ymm14			ymm30
+#define VMM_hi_ymm15			ymm31
+
+#define VMM_hi_zmm0				zmm16
+#define VMM_hi_zmm1				zmm17
+#define VMM_hi_zmm2				zmm18
+#define VMM_hi_zmm3				zmm19
+#define VMM_hi_zmm4				zmm20
+#define VMM_hi_zmm5				zmm21
+#define VMM_hi_zmm6				zmm22
+#define VMM_hi_zmm7				zmm23
+#define VMM_hi_zmm8				zmm24
+#define VMM_hi_zmm9				zmm25
+#define VMM_hi_zmm10			zmm26
+#define VMM_hi_zmm11			zmm27
+#define VMM_hi_zmm12			zmm28
+#define VMM_hi_zmm13			zmm29
+#define VMM_hi_zmm14			zmm30
+#define VMM_hi_zmm15			zmm31
+
+#define PRIMITIVE_VMM(vec, num)		vec##num
+
+#define VMM_any_xmm(i)			PRIMITIVE_VMM(xmm, i)
+#define VMM_any_ymm(i)			PRIMITIVE_VMM(ymm, i)
+#define VMM_any_zmm(i)			PRIMITIVE_VMM(zmm, i)
+
+#define VMM_hi_xmm(i)			PRIMITIVE_VMM(VMM_hi_xmm, i)
+#define VMM_hi_ymm(i)			PRIMITIVE_VMM(VMM_hi_ymm, i)
+#define VMM_hi_zmm(i)			PRIMITIVE_VMM(VMM_hi_zmm, i)
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v6 2/7] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 22:39 ` [PATCH v6 1/7] x86: Update and move evex256/512 vec macros Noah Goldstein
@ 2022-10-14 22:39   ` Noah Goldstein
  2022-10-14 22:39   ` [PATCH v6 3/7] x86: Update memrchr to use new VEC macros Noah Goldstein
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 22:39 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

This is to make it easier to do think like:
```
vpcmpb %VEC(0), %VEC(1), %k0
kmov{d|q} %k0, %{eax|rax}
test %{eax|rax}
```

It adds macro s.t any GPR can get the proper width with:
    `V{upper_case_GPR_name}`

and any mask insn can get the proper width with:
    `{mask_insn_without_postfix}V`

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/reg-macros.h         | 166 ++++++++++++++++++
 .../multiarch/scripts/gen-reg-macros.py       | 123 +++++++++++++
 2 files changed, 289 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
 create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py

diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
new file mode 100644
index 0000000000..16168b6fda
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/reg-macros.h
@@ -0,0 +1,166 @@
+/* This file was generated by: gen-reg-macros.py.
+
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _REG_MACROS_H
+#define _REG_MACROS_H	1
+
+#define rax_8	al
+#define rax_16	ax
+#define rax_32	eax
+#define rax_64	rax
+#define rbx_8	bl
+#define rbx_16	bx
+#define rbx_32	ebx
+#define rbx_64	rbx
+#define rcx_8	cl
+#define rcx_16	cx
+#define rcx_32	ecx
+#define rcx_64	rcx
+#define rdx_8	dl
+#define rdx_16	dx
+#define rdx_32	edx
+#define rdx_64	rdx
+#define rbp_8	bpl
+#define rbp_16	bp
+#define rbp_32	ebp
+#define rbp_64	rbp
+#define rsp_8	spl
+#define rsp_16	sp
+#define rsp_32	esp
+#define rsp_64	rsp
+#define rsi_8	sil
+#define rsi_16	si
+#define rsi_32	esi
+#define rsi_64	rsi
+#define rdi_8	dil
+#define rdi_16	di
+#define rdi_32	edi
+#define rdi_64	rdi
+#define r8_8	r8b
+#define r8_16	r8w
+#define r8_32	r8d
+#define r8_64	r8
+#define r9_8	r9b
+#define r9_16	r9w
+#define r9_32	r9d
+#define r9_64	r9
+#define r10_8	r10b
+#define r10_16	r10w
+#define r10_32	r10d
+#define r10_64	r10
+#define r11_8	r11b
+#define r11_16	r11w
+#define r11_32	r11d
+#define r11_64	r11
+#define r12_8	r12b
+#define r12_16	r12w
+#define r12_32	r12d
+#define r12_64	r12
+#define r13_8	r13b
+#define r13_16	r13w
+#define r13_32	r13d
+#define r13_64	r13
+#define r14_8	r14b
+#define r14_16	r14w
+#define r14_32	r14d
+#define r14_64	r14
+#define r15_8	r15b
+#define r15_16	r15w
+#define r15_32	r15d
+#define r15_64	r15
+
+#define kmov_8	kmovb
+#define kmov_16	kmovw
+#define kmov_32	kmovd
+#define kmov_64	kmovq
+#define kortest_8	kortestb
+#define kortest_16	kortestw
+#define kortest_32	kortestd
+#define kortest_64	kortestq
+#define kor_8	korb
+#define kor_16	korw
+#define kor_32	kord
+#define kor_64	korq
+#define ktest_8	ktestb
+#define ktest_16	ktestw
+#define ktest_32	ktestd
+#define ktest_64	ktestq
+#define kand_8	kandb
+#define kand_16	kandw
+#define kand_32	kandd
+#define kand_64	kandq
+#define kxor_8	kxorb
+#define kxor_16	kxorw
+#define kxor_32	kxord
+#define kxor_64	kxorq
+#define knot_8	knotb
+#define knot_16	knotw
+#define knot_32	knotd
+#define knot_64	knotq
+#define kxnor_8	kxnorb
+#define kxnor_16	kxnorw
+#define kxnor_32	kxnord
+#define kxnor_64	kxnorq
+#define kunpack_8	kunpackbw
+#define kunpack_16	kunpackwd
+#define kunpack_32	kunpackdq
+
+/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
+#define VRAX	VGPR(rax)
+#define VRBX	VGPR(rbx)
+#define VRCX	VGPR(rcx)
+#define VRDX	VGPR(rdx)
+#define VRBP	VGPR(rbp)
+#define VRSP	VGPR(rsp)
+#define VRSI	VGPR(rsi)
+#define VRDI	VGPR(rdi)
+#define VR8	VGPR(r8)
+#define VR9	VGPR(r9)
+#define VR10	VGPR(r10)
+#define VR11	VGPR(r11)
+#define VR12	VGPR(r12)
+#define VR13	VGPR(r13)
+#define VR14	VGPR(r14)
+#define VR15	VGPR(r15)
+
+/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
+#define KMOV 	VKINSN(kmov)
+#define KORTEST 	VKINSN(kortest)
+#define KOR 	VKINSN(kor)
+#define KTEST 	VKINSN(ktest)
+#define KAND 	VKINSN(kand)
+#define KXOR 	VKINSN(kxor)
+#define KNOT 	VKINSN(knot)
+#define KXNOR 	VKINSN(kxnor)
+#define KUNPACK 	VKINSN(kunpack)
+
+#ifndef REG_WIDTH
+# define REG_WIDTH VEC_SIZE
+#endif
+
+#define VPASTER(x, y)	x##_##y
+#define VEVALUATOR(x, y)	VPASTER(x, y)
+
+#define VGPR_SZ(reg_name, reg_size)	VEVALUATOR(reg_name, reg_size)
+#define VKINSN_SZ(insn, reg_size)	VEVALUATOR(insn, reg_size)
+
+#define VGPR(reg_name)	VGPR_SZ(reg_name, REG_WIDTH)
+#define VKINSN(mask_insn)	VKINSN_SZ(mask_insn, REG_WIDTH)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
new file mode 100644
index 0000000000..c7296a8104
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
@@ -0,0 +1,123 @@
+#!/usr/bin/python3
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+"""Generate macros for getting GPR name of a certain size
+
+Inputs: None
+Output: Prints header fill to stdout
+
+API:
+    VGPR(reg_name)
+        - Get register name VEC_SIZE component of `reg_name`
+    VGPR_SZ(reg_name, reg_size)
+        - Get register name `reg_size` component of `reg_name`
+"""
+
+import sys
+import os
+from datetime import datetime
+
+registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
+             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
+             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
+             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
+             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
+             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
+             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
+             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
+
+mask_insns = [
+    "kmov",
+    "kortest",
+    "kor",
+    "ktest",
+    "kand",
+    "kxor",
+    "knot",
+    "kxnor",
+]
+mask_insns_ext = ["b", "w", "d", "q"]
+
+cr = """
+   Copyright (C) {} Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+"""
+
+print("/* This file was generated by: {}.".format(os.path.basename(
+    sys.argv[0])))
+print(cr.format(datetime.today().year))
+
+print("#ifndef _REG_MACROS_H")
+print("#define _REG_MACROS_H\t1")
+print("")
+for reg in registers:
+    for i in range(0, 4):
+        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
+
+print("")
+for mask_insn in mask_insns:
+    for i in range(0, 4):
+        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
+                                           mask_insns_ext[i]))
+for i in range(0, 3):
+    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
+                                                   mask_insns_ext[i + 1]))
+mask_insns.append("kunpack")
+
+print("")
+print(
+    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
+for reg in registers:
+    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
+
+print("")
+
+print(
+    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
+)
+for mask_insn in mask_insns:
+    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
+print("")
+
+print("#ifndef REG_WIDTH")
+print("# define REG_WIDTH VEC_SIZE")
+print("#endif")
+print("")
+print("#define VPASTER(x, y)\tx##_##y")
+print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
+print("")
+print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
+print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
+print("")
+print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
+print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
+
+print("\n#endif")
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v6 3/7] x86: Update memrchr to use new VEC macros
  2022-10-14 22:39 ` [PATCH v6 1/7] x86: Update and move evex256/512 vec macros Noah Goldstein
  2022-10-14 22:39   ` [PATCH v6 2/7] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
@ 2022-10-14 22:39   ` Noah Goldstein
  2022-10-14 22:39   ` [PATCH v6 4/7] x86: Remove now unused vec header macros Noah Goldstein
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 22:39 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Replace %VEC(n) -> %VMM(n)

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/memrchr-evex.S | 42 ++++++++++++-------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index ea3a0a0a60..550b328c5a 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -21,7 +21,7 @@
 #if ISA_SHOULD_BUILD (4)
 
 # include <sysdep.h>
-# include "evex256-vecs.h"
+# include "x86-evex256-vecs.h"
 # if VEC_SIZE != 32
 #  error "VEC_SIZE != 32 unimplemented"
 # endif
@@ -31,7 +31,7 @@
 # endif
 
 # define PAGE_SIZE			4096
-# define VECMATCH			VEC(0)
+# define VMMMATCH			VMM(0)
 
 	.section SECTION(.text), "ax", @progbits
 ENTRY_P2ALIGN(MEMRCHR, 6)
@@ -47,7 +47,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
 	   correct page cross check and 2) it correctly sets up end ptr to be
 	   subtract by lzcnt aligned.  */
 	leaq	-1(%rdi, %rdx), %rax
-	vpbroadcastb %esi, %VECMATCH
+	vpbroadcastb %esi, %VMMMATCH
 
 	/* Check if we can load 1x VEC without cross a page.  */
 	testl	$(PAGE_SIZE - VEC_SIZE), %eax
@@ -55,7 +55,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
 
 	/* Don't use rax for pointer here because EVEX has better encoding with
 	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
@@ -96,7 +96,7 @@ L(more_1x_vec):
 	movq	%rax, %rdx
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	subq	%rdi, %rdx
@@ -115,7 +115,7 @@ L(last_2x_vec):
 
 	/* Don't use rax for pointer here because EVEX has better encoding with
 	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
 	lzcntq	%rcx, %rcx
@@ -131,7 +131,7 @@ L(last_2x_vec):
 L(page_cross):
 	movq	%rax, %rsi
 	andq	$-VEC_SIZE, %rsi
-	vpcmpb	$0, (%rsi), %VECMATCH, %k0
+	vpcmpb	$0, (%rsi), %VMMMATCH, %k0
 	kmovd	%k0, %r8d
 	/* Shift out negative alignment (because we are starting from endptr and
 	   working backwards).  */
@@ -165,13 +165,13 @@ L(more_2x_vec):
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x0_dec)
 
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x1)
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	subq	$(VEC_SIZE * 4), %rdx
@@ -185,7 +185,7 @@ L(last_vec):
 
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 	lzcntl	%ecx, %ecx
 	subq	$(VEC_SIZE * 3 + 1), %rax
@@ -220,7 +220,7 @@ L(more_4x_vec):
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x2)
 
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	testl	%ecx, %ecx
@@ -243,17 +243,17 @@ L(more_4x_vec):
 L(loop_4x_vec):
 	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
 	   on).  */
-	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1
 
 	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
-	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
-	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
-	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
+	vpxorq	(VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3)
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4
 
 	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
 	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
-	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
-	vptestnmb %VEC(3), %VEC(3), %k2
+	vpminub	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	vptestnmb %VMM(3), %VMM(3), %k2
 
 	/* Any 1s and we found CHAR.  */
 	kortestd %k2, %k4
@@ -270,7 +270,7 @@ L(loop_4x_vec):
 L(last_4x_vec):
 
 	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	cmpl	$(VEC_SIZE * 2), %edx
@@ -280,14 +280,14 @@ L(last_4x_vec):
 	jnz	L(ret_vec_x0_dec)
 
 
-	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x1)
 
 	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	cmpl	$(VEC_SIZE * 3), %edx
@@ -309,7 +309,7 @@ L(loop_end):
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x0_end)
 
-	vptestnmb %VEC(2), %VEC(2), %k0
+	vptestnmb %VMM(2), %VMM(2), %k0
 	kmovd	%k0, %ecx
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x1_end)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v6 4/7] x86: Remove now unused vec header macros.
  2022-10-14 22:39 ` [PATCH v6 1/7] x86: Update and move evex256/512 vec macros Noah Goldstein
  2022-10-14 22:39   ` [PATCH v6 2/7] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
  2022-10-14 22:39   ` [PATCH v6 3/7] x86: Update memrchr to use new VEC macros Noah Goldstein
@ 2022-10-14 22:39   ` Noah Goldstein
  2022-10-14 22:39   ` [PATCH v6 5/7] x86: Update memmove to use new VEC macros Noah Goldstein
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 22:39 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h     | 35 --------
 sysdeps/x86_64/multiarch/avx-vecs.h         | 47 -----------
 sysdeps/x86_64/multiarch/evex-vecs-common.h | 39 ---------
 sysdeps/x86_64/multiarch/evex256-vecs.h     | 35 --------
 sysdeps/x86_64/multiarch/evex512-vecs.h     | 35 --------
 sysdeps/x86_64/multiarch/sse2-vecs.h        | 47 -----------
 sysdeps/x86_64/multiarch/vec-macros.h       | 90 ---------------------
 7 files changed, 328 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/evex-vecs-common.h
 delete mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/vec-macros.h

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
deleted file mode 100644
index 6ca9f5e6ba..0000000000
--- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Common config for AVX-RTM VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _AVX_RTM_VECS_H
-#define _AVX_RTM_VECS_H			1
-
-#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
-#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
-	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
-
-#define USE_WITH_RTM			1
-#include "avx-vecs.h"
-
-#undef SECTION
-#define SECTION(p)				p##.avx.rtm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
deleted file mode 100644
index 89680f5db8..0000000000
--- a/sysdeps/x86_64/multiarch/avx-vecs.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Common config for AVX VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _AVX_VECS_H
-#define _AVX_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			32
-#include "vec-macros.h"
-
-#define USE_WITH_AVX		1
-#define SECTION(p)			p##.avx
-
-/* 4-byte mov instructions with AVX2.  */
-#define MOV_SIZE			4
-/* 1 (ret) + 3 (vzeroupper).  */
-#define RET_SIZE			4
-#define VZEROUPPER			vzeroupper
-
-#define VMOVU				vmovdqu
-#define VMOVA				vmovdqa
-#define VMOVNT				vmovntdq
-
-/* Often need to access xmm portion.  */
-#define VEC_xmm				VEC_any_xmm
-#define VEC					VEC_any_ymm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
deleted file mode 100644
index 99806ebcd7..0000000000
--- a/sysdeps/x86_64/multiarch/evex-vecs-common.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Common config for EVEX256 and EVEX512 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _EVEX_VECS_COMMON_H
-#define _EVEX_VECS_COMMON_H			1
-
-#include "vec-macros.h"
-
-/* 6-byte mov instructions with EVEX.  */
-#define MOV_SIZE			6
-/* No vzeroupper needed.  */
-#define RET_SIZE			1
-#define VZEROUPPER
-
-#define VMOVU				vmovdqu64
-#define VMOVA				vmovdqa64
-#define VMOVNT				vmovntdq
-
-#define VEC_xmm				VEC_hi_xmm
-#define VEC_ymm				VEC_hi_ymm
-#define VEC_zmm				VEC_hi_zmm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
deleted file mode 100644
index 222ba46dc7..0000000000
--- a/sysdeps/x86_64/multiarch/evex256-vecs.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Common config for EVEX256 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _EVEX256_VECS_H
-#define _EVEX256_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			32
-#include "evex-vecs-common.h"
-
-#define USE_WITH_EVEX256	1
-#define SECTION(p)			p##.evex
-
-#define VEC					VEC_ymm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
deleted file mode 100644
index d1784d5368..0000000000
--- a/sysdeps/x86_64/multiarch/evex512-vecs.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Common config for EVEX512 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _EVEX512_VECS_H
-#define _EVEX512_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			64
-#include "evex-vecs-common.h"
-
-#define USE_WITH_EVEX512	1
-#define SECTION(p)			p##.evex512
-
-#define VEC					VEC_zmm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
deleted file mode 100644
index 2b77a59d56..0000000000
--- a/sysdeps/x86_64/multiarch/sse2-vecs.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Common config for SSE2 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _SSE2_VECS_H
-#define _SSE2_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			16
-#include "vec-macros.h"
-
-#define USE_WITH_SSE2		1
-#define SECTION(p)			p
-
-/* 3-byte mov instructions with SSE2.  */
-#define MOV_SIZE			3
-/* No vzeroupper needed.  */
-#define RET_SIZE			1
-#define VZEROUPPER
-
-#define VMOVU				movups
-#define VMOVA				movaps
-#define VMOVNT				movntdq
-
-#define VEC_xmm				VEC_any_xmm
-#define VEC					VEC_any_xmm
-
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
deleted file mode 100644
index 9f3ffecede..0000000000
--- a/sysdeps/x86_64/multiarch/vec-macros.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Macro helpers for VEC_{type}({vec_num})
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _VEC_MACROS_H
-#define _VEC_MACROS_H			1
-
-#ifndef VEC_SIZE
-# error "Never include this file directly. Always include a vector config."
-#endif
-
-/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
-   VEC(N) values.  */
-#define VEC_hi_xmm0				xmm16
-#define VEC_hi_xmm1				xmm17
-#define VEC_hi_xmm2				xmm18
-#define VEC_hi_xmm3				xmm19
-#define VEC_hi_xmm4				xmm20
-#define VEC_hi_xmm5				xmm21
-#define VEC_hi_xmm6				xmm22
-#define VEC_hi_xmm7				xmm23
-#define VEC_hi_xmm8				xmm24
-#define VEC_hi_xmm9				xmm25
-#define VEC_hi_xmm10			xmm26
-#define VEC_hi_xmm11			xmm27
-#define VEC_hi_xmm12			xmm28
-#define VEC_hi_xmm13			xmm29
-#define VEC_hi_xmm14			xmm30
-#define VEC_hi_xmm15			xmm31
-
-#define VEC_hi_ymm0				ymm16
-#define VEC_hi_ymm1				ymm17
-#define VEC_hi_ymm2				ymm18
-#define VEC_hi_ymm3				ymm19
-#define VEC_hi_ymm4				ymm20
-#define VEC_hi_ymm5				ymm21
-#define VEC_hi_ymm6				ymm22
-#define VEC_hi_ymm7				ymm23
-#define VEC_hi_ymm8				ymm24
-#define VEC_hi_ymm9				ymm25
-#define VEC_hi_ymm10			ymm26
-#define VEC_hi_ymm11			ymm27
-#define VEC_hi_ymm12			ymm28
-#define VEC_hi_ymm13			ymm29
-#define VEC_hi_ymm14			ymm30
-#define VEC_hi_ymm15			ymm31
-
-#define VEC_hi_zmm0				zmm16
-#define VEC_hi_zmm1				zmm17
-#define VEC_hi_zmm2				zmm18
-#define VEC_hi_zmm3				zmm19
-#define VEC_hi_zmm4				zmm20
-#define VEC_hi_zmm5				zmm21
-#define VEC_hi_zmm6				zmm22
-#define VEC_hi_zmm7				zmm23
-#define VEC_hi_zmm8				zmm24
-#define VEC_hi_zmm9				zmm25
-#define VEC_hi_zmm10			zmm26
-#define VEC_hi_zmm11			zmm27
-#define VEC_hi_zmm12			zmm28
-#define VEC_hi_zmm13			zmm29
-#define VEC_hi_zmm14			zmm30
-#define VEC_hi_zmm15			zmm31
-
-#define PRIMITIVE_VEC(vec, num)		vec##num
-
-#define VEC_any_xmm(i)			PRIMITIVE_VEC(xmm, i)
-#define VEC_any_ymm(i)			PRIMITIVE_VEC(ymm, i)
-#define VEC_any_zmm(i)			PRIMITIVE_VEC(zmm, i)
-
-#define VEC_hi_xmm(i)			PRIMITIVE_VEC(VEC_hi_xmm, i)
-#define VEC_hi_ymm(i)			PRIMITIVE_VEC(VEC_hi_ymm, i)
-#define VEC_hi_zmm(i)			PRIMITIVE_VEC(VEC_hi_zmm, i)
-
-#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v6 5/7] x86: Update memmove to use new VEC macros
  2022-10-14 22:39 ` [PATCH v6 1/7] x86: Update and move evex256/512 vec macros Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-10-14 22:39   ` [PATCH v6 4/7] x86: Remove now unused vec header macros Noah Goldstein
@ 2022-10-14 22:39   ` Noah Goldstein
  2022-10-14 22:39   ` [PATCH v6 6/7] x86: Update memset " Noah Goldstein
  2022-10-14 22:39   ` [PATCH v6 7/7] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
  5 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 22:39 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Replace %VEC(n) -> %VMM(n)

This commit does not change libc.so

Tested build on x86-64
---
 .../memmove-avx-unaligned-erms-rtm.S          |  15 +-
 .../multiarch/memmove-avx-unaligned-erms.S    |   9 +-
 .../multiarch/memmove-avx512-unaligned-erms.S |  30 +-
 .../multiarch/memmove-evex-unaligned-erms.S   |  30 +-
 .../multiarch/memmove-sse2-unaligned-erms.S   |  11 +-
 .../multiarch/memmove-vec-unaligned-erms.S    | 262 +++++++++---------
 6 files changed, 135 insertions(+), 222 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
index 67a55f0c85..c2a95dc247 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -1,16 +1,9 @@
-#if IS_IN (libc)
-# define VEC_SIZE	32
-# define VEC(i)		ymm##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu
-# define VMOVA		vmovdqa
-# define MOV_SIZE	4
-# define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+#include <isa-level.h>
 
-# define VZEROUPPER_RETURN jmp	 L(return)
+#if ISA_SHOULD_BUILD (3)
+
+# include "x86-avx-rtm-vecs.h"
 
-# define SECTION(p)		p##.avx.rtm
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s##_rtm
 
 # include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index a14b155667..4e4b4635f9 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -2,14 +2,7 @@
 
 #if ISA_SHOULD_BUILD (3)
 
-# define VEC_SIZE	32
-# define VEC(i)		ymm##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu
-# define VMOVA		vmovdqa
-# define MOV_SIZE	4
-
-# define SECTION(p)		p##.avx
+# include "x86-avx-vecs.h"
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index 8d1568a7ba..cca97e38f8 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -2,35 +2,7 @@
 
 #if ISA_SHOULD_BUILD (4)
 
-# define VEC_SIZE	64
-# define XMM0		xmm16
-# define XMM1		xmm17
-# define YMM0		ymm16
-# define YMM1		ymm17
-# define VEC0		zmm16
-# define VEC1		zmm17
-# define VEC2		zmm18
-# define VEC3		zmm19
-# define VEC4		zmm20
-# define VEC5		zmm21
-# define VEC6		zmm22
-# define VEC7		zmm23
-# define VEC8		zmm24
-# define VEC9		zmm25
-# define VEC10		zmm26
-# define VEC11		zmm27
-# define VEC12		zmm28
-# define VEC13		zmm29
-# define VEC14		zmm30
-# define VEC15		zmm31
-# define VEC(i)		VEC##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-# define VZEROUPPER
-# define MOV_SIZE	6
-
-# define SECTION(p)		p##.evex512
+# include "x86-evex512-vecs.h"
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
index 2373017358..1f7b5715f7 100644
--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -2,35 +2,7 @@
 
 #if ISA_SHOULD_BUILD (4)
 
-# define VEC_SIZE	32
-# define XMM0		xmm16
-# define XMM1		xmm17
-# define YMM0		ymm16
-# define YMM1		ymm17
-# define VEC0		ymm16
-# define VEC1		ymm17
-# define VEC2		ymm18
-# define VEC3		ymm19
-# define VEC4		ymm20
-# define VEC5		ymm21
-# define VEC6		ymm22
-# define VEC7		ymm23
-# define VEC8		ymm24
-# define VEC9		ymm25
-# define VEC10		ymm26
-# define VEC11		ymm27
-# define VEC12		ymm28
-# define VEC13		ymm29
-# define VEC14		ymm30
-# define VEC15		ymm31
-# define VEC(i)		VEC##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-# define VZEROUPPER
-# define MOV_SIZE	6
-
-# define SECTION(p)		p##.evex
+# include "x86-evex256-vecs.h"
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
index 422a079902..8431bcd000 100644
--- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -22,18 +22,9 @@
    so we need this to build for ISA V2 builds. */
 #if ISA_SHOULD_BUILD (2)
 
-# include <sysdep.h>
+# include "x86-sse2-vecs.h"
 
-# define VEC_SIZE	16
-# define VEC(i)		xmm##i
 # define PREFETCHNT	prefetchnta
-# define VMOVNT		movntdq
-/* Use movups and movaps for smaller code sizes.  */
-# define VMOVU		movups
-# define VMOVA		movaps
-# define MOV_SIZE	3
-
-# define SECTION(p)		p
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 04747133b7..5b758cae5e 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -60,14 +60,6 @@
 # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
 #endif
 
-#ifndef XMM0
-# define XMM0				xmm0
-#endif
-
-#ifndef YMM0
-# define YMM0				ymm0
-#endif
-
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER vzeroupper
@@ -225,13 +217,13 @@ L(start):
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 	/* Load regardless.  */
-	VMOVU	(%rsi), %VEC(0)
+	VMOVU	(%rsi), %VMM(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VMM(1)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), -VEC_SIZE(%rdi,%rdx)
 #if !(defined USE_MULTIARCH && IS_IN (libc))
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
@@ -270,15 +262,15 @@ L(start_erms):
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 	/* Load regardless.  */
-	VMOVU	(%rsi), %VEC(0)
+	VMOVU	(%rsi), %VMM(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
 	 */
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
-L(return):
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), -VEC_SIZE(%rdi, %rdx)
+L(return_vzeroupper):
 # if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 # else
@@ -359,10 +351,10 @@ L(between_16_31):
 	.p2align 4,, 10
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
-	VMOVU	(%rsi), %YMM0
-	VMOVU	-32(%rsi, %rdx), %YMM1
-	VMOVU	%YMM0, (%rdi)
-	VMOVU	%YMM1, -32(%rdi, %rdx)
+	VMOVU	(%rsi), %VMM_256(0)
+	VMOVU	-32(%rsi, %rdx), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -32(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
@@ -380,12 +372,12 @@ L(last_4x_vec):
 	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
 
 	/* VEC(0) and VEC(1) have already been loaded.  */
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
+	VMOVU	%VMM(2), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VMM(3), -(VEC_SIZE * 2)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
 	.p2align 4
@@ -400,24 +392,24 @@ L(more_2x_vec):
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
 	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_4x_vec)
 	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(7)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(4), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VMM(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VMM(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(7), -(VEC_SIZE * 4)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
 	.p2align 4,, 4
@@ -466,14 +458,14 @@ L(more_8x_vec_forward):
 	 */
 
 	/* First vec was already loaded into VEC(0).  */
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(5)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(6)
 	/* Save begining of dst.  */
 	movq	%rdi, %rcx
 	/* Align dst to VEC_SIZE - 1.  */
 	orq	$(VEC_SIZE - 1), %rdi
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(7)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(8)
 
 	/* Subtract dst from src. Add back after dst aligned.  */
 	subq	%rcx, %rsi
@@ -488,25 +480,25 @@ L(more_8x_vec_forward):
 	.p2align 4,, 11
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
-	VMOVU	(%rsi), %VEC(1)
-	VMOVU	VEC_SIZE(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
+	VMOVU	(%rsi), %VMM(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(4)
 	subq	$-(VEC_SIZE * 4), %rsi
-	VMOVA	%VEC(1), (%rdi)
-	VMOVA	%VEC(2), VEC_SIZE(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VMM(1), (%rdi)
+	VMOVA	%VMM(2), VEC_SIZE(%rdi)
+	VMOVA	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VMM(4), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpq	%rdi, %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
-	VMOVU	%VEC(7), VEC_SIZE(%rdx)
-	VMOVU	%VEC(8), (%rdx)
+	VMOVU	%VMM(5), (VEC_SIZE * 3)(%rdx)
+	VMOVU	%VMM(6), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VMM(7), VEC_SIZE(%rdx)
+	VMOVU	%VMM(8), (%rdx)
 	/* Store the first VEC.  */
-	VMOVU	%VEC(0), (%rcx)
+	VMOVU	%VMM(0), (%rcx)
 	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
 	 */
 L(nop_backward):
@@ -523,12 +515,12 @@ L(more_8x_vec_backward):
 	   addresses.  */
 
 	/* First vec was also loaded into VEC(0).  */
-	VMOVU	VEC_SIZE(%rsi), %VEC(5)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	VMOVU	VEC_SIZE(%rsi), %VMM(5)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(6)
 	/* Begining of region for 4x backward copy stored in rcx.  */
 	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(7)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(8)
 	/* Subtract dst from src. Add back after dst aligned.  */
 	subq	%rdi, %rsi
 	/* Align dst.  */
@@ -540,25 +532,25 @@ L(more_8x_vec_backward):
 	.p2align 4,, 11
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
-	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VMM(3)
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VMM(4)
 	addq	$(VEC_SIZE * -4), %rsi
-	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
-	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
-	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
+	VMOVA	%VMM(1), (VEC_SIZE * 3)(%rcx)
+	VMOVA	%VMM(2), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VMM(3), (VEC_SIZE * 1)(%rcx)
+	VMOVA	%VMM(4), (VEC_SIZE * 0)(%rcx)
 	addq	$(VEC_SIZE * -4), %rcx
 	cmpq	%rcx, %rdi
 	jb	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(5), VEC_SIZE(%rdi)
+	VMOVU	%VMM(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
-	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
+	VMOVU	%VMM(8), -VEC_SIZE(%rdx, %rdi)
 	VZEROUPPER_RETURN
 
 #if defined USE_MULTIARCH && IS_IN (libc)
@@ -568,7 +560,7 @@ L(loop_4x_vec_backward):
 # if ALIGN_MOVSB
 L(skip_short_movsb_check):
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 #  endif
 #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
 #   error Unsupported MOVSB_ALIGN_TO
@@ -597,9 +589,9 @@ L(skip_short_movsb_check):
 
 	rep	movsb
 
-	VMOVU	%VEC(0), (%r8)
+	VMOVU	%VMM(0), (%r8)
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	%VEC(1), VEC_SIZE(%r8)
+	VMOVU	%VMM(1), VEC_SIZE(%r8)
 #  endif
 	VZEROUPPER_RETURN
 # endif
@@ -640,7 +632,7 @@ L(movsb):
 # endif
 # if ALIGN_MOVSB
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 #  endif
 #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
 #   error Unsupported MOVSB_ALIGN_TO
@@ -664,9 +656,9 @@ L(movsb_align_dst):
 	rep	movsb
 
 	/* Store VECs loaded for aligning.  */
-	VMOVU	%VEC(0), (%r8)
+	VMOVU	%VMM(0), (%r8)
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	%VEC(1), VEC_SIZE(%r8)
+	VMOVU	%VMM(1), VEC_SIZE(%r8)
 #  endif
 	VZEROUPPER_RETURN
 # else	/* !ALIGN_MOVSB.  */
@@ -701,18 +693,18 @@ L(large_memcpy_2x):
 
 	/* First vec was also loaded into VEC(0).  */
 # if VEC_SIZE < 64
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 #  if VEC_SIZE < 32
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
 #  endif
 # endif
-	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VMM(0), (%rdi)
 # if VEC_SIZE < 64
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
 #  if VEC_SIZE < 32
-	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
 #  endif
 # endif
 
@@ -761,12 +753,12 @@ L(loop_large_memcpy_2x_inner):
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
 	/* Load vectors from rsi.  */
-	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
 	subq	$-LARGE_LOAD_SIZE, %rsi
 	/* Non-temporal store vectors to rdi.  */
-	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
 	subq	$-LARGE_LOAD_SIZE, %rdi
 	decl	%ecx
 	jnz	L(loop_large_memcpy_2x_inner)
@@ -785,31 +777,31 @@ L(loop_large_memcpy_2x_tail):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
 	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VMM(0)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
 	subq	$-(VEC_SIZE * 4), %rsi
 	addl	$-(VEC_SIZE * 4), %edx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VMM(0), (%rdi)
+	VMOVA	%VMM(1), VEC_SIZE(%rdi)
+	VMOVA	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VMM(3), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpl	$(VEC_SIZE * 4), %edx
 	ja	L(loop_large_memcpy_2x_tail)
 
 L(large_memcpy_2x_end):
 	/* Store the last 4 * VEC.  */
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
-
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
-	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
-	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
-	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(3)
+
+	VMOVU	%VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VMM(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
 	.p2align 4
@@ -831,16 +823,16 @@ L(loop_large_memcpy_4x_inner):
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
 	/* Load vectors from rsi.  */
-	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
 	subq	$-LARGE_LOAD_SIZE, %rsi
 	/* Non-temporal store vectors to rdi.  */
-	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
-	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
-	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
 	subq	$-LARGE_LOAD_SIZE, %rdi
 	decl	%ecx
 	jnz	L(loop_large_memcpy_4x_inner)
@@ -858,31 +850,31 @@ L(loop_large_memcpy_4x_tail):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
 	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VMM(0)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
 	subq	$-(VEC_SIZE * 4), %rsi
 	addl	$-(VEC_SIZE * 4), %edx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VMM(0), (%rdi)
+	VMOVA	%VMM(1), VEC_SIZE(%rdi)
+	VMOVA	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VMM(3), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpl	$(VEC_SIZE * 4), %edx
 	ja	L(loop_large_memcpy_4x_tail)
 
 L(large_memcpy_4x_end):
 	/* Store the last 4 * VEC.  */
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
-
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
-	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
-	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
-	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(3)
+
+	VMOVU	%VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VMM(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v6 6/7] x86: Update memset to use new VEC macros
  2022-10-14 22:39 ` [PATCH v6 1/7] x86: Update and move evex256/512 vec macros Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-10-14 22:39   ` [PATCH v6 5/7] x86: Update memmove to use new VEC macros Noah Goldstein
@ 2022-10-14 22:39   ` Noah Goldstein
  2022-10-14 22:39   ` [PATCH v6 7/7] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
  5 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 22:39 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Replace %VEC(n) -> %VMM(n)

This commit does not change libc.so

Tested build on x86-64
---
 .../memset-avx2-unaligned-erms-rtm.S          |  8 +--
 .../multiarch/memset-avx2-unaligned-erms.S    | 14 +---
 .../multiarch/memset-avx512-unaligned-erms.S  | 20 +-----
 .../multiarch/memset-evex-unaligned-erms.S    | 20 +-----
 .../multiarch/memset-sse2-unaligned-erms.S    | 10 +--
 .../multiarch/memset-vec-unaligned-erms.S     | 70 ++++++++-----------
 6 files changed, 43 insertions(+), 99 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
index 8ac3e479bb..bc8605faf3 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
@@ -1,10 +1,6 @@
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+#include "x86-avx-rtm-vecs.h"
 
-#define VZEROUPPER_RETURN jmp	 L(return)
-
-#define SECTION(p) p##.avx.rtm
 #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
 #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
 
-#include "memset-avx2-unaligned-erms.S"
+# include "memset-avx2-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index a9054a9122..47cf5072a4 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -4,14 +4,9 @@
 
 # define USE_WITH_AVX2	1
 
-# define VEC_SIZE	32
-# define MOV_SIZE	4
-# define RET_SIZE	4
-
-# define VEC(i)		ymm##i
-
-# define VMOVU     vmovdqu
-# define VMOVA     vmovdqa
+# ifndef VEC_SIZE
+#  include "x86-avx-vecs.h"
+# endif
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
@@ -26,9 +21,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
 # define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
 
-# ifndef SECTION
-#  define SECTION(p)		p##.avx
-# endif
 # ifndef MEMSET_SYMBOL
 #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 47623b8ee8..84145b6c27 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -4,26 +4,14 @@
 
 # define USE_WITH_AVX512	1
 
-# define VEC_SIZE	64
-# define MOV_SIZE	6
-# define RET_SIZE	1
-
-# define XMM0		xmm16
-# define YMM0		ymm16
-# define VEC0		zmm16
-# define VEC(i)		VEC##i
-
-# define VMOVU     vmovdqu64
-# define VMOVA     vmovdqa64
-
-# define VZEROUPPER
+# include "x86-evex512-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastb d, %VEC0; \
+  vpbroadcastb d, %VMM(0); \
   movq r, %rax
 
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastd d, %VEC0; \
+  vpbroadcastd d, %VMM(0); \
   movq r, %rax
 
 # define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -32,8 +20,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p##.evex512
-
 #ifndef MEMSET_SYMBOL
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index ac4b2d2d50..1f03b26bf8 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -4,26 +4,14 @@
 
 # define USE_WITH_EVEX	1
 
-# define VEC_SIZE	32
-# define MOV_SIZE	6
-# define RET_SIZE	1
-
-# define XMM0		xmm16
-# define YMM0		ymm16
-# define VEC0		ymm16
-# define VEC(i)		VEC##i
-
-# define VMOVU     vmovdqu64
-# define VMOVA     vmovdqa64
-
-# define VZEROUPPER
+# include "x86-evex256-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastb d, %VEC0; \
+  vpbroadcastb d, %VMM(0); \
   movq r, %rax
 
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastd d, %VEC0; \
+  vpbroadcastd d, %VMM(0); \
   movq r, %rax
 
 # define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -32,8 +20,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p##.evex
-
 #ifndef MEMSET_SYMBOL
 # define MEMSET_SYMBOL(p,s)	p##_evex_##s
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index 44f9b8888b..34b245d8ca 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -26,13 +26,7 @@
 # include <sysdep.h>
 # define USE_WITH_SSE2	1
 
-# define VEC_SIZE	16
-# define MOV_SIZE	3
-# define RET_SIZE	1
-
-# define VEC(i)		xmm##i
-# define VMOVU     movups
-# define VMOVA     movaps
+# include "x86-sse2-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
@@ -52,8 +46,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p
-
 # ifndef MEMSET_SYMBOL
 #  define MEMSET_SYMBOL(p,s)	p##_sse2_##s
 # endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 905d0fa464..03de0ab907 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -34,14 +34,6 @@
 # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
 #endif
 
-#ifndef XMM0
-# define XMM0				xmm0
-#endif
-
-#ifndef YMM0
-# define YMM0				ymm0
-#endif
-
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER			vzeroupper
@@ -150,8 +142,8 @@ L(entry_from_wmemset):
 	cmpq	$(VEC_SIZE * 2), %rdx
 	ja	L(more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VMM(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VMM(0), (%rdi)
 	VZEROUPPER_RETURN
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
@@ -175,19 +167,19 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(stosb_more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
 	.p2align 4,, 4
 L(last_2x_vec):
 #ifdef USE_LESS_VEC_MASK_STORE
-	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * -2)(%rdi, %rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
 #else
-	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * -2)(%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * -1)(%rdi)
 #endif
 	VZEROUPPER_RETURN
 
@@ -221,7 +213,7 @@ L(less_vec_from_wmemset):
 	bzhil	%edx, %ecx, %ecx
 	kmovd	%ecx, %k1
 # endif
-	vmovdqu8 %VEC(0), (%rax){%k1}
+	vmovdqu8 %VMM(0), (%rax){%k1}
 	VZEROUPPER_RETURN
 
 # if defined USE_MULTIARCH && IS_IN (libc)
@@ -249,8 +241,8 @@ L(stosb_more_2x_vec):
 	   and (4x, 8x] jump to target.  */
 L(more_2x_vec):
 	/* Store next 2x vec regardless.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * 1)(%rdi)
 
 
 	/* Two different methods of setting up pointers / compare. The two
@@ -278,8 +270,8 @@ L(more_2x_vec):
 #endif
 
 	/* Store next 2x vec regardless.  */
-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
-	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
+	VMOVU	%VMM(0), (VEC_SIZE * 2)(%rax)
+	VMOVU	%VMM(0), (VEC_SIZE * 3)(%rax)
 
 
 #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
@@ -304,20 +296,20 @@ L(more_2x_vec):
 	andq	$(VEC_SIZE * -2), %LOOP_REG
 	.p2align 4
 L(loop):
-	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
-	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
-	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
-	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VMM(0), LOOP_4X_OFFSET(%LOOP_REG)
+	VMOVA	%VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
 	subq	$-(VEC_SIZE * 4), %LOOP_REG
 	cmpq	%END_REG, %LOOP_REG
 	jb	L(loop)
 	.p2align 4,, MOV_SIZE
 L(last_4x_vec):
-	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
-	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
-L(return):
+	VMOVU	%VMM(0), LOOP_4X_OFFSET(%END_REG)
+	VMOVU	%VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
+L(return_vzeroupper):
 #if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
@@ -355,7 +347,7 @@ L(cross_page):
 	jge	L(between_16_31)
 #endif
 #ifndef USE_XMM_LESS_VEC
-	MOVQ	%XMM0, %SET_REG64
+	MOVQ	%VMM_128(0), %SET_REG64
 #endif
 	cmpl	$8, %edx
 	jge	L(between_8_15)
@@ -374,8 +366,8 @@ L(between_0_0):
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
-	VMOVU	%YMM0, (%LESS_VEC_REG)
-	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
+	VMOVU	%VMM_256(0), (%LESS_VEC_REG)
+	VMOVU	%VMM_256(0), -32(%LESS_VEC_REG, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
@@ -383,8 +375,8 @@ L(between_32_63):
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
 L(between_16_31):
 	/* From 16 to 31.  No branch when size == 16.  */
-	VMOVU	%XMM0, (%LESS_VEC_REG)
-	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
+	VMOVU	%VMM_128(0), (%LESS_VEC_REG)
+	VMOVU	%VMM_128(0), -16(%LESS_VEC_REG, %rdx)
 	ret
 #endif
 
@@ -394,8 +386,8 @@ L(between_16_31):
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
 #ifdef USE_XMM_LESS_VEC
-	MOVQ	%XMM0, (%rdi)
-	MOVQ	%XMM0, -8(%rdi, %rdx)
+	MOVQ	%VMM_128(0), (%rdi)
+	MOVQ	%VMM_128(0), -8(%rdi, %rdx)
 #else
 	movq	%SET_REG64, (%LESS_VEC_REG)
 	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
@@ -408,8 +400,8 @@ L(between_8_15):
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
 #ifdef USE_XMM_LESS_VEC
-	MOVD	%XMM0, (%rdi)
-	MOVD	%XMM0, -4(%rdi, %rdx)
+	MOVD	%VMM_128(0), (%rdi)
+	MOVD	%VMM_128(0), -4(%rdi, %rdx)
 #else
 	movl	%SET_REG32, (%LESS_VEC_REG)
 	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v6 7/7] x86: Update strlen-evex-base to use new reg/vec macros.
  2022-10-14 22:39 ` [PATCH v6 1/7] x86: Update and move evex256/512 vec macros Noah Goldstein
                     ` (4 preceding siblings ...)
  2022-10-14 22:39   ` [PATCH v6 6/7] x86: Update memset " Noah Goldstein
@ 2022-10-14 22:39   ` Noah Goldstein
  5 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 22:39 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

To avoid duplicate the VMM / GPR / mask insn macros in all incoming
evex512 files use the macros defined in 'reg-macros.h' and
'{vec}-macros.h'

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 116 +++++++-------------
 sysdeps/x86_64/multiarch/strlen-evex512.S   |   4 +-
 2 files changed, 44 insertions(+), 76 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index 418e9f8411..c832b15a48 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -36,42 +36,10 @@
 #  define CHAR_SIZE	1
 # endif
 
-# define XMM0		xmm16
 # define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# if VEC_SIZE == 64
-#  define KMOV		kmovq
-#  define KORTEST	kortestq
-#  define RAX		rax
-#  define RCX		rcx
-#  define RDX		rdx
-#  define SHR		shrq
-#  define TEXTSUFFIX	evex512
-#  define VMM0		zmm16
-#  define VMM1		zmm17
-#  define VMM2		zmm18
-#  define VMM3		zmm19
-#  define VMM4		zmm20
-#  define VMOVA		vmovdqa64
-# elif VEC_SIZE == 32
-/* Currently Unused.  */
-#  define KMOV		kmovd
-#  define KORTEST	kortestd
-#  define RAX		eax
-#  define RCX		ecx
-#  define RDX		edx
-#  define SHR		shrl
-#  define TEXTSUFFIX	evex256
-#  define VMM0		ymm16
-#  define VMM1		ymm17
-#  define VMM2		ymm18
-#  define VMM3		ymm19
-#  define VMM4		ymm20
-#  define VMOVA		vmovdqa32
-# endif
-
-	.section .text.TEXTSUFFIX, "ax", @progbits
+	.section SECTION(.text),"ax",@progbits
 /* Aligning entry point to 64 byte, provides better performance for
    one vector length string.  */
 ENTRY_P2ALIGN (STRLEN, 6)
@@ -86,18 +54,18 @@ ENTRY_P2ALIGN (STRLEN, 6)
 # endif
 
 	movl	%edi, %eax
-	vpxorq	%XMM0, %XMM0, %XMM0
+	vpxorq	%VMM_128(0), %VMM_128(0), %VMM_128(0)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(page_cross)
 
 	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMP	$0, (%rdi), %VMM0, %k0
-	KMOV	%k0, %RAX
-	test	%RAX, %RAX
+	VPCMP	$0, (%rdi), %VMM(0), %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
@@ -120,7 +88,7 @@ L(align_more):
 	movq	%rax, %rdx
 	subq	%rdi, %rdx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RDX
+	shr	$2, %VRDX
 #  endif
 	/* At this point rdx contains [w]chars already compared.  */
 	subq	%rsi, %rdx
@@ -131,9 +99,9 @@ L(align_more):
 # endif
 
 	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMP	$0, (%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
 # ifdef USE_AS_STRNLEN
@@ -141,9 +109,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, VEC_SIZE(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, VEC_SIZE(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
 # ifdef USE_AS_STRNLEN
@@ -151,9 +119,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 
 # ifdef USE_AS_STRNLEN
@@ -161,9 +129,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x4)
 
 # ifdef USE_AS_STRNLEN
@@ -179,7 +147,7 @@ L(align_more):
 # ifdef USE_AS_STRNLEN
 	subq	%rax, %rcx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RCX
+	shr	$2, %VRCX
 #  endif
 	/* rcx contains number of [w]char will be recompared due to
 	   alignment fixes.  rdx must be incremented by rcx to offset
@@ -199,42 +167,42 @@ L(loop_entry):
 # endif
 	/* VPMINU and VPCMP combination provide better performance as
 	   compared to alternative combinations.  */
-	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
-	VPMINU	(VEC_SIZE * 5)(%rax), %VMM1, %VMM2
-	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
-	VPMINU	(VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+	VMOVA	(VEC_SIZE * 4)(%rax), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rax), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
 
-	VPTESTN	%VMM2, %VMM2, %k0
-	VPTESTN	%VMM4, %VMM4, %k1
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k1
 
 	subq	$-(VEC_SIZE * 4), %rax
 	KORTEST	%k0, %k1
 	jz	L(loop)
 
-	VPTESTN	%VMM1, %VMM1, %k2
-	KMOV	%k2, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VMM(1), %VMM(1), %k2
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
-	KMOV	%k0, %RCX
+	KMOV	%k0, %VRCX
 	/* At this point, if k0 is non zero, null char must be in the
 	   second vector.  */
-	test	%RCX, %RCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
-	VPTESTN	%VMM3, %VMM3, %k3
-	KMOV	%k3, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VMM(3), %VMM(3), %k3
+	KMOV	%k3, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 	/* At this point null [w]char must be in the fourth vector so no
 	   need to check.  */
-	KMOV	%k1, %RCX
+	KMOV	%k1, %VRCX
 
 	/* Fourth, third, second vector terminating are pretty much
 	   same, implemented this way to avoid branching and reuse code
 	   from pre loop exit condition.  */
 L(ret_vec_x4):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 3), %rax
@@ -250,7 +218,7 @@ L(ret_vec_x4):
 	ret
 
 L(ret_vec_x3):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 2), %rax
@@ -268,7 +236,7 @@ L(ret_vec_x3):
 L(ret_vec_x2):
 	subq	$-VEC_SIZE, %rax
 L(ret_vec_x1):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
@@ -289,13 +257,13 @@ L(page_cross):
 	/* ecx contains number of w[char] to be skipped as a result
 	   of address alignment.  */
 	xorq	%rdi, %rax
-	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
-	KMOV	%k0, %RAX
+	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRAX
 	/* Ignore number of character for alignment adjustment.  */
-	SHR	%cl, %RAX
+	shr	%cl, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
index 116f8981c8..10c3415c8a 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -2,6 +2,6 @@
 # define STRLEN		__strlen_evex512
 #endif
 
-#define VEC_SIZE	64
-
+#include "x86-evex512-vecs.h"
+#include "reg-macros.h"
 #include "strlen-evex-base.S"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v5 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 22:27           ` Noah Goldstein
@ 2022-10-14 22:41             ` H.J. Lu
  2022-10-14 23:15               ` Noah Goldstein
  0 siblings, 1 reply; 72+ messages in thread
From: H.J. Lu @ 2022-10-14 22:41 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 3:27 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Oct 14, 2022 at 5:06 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> >  On Fri, Oct 14, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Fri, Oct 14, 2022 at 4:28 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Fri, Oct 14, 2022 at 2:15 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > This is to make it easier to do think like:
> > > > > ```
> > > > > vpcmpb %VEC(0), %VEC(1), %k0
> > > > > kmov{d|q} %k0, %{eax|rax}
> > > > > test %{eax|rax}
> > > > > ```
> > > > >
> > > > > It adds macro s.t any GPR can get the proper width with:
> > > > >     `V{upper_case_GPR_name}`
> > > > >
> > > > > and any mask insn can get the proper width with:
> > > > >     `{mask_insn_without_postfix}V`
> > > > >
> > > > > This commit does not change libc.so
> > > > >
> > > > > Tested build on x86-64
> > > > > ---
> > > > >  sysdeps/x86_64/multiarch/reg-macros.h         | 166 ++++++++++++++++++
> > > > >  .../multiarch/scripts/gen-reg-macros.py       | 123 +++++++++++++
> > > > >  2 files changed, 289 insertions(+)
> > > > >  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
> > > > >  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > > >
> > > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > > new file mode 100644
> > > > > index 0000000000..16168b6fda
> > > > > --- /dev/null
> > > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > > @@ -0,0 +1,166 @@
> > > > > +/* This file was generated by: gen-reg-macros.py.
> > > > > +
> > > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > +   This file is part of the GNU C Library.
> > > > > +
> > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > +   License as published by the Free Software Foundation; either
> > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > +
> > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > +   Lesser General Public License for more details.
> > > > > +
> > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > +   License along with the GNU C Library; if not, see
> > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > +
> > > > > +#ifndef _REG_MACROS_H
> > > > > +#define _REG_MACROS_H  1
> > > > > +
> > > > > +#define rax_8  al
> > > > > +#define rax_16 ax
> > > > > +#define rax_32 eax
> > > > > +#define rax_64 rax
> > > > > +#define rbx_8  bl
> > > > > +#define rbx_16 bx
> > > > > +#define rbx_32 ebx
> > > > > +#define rbx_64 rbx
> > > > > +#define rcx_8  cl
> > > > > +#define rcx_16 cx
> > > > > +#define rcx_32 ecx
> > > > > +#define rcx_64 rcx
> > > > > +#define rdx_8  dl
> > > > > +#define rdx_16 dx
> > > > > +#define rdx_32 edx
> > > > > +#define rdx_64 rdx
> > > > > +#define rbp_8  bpl
> > > > > +#define rbp_16 bp
> > > > > +#define rbp_32 ebp
> > > > > +#define rbp_64 rbp
> > > > > +#define rsp_8  spl
> > > > > +#define rsp_16 sp
> > > > > +#define rsp_32 esp
> > > > > +#define rsp_64 rsp
> > > > > +#define rsi_8  sil
> > > > > +#define rsi_16 si
> > > > > +#define rsi_32 esi
> > > > > +#define rsi_64 rsi
> > > > > +#define rdi_8  dil
> > > > > +#define rdi_16 di
> > > > > +#define rdi_32 edi
> > > > > +#define rdi_64 rdi
> > > > > +#define r8_8   r8b
> > > > > +#define r8_16  r8w
> > > > > +#define r8_32  r8d
> > > > > +#define r8_64  r8
> > > > > +#define r9_8   r9b
> > > > > +#define r9_16  r9w
> > > > > +#define r9_32  r9d
> > > > > +#define r9_64  r9
> > > > > +#define r10_8  r10b
> > > > > +#define r10_16 r10w
> > > > > +#define r10_32 r10d
> > > > > +#define r10_64 r10
> > > > > +#define r11_8  r11b
> > > > > +#define r11_16 r11w
> > > > > +#define r11_32 r11d
> > > > > +#define r11_64 r11
> > > > > +#define r12_8  r12b
> > > > > +#define r12_16 r12w
> > > > > +#define r12_32 r12d
> > > > > +#define r12_64 r12
> > > > > +#define r13_8  r13b
> > > > > +#define r13_16 r13w
> > > > > +#define r13_32 r13d
> > > > > +#define r13_64 r13
> > > > > +#define r14_8  r14b
> > > > > +#define r14_16 r14w
> > > > > +#define r14_32 r14d
> > > > > +#define r14_64 r14
> > > > > +#define r15_8  r15b
> > > > > +#define r15_16 r15w
> > > > > +#define r15_32 r15d
> > > > > +#define r15_64 r15
> > > > > +
> > > > > +#define kmov_8 kmovb
> > > > > +#define kmov_16        kmovw
> > > > > +#define kmov_32        kmovd
> > > > > +#define kmov_64        kmovq
> > > > > +#define kortest_8      kortestb
> > > > > +#define kortest_16     kortestw
> > > > > +#define kortest_32     kortestd
> > > > > +#define kortest_64     kortestq
> > > > > +#define kor_8  korb
> > > > > +#define kor_16 korw
> > > > > +#define kor_32 kord
> > > > > +#define kor_64 korq
> > > > > +#define ktest_8        ktestb
> > > > > +#define ktest_16       ktestw
> > > > > +#define ktest_32       ktestd
> > > > > +#define ktest_64       ktestq
> > > > > +#define kand_8 kandb
> > > > > +#define kand_16        kandw
> > > > > +#define kand_32        kandd
> > > > > +#define kand_64        kandq
> > > > > +#define kxor_8 kxorb
> > > > > +#define kxor_16        kxorw
> > > > > +#define kxor_32        kxord
> > > > > +#define kxor_64        kxorq
> > > > > +#define knot_8 knotb
> > > > > +#define knot_16        knotw
> > > > > +#define knot_32        knotd
> > > > > +#define knot_64        knotq
> > > > > +#define kxnor_8        kxnorb
> > > > > +#define kxnor_16       kxnorw
> > > > > +#define kxnor_32       kxnord
> > > > > +#define kxnor_64       kxnorq
> > > > > +#define kunpack_8      kunpackbw
> > > > > +#define kunpack_16     kunpackwd
> > > > > +#define kunpack_32     kunpackdq
> > > > > +
> > > > > +/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
> > > > > +#define VRAX   VGPR(rax)
> > > > > +#define VRBX   VGPR(rbx)
> > > > > +#define VRCX   VGPR(rcx)
> > > > > +#define VRDX   VGPR(rdx)
> > > > > +#define VRBP   VGPR(rbp)
> > > > > +#define VRSP   VGPR(rsp)
> > > > > +#define VRSI   VGPR(rsi)
> > > > > +#define VRDI   VGPR(rdi)
> > > > > +#define VR8    VGPR(r8)
> > > > > +#define VR9    VGPR(r9)
> > > > > +#define VR10   VGPR(r10)
> > > > > +#define VR11   VGPR(r11)
> > > > > +#define VR12   VGPR(r12)
> > > > > +#define VR13   VGPR(r13)
> > > > > +#define VR14   VGPR(r14)
> > > > > +#define VR15   VGPR(r15)
> > > > > +
> > > > > +/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
> > > > > +#define KMOV   VKINSN(kmov)
> > > > > +#define KORTEST        VKINSN(kortest)
> > > > > +#define KOR    VKINSN(kor)
> > > > > +#define KTEST  VKINSN(ktest)
> > > > > +#define KAND   VKINSN(kand)
> > > > > +#define KXOR   VKINSN(kxor)
> > > > > +#define KNOT   VKINSN(knot)
> > > > > +#define KXNOR  VKINSN(kxnor)
> > > > > +#define KUNPACK        VKINSN(kunpack)
> > > > > +
> > > > > +#ifndef REG_WIDTH
> > > > > +# define REG_WIDTH VEC_SIZE
> > > > > +#endif
> > > >
> > > > Which files will define REG_WIDTH?  What values will it be for
> > > > YMM and ZMM vectors?
> > >
> > > for non-wide char evex or avx2/sse2 REG_WIDTH = VEC_SIZE
> > > so for YMM REG_WIDTH = 32, for ZMM REG_WIDTH = 64.
> > >
> > > For wchar impls REG_WIDTH will often be 32 irrelivant of YMM/ZMM.
> >
> > Then we should have
> >
> > #ifdef USE_WIDE_CHAR
> > # define REG_WIDTH 32
> > #else
> > # define REG_WIDTH VEC_SIZE
> > #endif
> >
>
> It may not be universal. It may be that some wide-char impls will want
> REG_WIDTH == 8/16 if they rely heavily on `inc` to do zero test or

I think we can define a macro for it if needed.

> for some reason or another uses the full VEC_SIZE (as wcslen-evex512
> currently does).

Will REG_WIDTH == 32 work for wcslen-evex512?

> Also don't really see what it saves to give up the granularity.
> Either way to specify a seperate reg width the wchar impl will
> need to define something else. Seems reasonable for that
> something else to just be REG_WIDTH directly as opposed to
> USE_WIDE_CHAR.
>
> What do you think?
> > > >
> > > > > +#define VPASTER(x, y)  x##_##y
> > > > > +#define VEVALUATOR(x, y)       VPASTER(x, y)
> > > > > +
> > > > > +#define VGPR_SZ(reg_name, reg_size)    VEVALUATOR(reg_name, reg_size)
> > > > > +#define VKINSN_SZ(insn, reg_size)      VEVALUATOR(insn, reg_size)
> > > > > +
> > > > > +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> > > > > +#define VKINSN(mask_insn)      VKINSN_SZ(mask_insn, REG_WIDTH)
> > > > > +
> > > > > +#endif
> > > > > diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > > > new file mode 100644
> > > > > index 0000000000..c7296a8104
> > > > > --- /dev/null
> > > > > +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > > > @@ -0,0 +1,123 @@
> > > > > +#!/usr/bin/python3
> > > > > +# Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > +# This file is part of the GNU C Library.
> > > > > +#
> > > > > +# The GNU C Library is free software; you can redistribute it and/or
> > > > > +# modify it under the terms of the GNU Lesser General Public
> > > > > +# License as published by the Free Software Foundation; either
> > > > > +# version 2.1 of the License, or (at your option) any later version.
> > > > > +#
> > > > > +# The GNU C Library is distributed in the hope that it will be useful,
> > > > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > +# Lesser General Public License for more details.
> > > > > +#
> > > > > +# You should have received a copy of the GNU Lesser General Public
> > > > > +# License along with the GNU C Library; if not, see
> > > > > +# <https://www.gnu.org/licenses/>.
> > > > > +"""Generate macros for getting GPR name of a certain size
> > > > > +
> > > > > +Inputs: None
> > > > > +Output: Prints header fill to stdout
> > > > > +
> > > > > +API:
> > > > > +    VGPR(reg_name)
> > > > > +        - Get register name VEC_SIZE component of `reg_name`
> > > > > +    VGPR_SZ(reg_name, reg_size)
> > > > > +        - Get register name `reg_size` component of `reg_name`
> > > > > +"""
> > > > > +
> > > > > +import sys
> > > > > +import os
> > > > > +from datetime import datetime
> > > > > +
> > > > > +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> > > > > +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> > > > > +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> > > > > +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> > > > > +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> > > > > +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> > > > > +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> > > > > +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> > > > > +
> > > > > +mask_insns = [
> > > > > +    "kmov",
> > > > > +    "kortest",
> > > > > +    "kor",
> > > > > +    "ktest",
> > > > > +    "kand",
> > > > > +    "kxor",
> > > > > +    "knot",
> > > > > +    "kxnor",
> > > > > +]
> > > > > +mask_insns_ext = ["b", "w", "d", "q"]
> > > > > +
> > > > > +cr = """
> > > > > +   Copyright (C) {} Free Software Foundation, Inc.
> > > > > +   This file is part of the GNU C Library.
> > > > > +
> > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > +   License as published by the Free Software Foundation; either
> > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > +
> > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > +   Lesser General Public License for more details.
> > > > > +
> > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > +   License along with the GNU C Library; if not, see
> > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > +"""
> > > > > +
> > > > > +print("/* This file was generated by: {}.".format(os.path.basename(
> > > > > +    sys.argv[0])))
> > > > > +print(cr.format(datetime.today().year))
> > > > > +
> > > > > +print("#ifndef _REG_MACROS_H")
> > > > > +print("#define _REG_MACROS_H\t1")
> > > > > +print("")
> > > > > +for reg in registers:
> > > > > +    for i in range(0, 4):
> > > > > +        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
> > > > > +
> > > > > +print("")
> > > > > +for mask_insn in mask_insns:
> > > > > +    for i in range(0, 4):
> > > > > +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> > > > > +                                           mask_insns_ext[i]))
> > > > > +for i in range(0, 3):
> > > > > +    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
> > > > > +                                                   mask_insns_ext[i + 1]))
> > > > > +mask_insns.append("kunpack")
> > > > > +
> > > > > +print("")
> > > > > +print(
> > > > > +    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
> > > > > +for reg in registers:
> > > > > +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> > > > > +
> > > > > +print("")
> > > > > +
> > > > > +print(
> > > > > +    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
> > > > > +)
> > > > > +for mask_insn in mask_insns:
> > > > > +    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
> > > > > +print("")
> > > > > +
> > > > > +print("#ifndef REG_WIDTH")
> > > > > +print("# define REG_WIDTH VEC_SIZE")
> > > > > +print("#endif")
> > > > > +print("")
> > > > > +print("#define VPASTER(x, y)\tx##_##y")
> > > > > +print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
> > > > > +print("")
> > > > > +print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
> > > > > +print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
> > > > > +print("")
> > > > > +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> > > > > +print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
> > > > > +
> > > > > +print("\n#endif")
> > > > > --
> > > > > 2.34.1
> > > > >
> > > >
> > > >
> > > > --
> > > > H.J.
> >
> >
> >
> > --
> > H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v5 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 22:41             ` H.J. Lu
@ 2022-10-14 23:15               ` Noah Goldstein
  2022-10-14 23:22                 ` H.J. Lu
  0 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 23:15 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 5:41 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Oct 14, 2022 at 3:27 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Oct 14, 2022 at 5:06 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > >  On Fri, Oct 14, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Fri, Oct 14, 2022 at 4:28 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Fri, Oct 14, 2022 at 2:15 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > >
> > > > > > This is to make it easier to do think like:
> > > > > > ```
> > > > > > vpcmpb %VEC(0), %VEC(1), %k0
> > > > > > kmov{d|q} %k0, %{eax|rax}
> > > > > > test %{eax|rax}
> > > > > > ```
> > > > > >
> > > > > > It adds macro s.t any GPR can get the proper width with:
> > > > > >     `V{upper_case_GPR_name}`
> > > > > >
> > > > > > and any mask insn can get the proper width with:
> > > > > >     `{mask_insn_without_postfix}V`
> > > > > >
> > > > > > This commit does not change libc.so
> > > > > >
> > > > > > Tested build on x86-64
> > > > > > ---
> > > > > >  sysdeps/x86_64/multiarch/reg-macros.h         | 166 ++++++++++++++++++
> > > > > >  .../multiarch/scripts/gen-reg-macros.py       | 123 +++++++++++++
> > > > > >  2 files changed, 289 insertions(+)
> > > > > >  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
> > > > > >  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > > > >
> > > > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > > > new file mode 100644
> > > > > > index 0000000000..16168b6fda
> > > > > > --- /dev/null
> > > > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > > > @@ -0,0 +1,166 @@
> > > > > > +/* This file was generated by: gen-reg-macros.py.
> > > > > > +
> > > > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > +   This file is part of the GNU C Library.
> > > > > > +
> > > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > > +   License as published by the Free Software Foundation; either
> > > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > > +
> > > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > +   Lesser General Public License for more details.
> > > > > > +
> > > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > > +   License along with the GNU C Library; if not, see
> > > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > > +
> > > > > > +#ifndef _REG_MACROS_H
> > > > > > +#define _REG_MACROS_H  1
> > > > > > +
> > > > > > +#define rax_8  al
> > > > > > +#define rax_16 ax
> > > > > > +#define rax_32 eax
> > > > > > +#define rax_64 rax
> > > > > > +#define rbx_8  bl
> > > > > > +#define rbx_16 bx
> > > > > > +#define rbx_32 ebx
> > > > > > +#define rbx_64 rbx
> > > > > > +#define rcx_8  cl
> > > > > > +#define rcx_16 cx
> > > > > > +#define rcx_32 ecx
> > > > > > +#define rcx_64 rcx
> > > > > > +#define rdx_8  dl
> > > > > > +#define rdx_16 dx
> > > > > > +#define rdx_32 edx
> > > > > > +#define rdx_64 rdx
> > > > > > +#define rbp_8  bpl
> > > > > > +#define rbp_16 bp
> > > > > > +#define rbp_32 ebp
> > > > > > +#define rbp_64 rbp
> > > > > > +#define rsp_8  spl
> > > > > > +#define rsp_16 sp
> > > > > > +#define rsp_32 esp
> > > > > > +#define rsp_64 rsp
> > > > > > +#define rsi_8  sil
> > > > > > +#define rsi_16 si
> > > > > > +#define rsi_32 esi
> > > > > > +#define rsi_64 rsi
> > > > > > +#define rdi_8  dil
> > > > > > +#define rdi_16 di
> > > > > > +#define rdi_32 edi
> > > > > > +#define rdi_64 rdi
> > > > > > +#define r8_8   r8b
> > > > > > +#define r8_16  r8w
> > > > > > +#define r8_32  r8d
> > > > > > +#define r8_64  r8
> > > > > > +#define r9_8   r9b
> > > > > > +#define r9_16  r9w
> > > > > > +#define r9_32  r9d
> > > > > > +#define r9_64  r9
> > > > > > +#define r10_8  r10b
> > > > > > +#define r10_16 r10w
> > > > > > +#define r10_32 r10d
> > > > > > +#define r10_64 r10
> > > > > > +#define r11_8  r11b
> > > > > > +#define r11_16 r11w
> > > > > > +#define r11_32 r11d
> > > > > > +#define r11_64 r11
> > > > > > +#define r12_8  r12b
> > > > > > +#define r12_16 r12w
> > > > > > +#define r12_32 r12d
> > > > > > +#define r12_64 r12
> > > > > > +#define r13_8  r13b
> > > > > > +#define r13_16 r13w
> > > > > > +#define r13_32 r13d
> > > > > > +#define r13_64 r13
> > > > > > +#define r14_8  r14b
> > > > > > +#define r14_16 r14w
> > > > > > +#define r14_32 r14d
> > > > > > +#define r14_64 r14
> > > > > > +#define r15_8  r15b
> > > > > > +#define r15_16 r15w
> > > > > > +#define r15_32 r15d
> > > > > > +#define r15_64 r15
> > > > > > +
> > > > > > +#define kmov_8 kmovb
> > > > > > +#define kmov_16        kmovw
> > > > > > +#define kmov_32        kmovd
> > > > > > +#define kmov_64        kmovq
> > > > > > +#define kortest_8      kortestb
> > > > > > +#define kortest_16     kortestw
> > > > > > +#define kortest_32     kortestd
> > > > > > +#define kortest_64     kortestq
> > > > > > +#define kor_8  korb
> > > > > > +#define kor_16 korw
> > > > > > +#define kor_32 kord
> > > > > > +#define kor_64 korq
> > > > > > +#define ktest_8        ktestb
> > > > > > +#define ktest_16       ktestw
> > > > > > +#define ktest_32       ktestd
> > > > > > +#define ktest_64       ktestq
> > > > > > +#define kand_8 kandb
> > > > > > +#define kand_16        kandw
> > > > > > +#define kand_32        kandd
> > > > > > +#define kand_64        kandq
> > > > > > +#define kxor_8 kxorb
> > > > > > +#define kxor_16        kxorw
> > > > > > +#define kxor_32        kxord
> > > > > > +#define kxor_64        kxorq
> > > > > > +#define knot_8 knotb
> > > > > > +#define knot_16        knotw
> > > > > > +#define knot_32        knotd
> > > > > > +#define knot_64        knotq
> > > > > > +#define kxnor_8        kxnorb
> > > > > > +#define kxnor_16       kxnorw
> > > > > > +#define kxnor_32       kxnord
> > > > > > +#define kxnor_64       kxnorq
> > > > > > +#define kunpack_8      kunpackbw
> > > > > > +#define kunpack_16     kunpackwd
> > > > > > +#define kunpack_32     kunpackdq
> > > > > > +
> > > > > > +/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
> > > > > > +#define VRAX   VGPR(rax)
> > > > > > +#define VRBX   VGPR(rbx)
> > > > > > +#define VRCX   VGPR(rcx)
> > > > > > +#define VRDX   VGPR(rdx)
> > > > > > +#define VRBP   VGPR(rbp)
> > > > > > +#define VRSP   VGPR(rsp)
> > > > > > +#define VRSI   VGPR(rsi)
> > > > > > +#define VRDI   VGPR(rdi)
> > > > > > +#define VR8    VGPR(r8)
> > > > > > +#define VR9    VGPR(r9)
> > > > > > +#define VR10   VGPR(r10)
> > > > > > +#define VR11   VGPR(r11)
> > > > > > +#define VR12   VGPR(r12)
> > > > > > +#define VR13   VGPR(r13)
> > > > > > +#define VR14   VGPR(r14)
> > > > > > +#define VR15   VGPR(r15)
> > > > > > +
> > > > > > +/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
> > > > > > +#define KMOV   VKINSN(kmov)
> > > > > > +#define KORTEST        VKINSN(kortest)
> > > > > > +#define KOR    VKINSN(kor)
> > > > > > +#define KTEST  VKINSN(ktest)
> > > > > > +#define KAND   VKINSN(kand)
> > > > > > +#define KXOR   VKINSN(kxor)
> > > > > > +#define KNOT   VKINSN(knot)
> > > > > > +#define KXNOR  VKINSN(kxnor)
> > > > > > +#define KUNPACK        VKINSN(kunpack)
> > > > > > +
> > > > > > +#ifndef REG_WIDTH
> > > > > > +# define REG_WIDTH VEC_SIZE
> > > > > > +#endif
> > > > >
> > > > > Which files will define REG_WIDTH?  What values will it be for
> > > > > YMM and ZMM vectors?
> > > >
> > > > for non-wide char evex or avx2/sse2 REG_WIDTH = VEC_SIZE
> > > > so for YMM REG_WIDTH = 32, for ZMM REG_WIDTH = 64.
> > > >
> > > > For wchar impls REG_WIDTH will often be 32 irrelivant of YMM/ZMM.
> > >
> > > Then we should have
> > >
> > > #ifdef USE_WIDE_CHAR
> > > # define REG_WIDTH 32
> > > #else
> > > # define REG_WIDTH VEC_SIZE
> > > #endif
> > >
> >
> > It may not be universal. It may be that some wide-char impls will want
> > REG_WIDTH == 8/16 if they rely heavily on `inc` to do zero test or
>
> I think we can define a macro for it if needed.

We can but don't you think just REG_WIDTH is more direct?

>
> > for some reason or another uses the full VEC_SIZE (as wcslen-evex512
> > currently does).
>
> Will REG_WIDTH == 32 work for wcslen-evex512?
>

I believe so but am trying to make these patch zero-affect. I think a seperate
patch to actually make substantive changes make more sense.
> > Also don't really see what it saves to give up the granularity.
> > Either way to specify a seperate reg width the wchar impl will
> > need to define something else. Seems reasonable for that
> > something else to just be REG_WIDTH directly as opposed to
> > USE_WIDE_CHAR.
> >
> > What do you think?
> > > > >
> > > > > > +#define VPASTER(x, y)  x##_##y
> > > > > > +#define VEVALUATOR(x, y)       VPASTER(x, y)
> > > > > > +
> > > > > > +#define VGPR_SZ(reg_name, reg_size)    VEVALUATOR(reg_name, reg_size)
> > > > > > +#define VKINSN_SZ(insn, reg_size)      VEVALUATOR(insn, reg_size)
> > > > > > +
> > > > > > +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> > > > > > +#define VKINSN(mask_insn)      VKINSN_SZ(mask_insn, REG_WIDTH)
> > > > > > +
> > > > > > +#endif
> > > > > > diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > > > > new file mode 100644
> > > > > > index 0000000000..c7296a8104
> > > > > > --- /dev/null
> > > > > > +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > > > > @@ -0,0 +1,123 @@
> > > > > > +#!/usr/bin/python3
> > > > > > +# Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > +# This file is part of the GNU C Library.
> > > > > > +#
> > > > > > +# The GNU C Library is free software; you can redistribute it and/or
> > > > > > +# modify it under the terms of the GNU Lesser General Public
> > > > > > +# License as published by the Free Software Foundation; either
> > > > > > +# version 2.1 of the License, or (at your option) any later version.
> > > > > > +#
> > > > > > +# The GNU C Library is distributed in the hope that it will be useful,
> > > > > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > +# Lesser General Public License for more details.
> > > > > > +#
> > > > > > +# You should have received a copy of the GNU Lesser General Public
> > > > > > +# License along with the GNU C Library; if not, see
> > > > > > +# <https://www.gnu.org/licenses/>.
> > > > > > +"""Generate macros for getting GPR name of a certain size
> > > > > > +
> > > > > > +Inputs: None
> > > > > > +Output: Prints header fill to stdout
> > > > > > +
> > > > > > +API:
> > > > > > +    VGPR(reg_name)
> > > > > > +        - Get register name VEC_SIZE component of `reg_name`
> > > > > > +    VGPR_SZ(reg_name, reg_size)
> > > > > > +        - Get register name `reg_size` component of `reg_name`
> > > > > > +"""
> > > > > > +
> > > > > > +import sys
> > > > > > +import os
> > > > > > +from datetime import datetime
> > > > > > +
> > > > > > +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> > > > > > +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> > > > > > +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> > > > > > +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> > > > > > +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> > > > > > +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> > > > > > +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> > > > > > +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> > > > > > +
> > > > > > +mask_insns = [
> > > > > > +    "kmov",
> > > > > > +    "kortest",
> > > > > > +    "kor",
> > > > > > +    "ktest",
> > > > > > +    "kand",
> > > > > > +    "kxor",
> > > > > > +    "knot",
> > > > > > +    "kxnor",
> > > > > > +]
> > > > > > +mask_insns_ext = ["b", "w", "d", "q"]
> > > > > > +
> > > > > > +cr = """
> > > > > > +   Copyright (C) {} Free Software Foundation, Inc.
> > > > > > +   This file is part of the GNU C Library.
> > > > > > +
> > > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > > +   License as published by the Free Software Foundation; either
> > > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > > +
> > > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > +   Lesser General Public License for more details.
> > > > > > +
> > > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > > +   License along with the GNU C Library; if not, see
> > > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > > +"""
> > > > > > +
> > > > > > +print("/* This file was generated by: {}.".format(os.path.basename(
> > > > > > +    sys.argv[0])))
> > > > > > +print(cr.format(datetime.today().year))
> > > > > > +
> > > > > > +print("#ifndef _REG_MACROS_H")
> > > > > > +print("#define _REG_MACROS_H\t1")
> > > > > > +print("")
> > > > > > +for reg in registers:
> > > > > > +    for i in range(0, 4):
> > > > > > +        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
> > > > > > +
> > > > > > +print("")
> > > > > > +for mask_insn in mask_insns:
> > > > > > +    for i in range(0, 4):
> > > > > > +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> > > > > > +                                           mask_insns_ext[i]))
> > > > > > +for i in range(0, 3):
> > > > > > +    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
> > > > > > +                                                   mask_insns_ext[i + 1]))
> > > > > > +mask_insns.append("kunpack")
> > > > > > +
> > > > > > +print("")
> > > > > > +print(
> > > > > > +    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
> > > > > > +for reg in registers:
> > > > > > +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> > > > > > +
> > > > > > +print("")
> > > > > > +
> > > > > > +print(
> > > > > > +    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
> > > > > > +)
> > > > > > +for mask_insn in mask_insns:
> > > > > > +    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
> > > > > > +print("")
> > > > > > +
> > > > > > +print("#ifndef REG_WIDTH")
> > > > > > +print("# define REG_WIDTH VEC_SIZE")
> > > > > > +print("#endif")
> > > > > > +print("")
> > > > > > +print("#define VPASTER(x, y)\tx##_##y")
> > > > > > +print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
> > > > > > +print("")
> > > > > > +print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
> > > > > > +print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
> > > > > > +print("")
> > > > > > +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> > > > > > +print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
> > > > > > +
> > > > > > +print("\n#endif")
> > > > > > --
> > > > > > 2.34.1
> > > > > >
> > > > >
> > > > >
> > > > > --
> > > > > H.J.
> > >
> > >
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v5 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 23:15               ` Noah Goldstein
@ 2022-10-14 23:22                 ` H.J. Lu
  2022-10-14 23:25                   ` Noah Goldstein
  0 siblings, 1 reply; 72+ messages in thread
From: H.J. Lu @ 2022-10-14 23:22 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

 On Fri, Oct 14, 2022 at 4:15 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Oct 14, 2022 at 5:41 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Oct 14, 2022 at 3:27 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Fri, Oct 14, 2022 at 5:06 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > >  On Fri, Oct 14, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > On Fri, Oct 14, 2022 at 4:28 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Fri, Oct 14, 2022 at 2:15 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > > >
> > > > > > > This is to make it easier to do think like:
> > > > > > > ```
> > > > > > > vpcmpb %VEC(0), %VEC(1), %k0
> > > > > > > kmov{d|q} %k0, %{eax|rax}
> > > > > > > test %{eax|rax}
> > > > > > > ```
> > > > > > >
> > > > > > > It adds macro s.t any GPR can get the proper width with:
> > > > > > >     `V{upper_case_GPR_name}`
> > > > > > >
> > > > > > > and any mask insn can get the proper width with:
> > > > > > >     `{mask_insn_without_postfix}V`
> > > > > > >
> > > > > > > This commit does not change libc.so
> > > > > > >
> > > > > > > Tested build on x86-64
> > > > > > > ---
> > > > > > >  sysdeps/x86_64/multiarch/reg-macros.h         | 166 ++++++++++++++++++
> > > > > > >  .../multiarch/scripts/gen-reg-macros.py       | 123 +++++++++++++
> > > > > > >  2 files changed, 289 insertions(+)
> > > > > > >  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
> > > > > > >  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > > > > >
> > > > > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > > > > new file mode 100644
> > > > > > > index 0000000000..16168b6fda
> > > > > > > --- /dev/null
> > > > > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > > > > @@ -0,0 +1,166 @@
> > > > > > > +/* This file was generated by: gen-reg-macros.py.
> > > > > > > +
> > > > > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > > +   This file is part of the GNU C Library.
> > > > > > > +
> > > > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > > > +   License as published by the Free Software Foundation; either
> > > > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > > > +
> > > > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > +   Lesser General Public License for more details.
> > > > > > > +
> > > > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > > > +   License along with the GNU C Library; if not, see
> > > > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > > > +
> > > > > > > +#ifndef _REG_MACROS_H
> > > > > > > +#define _REG_MACROS_H  1
> > > > > > > +
> > > > > > > +#define rax_8  al
> > > > > > > +#define rax_16 ax
> > > > > > > +#define rax_32 eax
> > > > > > > +#define rax_64 rax
> > > > > > > +#define rbx_8  bl
> > > > > > > +#define rbx_16 bx
> > > > > > > +#define rbx_32 ebx
> > > > > > > +#define rbx_64 rbx
> > > > > > > +#define rcx_8  cl
> > > > > > > +#define rcx_16 cx
> > > > > > > +#define rcx_32 ecx
> > > > > > > +#define rcx_64 rcx
> > > > > > > +#define rdx_8  dl
> > > > > > > +#define rdx_16 dx
> > > > > > > +#define rdx_32 edx
> > > > > > > +#define rdx_64 rdx
> > > > > > > +#define rbp_8  bpl
> > > > > > > +#define rbp_16 bp
> > > > > > > +#define rbp_32 ebp
> > > > > > > +#define rbp_64 rbp
> > > > > > > +#define rsp_8  spl
> > > > > > > +#define rsp_16 sp
> > > > > > > +#define rsp_32 esp
> > > > > > > +#define rsp_64 rsp
> > > > > > > +#define rsi_8  sil
> > > > > > > +#define rsi_16 si
> > > > > > > +#define rsi_32 esi
> > > > > > > +#define rsi_64 rsi
> > > > > > > +#define rdi_8  dil
> > > > > > > +#define rdi_16 di
> > > > > > > +#define rdi_32 edi
> > > > > > > +#define rdi_64 rdi
> > > > > > > +#define r8_8   r8b
> > > > > > > +#define r8_16  r8w
> > > > > > > +#define r8_32  r8d
> > > > > > > +#define r8_64  r8
> > > > > > > +#define r9_8   r9b
> > > > > > > +#define r9_16  r9w
> > > > > > > +#define r9_32  r9d
> > > > > > > +#define r9_64  r9
> > > > > > > +#define r10_8  r10b
> > > > > > > +#define r10_16 r10w
> > > > > > > +#define r10_32 r10d
> > > > > > > +#define r10_64 r10
> > > > > > > +#define r11_8  r11b
> > > > > > > +#define r11_16 r11w
> > > > > > > +#define r11_32 r11d
> > > > > > > +#define r11_64 r11
> > > > > > > +#define r12_8  r12b
> > > > > > > +#define r12_16 r12w
> > > > > > > +#define r12_32 r12d
> > > > > > > +#define r12_64 r12
> > > > > > > +#define r13_8  r13b
> > > > > > > +#define r13_16 r13w
> > > > > > > +#define r13_32 r13d
> > > > > > > +#define r13_64 r13
> > > > > > > +#define r14_8  r14b
> > > > > > > +#define r14_16 r14w
> > > > > > > +#define r14_32 r14d
> > > > > > > +#define r14_64 r14
> > > > > > > +#define r15_8  r15b
> > > > > > > +#define r15_16 r15w
> > > > > > > +#define r15_32 r15d
> > > > > > > +#define r15_64 r15
> > > > > > > +
> > > > > > > +#define kmov_8 kmovb
> > > > > > > +#define kmov_16        kmovw
> > > > > > > +#define kmov_32        kmovd
> > > > > > > +#define kmov_64        kmovq
> > > > > > > +#define kortest_8      kortestb
> > > > > > > +#define kortest_16     kortestw
> > > > > > > +#define kortest_32     kortestd
> > > > > > > +#define kortest_64     kortestq
> > > > > > > +#define kor_8  korb
> > > > > > > +#define kor_16 korw
> > > > > > > +#define kor_32 kord
> > > > > > > +#define kor_64 korq
> > > > > > > +#define ktest_8        ktestb
> > > > > > > +#define ktest_16       ktestw
> > > > > > > +#define ktest_32       ktestd
> > > > > > > +#define ktest_64       ktestq
> > > > > > > +#define kand_8 kandb
> > > > > > > +#define kand_16        kandw
> > > > > > > +#define kand_32        kandd
> > > > > > > +#define kand_64        kandq
> > > > > > > +#define kxor_8 kxorb
> > > > > > > +#define kxor_16        kxorw
> > > > > > > +#define kxor_32        kxord
> > > > > > > +#define kxor_64        kxorq
> > > > > > > +#define knot_8 knotb
> > > > > > > +#define knot_16        knotw
> > > > > > > +#define knot_32        knotd
> > > > > > > +#define knot_64        knotq
> > > > > > > +#define kxnor_8        kxnorb
> > > > > > > +#define kxnor_16       kxnorw
> > > > > > > +#define kxnor_32       kxnord
> > > > > > > +#define kxnor_64       kxnorq
> > > > > > > +#define kunpack_8      kunpackbw
> > > > > > > +#define kunpack_16     kunpackwd
> > > > > > > +#define kunpack_32     kunpackdq
> > > > > > > +
> > > > > > > +/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
> > > > > > > +#define VRAX   VGPR(rax)
> > > > > > > +#define VRBX   VGPR(rbx)
> > > > > > > +#define VRCX   VGPR(rcx)
> > > > > > > +#define VRDX   VGPR(rdx)
> > > > > > > +#define VRBP   VGPR(rbp)
> > > > > > > +#define VRSP   VGPR(rsp)
> > > > > > > +#define VRSI   VGPR(rsi)
> > > > > > > +#define VRDI   VGPR(rdi)
> > > > > > > +#define VR8    VGPR(r8)
> > > > > > > +#define VR9    VGPR(r9)
> > > > > > > +#define VR10   VGPR(r10)
> > > > > > > +#define VR11   VGPR(r11)
> > > > > > > +#define VR12   VGPR(r12)
> > > > > > > +#define VR13   VGPR(r13)
> > > > > > > +#define VR14   VGPR(r14)
> > > > > > > +#define VR15   VGPR(r15)
> > > > > > > +
> > > > > > > +/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
> > > > > > > +#define KMOV   VKINSN(kmov)
> > > > > > > +#define KORTEST        VKINSN(kortest)
> > > > > > > +#define KOR    VKINSN(kor)
> > > > > > > +#define KTEST  VKINSN(ktest)
> > > > > > > +#define KAND   VKINSN(kand)
> > > > > > > +#define KXOR   VKINSN(kxor)
> > > > > > > +#define KNOT   VKINSN(knot)
> > > > > > > +#define KXNOR  VKINSN(kxnor)
> > > > > > > +#define KUNPACK        VKINSN(kunpack)
> > > > > > > +
> > > > > > > +#ifndef REG_WIDTH
> > > > > > > +# define REG_WIDTH VEC_SIZE
> > > > > > > +#endif
> > > > > >
> > > > > > Which files will define REG_WIDTH?  What values will it be for
> > > > > > YMM and ZMM vectors?
> > > > >
> > > > > for non-wide char evex or avx2/sse2 REG_WIDTH = VEC_SIZE
> > > > > so for YMM REG_WIDTH = 32, for ZMM REG_WIDTH = 64.
> > > > >
> > > > > For wchar impls REG_WIDTH will often be 32 irrelivant of YMM/ZMM.
> > > >
> > > > Then we should have
> > > >
> > > > #ifdef USE_WIDE_CHAR
> > > > # define REG_WIDTH 32
> > > > #else
> > > > # define REG_WIDTH VEC_SIZE
> > > > #endif
> > > >
> > >
> > > It may not be universal. It may be that some wide-char impls will want
> > > REG_WIDTH == 8/16 if they rely heavily on `inc` to do zero test or
> >
> > I think we can define a macro for it if needed.
>
> We can but don't you think just REG_WIDTH is more direct?

It is very likely that 8-bit/16-bit registers will be used only for specific
operations.  Majority operations will be in 32-bit.  Things like

#ifndef REG_WIDTH
# define REG_WIDTH VEC_SIZE
#endif

may lead to questions.

> >
> > > for some reason or another uses the full VEC_SIZE (as wcslen-evex512
> > > currently does).
> >
> > Will REG_WIDTH == 32 work for wcslen-evex512?
> >
>
> I believe so but am trying to make these patch zero-affect. I think a seperate
> patch to actually make substantive changes make more sense.

USE_WIDE_CHAR is undefined currently.   There is no impact.

> > > Also don't really see what it saves to give up the granularity.
> > > Either way to specify a seperate reg width the wchar impl will
> > > need to define something else. Seems reasonable for that
> > > something else to just be REG_WIDTH directly as opposed to
> > > USE_WIDE_CHAR.
> > >
> > > What do you think?
> > > > > >
> > > > > > > +#define VPASTER(x, y)  x##_##y
> > > > > > > +#define VEVALUATOR(x, y)       VPASTER(x, y)
> > > > > > > +
> > > > > > > +#define VGPR_SZ(reg_name, reg_size)    VEVALUATOR(reg_name, reg_size)
> > > > > > > +#define VKINSN_SZ(insn, reg_size)      VEVALUATOR(insn, reg_size)
> > > > > > > +
> > > > > > > +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> > > > > > > +#define VKINSN(mask_insn)      VKINSN_SZ(mask_insn, REG_WIDTH)
> > > > > > > +
> > > > > > > +#endif
> > > > > > > diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > > > > > new file mode 100644
> > > > > > > index 0000000000..c7296a8104
> > > > > > > --- /dev/null
> > > > > > > +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > > > > > @@ -0,0 +1,123 @@
> > > > > > > +#!/usr/bin/python3
> > > > > > > +# Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > > +# This file is part of the GNU C Library.
> > > > > > > +#
> > > > > > > +# The GNU C Library is free software; you can redistribute it and/or
> > > > > > > +# modify it under the terms of the GNU Lesser General Public
> > > > > > > +# License as published by the Free Software Foundation; either
> > > > > > > +# version 2.1 of the License, or (at your option) any later version.
> > > > > > > +#
> > > > > > > +# The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > +# Lesser General Public License for more details.
> > > > > > > +#
> > > > > > > +# You should have received a copy of the GNU Lesser General Public
> > > > > > > +# License along with the GNU C Library; if not, see
> > > > > > > +# <https://www.gnu.org/licenses/>.
> > > > > > > +"""Generate macros for getting GPR name of a certain size
> > > > > > > +
> > > > > > > +Inputs: None
> > > > > > > +Output: Prints header fill to stdout
> > > > > > > +
> > > > > > > +API:
> > > > > > > +    VGPR(reg_name)
> > > > > > > +        - Get register name VEC_SIZE component of `reg_name`
> > > > > > > +    VGPR_SZ(reg_name, reg_size)
> > > > > > > +        - Get register name `reg_size` component of `reg_name`
> > > > > > > +"""
> > > > > > > +
> > > > > > > +import sys
> > > > > > > +import os
> > > > > > > +from datetime import datetime
> > > > > > > +
> > > > > > > +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> > > > > > > +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> > > > > > > +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> > > > > > > +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> > > > > > > +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> > > > > > > +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> > > > > > > +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> > > > > > > +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> > > > > > > +
> > > > > > > +mask_insns = [
> > > > > > > +    "kmov",
> > > > > > > +    "kortest",
> > > > > > > +    "kor",
> > > > > > > +    "ktest",
> > > > > > > +    "kand",
> > > > > > > +    "kxor",
> > > > > > > +    "knot",
> > > > > > > +    "kxnor",
> > > > > > > +]
> > > > > > > +mask_insns_ext = ["b", "w", "d", "q"]
> > > > > > > +
> > > > > > > +cr = """
> > > > > > > +   Copyright (C) {} Free Software Foundation, Inc.
> > > > > > > +   This file is part of the GNU C Library.
> > > > > > > +
> > > > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > > > +   License as published by the Free Software Foundation; either
> > > > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > > > +
> > > > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > +   Lesser General Public License for more details.
> > > > > > > +
> > > > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > > > +   License along with the GNU C Library; if not, see
> > > > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > > > +"""
> > > > > > > +
> > > > > > > +print("/* This file was generated by: {}.".format(os.path.basename(
> > > > > > > +    sys.argv[0])))
> > > > > > > +print(cr.format(datetime.today().year))
> > > > > > > +
> > > > > > > +print("#ifndef _REG_MACROS_H")
> > > > > > > +print("#define _REG_MACROS_H\t1")
> > > > > > > +print("")
> > > > > > > +for reg in registers:
> > > > > > > +    for i in range(0, 4):
> > > > > > > +        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
> > > > > > > +
> > > > > > > +print("")
> > > > > > > +for mask_insn in mask_insns:
> > > > > > > +    for i in range(0, 4):
> > > > > > > +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> > > > > > > +                                           mask_insns_ext[i]))
> > > > > > > +for i in range(0, 3):
> > > > > > > +    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
> > > > > > > +                                                   mask_insns_ext[i + 1]))
> > > > > > > +mask_insns.append("kunpack")
> > > > > > > +
> > > > > > > +print("")
> > > > > > > +print(
> > > > > > > +    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
> > > > > > > +for reg in registers:
> > > > > > > +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> > > > > > > +
> > > > > > > +print("")
> > > > > > > +
> > > > > > > +print(
> > > > > > > +    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
> > > > > > > +)
> > > > > > > +for mask_insn in mask_insns:
> > > > > > > +    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
> > > > > > > +print("")
> > > > > > > +
> > > > > > > +print("#ifndef REG_WIDTH")
> > > > > > > +print("# define REG_WIDTH VEC_SIZE")
> > > > > > > +print("#endif")
> > > > > > > +print("")
> > > > > > > +print("#define VPASTER(x, y)\tx##_##y")
> > > > > > > +print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
> > > > > > > +print("")
> > > > > > > +print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
> > > > > > > +print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
> > > > > > > +print("")
> > > > > > > +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> > > > > > > +print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
> > > > > > > +
> > > > > > > +print("\n#endif")
> > > > > > > --
> > > > > > > 2.34.1
> > > > > > >
> > > > > >
> > > > > >
> > > > > > --
> > > > > > H.J.
> > > >
> > > >
> > > >
> > > > --
> > > > H.J.
> >
> >
> >
> > --
> > H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v5 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE
  2022-10-14 23:22                 ` H.J. Lu
@ 2022-10-14 23:25                   ` Noah Goldstein
  0 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-14 23:25 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 6:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
>  On Fri, Oct 14, 2022 at 4:15 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Oct 14, 2022 at 5:41 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Oct 14, 2022 at 3:27 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Fri, Oct 14, 2022 at 5:06 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > >  On Fri, Oct 14, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > >
> > > > > > On Fri, Oct 14, 2022 at 4:28 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > > On Fri, Oct 14, 2022 at 2:15 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > > > >
> > > > > > > > This is to make it easier to do think like:
> > > > > > > > ```
> > > > > > > > vpcmpb %VEC(0), %VEC(1), %k0
> > > > > > > > kmov{d|q} %k0, %{eax|rax}
> > > > > > > > test %{eax|rax}
> > > > > > > > ```
> > > > > > > >
> > > > > > > > It adds macro s.t any GPR can get the proper width with:
> > > > > > > >     `V{upper_case_GPR_name}`
> > > > > > > >
> > > > > > > > and any mask insn can get the proper width with:
> > > > > > > >     `{mask_insn_without_postfix}V`
> > > > > > > >
> > > > > > > > This commit does not change libc.so
> > > > > > > >
> > > > > > > > Tested build on x86-64
> > > > > > > > ---
> > > > > > > >  sysdeps/x86_64/multiarch/reg-macros.h         | 166 ++++++++++++++++++
> > > > > > > >  .../multiarch/scripts/gen-reg-macros.py       | 123 +++++++++++++
> > > > > > > >  2 files changed, 289 insertions(+)
> > > > > > > >  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
> > > > > > > >  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > > > > > >
> > > > > > > > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > > > > > new file mode 100644
> > > > > > > > index 0000000000..16168b6fda
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> > > > > > > > @@ -0,0 +1,166 @@
> > > > > > > > +/* This file was generated by: gen-reg-macros.py.
> > > > > > > > +
> > > > > > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > > > +   This file is part of the GNU C Library.
> > > > > > > > +
> > > > > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > > > > +   License as published by the Free Software Foundation; either
> > > > > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > > > > +
> > > > > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > > +   Lesser General Public License for more details.
> > > > > > > > +
> > > > > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > > > > +   License along with the GNU C Library; if not, see
> > > > > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > > > > +
> > > > > > > > +#ifndef _REG_MACROS_H
> > > > > > > > +#define _REG_MACROS_H  1
> > > > > > > > +
> > > > > > > > +#define rax_8  al
> > > > > > > > +#define rax_16 ax
> > > > > > > > +#define rax_32 eax
> > > > > > > > +#define rax_64 rax
> > > > > > > > +#define rbx_8  bl
> > > > > > > > +#define rbx_16 bx
> > > > > > > > +#define rbx_32 ebx
> > > > > > > > +#define rbx_64 rbx
> > > > > > > > +#define rcx_8  cl
> > > > > > > > +#define rcx_16 cx
> > > > > > > > +#define rcx_32 ecx
> > > > > > > > +#define rcx_64 rcx
> > > > > > > > +#define rdx_8  dl
> > > > > > > > +#define rdx_16 dx
> > > > > > > > +#define rdx_32 edx
> > > > > > > > +#define rdx_64 rdx
> > > > > > > > +#define rbp_8  bpl
> > > > > > > > +#define rbp_16 bp
> > > > > > > > +#define rbp_32 ebp
> > > > > > > > +#define rbp_64 rbp
> > > > > > > > +#define rsp_8  spl
> > > > > > > > +#define rsp_16 sp
> > > > > > > > +#define rsp_32 esp
> > > > > > > > +#define rsp_64 rsp
> > > > > > > > +#define rsi_8  sil
> > > > > > > > +#define rsi_16 si
> > > > > > > > +#define rsi_32 esi
> > > > > > > > +#define rsi_64 rsi
> > > > > > > > +#define rdi_8  dil
> > > > > > > > +#define rdi_16 di
> > > > > > > > +#define rdi_32 edi
> > > > > > > > +#define rdi_64 rdi
> > > > > > > > +#define r8_8   r8b
> > > > > > > > +#define r8_16  r8w
> > > > > > > > +#define r8_32  r8d
> > > > > > > > +#define r8_64  r8
> > > > > > > > +#define r9_8   r9b
> > > > > > > > +#define r9_16  r9w
> > > > > > > > +#define r9_32  r9d
> > > > > > > > +#define r9_64  r9
> > > > > > > > +#define r10_8  r10b
> > > > > > > > +#define r10_16 r10w
> > > > > > > > +#define r10_32 r10d
> > > > > > > > +#define r10_64 r10
> > > > > > > > +#define r11_8  r11b
> > > > > > > > +#define r11_16 r11w
> > > > > > > > +#define r11_32 r11d
> > > > > > > > +#define r11_64 r11
> > > > > > > > +#define r12_8  r12b
> > > > > > > > +#define r12_16 r12w
> > > > > > > > +#define r12_32 r12d
> > > > > > > > +#define r12_64 r12
> > > > > > > > +#define r13_8  r13b
> > > > > > > > +#define r13_16 r13w
> > > > > > > > +#define r13_32 r13d
> > > > > > > > +#define r13_64 r13
> > > > > > > > +#define r14_8  r14b
> > > > > > > > +#define r14_16 r14w
> > > > > > > > +#define r14_32 r14d
> > > > > > > > +#define r14_64 r14
> > > > > > > > +#define r15_8  r15b
> > > > > > > > +#define r15_16 r15w
> > > > > > > > +#define r15_32 r15d
> > > > > > > > +#define r15_64 r15
> > > > > > > > +
> > > > > > > > +#define kmov_8 kmovb
> > > > > > > > +#define kmov_16        kmovw
> > > > > > > > +#define kmov_32        kmovd
> > > > > > > > +#define kmov_64        kmovq
> > > > > > > > +#define kortest_8      kortestb
> > > > > > > > +#define kortest_16     kortestw
> > > > > > > > +#define kortest_32     kortestd
> > > > > > > > +#define kortest_64     kortestq
> > > > > > > > +#define kor_8  korb
> > > > > > > > +#define kor_16 korw
> > > > > > > > +#define kor_32 kord
> > > > > > > > +#define kor_64 korq
> > > > > > > > +#define ktest_8        ktestb
> > > > > > > > +#define ktest_16       ktestw
> > > > > > > > +#define ktest_32       ktestd
> > > > > > > > +#define ktest_64       ktestq
> > > > > > > > +#define kand_8 kandb
> > > > > > > > +#define kand_16        kandw
> > > > > > > > +#define kand_32        kandd
> > > > > > > > +#define kand_64        kandq
> > > > > > > > +#define kxor_8 kxorb
> > > > > > > > +#define kxor_16        kxorw
> > > > > > > > +#define kxor_32        kxord
> > > > > > > > +#define kxor_64        kxorq
> > > > > > > > +#define knot_8 knotb
> > > > > > > > +#define knot_16        knotw
> > > > > > > > +#define knot_32        knotd
> > > > > > > > +#define knot_64        knotq
> > > > > > > > +#define kxnor_8        kxnorb
> > > > > > > > +#define kxnor_16       kxnorw
> > > > > > > > +#define kxnor_32       kxnord
> > > > > > > > +#define kxnor_64       kxnorq
> > > > > > > > +#define kunpack_8      kunpackbw
> > > > > > > > +#define kunpack_16     kunpackwd
> > > > > > > > +#define kunpack_32     kunpackdq
> > > > > > > > +
> > > > > > > > +/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
> > > > > > > > +#define VRAX   VGPR(rax)
> > > > > > > > +#define VRBX   VGPR(rbx)
> > > > > > > > +#define VRCX   VGPR(rcx)
> > > > > > > > +#define VRDX   VGPR(rdx)
> > > > > > > > +#define VRBP   VGPR(rbp)
> > > > > > > > +#define VRSP   VGPR(rsp)
> > > > > > > > +#define VRSI   VGPR(rsi)
> > > > > > > > +#define VRDI   VGPR(rdi)
> > > > > > > > +#define VR8    VGPR(r8)
> > > > > > > > +#define VR9    VGPR(r9)
> > > > > > > > +#define VR10   VGPR(r10)
> > > > > > > > +#define VR11   VGPR(r11)
> > > > > > > > +#define VR12   VGPR(r12)
> > > > > > > > +#define VR13   VGPR(r13)
> > > > > > > > +#define VR14   VGPR(r14)
> > > > > > > > +#define VR15   VGPR(r15)
> > > > > > > > +
> > > > > > > > +/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
> > > > > > > > +#define KMOV   VKINSN(kmov)
> > > > > > > > +#define KORTEST        VKINSN(kortest)
> > > > > > > > +#define KOR    VKINSN(kor)
> > > > > > > > +#define KTEST  VKINSN(ktest)
> > > > > > > > +#define KAND   VKINSN(kand)
> > > > > > > > +#define KXOR   VKINSN(kxor)
> > > > > > > > +#define KNOT   VKINSN(knot)
> > > > > > > > +#define KXNOR  VKINSN(kxnor)
> > > > > > > > +#define KUNPACK        VKINSN(kunpack)
> > > > > > > > +
> > > > > > > > +#ifndef REG_WIDTH
> > > > > > > > +# define REG_WIDTH VEC_SIZE
> > > > > > > > +#endif
> > > > > > >
> > > > > > > Which files will define REG_WIDTH?  What values will it be for
> > > > > > > YMM and ZMM vectors?
> > > > > >
> > > > > > for non-wide char evex or avx2/sse2 REG_WIDTH = VEC_SIZE
> > > > > > so for YMM REG_WIDTH = 32, for ZMM REG_WIDTH = 64.
> > > > > >
> > > > > > For wchar impls REG_WIDTH will often be 32 irrelivant of YMM/ZMM.
> > > > >
> > > > > Then we should have
> > > > >
> > > > > #ifdef USE_WIDE_CHAR
> > > > > # define REG_WIDTH 32
> > > > > #else
> > > > > # define REG_WIDTH VEC_SIZE
> > > > > #endif
> > > > >
> > > >
> > > > It may not be universal. It may be that some wide-char impls will want
> > > > REG_WIDTH == 8/16 if they rely heavily on `inc` to do zero test or
> > >
> > > I think we can define a macro for it if needed.
> >
> > We can but don't you think just REG_WIDTH is more direct?
>
> It is very likely that 8-bit/16-bit registers will be used only for specific
> operations.  Majority operations will be in 32-bit.  Things like
>
> #ifndef REG_WIDTH
> # define REG_WIDTH VEC_SIZE
> #endif
>
> may lead to questions.
>
> > >
> > > > for some reason or another uses the full VEC_SIZE (as wcslen-evex512
> > > > currently does).
> > >
> > > Will REG_WIDTH == 32 work for wcslen-evex512?
> > >
> >
> > I believe so but am trying to make these patch zero-affect. I think a seperate
> > patch to actually make substantive changes make more sense.
>
> USE_WIDE_CHAR is undefined currently.   There is no impact.
>
> > > > Also don't really see what it saves to give up the granularity.
> > > > Either way to specify a seperate reg width the wchar impl will
> > > > need to define something else. Seems reasonable for that
> > > > something else to just be REG_WIDTH directly as opposed to
> > > > USE_WIDE_CHAR.
> > > >
> > > > What do you think?
> > > > > > >
> > > > > > > > +#define VPASTER(x, y)  x##_##y
> > > > > > > > +#define VEVALUATOR(x, y)       VPASTER(x, y)
> > > > > > > > +
> > > > > > > > +#define VGPR_SZ(reg_name, reg_size)    VEVALUATOR(reg_name, reg_size)
> > > > > > > > +#define VKINSN_SZ(insn, reg_size)      VEVALUATOR(insn, reg_size)
> > > > > > > > +
> > > > > > > > +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> > > > > > > > +#define VKINSN(mask_insn)      VKINSN_SZ(mask_insn, REG_WIDTH)
> > > > > > > > +
> > > > > > > > +#endif
> > > > > > > > diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > > > > > > new file mode 100644
> > > > > > > > index 0000000000..c7296a8104
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > > > > > > > @@ -0,0 +1,123 @@
> > > > > > > > +#!/usr/bin/python3
> > > > > > > > +# Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > > > +# This file is part of the GNU C Library.
> > > > > > > > +#
> > > > > > > > +# The GNU C Library is free software; you can redistribute it and/or
> > > > > > > > +# modify it under the terms of the GNU Lesser General Public
> > > > > > > > +# License as published by the Free Software Foundation; either
> > > > > > > > +# version 2.1 of the License, or (at your option) any later version.
> > > > > > > > +#
> > > > > > > > +# The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > > +# Lesser General Public License for more details.
> > > > > > > > +#
> > > > > > > > +# You should have received a copy of the GNU Lesser General Public
> > > > > > > > +# License along with the GNU C Library; if not, see
> > > > > > > > +# <https://www.gnu.org/licenses/>.
> > > > > > > > +"""Generate macros for getting GPR name of a certain size
> > > > > > > > +
> > > > > > > > +Inputs: None
> > > > > > > > +Output: Prints header fill to stdout
> > > > > > > > +
> > > > > > > > +API:
> > > > > > > > +    VGPR(reg_name)
> > > > > > > > +        - Get register name VEC_SIZE component of `reg_name`
> > > > > > > > +    VGPR_SZ(reg_name, reg_size)
> > > > > > > > +        - Get register name `reg_size` component of `reg_name`
> > > > > > > > +"""
> > > > > > > > +
> > > > > > > > +import sys
> > > > > > > > +import os
> > > > > > > > +from datetime import datetime
> > > > > > > > +
> > > > > > > > +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> > > > > > > > +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> > > > > > > > +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> > > > > > > > +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> > > > > > > > +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> > > > > > > > +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> > > > > > > > +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> > > > > > > > +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> > > > > > > > +
> > > > > > > > +mask_insns = [
> > > > > > > > +    "kmov",
> > > > > > > > +    "kortest",
> > > > > > > > +    "kor",
> > > > > > > > +    "ktest",
> > > > > > > > +    "kand",
> > > > > > > > +    "kxor",
> > > > > > > > +    "knot",
> > > > > > > > +    "kxnor",
> > > > > > > > +]
> > > > > > > > +mask_insns_ext = ["b", "w", "d", "q"]
> > > > > > > > +
> > > > > > > > +cr = """
> > > > > > > > +   Copyright (C) {} Free Software Foundation, Inc.
> > > > > > > > +   This file is part of the GNU C Library.
> > > > > > > > +
> > > > > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > > > > +   License as published by the Free Software Foundation; either
> > > > > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > > > > +
> > > > > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > > +   Lesser General Public License for more details.
> > > > > > > > +
> > > > > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > > > > +   License along with the GNU C Library; if not, see
> > > > > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > > > > +"""
> > > > > > > > +
> > > > > > > > +print("/* This file was generated by: {}.".format(os.path.basename(
> > > > > > > > +    sys.argv[0])))
> > > > > > > > +print(cr.format(datetime.today().year))
> > > > > > > > +
> > > > > > > > +print("#ifndef _REG_MACROS_H")
> > > > > > > > +print("#define _REG_MACROS_H\t1")
> > > > > > > > +print("")
> > > > > > > > +for reg in registers:
> > > > > > > > +    for i in range(0, 4):
> > > > > > > > +        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
> > > > > > > > +
> > > > > > > > +print("")
> > > > > > > > +for mask_insn in mask_insns:
> > > > > > > > +    for i in range(0, 4):
> > > > > > > > +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> > > > > > > > +                                           mask_insns_ext[i]))
> > > > > > > > +for i in range(0, 3):
> > > > > > > > +    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
> > > > > > > > +                                                   mask_insns_ext[i + 1]))
> > > > > > > > +mask_insns.append("kunpack")
> > > > > > > > +
> > > > > > > > +print("")
> > > > > > > > +print(
> > > > > > > > +    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
> > > > > > > > +for reg in registers:
> > > > > > > > +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> > > > > > > > +
> > > > > > > > +print("")
> > > > > > > > +
> > > > > > > > +print(
> > > > > > > > +    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
> > > > > > > > +)
> > > > > > > > +for mask_insn in mask_insns:
> > > > > > > > +    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
> > > > > > > > +print("")
> > > > > > > > +
> > > > > > > > +print("#ifndef REG_WIDTH")
> > > > > > > > +print("# define REG_WIDTH VEC_SIZE")
> > > > > > > > +print("#endif")
> > > > > > > > +print("")
> > > > > > > > +print("#define VPASTER(x, y)\tx##_##y")
> > > > > > > > +print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
> > > > > > > > +print("")
> > > > > > > > +print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
> > > > > > > > +print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
> > > > > > > > +print("")
> > > > > > > > +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> > > > > > > > +print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
> > > > > > > > +
> > > > > > > > +print("\n#endif")
> > > > > > > > --
> > > > > > > > 2.34.1
> > > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > --
> > > > > > > H.J.
> > > > >
> > > > >
> > > > >
> > > > > --
> > > > > H.J.
> > >
> > >
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.

kk

^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls
  2022-10-14 16:40 [PATCH v1 1/3] x86: Update evex256/512 vec macros Noah Goldstein
                   ` (7 preceding siblings ...)
  2022-10-14 22:39 ` [PATCH v6 1/7] x86: Update and move evex256/512 vec macros Noah Goldstein
@ 2022-10-15  0:06 ` Noah Goldstein
  2022-10-15  0:06   ` [PATCH v8 2/6] x86: Update memrchr to use new VEC macros Noah Goldstein
                     ` (5 more replies)
  2022-10-15  0:20 ` [PATCH v9 " Noah Goldstein
  2022-10-15  3:00 ` [PATCH v10 " Noah Goldstein
  10 siblings, 6 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  0:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

1) Copy so that backport will be easier.
2) Make section only define if there is not a previous definition
3) Add `VEC_lo` definition for proper reg-width but in the
   ymm/zmm0-15 range.
4) Add macros for accessing GPRs based on VEC_SIZE
        This is to make it easier to do think like:
        ```
            vpcmpb %VEC(0), %VEC(1), %k0
            kmov{d|q} %k0, %{eax|rax}
            test %{eax|rax}
        ```
        It adds macro s.t any GPR can get the proper width with:
            `V{upper_case_GPR_name}`

        and any mask insn can get the proper width with:
            `{mask_insn_without_postfix}V`

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/reg-macros.h         | 168 ++++++++++++++++++
 .../multiarch/scripts/gen-reg-macros.py       | 125 +++++++++++++
 sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h   |  35 ++++
 sysdeps/x86_64/multiarch/x86-avx-vecs.h       |  47 +++++
 .../x86_64/multiarch/x86-evex-vecs-common.h   |  39 ++++
 sysdeps/x86_64/multiarch/x86-evex256-vecs.h   |  38 ++++
 sysdeps/x86_64/multiarch/x86-evex512-vecs.h   |  38 ++++
 sysdeps/x86_64/multiarch/x86-sse2-vecs.h      |  47 +++++
 sysdeps/x86_64/multiarch/x86-vec-macros.h     |  90 ++++++++++
 9 files changed, 627 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
 create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-evex256-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-evex512-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-sse2-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-vec-macros.h

diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
new file mode 100644
index 0000000000..c8ea330256
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/reg-macros.h
@@ -0,0 +1,168 @@
+/* This file was generated by: gen-reg-macros.py.
+
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _REG_MACROS_H
+#define _REG_MACROS_H	1
+
+#define rax_8	al
+#define rax_16	ax
+#define rax_32	eax
+#define rax_64	rax
+#define rbx_8	bl
+#define rbx_16	bx
+#define rbx_32	ebx
+#define rbx_64	rbx
+#define rcx_8	cl
+#define rcx_16	cx
+#define rcx_32	ecx
+#define rcx_64	rcx
+#define rdx_8	dl
+#define rdx_16	dx
+#define rdx_32	edx
+#define rdx_64	rdx
+#define rbp_8	bpl
+#define rbp_16	bp
+#define rbp_32	ebp
+#define rbp_64	rbp
+#define rsp_8	spl
+#define rsp_16	sp
+#define rsp_32	esp
+#define rsp_64	rsp
+#define rsi_8	sil
+#define rsi_16	si
+#define rsi_32	esi
+#define rsi_64	rsi
+#define rdi_8	dil
+#define rdi_16	di
+#define rdi_32	edi
+#define rdi_64	rdi
+#define r8_8	r8b
+#define r8_16	r8w
+#define r8_32	r8d
+#define r8_64	r8
+#define r9_8	r9b
+#define r9_16	r9w
+#define r9_32	r9d
+#define r9_64	r9
+#define r10_8	r10b
+#define r10_16	r10w
+#define r10_32	r10d
+#define r10_64	r10
+#define r11_8	r11b
+#define r11_16	r11w
+#define r11_32	r11d
+#define r11_64	r11
+#define r12_8	r12b
+#define r12_16	r12w
+#define r12_32	r12d
+#define r12_64	r12
+#define r13_8	r13b
+#define r13_16	r13w
+#define r13_32	r13d
+#define r13_64	r13
+#define r14_8	r14b
+#define r14_16	r14w
+#define r14_32	r14d
+#define r14_64	r14
+#define r15_8	r15b
+#define r15_16	r15w
+#define r15_32	r15d
+#define r15_64	r15
+
+#define kmov_8	kmovb
+#define kmov_16	kmovw
+#define kmov_32	kmovd
+#define kmov_64	kmovq
+#define kortest_8	kortestb
+#define kortest_16	kortestw
+#define kortest_32	kortestd
+#define kortest_64	kortestq
+#define kor_8	korb
+#define kor_16	korw
+#define kor_32	kord
+#define kor_64	korq
+#define ktest_8	ktestb
+#define ktest_16	ktestw
+#define ktest_32	ktestd
+#define ktest_64	ktestq
+#define kand_8	kandb
+#define kand_16	kandw
+#define kand_32	kandd
+#define kand_64	kandq
+#define kxor_8	kxorb
+#define kxor_16	kxorw
+#define kxor_32	kxord
+#define kxor_64	kxorq
+#define knot_8	knotb
+#define knot_16	knotw
+#define knot_32	knotd
+#define knot_64	knotq
+#define kxnor_8	kxnorb
+#define kxnor_16	kxnorw
+#define kxnor_32	kxnord
+#define kxnor_64	kxnorq
+#define kunpack_8	kunpackbw
+#define kunpack_16	kunpackwd
+#define kunpack_32	kunpackdq
+
+/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
+#define VRAX	VGPR(rax)
+#define VRBX	VGPR(rbx)
+#define VRCX	VGPR(rcx)
+#define VRDX	VGPR(rdx)
+#define VRBP	VGPR(rbp)
+#define VRSP	VGPR(rsp)
+#define VRSI	VGPR(rsi)
+#define VRDI	VGPR(rdi)
+#define VR8	VGPR(r8)
+#define VR9	VGPR(r9)
+#define VR10	VGPR(r10)
+#define VR11	VGPR(r11)
+#define VR12	VGPR(r12)
+#define VR13	VGPR(r13)
+#define VR14	VGPR(r14)
+#define VR15	VGPR(r15)
+
+/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
+#define KMOV 	VKINSN(kmov)
+#define KORTEST 	VKINSN(kortest)
+#define KOR 	VKINSN(kor)
+#define KTEST 	VKINSN(ktest)
+#define KAND 	VKINSN(kand)
+#define KXOR 	VKINSN(kxor)
+#define KNOT 	VKINSN(knot)
+#define KXNOR 	VKINSN(kxnor)
+#define KUNPACK 	VKINSN(kunpack)
+
+#ifdef USE_WIDE_CHAR
+# define REG_WIDTH 32
+#else
+# define REG_WIDTH VEC_SIZE
+#endif
+
+#define VPASTER(x, y)	x##_##y
+#define VEVALUATOR(x, y)	VPASTER(x, y)
+
+#define VGPR_SZ(reg_name, reg_size)	VEVALUATOR(reg_name, reg_size)
+#define VKINSN_SZ(insn, reg_size)	VEVALUATOR(insn, reg_size)
+
+#define VGPR(reg_name)	VGPR_SZ(reg_name, REG_WIDTH)
+#define VKINSN(mask_insn)	VKINSN_SZ(mask_insn, REG_WIDTH)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
new file mode 100644
index 0000000000..6a05f27ff4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
@@ -0,0 +1,125 @@
+#!/usr/bin/python3
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+"""Generate macros for getting GPR name of a certain size
+
+Inputs: None
+Output: Prints header fill to stdout
+
+API:
+    VGPR(reg_name)
+        - Get register name VEC_SIZE component of `reg_name`
+    VGPR_SZ(reg_name, reg_size)
+        - Get register name `reg_size` component of `reg_name`
+"""
+
+import sys
+import os
+from datetime import datetime
+
+registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
+             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
+             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
+             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
+             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
+             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
+             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
+             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
+
+mask_insns = [
+    "kmov",
+    "kortest",
+    "kor",
+    "ktest",
+    "kand",
+    "kxor",
+    "knot",
+    "kxnor",
+]
+mask_insns_ext = ["b", "w", "d", "q"]
+
+cr = """
+   Copyright (C) {} Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+"""
+
+print("/* This file was generated by: {}.".format(os.path.basename(
+    sys.argv[0])))
+print(cr.format(datetime.today().year))
+
+print("#ifndef _REG_MACROS_H")
+print("#define _REG_MACROS_H\t1")
+print("")
+for reg in registers:
+    for i in range(0, 4):
+        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
+
+print("")
+for mask_insn in mask_insns:
+    for i in range(0, 4):
+        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
+                                           mask_insns_ext[i]))
+for i in range(0, 3):
+    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
+                                                   mask_insns_ext[i + 1]))
+mask_insns.append("kunpack")
+
+print("")
+print(
+    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
+for reg in registers:
+    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
+
+print("")
+
+print(
+    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
+)
+for mask_insn in mask_insns:
+    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
+print("")
+
+print("#ifdef USE_WIDE_CHAR")
+print("# define REG_WIDTH 32")
+print("#else")
+print("# define REG_WIDTH VEC_SIZE")
+print("#endif")
+print("")
+print("#define VPASTER(x, y)\tx##_##y")
+print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
+print("")
+print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
+print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
+print("")
+print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
+print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
+
+print("\n#endif")
diff --git a/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
new file mode 100644
index 0000000000..0b326c8a70
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
@@ -0,0 +1,35 @@
+/* Common config for AVX-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX_RTM_VECS_H
+#define _X86_AVX_RTM_VECS_H			1
+
+#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define USE_WITH_RTM			1
+#include "x86-avx-vecs.h"
+
+#undef SECTION
+#define SECTION(p)				p##.avx.rtm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
new file mode 100644
index 0000000000..dca1089060
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for AVX VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX_VECS_H
+#define _X86_AVX_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "x86-vec-macros.h"
+
+#define USE_WITH_AVX		1
+#define SECTION(p)			p##.avx
+
+/* 4-byte mov instructions with AVX2.  */
+#define MOV_SIZE			4
+/* 1 (ret) + 3 (vzeroupper).  */
+#define RET_SIZE			4
+#define VZEROUPPER			vzeroupper
+
+#define VMOVU				vmovdqu
+#define VMOVA				vmovdqa
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VMM_128				VMM_any_xmm
+#define VMM					VMM_any_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
new file mode 100644
index 0000000000..f331e9d8ec
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
@@ -0,0 +1,39 @@
+/* Common config for EVEX256 and EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_EVEX_VECS_COMMON_H
+#define _X86_EVEX_VECS_COMMON_H			1
+
+#include "x86-vec-macros.h"
+
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+#define VMM_128				VMM_hi_xmm
+#define VMM_256				VMM_hi_ymm
+#define VMM_512				VMM_hi_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-evex256-vecs.h b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
new file mode 100644
index 0000000000..8337b95504
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
@@ -0,0 +1,38 @@
+/* Common config for EVEX256 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX256_VECS_H
+#define _EVEX256_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "x86-evex-vecs-common.h"
+
+#define USE_WITH_EVEX256	1
+
+#ifndef SECTION
+# define SECTION(p)			p##.evex
+#endif
+
+#define VMM					VMM_256
+#define VMM_lo				VMM_any_ymm
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-evex512-vecs.h b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
new file mode 100644
index 0000000000..7dc5c23ad0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
@@ -0,0 +1,38 @@
+/* Common config for EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX512_VECS_H
+#define _EVEX512_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			64
+#include "x86-evex-vecs-common.h"
+
+#define USE_WITH_EVEX512	1
+
+#ifndef SECTION
+# define SECTION(p)			p##.evex512
+#endif
+
+#define VMM					VMM_512
+#define VMM_lo				VMM_any_zmm
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-sse2-vecs.h b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
new file mode 100644
index 0000000000..b8bbd5dc29
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for SSE2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_SSE2_VECS_H
+#define _X86_SSE2_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			16
+#include "x86-vec-macros.h"
+
+#define USE_WITH_SSE2		1
+#define SECTION(p)			p
+
+/* 3-byte mov instructions with SSE2.  */
+#define MOV_SIZE			3
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				movups
+#define VMOVA				movaps
+#define VMOVNT				movntdq
+
+#define VMM_128				VMM_any_xmm
+#define VMM					VMM_any_xmm
+
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-vec-macros.h b/sysdeps/x86_64/multiarch/x86-vec-macros.h
new file mode 100644
index 0000000000..7d6bb31d55
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-vec-macros.h
@@ -0,0 +1,90 @@
+/* Macro helpers for VEC_{type}({vec_num})
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_VEC_MACROS_H
+#define _X86_VEC_MACROS_H			1
+
+#ifndef VEC_SIZE
+# error "Never include this file directly. Always include a vector config."
+#endif
+
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
+   VMM(N) values.  */
+#define VMM_hi_xmm0				xmm16
+#define VMM_hi_xmm1				xmm17
+#define VMM_hi_xmm2				xmm18
+#define VMM_hi_xmm3				xmm19
+#define VMM_hi_xmm4				xmm20
+#define VMM_hi_xmm5				xmm21
+#define VMM_hi_xmm6				xmm22
+#define VMM_hi_xmm7				xmm23
+#define VMM_hi_xmm8				xmm24
+#define VMM_hi_xmm9				xmm25
+#define VMM_hi_xmm10			xmm26
+#define VMM_hi_xmm11			xmm27
+#define VMM_hi_xmm12			xmm28
+#define VMM_hi_xmm13			xmm29
+#define VMM_hi_xmm14			xmm30
+#define VMM_hi_xmm15			xmm31
+
+#define VMM_hi_ymm0				ymm16
+#define VMM_hi_ymm1				ymm17
+#define VMM_hi_ymm2				ymm18
+#define VMM_hi_ymm3				ymm19
+#define VMM_hi_ymm4				ymm20
+#define VMM_hi_ymm5				ymm21
+#define VMM_hi_ymm6				ymm22
+#define VMM_hi_ymm7				ymm23
+#define VMM_hi_ymm8				ymm24
+#define VMM_hi_ymm9				ymm25
+#define VMM_hi_ymm10			ymm26
+#define VMM_hi_ymm11			ymm27
+#define VMM_hi_ymm12			ymm28
+#define VMM_hi_ymm13			ymm29
+#define VMM_hi_ymm14			ymm30
+#define VMM_hi_ymm15			ymm31
+
+#define VMM_hi_zmm0				zmm16
+#define VMM_hi_zmm1				zmm17
+#define VMM_hi_zmm2				zmm18
+#define VMM_hi_zmm3				zmm19
+#define VMM_hi_zmm4				zmm20
+#define VMM_hi_zmm5				zmm21
+#define VMM_hi_zmm6				zmm22
+#define VMM_hi_zmm7				zmm23
+#define VMM_hi_zmm8				zmm24
+#define VMM_hi_zmm9				zmm25
+#define VMM_hi_zmm10			zmm26
+#define VMM_hi_zmm11			zmm27
+#define VMM_hi_zmm12			zmm28
+#define VMM_hi_zmm13			zmm29
+#define VMM_hi_zmm14			zmm30
+#define VMM_hi_zmm15			zmm31
+
+#define PRIMITIVE_VMM(vec, num)		vec##num
+
+#define VMM_any_xmm(i)			PRIMITIVE_VMM(xmm, i)
+#define VMM_any_ymm(i)			PRIMITIVE_VMM(ymm, i)
+#define VMM_any_zmm(i)			PRIMITIVE_VMM(zmm, i)
+
+#define VMM_hi_xmm(i)			PRIMITIVE_VMM(VMM_hi_xmm, i)
+#define VMM_hi_ymm(i)			PRIMITIVE_VMM(VMM_hi_ymm, i)
+#define VMM_hi_zmm(i)			PRIMITIVE_VMM(VMM_hi_zmm, i)
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v8 2/6] x86: Update memrchr to use new VEC macros
  2022-10-15  0:06 ` [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls Noah Goldstein
@ 2022-10-15  0:06   ` Noah Goldstein
  2022-10-15  0:06   ` [PATCH v8 3/6] x86: Update memmove " Noah Goldstein
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  0:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Replace %VEC(n) -> %VMM(n)

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/memrchr-evex.S | 42 ++++++++++++-------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index ea3a0a0a60..550b328c5a 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -21,7 +21,7 @@
 #if ISA_SHOULD_BUILD (4)
 
 # include <sysdep.h>
-# include "evex256-vecs.h"
+# include "x86-evex256-vecs.h"
 # if VEC_SIZE != 32
 #  error "VEC_SIZE != 32 unimplemented"
 # endif
@@ -31,7 +31,7 @@
 # endif
 
 # define PAGE_SIZE			4096
-# define VECMATCH			VEC(0)
+# define VMMMATCH			VMM(0)
 
 	.section SECTION(.text), "ax", @progbits
 ENTRY_P2ALIGN(MEMRCHR, 6)
@@ -47,7 +47,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
 	   correct page cross check and 2) it correctly sets up end ptr to be
 	   subtract by lzcnt aligned.  */
 	leaq	-1(%rdi, %rdx), %rax
-	vpbroadcastb %esi, %VECMATCH
+	vpbroadcastb %esi, %VMMMATCH
 
 	/* Check if we can load 1x VEC without cross a page.  */
 	testl	$(PAGE_SIZE - VEC_SIZE), %eax
@@ -55,7 +55,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
 
 	/* Don't use rax for pointer here because EVEX has better encoding with
 	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
@@ -96,7 +96,7 @@ L(more_1x_vec):
 	movq	%rax, %rdx
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	subq	%rdi, %rdx
@@ -115,7 +115,7 @@ L(last_2x_vec):
 
 	/* Don't use rax for pointer here because EVEX has better encoding with
 	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
 	lzcntq	%rcx, %rcx
@@ -131,7 +131,7 @@ L(last_2x_vec):
 L(page_cross):
 	movq	%rax, %rsi
 	andq	$-VEC_SIZE, %rsi
-	vpcmpb	$0, (%rsi), %VECMATCH, %k0
+	vpcmpb	$0, (%rsi), %VMMMATCH, %k0
 	kmovd	%k0, %r8d
 	/* Shift out negative alignment (because we are starting from endptr and
 	   working backwards).  */
@@ -165,13 +165,13 @@ L(more_2x_vec):
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x0_dec)
 
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x1)
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	subq	$(VEC_SIZE * 4), %rdx
@@ -185,7 +185,7 @@ L(last_vec):
 
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 	lzcntl	%ecx, %ecx
 	subq	$(VEC_SIZE * 3 + 1), %rax
@@ -220,7 +220,7 @@ L(more_4x_vec):
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x2)
 
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	testl	%ecx, %ecx
@@ -243,17 +243,17 @@ L(more_4x_vec):
 L(loop_4x_vec):
 	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
 	   on).  */
-	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1
 
 	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
-	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
-	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
-	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
+	vpxorq	(VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3)
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4
 
 	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
 	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
-	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
-	vptestnmb %VEC(3), %VEC(3), %k2
+	vpminub	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	vptestnmb %VMM(3), %VMM(3), %k2
 
 	/* Any 1s and we found CHAR.  */
 	kortestd %k2, %k4
@@ -270,7 +270,7 @@ L(loop_4x_vec):
 L(last_4x_vec):
 
 	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	cmpl	$(VEC_SIZE * 2), %edx
@@ -280,14 +280,14 @@ L(last_4x_vec):
 	jnz	L(ret_vec_x0_dec)
 
 
-	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x1)
 
 	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	cmpl	$(VEC_SIZE * 3), %edx
@@ -309,7 +309,7 @@ L(loop_end):
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x0_end)
 
-	vptestnmb %VEC(2), %VEC(2), %k0
+	vptestnmb %VMM(2), %VMM(2), %k0
 	kmovd	%k0, %ecx
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x1_end)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v8 3/6] x86: Update memmove to use new VEC macros
  2022-10-15  0:06 ` [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls Noah Goldstein
  2022-10-15  0:06   ` [PATCH v8 2/6] x86: Update memrchr to use new VEC macros Noah Goldstein
@ 2022-10-15  0:06   ` Noah Goldstein
  2022-10-15  0:06   ` [PATCH v8 4/6] x86: Update memset " Noah Goldstein
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  0:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Replace %VEC(n) -> %VMM(n)

This commit does not change libc.so

Tested build on x86-64
---
 .../memmove-avx-unaligned-erms-rtm.S          |  15 +-
 .../multiarch/memmove-avx-unaligned-erms.S    |   9 +-
 .../multiarch/memmove-avx512-unaligned-erms.S |  30 +-
 .../multiarch/memmove-evex-unaligned-erms.S   |  30 +-
 .../multiarch/memmove-sse2-unaligned-erms.S   |  11 +-
 .../multiarch/memmove-vec-unaligned-erms.S    | 262 +++++++++---------
 6 files changed, 135 insertions(+), 222 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
index 67a55f0c85..c2a95dc247 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -1,16 +1,9 @@
-#if IS_IN (libc)
-# define VEC_SIZE	32
-# define VEC(i)		ymm##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu
-# define VMOVA		vmovdqa
-# define MOV_SIZE	4
-# define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+#include <isa-level.h>
 
-# define VZEROUPPER_RETURN jmp	 L(return)
+#if ISA_SHOULD_BUILD (3)
+
+# include "x86-avx-rtm-vecs.h"
 
-# define SECTION(p)		p##.avx.rtm
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s##_rtm
 
 # include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index a14b155667..4e4b4635f9 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -2,14 +2,7 @@
 
 #if ISA_SHOULD_BUILD (3)
 
-# define VEC_SIZE	32
-# define VEC(i)		ymm##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu
-# define VMOVA		vmovdqa
-# define MOV_SIZE	4
-
-# define SECTION(p)		p##.avx
+# include "x86-avx-vecs.h"
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index 8d1568a7ba..cca97e38f8 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -2,35 +2,7 @@
 
 #if ISA_SHOULD_BUILD (4)
 
-# define VEC_SIZE	64
-# define XMM0		xmm16
-# define XMM1		xmm17
-# define YMM0		ymm16
-# define YMM1		ymm17
-# define VEC0		zmm16
-# define VEC1		zmm17
-# define VEC2		zmm18
-# define VEC3		zmm19
-# define VEC4		zmm20
-# define VEC5		zmm21
-# define VEC6		zmm22
-# define VEC7		zmm23
-# define VEC8		zmm24
-# define VEC9		zmm25
-# define VEC10		zmm26
-# define VEC11		zmm27
-# define VEC12		zmm28
-# define VEC13		zmm29
-# define VEC14		zmm30
-# define VEC15		zmm31
-# define VEC(i)		VEC##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-# define VZEROUPPER
-# define MOV_SIZE	6
-
-# define SECTION(p)		p##.evex512
+# include "x86-evex512-vecs.h"
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
index 2373017358..1f7b5715f7 100644
--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -2,35 +2,7 @@
 
 #if ISA_SHOULD_BUILD (4)
 
-# define VEC_SIZE	32
-# define XMM0		xmm16
-# define XMM1		xmm17
-# define YMM0		ymm16
-# define YMM1		ymm17
-# define VEC0		ymm16
-# define VEC1		ymm17
-# define VEC2		ymm18
-# define VEC3		ymm19
-# define VEC4		ymm20
-# define VEC5		ymm21
-# define VEC6		ymm22
-# define VEC7		ymm23
-# define VEC8		ymm24
-# define VEC9		ymm25
-# define VEC10		ymm26
-# define VEC11		ymm27
-# define VEC12		ymm28
-# define VEC13		ymm29
-# define VEC14		ymm30
-# define VEC15		ymm31
-# define VEC(i)		VEC##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-# define VZEROUPPER
-# define MOV_SIZE	6
-
-# define SECTION(p)		p##.evex
+# include "x86-evex256-vecs.h"
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
index 422a079902..8431bcd000 100644
--- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -22,18 +22,9 @@
    so we need this to build for ISA V2 builds. */
 #if ISA_SHOULD_BUILD (2)
 
-# include <sysdep.h>
+# include "x86-sse2-vecs.h"
 
-# define VEC_SIZE	16
-# define VEC(i)		xmm##i
 # define PREFETCHNT	prefetchnta
-# define VMOVNT		movntdq
-/* Use movups and movaps for smaller code sizes.  */
-# define VMOVU		movups
-# define VMOVA		movaps
-# define MOV_SIZE	3
-
-# define SECTION(p)		p
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 04747133b7..5b758cae5e 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -60,14 +60,6 @@
 # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
 #endif
 
-#ifndef XMM0
-# define XMM0				xmm0
-#endif
-
-#ifndef YMM0
-# define YMM0				ymm0
-#endif
-
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER vzeroupper
@@ -225,13 +217,13 @@ L(start):
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 	/* Load regardless.  */
-	VMOVU	(%rsi), %VEC(0)
+	VMOVU	(%rsi), %VMM(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VMM(1)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), -VEC_SIZE(%rdi,%rdx)
 #if !(defined USE_MULTIARCH && IS_IN (libc))
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
@@ -270,15 +262,15 @@ L(start_erms):
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 	/* Load regardless.  */
-	VMOVU	(%rsi), %VEC(0)
+	VMOVU	(%rsi), %VMM(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
 	 */
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
-L(return):
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), -VEC_SIZE(%rdi, %rdx)
+L(return_vzeroupper):
 # if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 # else
@@ -359,10 +351,10 @@ L(between_16_31):
 	.p2align 4,, 10
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
-	VMOVU	(%rsi), %YMM0
-	VMOVU	-32(%rsi, %rdx), %YMM1
-	VMOVU	%YMM0, (%rdi)
-	VMOVU	%YMM1, -32(%rdi, %rdx)
+	VMOVU	(%rsi), %VMM_256(0)
+	VMOVU	-32(%rsi, %rdx), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -32(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
@@ -380,12 +372,12 @@ L(last_4x_vec):
 	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
 
 	/* VEC(0) and VEC(1) have already been loaded.  */
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
+	VMOVU	%VMM(2), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VMM(3), -(VEC_SIZE * 2)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
 	.p2align 4
@@ -400,24 +392,24 @@ L(more_2x_vec):
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
 	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_4x_vec)
 	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(7)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(4), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VMM(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VMM(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(7), -(VEC_SIZE * 4)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
 	.p2align 4,, 4
@@ -466,14 +458,14 @@ L(more_8x_vec_forward):
 	 */
 
 	/* First vec was already loaded into VEC(0).  */
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(5)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(6)
 	/* Save begining of dst.  */
 	movq	%rdi, %rcx
 	/* Align dst to VEC_SIZE - 1.  */
 	orq	$(VEC_SIZE - 1), %rdi
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(7)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(8)
 
 	/* Subtract dst from src. Add back after dst aligned.  */
 	subq	%rcx, %rsi
@@ -488,25 +480,25 @@ L(more_8x_vec_forward):
 	.p2align 4,, 11
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
-	VMOVU	(%rsi), %VEC(1)
-	VMOVU	VEC_SIZE(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
+	VMOVU	(%rsi), %VMM(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(4)
 	subq	$-(VEC_SIZE * 4), %rsi
-	VMOVA	%VEC(1), (%rdi)
-	VMOVA	%VEC(2), VEC_SIZE(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VMM(1), (%rdi)
+	VMOVA	%VMM(2), VEC_SIZE(%rdi)
+	VMOVA	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VMM(4), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpq	%rdi, %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
-	VMOVU	%VEC(7), VEC_SIZE(%rdx)
-	VMOVU	%VEC(8), (%rdx)
+	VMOVU	%VMM(5), (VEC_SIZE * 3)(%rdx)
+	VMOVU	%VMM(6), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VMM(7), VEC_SIZE(%rdx)
+	VMOVU	%VMM(8), (%rdx)
 	/* Store the first VEC.  */
-	VMOVU	%VEC(0), (%rcx)
+	VMOVU	%VMM(0), (%rcx)
 	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
 	 */
 L(nop_backward):
@@ -523,12 +515,12 @@ L(more_8x_vec_backward):
 	   addresses.  */
 
 	/* First vec was also loaded into VEC(0).  */
-	VMOVU	VEC_SIZE(%rsi), %VEC(5)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	VMOVU	VEC_SIZE(%rsi), %VMM(5)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(6)
 	/* Begining of region for 4x backward copy stored in rcx.  */
 	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(7)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(8)
 	/* Subtract dst from src. Add back after dst aligned.  */
 	subq	%rdi, %rsi
 	/* Align dst.  */
@@ -540,25 +532,25 @@ L(more_8x_vec_backward):
 	.p2align 4,, 11
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
-	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VMM(3)
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VMM(4)
 	addq	$(VEC_SIZE * -4), %rsi
-	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
-	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
-	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
+	VMOVA	%VMM(1), (VEC_SIZE * 3)(%rcx)
+	VMOVA	%VMM(2), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VMM(3), (VEC_SIZE * 1)(%rcx)
+	VMOVA	%VMM(4), (VEC_SIZE * 0)(%rcx)
 	addq	$(VEC_SIZE * -4), %rcx
 	cmpq	%rcx, %rdi
 	jb	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(5), VEC_SIZE(%rdi)
+	VMOVU	%VMM(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
-	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
+	VMOVU	%VMM(8), -VEC_SIZE(%rdx, %rdi)
 	VZEROUPPER_RETURN
 
 #if defined USE_MULTIARCH && IS_IN (libc)
@@ -568,7 +560,7 @@ L(loop_4x_vec_backward):
 # if ALIGN_MOVSB
 L(skip_short_movsb_check):
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 #  endif
 #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
 #   error Unsupported MOVSB_ALIGN_TO
@@ -597,9 +589,9 @@ L(skip_short_movsb_check):
 
 	rep	movsb
 
-	VMOVU	%VEC(0), (%r8)
+	VMOVU	%VMM(0), (%r8)
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	%VEC(1), VEC_SIZE(%r8)
+	VMOVU	%VMM(1), VEC_SIZE(%r8)
 #  endif
 	VZEROUPPER_RETURN
 # endif
@@ -640,7 +632,7 @@ L(movsb):
 # endif
 # if ALIGN_MOVSB
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 #  endif
 #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
 #   error Unsupported MOVSB_ALIGN_TO
@@ -664,9 +656,9 @@ L(movsb_align_dst):
 	rep	movsb
 
 	/* Store VECs loaded for aligning.  */
-	VMOVU	%VEC(0), (%r8)
+	VMOVU	%VMM(0), (%r8)
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	%VEC(1), VEC_SIZE(%r8)
+	VMOVU	%VMM(1), VEC_SIZE(%r8)
 #  endif
 	VZEROUPPER_RETURN
 # else	/* !ALIGN_MOVSB.  */
@@ -701,18 +693,18 @@ L(large_memcpy_2x):
 
 	/* First vec was also loaded into VEC(0).  */
 # if VEC_SIZE < 64
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 #  if VEC_SIZE < 32
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
 #  endif
 # endif
-	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VMM(0), (%rdi)
 # if VEC_SIZE < 64
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
 #  if VEC_SIZE < 32
-	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
 #  endif
 # endif
 
@@ -761,12 +753,12 @@ L(loop_large_memcpy_2x_inner):
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
 	/* Load vectors from rsi.  */
-	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
 	subq	$-LARGE_LOAD_SIZE, %rsi
 	/* Non-temporal store vectors to rdi.  */
-	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
 	subq	$-LARGE_LOAD_SIZE, %rdi
 	decl	%ecx
 	jnz	L(loop_large_memcpy_2x_inner)
@@ -785,31 +777,31 @@ L(loop_large_memcpy_2x_tail):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
 	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VMM(0)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
 	subq	$-(VEC_SIZE * 4), %rsi
 	addl	$-(VEC_SIZE * 4), %edx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VMM(0), (%rdi)
+	VMOVA	%VMM(1), VEC_SIZE(%rdi)
+	VMOVA	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VMM(3), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpl	$(VEC_SIZE * 4), %edx
 	ja	L(loop_large_memcpy_2x_tail)
 
 L(large_memcpy_2x_end):
 	/* Store the last 4 * VEC.  */
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
-
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
-	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
-	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
-	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(3)
+
+	VMOVU	%VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VMM(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
 	.p2align 4
@@ -831,16 +823,16 @@ L(loop_large_memcpy_4x_inner):
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
 	/* Load vectors from rsi.  */
-	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
 	subq	$-LARGE_LOAD_SIZE, %rsi
 	/* Non-temporal store vectors to rdi.  */
-	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
-	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
-	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
 	subq	$-LARGE_LOAD_SIZE, %rdi
 	decl	%ecx
 	jnz	L(loop_large_memcpy_4x_inner)
@@ -858,31 +850,31 @@ L(loop_large_memcpy_4x_tail):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
 	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VMM(0)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
 	subq	$-(VEC_SIZE * 4), %rsi
 	addl	$-(VEC_SIZE * 4), %edx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VMM(0), (%rdi)
+	VMOVA	%VMM(1), VEC_SIZE(%rdi)
+	VMOVA	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VMM(3), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpl	$(VEC_SIZE * 4), %edx
 	ja	L(loop_large_memcpy_4x_tail)
 
 L(large_memcpy_4x_end):
 	/* Store the last 4 * VEC.  */
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
-
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
-	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
-	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
-	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(3)
+
+	VMOVU	%VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VMM(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v8 4/6] x86: Update memset to use new VEC macros
  2022-10-15  0:06 ` [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls Noah Goldstein
  2022-10-15  0:06   ` [PATCH v8 2/6] x86: Update memrchr to use new VEC macros Noah Goldstein
  2022-10-15  0:06   ` [PATCH v8 3/6] x86: Update memmove " Noah Goldstein
@ 2022-10-15  0:06   ` Noah Goldstein
  2022-10-15  0:06   ` [PATCH v8 5/6] x86: Remove now unused vec header macros Noah Goldstein
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  0:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Replace %VEC(n) -> %VMM(n)

This commit does not change libc.so

Tested build on x86-64
---
 .../memset-avx2-unaligned-erms-rtm.S          |  8 +--
 .../multiarch/memset-avx2-unaligned-erms.S    | 14 +---
 .../multiarch/memset-avx512-unaligned-erms.S  | 20 +-----
 .../multiarch/memset-evex-unaligned-erms.S    | 20 +-----
 .../multiarch/memset-sse2-unaligned-erms.S    | 10 +--
 .../multiarch/memset-vec-unaligned-erms.S     | 70 ++++++++-----------
 6 files changed, 43 insertions(+), 99 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
index 8ac3e479bb..bc8605faf3 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
@@ -1,10 +1,6 @@
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+#include "x86-avx-rtm-vecs.h"
 
-#define VZEROUPPER_RETURN jmp	 L(return)
-
-#define SECTION(p) p##.avx.rtm
 #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
 #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
 
-#include "memset-avx2-unaligned-erms.S"
+# include "memset-avx2-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index a9054a9122..47cf5072a4 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -4,14 +4,9 @@
 
 # define USE_WITH_AVX2	1
 
-# define VEC_SIZE	32
-# define MOV_SIZE	4
-# define RET_SIZE	4
-
-# define VEC(i)		ymm##i
-
-# define VMOVU     vmovdqu
-# define VMOVA     vmovdqa
+# ifndef VEC_SIZE
+#  include "x86-avx-vecs.h"
+# endif
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
@@ -26,9 +21,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
 # define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
 
-# ifndef SECTION
-#  define SECTION(p)		p##.avx
-# endif
 # ifndef MEMSET_SYMBOL
 #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 47623b8ee8..84145b6c27 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -4,26 +4,14 @@
 
 # define USE_WITH_AVX512	1
 
-# define VEC_SIZE	64
-# define MOV_SIZE	6
-# define RET_SIZE	1
-
-# define XMM0		xmm16
-# define YMM0		ymm16
-# define VEC0		zmm16
-# define VEC(i)		VEC##i
-
-# define VMOVU     vmovdqu64
-# define VMOVA     vmovdqa64
-
-# define VZEROUPPER
+# include "x86-evex512-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastb d, %VEC0; \
+  vpbroadcastb d, %VMM(0); \
   movq r, %rax
 
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastd d, %VEC0; \
+  vpbroadcastd d, %VMM(0); \
   movq r, %rax
 
 # define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -32,8 +20,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p##.evex512
-
 #ifndef MEMSET_SYMBOL
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index ac4b2d2d50..1f03b26bf8 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -4,26 +4,14 @@
 
 # define USE_WITH_EVEX	1
 
-# define VEC_SIZE	32
-# define MOV_SIZE	6
-# define RET_SIZE	1
-
-# define XMM0		xmm16
-# define YMM0		ymm16
-# define VEC0		ymm16
-# define VEC(i)		VEC##i
-
-# define VMOVU     vmovdqu64
-# define VMOVA     vmovdqa64
-
-# define VZEROUPPER
+# include "x86-evex256-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastb d, %VEC0; \
+  vpbroadcastb d, %VMM(0); \
   movq r, %rax
 
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastd d, %VEC0; \
+  vpbroadcastd d, %VMM(0); \
   movq r, %rax
 
 # define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -32,8 +20,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p##.evex
-
 #ifndef MEMSET_SYMBOL
 # define MEMSET_SYMBOL(p,s)	p##_evex_##s
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index 44f9b8888b..34b245d8ca 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -26,13 +26,7 @@
 # include <sysdep.h>
 # define USE_WITH_SSE2	1
 
-# define VEC_SIZE	16
-# define MOV_SIZE	3
-# define RET_SIZE	1
-
-# define VEC(i)		xmm##i
-# define VMOVU     movups
-# define VMOVA     movaps
+# include "x86-sse2-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
@@ -52,8 +46,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p
-
 # ifndef MEMSET_SYMBOL
 #  define MEMSET_SYMBOL(p,s)	p##_sse2_##s
 # endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 905d0fa464..03de0ab907 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -34,14 +34,6 @@
 # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
 #endif
 
-#ifndef XMM0
-# define XMM0				xmm0
-#endif
-
-#ifndef YMM0
-# define YMM0				ymm0
-#endif
-
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER			vzeroupper
@@ -150,8 +142,8 @@ L(entry_from_wmemset):
 	cmpq	$(VEC_SIZE * 2), %rdx
 	ja	L(more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VMM(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VMM(0), (%rdi)
 	VZEROUPPER_RETURN
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
@@ -175,19 +167,19 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(stosb_more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
 	.p2align 4,, 4
 L(last_2x_vec):
 #ifdef USE_LESS_VEC_MASK_STORE
-	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * -2)(%rdi, %rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
 #else
-	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * -2)(%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * -1)(%rdi)
 #endif
 	VZEROUPPER_RETURN
 
@@ -221,7 +213,7 @@ L(less_vec_from_wmemset):
 	bzhil	%edx, %ecx, %ecx
 	kmovd	%ecx, %k1
 # endif
-	vmovdqu8 %VEC(0), (%rax){%k1}
+	vmovdqu8 %VMM(0), (%rax){%k1}
 	VZEROUPPER_RETURN
 
 # if defined USE_MULTIARCH && IS_IN (libc)
@@ -249,8 +241,8 @@ L(stosb_more_2x_vec):
 	   and (4x, 8x] jump to target.  */
 L(more_2x_vec):
 	/* Store next 2x vec regardless.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * 1)(%rdi)
 
 
 	/* Two different methods of setting up pointers / compare. The two
@@ -278,8 +270,8 @@ L(more_2x_vec):
 #endif
 
 	/* Store next 2x vec regardless.  */
-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
-	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
+	VMOVU	%VMM(0), (VEC_SIZE * 2)(%rax)
+	VMOVU	%VMM(0), (VEC_SIZE * 3)(%rax)
 
 
 #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
@@ -304,20 +296,20 @@ L(more_2x_vec):
 	andq	$(VEC_SIZE * -2), %LOOP_REG
 	.p2align 4
 L(loop):
-	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
-	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
-	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
-	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VMM(0), LOOP_4X_OFFSET(%LOOP_REG)
+	VMOVA	%VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
 	subq	$-(VEC_SIZE * 4), %LOOP_REG
 	cmpq	%END_REG, %LOOP_REG
 	jb	L(loop)
 	.p2align 4,, MOV_SIZE
 L(last_4x_vec):
-	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
-	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
-L(return):
+	VMOVU	%VMM(0), LOOP_4X_OFFSET(%END_REG)
+	VMOVU	%VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
+L(return_vzeroupper):
 #if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
@@ -355,7 +347,7 @@ L(cross_page):
 	jge	L(between_16_31)
 #endif
 #ifndef USE_XMM_LESS_VEC
-	MOVQ	%XMM0, %SET_REG64
+	MOVQ	%VMM_128(0), %SET_REG64
 #endif
 	cmpl	$8, %edx
 	jge	L(between_8_15)
@@ -374,8 +366,8 @@ L(between_0_0):
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
-	VMOVU	%YMM0, (%LESS_VEC_REG)
-	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
+	VMOVU	%VMM_256(0), (%LESS_VEC_REG)
+	VMOVU	%VMM_256(0), -32(%LESS_VEC_REG, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
@@ -383,8 +375,8 @@ L(between_32_63):
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
 L(between_16_31):
 	/* From 16 to 31.  No branch when size == 16.  */
-	VMOVU	%XMM0, (%LESS_VEC_REG)
-	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
+	VMOVU	%VMM_128(0), (%LESS_VEC_REG)
+	VMOVU	%VMM_128(0), -16(%LESS_VEC_REG, %rdx)
 	ret
 #endif
 
@@ -394,8 +386,8 @@ L(between_16_31):
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
 #ifdef USE_XMM_LESS_VEC
-	MOVQ	%XMM0, (%rdi)
-	MOVQ	%XMM0, -8(%rdi, %rdx)
+	MOVQ	%VMM_128(0), (%rdi)
+	MOVQ	%VMM_128(0), -8(%rdi, %rdx)
 #else
 	movq	%SET_REG64, (%LESS_VEC_REG)
 	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
@@ -408,8 +400,8 @@ L(between_8_15):
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
 #ifdef USE_XMM_LESS_VEC
-	MOVD	%XMM0, (%rdi)
-	MOVD	%XMM0, -4(%rdi, %rdx)
+	MOVD	%VMM_128(0), (%rdi)
+	MOVD	%VMM_128(0), -4(%rdi, %rdx)
 #else
 	movl	%SET_REG32, (%LESS_VEC_REG)
 	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v8 5/6] x86: Remove now unused vec header macros.
  2022-10-15  0:06 ` [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-10-15  0:06   ` [PATCH v8 4/6] x86: Update memset " Noah Goldstein
@ 2022-10-15  0:06   ` Noah Goldstein
  2022-10-15  0:06   ` [PATCH v8 6/6] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
  2022-10-15  0:12   ` [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls H.J. Lu
  5 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  0:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h     | 35 --------
 sysdeps/x86_64/multiarch/avx-vecs.h         | 47 -----------
 sysdeps/x86_64/multiarch/evex-vecs-common.h | 39 ---------
 sysdeps/x86_64/multiarch/evex256-vecs.h     | 35 --------
 sysdeps/x86_64/multiarch/evex512-vecs.h     | 35 --------
 sysdeps/x86_64/multiarch/sse2-vecs.h        | 47 -----------
 sysdeps/x86_64/multiarch/vec-macros.h       | 90 ---------------------
 7 files changed, 328 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/evex-vecs-common.h
 delete mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/vec-macros.h

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
deleted file mode 100644
index 6ca9f5e6ba..0000000000
--- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Common config for AVX-RTM VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _AVX_RTM_VECS_H
-#define _AVX_RTM_VECS_H			1
-
-#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
-#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
-	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
-
-#define USE_WITH_RTM			1
-#include "avx-vecs.h"
-
-#undef SECTION
-#define SECTION(p)				p##.avx.rtm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
deleted file mode 100644
index 89680f5db8..0000000000
--- a/sysdeps/x86_64/multiarch/avx-vecs.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Common config for AVX VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _AVX_VECS_H
-#define _AVX_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			32
-#include "vec-macros.h"
-
-#define USE_WITH_AVX		1
-#define SECTION(p)			p##.avx
-
-/* 4-byte mov instructions with AVX2.  */
-#define MOV_SIZE			4
-/* 1 (ret) + 3 (vzeroupper).  */
-#define RET_SIZE			4
-#define VZEROUPPER			vzeroupper
-
-#define VMOVU				vmovdqu
-#define VMOVA				vmovdqa
-#define VMOVNT				vmovntdq
-
-/* Often need to access xmm portion.  */
-#define VEC_xmm				VEC_any_xmm
-#define VEC					VEC_any_ymm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
deleted file mode 100644
index 99806ebcd7..0000000000
--- a/sysdeps/x86_64/multiarch/evex-vecs-common.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Common config for EVEX256 and EVEX512 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _EVEX_VECS_COMMON_H
-#define _EVEX_VECS_COMMON_H			1
-
-#include "vec-macros.h"
-
-/* 6-byte mov instructions with EVEX.  */
-#define MOV_SIZE			6
-/* No vzeroupper needed.  */
-#define RET_SIZE			1
-#define VZEROUPPER
-
-#define VMOVU				vmovdqu64
-#define VMOVA				vmovdqa64
-#define VMOVNT				vmovntdq
-
-#define VEC_xmm				VEC_hi_xmm
-#define VEC_ymm				VEC_hi_ymm
-#define VEC_zmm				VEC_hi_zmm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
deleted file mode 100644
index 222ba46dc7..0000000000
--- a/sysdeps/x86_64/multiarch/evex256-vecs.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Common config for EVEX256 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _EVEX256_VECS_H
-#define _EVEX256_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			32
-#include "evex-vecs-common.h"
-
-#define USE_WITH_EVEX256	1
-#define SECTION(p)			p##.evex
-
-#define VEC					VEC_ymm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
deleted file mode 100644
index d1784d5368..0000000000
--- a/sysdeps/x86_64/multiarch/evex512-vecs.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Common config for EVEX512 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _EVEX512_VECS_H
-#define _EVEX512_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			64
-#include "evex-vecs-common.h"
-
-#define USE_WITH_EVEX512	1
-#define SECTION(p)			p##.evex512
-
-#define VEC					VEC_zmm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
deleted file mode 100644
index 2b77a59d56..0000000000
--- a/sysdeps/x86_64/multiarch/sse2-vecs.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Common config for SSE2 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _SSE2_VECS_H
-#define _SSE2_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			16
-#include "vec-macros.h"
-
-#define USE_WITH_SSE2		1
-#define SECTION(p)			p
-
-/* 3-byte mov instructions with SSE2.  */
-#define MOV_SIZE			3
-/* No vzeroupper needed.  */
-#define RET_SIZE			1
-#define VZEROUPPER
-
-#define VMOVU				movups
-#define VMOVA				movaps
-#define VMOVNT				movntdq
-
-#define VEC_xmm				VEC_any_xmm
-#define VEC					VEC_any_xmm
-
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
deleted file mode 100644
index 9f3ffecede..0000000000
--- a/sysdeps/x86_64/multiarch/vec-macros.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Macro helpers for VEC_{type}({vec_num})
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _VEC_MACROS_H
-#define _VEC_MACROS_H			1
-
-#ifndef VEC_SIZE
-# error "Never include this file directly. Always include a vector config."
-#endif
-
-/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
-   VEC(N) values.  */
-#define VEC_hi_xmm0				xmm16
-#define VEC_hi_xmm1				xmm17
-#define VEC_hi_xmm2				xmm18
-#define VEC_hi_xmm3				xmm19
-#define VEC_hi_xmm4				xmm20
-#define VEC_hi_xmm5				xmm21
-#define VEC_hi_xmm6				xmm22
-#define VEC_hi_xmm7				xmm23
-#define VEC_hi_xmm8				xmm24
-#define VEC_hi_xmm9				xmm25
-#define VEC_hi_xmm10			xmm26
-#define VEC_hi_xmm11			xmm27
-#define VEC_hi_xmm12			xmm28
-#define VEC_hi_xmm13			xmm29
-#define VEC_hi_xmm14			xmm30
-#define VEC_hi_xmm15			xmm31
-
-#define VEC_hi_ymm0				ymm16
-#define VEC_hi_ymm1				ymm17
-#define VEC_hi_ymm2				ymm18
-#define VEC_hi_ymm3				ymm19
-#define VEC_hi_ymm4				ymm20
-#define VEC_hi_ymm5				ymm21
-#define VEC_hi_ymm6				ymm22
-#define VEC_hi_ymm7				ymm23
-#define VEC_hi_ymm8				ymm24
-#define VEC_hi_ymm9				ymm25
-#define VEC_hi_ymm10			ymm26
-#define VEC_hi_ymm11			ymm27
-#define VEC_hi_ymm12			ymm28
-#define VEC_hi_ymm13			ymm29
-#define VEC_hi_ymm14			ymm30
-#define VEC_hi_ymm15			ymm31
-
-#define VEC_hi_zmm0				zmm16
-#define VEC_hi_zmm1				zmm17
-#define VEC_hi_zmm2				zmm18
-#define VEC_hi_zmm3				zmm19
-#define VEC_hi_zmm4				zmm20
-#define VEC_hi_zmm5				zmm21
-#define VEC_hi_zmm6				zmm22
-#define VEC_hi_zmm7				zmm23
-#define VEC_hi_zmm8				zmm24
-#define VEC_hi_zmm9				zmm25
-#define VEC_hi_zmm10			zmm26
-#define VEC_hi_zmm11			zmm27
-#define VEC_hi_zmm12			zmm28
-#define VEC_hi_zmm13			zmm29
-#define VEC_hi_zmm14			zmm30
-#define VEC_hi_zmm15			zmm31
-
-#define PRIMITIVE_VEC(vec, num)		vec##num
-
-#define VEC_any_xmm(i)			PRIMITIVE_VEC(xmm, i)
-#define VEC_any_ymm(i)			PRIMITIVE_VEC(ymm, i)
-#define VEC_any_zmm(i)			PRIMITIVE_VEC(zmm, i)
-
-#define VEC_hi_xmm(i)			PRIMITIVE_VEC(VEC_hi_xmm, i)
-#define VEC_hi_ymm(i)			PRIMITIVE_VEC(VEC_hi_ymm, i)
-#define VEC_hi_zmm(i)			PRIMITIVE_VEC(VEC_hi_zmm, i)
-
-#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v8 6/6] x86: Update strlen-evex-base to use new reg/vec macros.
  2022-10-15  0:06 ` [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-10-15  0:06   ` [PATCH v8 5/6] x86: Remove now unused vec header macros Noah Goldstein
@ 2022-10-15  0:06   ` Noah Goldstein
  2022-10-15  0:12   ` [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls H.J. Lu
  5 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  0:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

To avoid duplicate the VMM / GPR / mask insn macros in all incoming
evex512 files use the macros defined in 'reg-macros.h' and
'{vec}-macros.h'

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 116 +++++++-------------
 sysdeps/x86_64/multiarch/strlen-evex512.S   |   4 +-
 2 files changed, 44 insertions(+), 76 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index 418e9f8411..c832b15a48 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -36,42 +36,10 @@
 #  define CHAR_SIZE	1
 # endif
 
-# define XMM0		xmm16
 # define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# if VEC_SIZE == 64
-#  define KMOV		kmovq
-#  define KORTEST	kortestq
-#  define RAX		rax
-#  define RCX		rcx
-#  define RDX		rdx
-#  define SHR		shrq
-#  define TEXTSUFFIX	evex512
-#  define VMM0		zmm16
-#  define VMM1		zmm17
-#  define VMM2		zmm18
-#  define VMM3		zmm19
-#  define VMM4		zmm20
-#  define VMOVA		vmovdqa64
-# elif VEC_SIZE == 32
-/* Currently Unused.  */
-#  define KMOV		kmovd
-#  define KORTEST	kortestd
-#  define RAX		eax
-#  define RCX		ecx
-#  define RDX		edx
-#  define SHR		shrl
-#  define TEXTSUFFIX	evex256
-#  define VMM0		ymm16
-#  define VMM1		ymm17
-#  define VMM2		ymm18
-#  define VMM3		ymm19
-#  define VMM4		ymm20
-#  define VMOVA		vmovdqa32
-# endif
-
-	.section .text.TEXTSUFFIX, "ax", @progbits
+	.section SECTION(.text),"ax",@progbits
 /* Aligning entry point to 64 byte, provides better performance for
    one vector length string.  */
 ENTRY_P2ALIGN (STRLEN, 6)
@@ -86,18 +54,18 @@ ENTRY_P2ALIGN (STRLEN, 6)
 # endif
 
 	movl	%edi, %eax
-	vpxorq	%XMM0, %XMM0, %XMM0
+	vpxorq	%VMM_128(0), %VMM_128(0), %VMM_128(0)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(page_cross)
 
 	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMP	$0, (%rdi), %VMM0, %k0
-	KMOV	%k0, %RAX
-	test	%RAX, %RAX
+	VPCMP	$0, (%rdi), %VMM(0), %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
@@ -120,7 +88,7 @@ L(align_more):
 	movq	%rax, %rdx
 	subq	%rdi, %rdx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RDX
+	shr	$2, %VRDX
 #  endif
 	/* At this point rdx contains [w]chars already compared.  */
 	subq	%rsi, %rdx
@@ -131,9 +99,9 @@ L(align_more):
 # endif
 
 	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMP	$0, (%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
 # ifdef USE_AS_STRNLEN
@@ -141,9 +109,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, VEC_SIZE(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, VEC_SIZE(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
 # ifdef USE_AS_STRNLEN
@@ -151,9 +119,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 
 # ifdef USE_AS_STRNLEN
@@ -161,9 +129,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x4)
 
 # ifdef USE_AS_STRNLEN
@@ -179,7 +147,7 @@ L(align_more):
 # ifdef USE_AS_STRNLEN
 	subq	%rax, %rcx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RCX
+	shr	$2, %VRCX
 #  endif
 	/* rcx contains number of [w]char will be recompared due to
 	   alignment fixes.  rdx must be incremented by rcx to offset
@@ -199,42 +167,42 @@ L(loop_entry):
 # endif
 	/* VPMINU and VPCMP combination provide better performance as
 	   compared to alternative combinations.  */
-	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
-	VPMINU	(VEC_SIZE * 5)(%rax), %VMM1, %VMM2
-	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
-	VPMINU	(VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+	VMOVA	(VEC_SIZE * 4)(%rax), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rax), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
 
-	VPTESTN	%VMM2, %VMM2, %k0
-	VPTESTN	%VMM4, %VMM4, %k1
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k1
 
 	subq	$-(VEC_SIZE * 4), %rax
 	KORTEST	%k0, %k1
 	jz	L(loop)
 
-	VPTESTN	%VMM1, %VMM1, %k2
-	KMOV	%k2, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VMM(1), %VMM(1), %k2
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
-	KMOV	%k0, %RCX
+	KMOV	%k0, %VRCX
 	/* At this point, if k0 is non zero, null char must be in the
 	   second vector.  */
-	test	%RCX, %RCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
-	VPTESTN	%VMM3, %VMM3, %k3
-	KMOV	%k3, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VMM(3), %VMM(3), %k3
+	KMOV	%k3, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 	/* At this point null [w]char must be in the fourth vector so no
 	   need to check.  */
-	KMOV	%k1, %RCX
+	KMOV	%k1, %VRCX
 
 	/* Fourth, third, second vector terminating are pretty much
 	   same, implemented this way to avoid branching and reuse code
 	   from pre loop exit condition.  */
 L(ret_vec_x4):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 3), %rax
@@ -250,7 +218,7 @@ L(ret_vec_x4):
 	ret
 
 L(ret_vec_x3):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 2), %rax
@@ -268,7 +236,7 @@ L(ret_vec_x3):
 L(ret_vec_x2):
 	subq	$-VEC_SIZE, %rax
 L(ret_vec_x1):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
@@ -289,13 +257,13 @@ L(page_cross):
 	/* ecx contains number of w[char] to be skipped as a result
 	   of address alignment.  */
 	xorq	%rdi, %rax
-	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
-	KMOV	%k0, %RAX
+	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRAX
 	/* Ignore number of character for alignment adjustment.  */
-	SHR	%cl, %RAX
+	shr	%cl, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
index 116f8981c8..10c3415c8a 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -2,6 +2,6 @@
 # define STRLEN		__strlen_evex512
 #endif
 
-#define VEC_SIZE	64
-
+#include "x86-evex512-vecs.h"
+#include "reg-macros.h"
 #include "strlen-evex-base.S"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls
  2022-10-15  0:06 ` [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls Noah Goldstein
                     ` (4 preceding siblings ...)
  2022-10-15  0:06   ` [PATCH v8 6/6] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
@ 2022-10-15  0:12   ` H.J. Lu
  2022-10-15  0:20     ` Noah Goldstein
  5 siblings, 1 reply; 72+ messages in thread
From: H.J. Lu @ 2022-10-15  0:12 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 5:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> 1) Copy so that backport will be easier.
> 2) Make section only define if there is not a previous definition
> 3) Add `VEC_lo` definition for proper reg-width but in the
>    ymm/zmm0-15 range.
> 4) Add macros for accessing GPRs based on VEC_SIZE
>         This is to make it easier to do think like:
>         ```
>             vpcmpb %VEC(0), %VEC(1), %k0
>             kmov{d|q} %k0, %{eax|rax}
>             test %{eax|rax}
>         ```
>         It adds macro s.t any GPR can get the proper width with:
>             `V{upper_case_GPR_name}`
>
>         and any mask insn can get the proper width with:
>             `{mask_insn_without_postfix}V`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

This comment is incorrect.

>
> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  sysdeps/x86_64/multiarch/reg-macros.h         | 168 ++++++++++++++++++
>  .../multiarch/scripts/gen-reg-macros.py       | 125 +++++++++++++
>  sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h   |  35 ++++
>  sysdeps/x86_64/multiarch/x86-avx-vecs.h       |  47 +++++
>  .../x86_64/multiarch/x86-evex-vecs-common.h   |  39 ++++
>  sysdeps/x86_64/multiarch/x86-evex256-vecs.h   |  38 ++++
>  sysdeps/x86_64/multiarch/x86-evex512-vecs.h   |  38 ++++
>  sysdeps/x86_64/multiarch/x86-sse2-vecs.h      |  47 +++++
>  sysdeps/x86_64/multiarch/x86-vec-macros.h     |  90 ++++++++++
>  9 files changed, 627 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
>  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
>  create mode 100644 sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-avx-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-evex256-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-evex512-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-sse2-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-vec-macros.h
>
> diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> new file mode 100644
> index 0000000000..c8ea330256
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> @@ -0,0 +1,168 @@
> +/* This file was generated by: gen-reg-macros.py.
> +
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _REG_MACROS_H
> +#define _REG_MACROS_H  1
> +
> +#define rax_8  al
> +#define rax_16 ax
> +#define rax_32 eax
> +#define rax_64 rax
> +#define rbx_8  bl
> +#define rbx_16 bx
> +#define rbx_32 ebx
> +#define rbx_64 rbx
> +#define rcx_8  cl
> +#define rcx_16 cx
> +#define rcx_32 ecx
> +#define rcx_64 rcx
> +#define rdx_8  dl
> +#define rdx_16 dx
> +#define rdx_32 edx
> +#define rdx_64 rdx
> +#define rbp_8  bpl
> +#define rbp_16 bp
> +#define rbp_32 ebp
> +#define rbp_64 rbp
> +#define rsp_8  spl
> +#define rsp_16 sp
> +#define rsp_32 esp
> +#define rsp_64 rsp
> +#define rsi_8  sil
> +#define rsi_16 si
> +#define rsi_32 esi
> +#define rsi_64 rsi
> +#define rdi_8  dil
> +#define rdi_16 di
> +#define rdi_32 edi
> +#define rdi_64 rdi
> +#define r8_8   r8b
> +#define r8_16  r8w
> +#define r8_32  r8d
> +#define r8_64  r8
> +#define r9_8   r9b
> +#define r9_16  r9w
> +#define r9_32  r9d
> +#define r9_64  r9
> +#define r10_8  r10b
> +#define r10_16 r10w
> +#define r10_32 r10d
> +#define r10_64 r10
> +#define r11_8  r11b
> +#define r11_16 r11w
> +#define r11_32 r11d
> +#define r11_64 r11
> +#define r12_8  r12b
> +#define r12_16 r12w
> +#define r12_32 r12d
> +#define r12_64 r12
> +#define r13_8  r13b
> +#define r13_16 r13w
> +#define r13_32 r13d
> +#define r13_64 r13
> +#define r14_8  r14b
> +#define r14_16 r14w
> +#define r14_32 r14d
> +#define r14_64 r14
> +#define r15_8  r15b
> +#define r15_16 r15w
> +#define r15_32 r15d
> +#define r15_64 r15
> +
> +#define kmov_8 kmovb
> +#define kmov_16        kmovw
> +#define kmov_32        kmovd
> +#define kmov_64        kmovq
> +#define kortest_8      kortestb
> +#define kortest_16     kortestw
> +#define kortest_32     kortestd
> +#define kortest_64     kortestq
> +#define kor_8  korb
> +#define kor_16 korw
> +#define kor_32 kord
> +#define kor_64 korq
> +#define ktest_8        ktestb
> +#define ktest_16       ktestw
> +#define ktest_32       ktestd
> +#define ktest_64       ktestq
> +#define kand_8 kandb
> +#define kand_16        kandw
> +#define kand_32        kandd
> +#define kand_64        kandq
> +#define kxor_8 kxorb
> +#define kxor_16        kxorw
> +#define kxor_32        kxord
> +#define kxor_64        kxorq
> +#define knot_8 knotb
> +#define knot_16        knotw
> +#define knot_32        knotd
> +#define knot_64        knotq
> +#define kxnor_8        kxnorb
> +#define kxnor_16       kxnorw
> +#define kxnor_32       kxnord
> +#define kxnor_64       kxnorq
> +#define kunpack_8      kunpackbw
> +#define kunpack_16     kunpackwd
> +#define kunpack_32     kunpackdq
> +
> +/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
> +#define VRAX   VGPR(rax)
> +#define VRBX   VGPR(rbx)
> +#define VRCX   VGPR(rcx)
> +#define VRDX   VGPR(rdx)
> +#define VRBP   VGPR(rbp)
> +#define VRSP   VGPR(rsp)
> +#define VRSI   VGPR(rsi)
> +#define VRDI   VGPR(rdi)
> +#define VR8    VGPR(r8)
> +#define VR9    VGPR(r9)
> +#define VR10   VGPR(r10)
> +#define VR11   VGPR(r11)
> +#define VR12   VGPR(r12)
> +#define VR13   VGPR(r13)
> +#define VR14   VGPR(r14)
> +#define VR15   VGPR(r15)
> +
> +/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
> +#define KMOV   VKINSN(kmov)
> +#define KORTEST        VKINSN(kortest)
> +#define KOR    VKINSN(kor)
> +#define KTEST  VKINSN(ktest)
> +#define KAND   VKINSN(kand)
> +#define KXOR   VKINSN(kxor)
> +#define KNOT   VKINSN(knot)
> +#define KXNOR  VKINSN(kxnor)
> +#define KUNPACK        VKINSN(kunpack)

These aren't register macros.  Should reg-macros.h be renamed, like
vec-macros.h?

> +
> +#ifdef USE_WIDE_CHAR
> +# define REG_WIDTH 32
> +#else
> +# define REG_WIDTH VEC_SIZE
> +#endif
> +
> +#define VPASTER(x, y)  x##_##y
> +#define VEVALUATOR(x, y)       VPASTER(x, y)
> +
> +#define VGPR_SZ(reg_name, reg_size)    VEVALUATOR(reg_name, reg_size)
> +#define VKINSN_SZ(insn, reg_size)      VEVALUATOR(insn, reg_size)
> +
> +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> +#define VKINSN(mask_insn)      VKINSN_SZ(mask_insn, REG_WIDTH)
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> new file mode 100644
> index 0000000000..6a05f27ff4
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> @@ -0,0 +1,125 @@
> +#!/usr/bin/python3
> +# Copyright (C) 2022 Free Software Foundation, Inc.
> +# This file is part of the GNU C Library.
> +#
> +# The GNU C Library is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU Lesser General Public
> +# License as published by the Free Software Foundation; either
> +# version 2.1 of the License, or (at your option) any later version.
> +#
> +# The GNU C Library is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +# Lesser General Public License for more details.
> +#
> +# You should have received a copy of the GNU Lesser General Public
> +# License along with the GNU C Library; if not, see
> +# <https://www.gnu.org/licenses/>.
> +"""Generate macros for getting GPR name of a certain size
> +
> +Inputs: None
> +Output: Prints header fill to stdout
> +
> +API:
> +    VGPR(reg_name)
> +        - Get register name VEC_SIZE component of `reg_name`
> +    VGPR_SZ(reg_name, reg_size)
> +        - Get register name `reg_size` component of `reg_name`
> +"""
> +
> +import sys
> +import os
> +from datetime import datetime
> +
> +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> +
> +mask_insns = [
> +    "kmov",
> +    "kortest",
> +    "kor",
> +    "ktest",
> +    "kand",
> +    "kxor",
> +    "knot",
> +    "kxnor",
> +]
> +mask_insns_ext = ["b", "w", "d", "q"]
> +
> +cr = """
> +   Copyright (C) {} Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +"""
> +
> +print("/* This file was generated by: {}.".format(os.path.basename(
> +    sys.argv[0])))
> +print(cr.format(datetime.today().year))
> +
> +print("#ifndef _REG_MACROS_H")
> +print("#define _REG_MACROS_H\t1")
> +print("")
> +for reg in registers:
> +    for i in range(0, 4):
> +        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
> +
> +print("")
> +for mask_insn in mask_insns:
> +    for i in range(0, 4):
> +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> +                                           mask_insns_ext[i]))
> +for i in range(0, 3):
> +    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
> +                                                   mask_insns_ext[i + 1]))
> +mask_insns.append("kunpack")
> +
> +print("")
> +print(
> +    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
> +for reg in registers:
> +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> +
> +print("")
> +
> +print(
> +    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
> +)
> +for mask_insn in mask_insns:
> +    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
> +print("")
> +
> +print("#ifdef USE_WIDE_CHAR")
> +print("# define REG_WIDTH 32")
> +print("#else")
> +print("# define REG_WIDTH VEC_SIZE")
> +print("#endif")
> +print("")
> +print("#define VPASTER(x, y)\tx##_##y")
> +print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
> +print("")
> +print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
> +print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
> +print("")
> +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> +print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
> +
> +print("\n#endif")
> diff --git a/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
> new file mode 100644
> index 0000000000..0b326c8a70
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
> @@ -0,0 +1,35 @@
> +/* Common config for AVX-RTM VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_AVX_RTM_VECS_H
> +#define _X86_AVX_RTM_VECS_H                    1
> +
> +#define COND_VZEROUPPER                        COND_VZEROUPPER_XTEST
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN              jmp L(return_vzeroupper)
> +
> +#define USE_WITH_RTM                   1
> +#include "x86-avx-vecs.h"
> +
> +#undef SECTION
> +#define SECTION(p)                             p##.avx.rtm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> new file mode 100644
> index 0000000000..dca1089060
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> @@ -0,0 +1,47 @@
> +/* Common config for AVX VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_AVX_VECS_H
> +#define _X86_AVX_VECS_H                        1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       32
> +#include "x86-vec-macros.h"
> +
> +#define USE_WITH_AVX           1
> +#define SECTION(p)                     p##.avx
> +
> +/* 4-byte mov instructions with AVX2.  */
> +#define MOV_SIZE                       4
> +/* 1 (ret) + 3 (vzeroupper).  */
> +#define RET_SIZE                       4
> +#define VZEROUPPER                     vzeroupper
> +
> +#define VMOVU                          vmovdqu
> +#define VMOVA                          vmovdqa
> +#define VMOVNT                         vmovntdq
> +
> +/* Often need to access xmm portion.  */
> +#define VMM_128                                VMM_any_xmm
> +#define VMM                                    VMM_any_ymm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
> new file mode 100644
> index 0000000000..f331e9d8ec
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
> @@ -0,0 +1,39 @@
> +/* Common config for EVEX256 and EVEX512 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_EVEX_VECS_COMMON_H
> +#define _X86_EVEX_VECS_COMMON_H                        1
> +
> +#include "x86-vec-macros.h"
> +
> +/* 6-byte mov instructions with EVEX.  */
> +#define MOV_SIZE                       6
> +/* No vzeroupper needed.  */
> +#define RET_SIZE                       1
> +#define VZEROUPPER
> +
> +#define VMOVU                          vmovdqu64
> +#define VMOVA                          vmovdqa64
> +#define VMOVNT                         vmovntdq
> +
> +#define VMM_128                                VMM_hi_xmm
> +#define VMM_256                                VMM_hi_ymm
> +#define VMM_512                                VMM_hi_zmm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-evex256-vecs.h b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
> new file mode 100644
> index 0000000000..8337b95504
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
> @@ -0,0 +1,38 @@
> +/* Common config for EVEX256 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _EVEX256_VECS_H
> +#define _EVEX256_VECS_H                        1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       32
> +#include "x86-evex-vecs-common.h"
> +
> +#define USE_WITH_EVEX256       1
> +
> +#ifndef SECTION
> +# define SECTION(p)                    p##.evex
> +#endif
> +
> +#define VMM                                    VMM_256
> +#define VMM_lo                         VMM_any_ymm
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-evex512-vecs.h b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
> new file mode 100644
> index 0000000000..7dc5c23ad0
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
> @@ -0,0 +1,38 @@
> +/* Common config for EVEX512 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _EVEX512_VECS_H
> +#define _EVEX512_VECS_H                        1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       64
> +#include "x86-evex-vecs-common.h"
> +
> +#define USE_WITH_EVEX512       1
> +
> +#ifndef SECTION
> +# define SECTION(p)                    p##.evex512
> +#endif
> +
> +#define VMM                                    VMM_512
> +#define VMM_lo                         VMM_any_zmm
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-sse2-vecs.h b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
> new file mode 100644
> index 0000000000..b8bbd5dc29
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
> @@ -0,0 +1,47 @@
> +/* Common config for SSE2 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_SSE2_VECS_H
> +#define _X86_SSE2_VECS_H                       1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       16
> +#include "x86-vec-macros.h"
> +
> +#define USE_WITH_SSE2          1
> +#define SECTION(p)                     p
> +
> +/* 3-byte mov instructions with SSE2.  */
> +#define MOV_SIZE                       3
> +/* No vzeroupper needed.  */
> +#define RET_SIZE                       1
> +#define VZEROUPPER
> +
> +#define VMOVU                          movups
> +#define VMOVA                          movaps
> +#define VMOVNT                         movntdq
> +
> +#define VMM_128                                VMM_any_xmm
> +#define VMM                                    VMM_any_xmm
> +
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-vec-macros.h b/sysdeps/x86_64/multiarch/x86-vec-macros.h
> new file mode 100644
> index 0000000000..7d6bb31d55
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-vec-macros.h
> @@ -0,0 +1,90 @@
> +/* Macro helpers for VEC_{type}({vec_num})
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_VEC_MACROS_H
> +#define _X86_VEC_MACROS_H                      1
> +
> +#ifndef VEC_SIZE
> +# error "Never include this file directly. Always include a vector config."
> +#endif
> +
> +/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
> +   VMM(N) values.  */
> +#define VMM_hi_xmm0                            xmm16
> +#define VMM_hi_xmm1                            xmm17
> +#define VMM_hi_xmm2                            xmm18
> +#define VMM_hi_xmm3                            xmm19
> +#define VMM_hi_xmm4                            xmm20
> +#define VMM_hi_xmm5                            xmm21
> +#define VMM_hi_xmm6                            xmm22
> +#define VMM_hi_xmm7                            xmm23
> +#define VMM_hi_xmm8                            xmm24
> +#define VMM_hi_xmm9                            xmm25
> +#define VMM_hi_xmm10                   xmm26
> +#define VMM_hi_xmm11                   xmm27
> +#define VMM_hi_xmm12                   xmm28
> +#define VMM_hi_xmm13                   xmm29
> +#define VMM_hi_xmm14                   xmm30
> +#define VMM_hi_xmm15                   xmm31
> +
> +#define VMM_hi_ymm0                            ymm16
> +#define VMM_hi_ymm1                            ymm17
> +#define VMM_hi_ymm2                            ymm18
> +#define VMM_hi_ymm3                            ymm19
> +#define VMM_hi_ymm4                            ymm20
> +#define VMM_hi_ymm5                            ymm21
> +#define VMM_hi_ymm6                            ymm22
> +#define VMM_hi_ymm7                            ymm23
> +#define VMM_hi_ymm8                            ymm24
> +#define VMM_hi_ymm9                            ymm25
> +#define VMM_hi_ymm10                   ymm26
> +#define VMM_hi_ymm11                   ymm27
> +#define VMM_hi_ymm12                   ymm28
> +#define VMM_hi_ymm13                   ymm29
> +#define VMM_hi_ymm14                   ymm30
> +#define VMM_hi_ymm15                   ymm31
> +
> +#define VMM_hi_zmm0                            zmm16
> +#define VMM_hi_zmm1                            zmm17
> +#define VMM_hi_zmm2                            zmm18
> +#define VMM_hi_zmm3                            zmm19
> +#define VMM_hi_zmm4                            zmm20
> +#define VMM_hi_zmm5                            zmm21
> +#define VMM_hi_zmm6                            zmm22
> +#define VMM_hi_zmm7                            zmm23
> +#define VMM_hi_zmm8                            zmm24
> +#define VMM_hi_zmm9                            zmm25
> +#define VMM_hi_zmm10                   zmm26
> +#define VMM_hi_zmm11                   zmm27
> +#define VMM_hi_zmm12                   zmm28
> +#define VMM_hi_zmm13                   zmm29
> +#define VMM_hi_zmm14                   zmm30
> +#define VMM_hi_zmm15                   zmm31
> +
> +#define PRIMITIVE_VMM(vec, num)                vec##num
> +
> +#define VMM_any_xmm(i)                 PRIMITIVE_VMM(xmm, i)
> +#define VMM_any_ymm(i)                 PRIMITIVE_VMM(ymm, i)
> +#define VMM_any_zmm(i)                 PRIMITIVE_VMM(zmm, i)
> +
> +#define VMM_hi_xmm(i)                  PRIMITIVE_VMM(VMM_hi_xmm, i)
> +#define VMM_hi_ymm(i)                  PRIMITIVE_VMM(VMM_hi_ymm, i)
> +#define VMM_hi_zmm(i)                  PRIMITIVE_VMM(VMM_hi_zmm, i)
> +
> +#endif
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v9 1/6] x86: Update VEC macros to complete API for evex/evex512 impls
  2022-10-14 16:40 [PATCH v1 1/3] x86: Update evex256/512 vec macros Noah Goldstein
                   ` (8 preceding siblings ...)
  2022-10-15  0:06 ` [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls Noah Goldstein
@ 2022-10-15  0:20 ` Noah Goldstein
  2022-10-15  0:20   ` [PATCH v9 2/6] x86: Update memrchr to use new VEC macros Noah Goldstein
                     ` (5 more replies)
  2022-10-15  3:00 ` [PATCH v10 " Noah Goldstein
  10 siblings, 6 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  0:20 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

1) Copy so that backport will be easier.
2) Make section only define if there is not a previous definition
3) Add `VEC_lo` definition for proper reg-width but in the
   ymm/zmm0-15 range.
4) Add macros for accessing GPRs based on VEC_SIZE
        This is to make it easier to do think like:
        ```
            vpcmpb %VEC(0), %VEC(1), %k0
            kmov{d|q} %k0, %{eax|rax}
            test %{eax|rax}
        ```
        It adds macro s.t any GPR can get the proper width with:
            `V{upcase_GPR_name}`

        and any mask insn can get the proper width with:
            `{upcase_mask_insn_without_postfix}`

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/reg-macros.h         | 168 ++++++++++++++++++
 .../multiarch/scripts/gen-reg-macros.py       | 133 ++++++++++++++
 sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h   |  35 ++++
 sysdeps/x86_64/multiarch/x86-avx-vecs.h       |  47 +++++
 .../x86_64/multiarch/x86-evex-vecs-common.h   |  39 ++++
 sysdeps/x86_64/multiarch/x86-evex256-vecs.h   |  38 ++++
 sysdeps/x86_64/multiarch/x86-evex512-vecs.h   |  38 ++++
 sysdeps/x86_64/multiarch/x86-sse2-vecs.h      |  47 +++++
 sysdeps/x86_64/multiarch/x86-vec-macros.h     |  90 ++++++++++
 9 files changed, 635 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
 create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-evex256-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-evex512-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-sse2-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-vec-macros.h

diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
new file mode 100644
index 0000000000..c8ea330256
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/reg-macros.h
@@ -0,0 +1,168 @@
+/* This file was generated by: gen-reg-macros.py.
+
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _REG_MACROS_H
+#define _REG_MACROS_H	1
+
+#define rax_8	al
+#define rax_16	ax
+#define rax_32	eax
+#define rax_64	rax
+#define rbx_8	bl
+#define rbx_16	bx
+#define rbx_32	ebx
+#define rbx_64	rbx
+#define rcx_8	cl
+#define rcx_16	cx
+#define rcx_32	ecx
+#define rcx_64	rcx
+#define rdx_8	dl
+#define rdx_16	dx
+#define rdx_32	edx
+#define rdx_64	rdx
+#define rbp_8	bpl
+#define rbp_16	bp
+#define rbp_32	ebp
+#define rbp_64	rbp
+#define rsp_8	spl
+#define rsp_16	sp
+#define rsp_32	esp
+#define rsp_64	rsp
+#define rsi_8	sil
+#define rsi_16	si
+#define rsi_32	esi
+#define rsi_64	rsi
+#define rdi_8	dil
+#define rdi_16	di
+#define rdi_32	edi
+#define rdi_64	rdi
+#define r8_8	r8b
+#define r8_16	r8w
+#define r8_32	r8d
+#define r8_64	r8
+#define r9_8	r9b
+#define r9_16	r9w
+#define r9_32	r9d
+#define r9_64	r9
+#define r10_8	r10b
+#define r10_16	r10w
+#define r10_32	r10d
+#define r10_64	r10
+#define r11_8	r11b
+#define r11_16	r11w
+#define r11_32	r11d
+#define r11_64	r11
+#define r12_8	r12b
+#define r12_16	r12w
+#define r12_32	r12d
+#define r12_64	r12
+#define r13_8	r13b
+#define r13_16	r13w
+#define r13_32	r13d
+#define r13_64	r13
+#define r14_8	r14b
+#define r14_16	r14w
+#define r14_32	r14d
+#define r14_64	r14
+#define r15_8	r15b
+#define r15_16	r15w
+#define r15_32	r15d
+#define r15_64	r15
+
+#define kmov_8	kmovb
+#define kmov_16	kmovw
+#define kmov_32	kmovd
+#define kmov_64	kmovq
+#define kortest_8	kortestb
+#define kortest_16	kortestw
+#define kortest_32	kortestd
+#define kortest_64	kortestq
+#define kor_8	korb
+#define kor_16	korw
+#define kor_32	kord
+#define kor_64	korq
+#define ktest_8	ktestb
+#define ktest_16	ktestw
+#define ktest_32	ktestd
+#define ktest_64	ktestq
+#define kand_8	kandb
+#define kand_16	kandw
+#define kand_32	kandd
+#define kand_64	kandq
+#define kxor_8	kxorb
+#define kxor_16	kxorw
+#define kxor_32	kxord
+#define kxor_64	kxorq
+#define knot_8	knotb
+#define knot_16	knotw
+#define knot_32	knotd
+#define knot_64	knotq
+#define kxnor_8	kxnorb
+#define kxnor_16	kxnorw
+#define kxnor_32	kxnord
+#define kxnor_64	kxnorq
+#define kunpack_8	kunpackbw
+#define kunpack_16	kunpackwd
+#define kunpack_32	kunpackdq
+
+/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
+#define VRAX	VGPR(rax)
+#define VRBX	VGPR(rbx)
+#define VRCX	VGPR(rcx)
+#define VRDX	VGPR(rdx)
+#define VRBP	VGPR(rbp)
+#define VRSP	VGPR(rsp)
+#define VRSI	VGPR(rsi)
+#define VRDI	VGPR(rdi)
+#define VR8	VGPR(r8)
+#define VR9	VGPR(r9)
+#define VR10	VGPR(r10)
+#define VR11	VGPR(r11)
+#define VR12	VGPR(r12)
+#define VR13	VGPR(r13)
+#define VR14	VGPR(r14)
+#define VR15	VGPR(r15)
+
+/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
+#define KMOV 	VKINSN(kmov)
+#define KORTEST 	VKINSN(kortest)
+#define KOR 	VKINSN(kor)
+#define KTEST 	VKINSN(ktest)
+#define KAND 	VKINSN(kand)
+#define KXOR 	VKINSN(kxor)
+#define KNOT 	VKINSN(knot)
+#define KXNOR 	VKINSN(kxnor)
+#define KUNPACK 	VKINSN(kunpack)
+
+#ifdef USE_WIDE_CHAR
+# define REG_WIDTH 32
+#else
+# define REG_WIDTH VEC_SIZE
+#endif
+
+#define VPASTER(x, y)	x##_##y
+#define VEVALUATOR(x, y)	VPASTER(x, y)
+
+#define VGPR_SZ(reg_name, reg_size)	VEVALUATOR(reg_name, reg_size)
+#define VKINSN_SZ(insn, reg_size)	VEVALUATOR(insn, reg_size)
+
+#define VGPR(reg_name)	VGPR_SZ(reg_name, REG_WIDTH)
+#define VKINSN(mask_insn)	VKINSN_SZ(mask_insn, REG_WIDTH)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
new file mode 100644
index 0000000000..9fb6903212
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
@@ -0,0 +1,133 @@
+#!/usr/bin/python3
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+"""Generate macros for getting GPR name of a certain size
+
+Inputs: None
+Output: Prints header fill to stdout
+
+API:
+    V{upcase_GPR_name}
+        - Get register name REG_WIDTH component of `upcase_GPR_name`
+    {upcase_mask_insn_without_postfix}
+        - Get proper REG_WIDTH mask insn for `upcase_mask_insn_without_postfix`
+    VGPR(reg_name)
+        - Get register name REG_WIDTH component of `reg_name`
+    VKINSN(mask_insn)
+        - Get proper REG_WIDTH mask insn for `mask_insn`
+    VGPR_SZ(reg_name, reg_size)
+        - Get register name `reg_size` component of `reg_name`
+    VKINSN_SZ(mask_insn, insn_size)
+        - Get proper `insn_size` mask insn for `mask_insn`
+"""
+
+import sys
+import os
+from datetime import datetime
+
+registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
+             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
+             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
+             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
+             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
+             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
+             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
+             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
+
+mask_insns = [
+    "kmov",
+    "kortest",
+    "kor",
+    "ktest",
+    "kand",
+    "kxor",
+    "knot",
+    "kxnor",
+]
+mask_insns_ext = ["b", "w", "d", "q"]
+
+cr = """
+   Copyright (C) {} Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+"""
+
+print("/* This file was generated by: {}.".format(os.path.basename(
+    sys.argv[0])))
+print(cr.format(datetime.today().year))
+
+print("#ifndef _REG_MACROS_H")
+print("#define _REG_MACROS_H\t1")
+print("")
+for reg in registers:
+    for i in range(0, 4):
+        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
+
+print("")
+for mask_insn in mask_insns:
+    for i in range(0, 4):
+        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
+                                           mask_insns_ext[i]))
+for i in range(0, 3):
+    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
+                                                   mask_insns_ext[i + 1]))
+mask_insns.append("kunpack")
+
+print("")
+print(
+    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
+for reg in registers:
+    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
+
+print("")
+
+print(
+    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
+)
+for mask_insn in mask_insns:
+    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
+print("")
+
+print("#ifdef USE_WIDE_CHAR")
+print("# define REG_WIDTH 32")
+print("#else")
+print("# define REG_WIDTH VEC_SIZE")
+print("#endif")
+print("")
+print("#define VPASTER(x, y)\tx##_##y")
+print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
+print("")
+print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
+print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
+print("")
+print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
+print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
+
+print("\n#endif")
diff --git a/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
new file mode 100644
index 0000000000..0b326c8a70
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
@@ -0,0 +1,35 @@
+/* Common config for AVX-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX_RTM_VECS_H
+#define _X86_AVX_RTM_VECS_H			1
+
+#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define USE_WITH_RTM			1
+#include "x86-avx-vecs.h"
+
+#undef SECTION
+#define SECTION(p)				p##.avx.rtm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
new file mode 100644
index 0000000000..dca1089060
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for AVX VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX_VECS_H
+#define _X86_AVX_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "x86-vec-macros.h"
+
+#define USE_WITH_AVX		1
+#define SECTION(p)			p##.avx
+
+/* 4-byte mov instructions with AVX2.  */
+#define MOV_SIZE			4
+/* 1 (ret) + 3 (vzeroupper).  */
+#define RET_SIZE			4
+#define VZEROUPPER			vzeroupper
+
+#define VMOVU				vmovdqu
+#define VMOVA				vmovdqa
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VMM_128				VMM_any_xmm
+#define VMM					VMM_any_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
new file mode 100644
index 0000000000..f331e9d8ec
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
@@ -0,0 +1,39 @@
+/* Common config for EVEX256 and EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_EVEX_VECS_COMMON_H
+#define _X86_EVEX_VECS_COMMON_H			1
+
+#include "x86-vec-macros.h"
+
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+#define VMM_128				VMM_hi_xmm
+#define VMM_256				VMM_hi_ymm
+#define VMM_512				VMM_hi_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-evex256-vecs.h b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
new file mode 100644
index 0000000000..8337b95504
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
@@ -0,0 +1,38 @@
+/* Common config for EVEX256 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX256_VECS_H
+#define _EVEX256_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "x86-evex-vecs-common.h"
+
+#define USE_WITH_EVEX256	1
+
+#ifndef SECTION
+# define SECTION(p)			p##.evex
+#endif
+
+#define VMM					VMM_256
+#define VMM_lo				VMM_any_ymm
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-evex512-vecs.h b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
new file mode 100644
index 0000000000..7dc5c23ad0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
@@ -0,0 +1,38 @@
+/* Common config for EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX512_VECS_H
+#define _EVEX512_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			64
+#include "x86-evex-vecs-common.h"
+
+#define USE_WITH_EVEX512	1
+
+#ifndef SECTION
+# define SECTION(p)			p##.evex512
+#endif
+
+#define VMM					VMM_512
+#define VMM_lo				VMM_any_zmm
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-sse2-vecs.h b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
new file mode 100644
index 0000000000..b8bbd5dc29
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for SSE2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_SSE2_VECS_H
+#define _X86_SSE2_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			16
+#include "x86-vec-macros.h"
+
+#define USE_WITH_SSE2		1
+#define SECTION(p)			p
+
+/* 3-byte mov instructions with SSE2.  */
+#define MOV_SIZE			3
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				movups
+#define VMOVA				movaps
+#define VMOVNT				movntdq
+
+#define VMM_128				VMM_any_xmm
+#define VMM					VMM_any_xmm
+
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-vec-macros.h b/sysdeps/x86_64/multiarch/x86-vec-macros.h
new file mode 100644
index 0000000000..7d6bb31d55
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-vec-macros.h
@@ -0,0 +1,90 @@
+/* Macro helpers for VEC_{type}({vec_num})
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_VEC_MACROS_H
+#define _X86_VEC_MACROS_H			1
+
+#ifndef VEC_SIZE
+# error "Never include this file directly. Always include a vector config."
+#endif
+
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
+   VMM(N) values.  */
+#define VMM_hi_xmm0				xmm16
+#define VMM_hi_xmm1				xmm17
+#define VMM_hi_xmm2				xmm18
+#define VMM_hi_xmm3				xmm19
+#define VMM_hi_xmm4				xmm20
+#define VMM_hi_xmm5				xmm21
+#define VMM_hi_xmm6				xmm22
+#define VMM_hi_xmm7				xmm23
+#define VMM_hi_xmm8				xmm24
+#define VMM_hi_xmm9				xmm25
+#define VMM_hi_xmm10			xmm26
+#define VMM_hi_xmm11			xmm27
+#define VMM_hi_xmm12			xmm28
+#define VMM_hi_xmm13			xmm29
+#define VMM_hi_xmm14			xmm30
+#define VMM_hi_xmm15			xmm31
+
+#define VMM_hi_ymm0				ymm16
+#define VMM_hi_ymm1				ymm17
+#define VMM_hi_ymm2				ymm18
+#define VMM_hi_ymm3				ymm19
+#define VMM_hi_ymm4				ymm20
+#define VMM_hi_ymm5				ymm21
+#define VMM_hi_ymm6				ymm22
+#define VMM_hi_ymm7				ymm23
+#define VMM_hi_ymm8				ymm24
+#define VMM_hi_ymm9				ymm25
+#define VMM_hi_ymm10			ymm26
+#define VMM_hi_ymm11			ymm27
+#define VMM_hi_ymm12			ymm28
+#define VMM_hi_ymm13			ymm29
+#define VMM_hi_ymm14			ymm30
+#define VMM_hi_ymm15			ymm31
+
+#define VMM_hi_zmm0				zmm16
+#define VMM_hi_zmm1				zmm17
+#define VMM_hi_zmm2				zmm18
+#define VMM_hi_zmm3				zmm19
+#define VMM_hi_zmm4				zmm20
+#define VMM_hi_zmm5				zmm21
+#define VMM_hi_zmm6				zmm22
+#define VMM_hi_zmm7				zmm23
+#define VMM_hi_zmm8				zmm24
+#define VMM_hi_zmm9				zmm25
+#define VMM_hi_zmm10			zmm26
+#define VMM_hi_zmm11			zmm27
+#define VMM_hi_zmm12			zmm28
+#define VMM_hi_zmm13			zmm29
+#define VMM_hi_zmm14			zmm30
+#define VMM_hi_zmm15			zmm31
+
+#define PRIMITIVE_VMM(vec, num)		vec##num
+
+#define VMM_any_xmm(i)			PRIMITIVE_VMM(xmm, i)
+#define VMM_any_ymm(i)			PRIMITIVE_VMM(ymm, i)
+#define VMM_any_zmm(i)			PRIMITIVE_VMM(zmm, i)
+
+#define VMM_hi_xmm(i)			PRIMITIVE_VMM(VMM_hi_xmm, i)
+#define VMM_hi_ymm(i)			PRIMITIVE_VMM(VMM_hi_ymm, i)
+#define VMM_hi_zmm(i)			PRIMITIVE_VMM(VMM_hi_zmm, i)
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v9 2/6] x86: Update memrchr to use new VEC macros
  2022-10-15  0:20 ` [PATCH v9 " Noah Goldstein
@ 2022-10-15  0:20   ` Noah Goldstein
  2022-10-15  2:48     ` H.J. Lu
  2022-10-15  0:20   ` [PATCH v9 3/6] x86: Update memmove " Noah Goldstein
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  0:20 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Replace %VEC(n) -> %VMM(n)

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/memrchr-evex.S | 42 ++++++++++++-------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index ea3a0a0a60..550b328c5a 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -21,7 +21,7 @@
 #if ISA_SHOULD_BUILD (4)
 
 # include <sysdep.h>
-# include "evex256-vecs.h"
+# include "x86-evex256-vecs.h"
 # if VEC_SIZE != 32
 #  error "VEC_SIZE != 32 unimplemented"
 # endif
@@ -31,7 +31,7 @@
 # endif
 
 # define PAGE_SIZE			4096
-# define VECMATCH			VEC(0)
+# define VMMMATCH			VMM(0)
 
 	.section SECTION(.text), "ax", @progbits
 ENTRY_P2ALIGN(MEMRCHR, 6)
@@ -47,7 +47,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
 	   correct page cross check and 2) it correctly sets up end ptr to be
 	   subtract by lzcnt aligned.  */
 	leaq	-1(%rdi, %rdx), %rax
-	vpbroadcastb %esi, %VECMATCH
+	vpbroadcastb %esi, %VMMMATCH
 
 	/* Check if we can load 1x VEC without cross a page.  */
 	testl	$(PAGE_SIZE - VEC_SIZE), %eax
@@ -55,7 +55,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
 
 	/* Don't use rax for pointer here because EVEX has better encoding with
 	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
@@ -96,7 +96,7 @@ L(more_1x_vec):
 	movq	%rax, %rdx
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	subq	%rdi, %rdx
@@ -115,7 +115,7 @@ L(last_2x_vec):
 
 	/* Don't use rax for pointer here because EVEX has better encoding with
 	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
 	lzcntq	%rcx, %rcx
@@ -131,7 +131,7 @@ L(last_2x_vec):
 L(page_cross):
 	movq	%rax, %rsi
 	andq	$-VEC_SIZE, %rsi
-	vpcmpb	$0, (%rsi), %VECMATCH, %k0
+	vpcmpb	$0, (%rsi), %VMMMATCH, %k0
 	kmovd	%k0, %r8d
 	/* Shift out negative alignment (because we are starting from endptr and
 	   working backwards).  */
@@ -165,13 +165,13 @@ L(more_2x_vec):
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x0_dec)
 
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x1)
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	subq	$(VEC_SIZE * 4), %rdx
@@ -185,7 +185,7 @@ L(last_vec):
 
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 	lzcntl	%ecx, %ecx
 	subq	$(VEC_SIZE * 3 + 1), %rax
@@ -220,7 +220,7 @@ L(more_4x_vec):
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x2)
 
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	testl	%ecx, %ecx
@@ -243,17 +243,17 @@ L(more_4x_vec):
 L(loop_4x_vec):
 	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
 	   on).  */
-	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1
 
 	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
-	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
-	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
-	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
+	vpxorq	(VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3)
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4
 
 	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
 	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
-	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
-	vptestnmb %VEC(3), %VEC(3), %k2
+	vpminub	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	vptestnmb %VMM(3), %VMM(3), %k2
 
 	/* Any 1s and we found CHAR.  */
 	kortestd %k2, %k4
@@ -270,7 +270,7 @@ L(loop_4x_vec):
 L(last_4x_vec):
 
 	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	cmpl	$(VEC_SIZE * 2), %edx
@@ -280,14 +280,14 @@ L(last_4x_vec):
 	jnz	L(ret_vec_x0_dec)
 
 
-	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x1)
 
 	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	cmpl	$(VEC_SIZE * 3), %edx
@@ -309,7 +309,7 @@ L(loop_end):
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x0_end)
 
-	vptestnmb %VEC(2), %VEC(2), %k0
+	vptestnmb %VMM(2), %VMM(2), %k0
 	kmovd	%k0, %ecx
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x1_end)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v9 3/6] x86: Update memmove to use new VEC macros
  2022-10-15  0:20 ` [PATCH v9 " Noah Goldstein
  2022-10-15  0:20   ` [PATCH v9 2/6] x86: Update memrchr to use new VEC macros Noah Goldstein
@ 2022-10-15  0:20   ` Noah Goldstein
  2022-10-15  2:52     ` H.J. Lu
  2022-10-15  0:20   ` [PATCH v9 4/6] x86: Update memset " Noah Goldstein
                     ` (3 subsequent siblings)
  5 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  0:20 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Replace %VEC(n) -> %VMM(n)

This commit does not change libc.so

Tested build on x86-64
---
 .../memmove-avx-unaligned-erms-rtm.S          |  15 +-
 .../multiarch/memmove-avx-unaligned-erms.S    |   9 +-
 .../multiarch/memmove-avx512-unaligned-erms.S |  30 +-
 .../multiarch/memmove-evex-unaligned-erms.S   |  30 +-
 .../multiarch/memmove-sse2-unaligned-erms.S   |  11 +-
 .../multiarch/memmove-vec-unaligned-erms.S    | 262 +++++++++---------
 6 files changed, 135 insertions(+), 222 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
index 67a55f0c85..c2a95dc247 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -1,16 +1,9 @@
-#if IS_IN (libc)
-# define VEC_SIZE	32
-# define VEC(i)		ymm##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu
-# define VMOVA		vmovdqa
-# define MOV_SIZE	4
-# define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+#include <isa-level.h>
 
-# define VZEROUPPER_RETURN jmp	 L(return)
+#if ISA_SHOULD_BUILD (3)
+
+# include "x86-avx-rtm-vecs.h"
 
-# define SECTION(p)		p##.avx.rtm
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s##_rtm
 
 # include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index a14b155667..4e4b4635f9 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -2,14 +2,7 @@
 
 #if ISA_SHOULD_BUILD (3)
 
-# define VEC_SIZE	32
-# define VEC(i)		ymm##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu
-# define VMOVA		vmovdqa
-# define MOV_SIZE	4
-
-# define SECTION(p)		p##.avx
+# include "x86-avx-vecs.h"
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index 8d1568a7ba..cca97e38f8 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -2,35 +2,7 @@
 
 #if ISA_SHOULD_BUILD (4)
 
-# define VEC_SIZE	64
-# define XMM0		xmm16
-# define XMM1		xmm17
-# define YMM0		ymm16
-# define YMM1		ymm17
-# define VEC0		zmm16
-# define VEC1		zmm17
-# define VEC2		zmm18
-# define VEC3		zmm19
-# define VEC4		zmm20
-# define VEC5		zmm21
-# define VEC6		zmm22
-# define VEC7		zmm23
-# define VEC8		zmm24
-# define VEC9		zmm25
-# define VEC10		zmm26
-# define VEC11		zmm27
-# define VEC12		zmm28
-# define VEC13		zmm29
-# define VEC14		zmm30
-# define VEC15		zmm31
-# define VEC(i)		VEC##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-# define VZEROUPPER
-# define MOV_SIZE	6
-
-# define SECTION(p)		p##.evex512
+# include "x86-evex512-vecs.h"
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
index 2373017358..1f7b5715f7 100644
--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -2,35 +2,7 @@
 
 #if ISA_SHOULD_BUILD (4)
 
-# define VEC_SIZE	32
-# define XMM0		xmm16
-# define XMM1		xmm17
-# define YMM0		ymm16
-# define YMM1		ymm17
-# define VEC0		ymm16
-# define VEC1		ymm17
-# define VEC2		ymm18
-# define VEC3		ymm19
-# define VEC4		ymm20
-# define VEC5		ymm21
-# define VEC6		ymm22
-# define VEC7		ymm23
-# define VEC8		ymm24
-# define VEC9		ymm25
-# define VEC10		ymm26
-# define VEC11		ymm27
-# define VEC12		ymm28
-# define VEC13		ymm29
-# define VEC14		ymm30
-# define VEC15		ymm31
-# define VEC(i)		VEC##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-# define VZEROUPPER
-# define MOV_SIZE	6
-
-# define SECTION(p)		p##.evex
+# include "x86-evex256-vecs.h"
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
index 422a079902..8431bcd000 100644
--- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -22,18 +22,9 @@
    so we need this to build for ISA V2 builds. */
 #if ISA_SHOULD_BUILD (2)
 
-# include <sysdep.h>
+# include "x86-sse2-vecs.h"
 
-# define VEC_SIZE	16
-# define VEC(i)		xmm##i
 # define PREFETCHNT	prefetchnta
-# define VMOVNT		movntdq
-/* Use movups and movaps for smaller code sizes.  */
-# define VMOVU		movups
-# define VMOVA		movaps
-# define MOV_SIZE	3
-
-# define SECTION(p)		p
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 04747133b7..5b758cae5e 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -60,14 +60,6 @@
 # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
 #endif
 
-#ifndef XMM0
-# define XMM0				xmm0
-#endif
-
-#ifndef YMM0
-# define YMM0				ymm0
-#endif
-
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER vzeroupper
@@ -225,13 +217,13 @@ L(start):
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 	/* Load regardless.  */
-	VMOVU	(%rsi), %VEC(0)
+	VMOVU	(%rsi), %VMM(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VMM(1)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), -VEC_SIZE(%rdi,%rdx)
 #if !(defined USE_MULTIARCH && IS_IN (libc))
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
@@ -270,15 +262,15 @@ L(start_erms):
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 	/* Load regardless.  */
-	VMOVU	(%rsi), %VEC(0)
+	VMOVU	(%rsi), %VMM(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
 	 */
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
-L(return):
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), -VEC_SIZE(%rdi, %rdx)
+L(return_vzeroupper):
 # if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 # else
@@ -359,10 +351,10 @@ L(between_16_31):
 	.p2align 4,, 10
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
-	VMOVU	(%rsi), %YMM0
-	VMOVU	-32(%rsi, %rdx), %YMM1
-	VMOVU	%YMM0, (%rdi)
-	VMOVU	%YMM1, -32(%rdi, %rdx)
+	VMOVU	(%rsi), %VMM_256(0)
+	VMOVU	-32(%rsi, %rdx), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -32(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
@@ -380,12 +372,12 @@ L(last_4x_vec):
 	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
 
 	/* VEC(0) and VEC(1) have already been loaded.  */
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
+	VMOVU	%VMM(2), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VMM(3), -(VEC_SIZE * 2)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
 	.p2align 4
@@ -400,24 +392,24 @@ L(more_2x_vec):
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
 	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_4x_vec)
 	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(7)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(4), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VMM(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VMM(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(7), -(VEC_SIZE * 4)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
 	.p2align 4,, 4
@@ -466,14 +458,14 @@ L(more_8x_vec_forward):
 	 */
 
 	/* First vec was already loaded into VEC(0).  */
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(5)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(6)
 	/* Save begining of dst.  */
 	movq	%rdi, %rcx
 	/* Align dst to VEC_SIZE - 1.  */
 	orq	$(VEC_SIZE - 1), %rdi
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(7)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(8)
 
 	/* Subtract dst from src. Add back after dst aligned.  */
 	subq	%rcx, %rsi
@@ -488,25 +480,25 @@ L(more_8x_vec_forward):
 	.p2align 4,, 11
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
-	VMOVU	(%rsi), %VEC(1)
-	VMOVU	VEC_SIZE(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
+	VMOVU	(%rsi), %VMM(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(4)
 	subq	$-(VEC_SIZE * 4), %rsi
-	VMOVA	%VEC(1), (%rdi)
-	VMOVA	%VEC(2), VEC_SIZE(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VMM(1), (%rdi)
+	VMOVA	%VMM(2), VEC_SIZE(%rdi)
+	VMOVA	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VMM(4), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpq	%rdi, %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
-	VMOVU	%VEC(7), VEC_SIZE(%rdx)
-	VMOVU	%VEC(8), (%rdx)
+	VMOVU	%VMM(5), (VEC_SIZE * 3)(%rdx)
+	VMOVU	%VMM(6), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VMM(7), VEC_SIZE(%rdx)
+	VMOVU	%VMM(8), (%rdx)
 	/* Store the first VEC.  */
-	VMOVU	%VEC(0), (%rcx)
+	VMOVU	%VMM(0), (%rcx)
 	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
 	 */
 L(nop_backward):
@@ -523,12 +515,12 @@ L(more_8x_vec_backward):
 	   addresses.  */
 
 	/* First vec was also loaded into VEC(0).  */
-	VMOVU	VEC_SIZE(%rsi), %VEC(5)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	VMOVU	VEC_SIZE(%rsi), %VMM(5)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(6)
 	/* Begining of region for 4x backward copy stored in rcx.  */
 	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(7)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(8)
 	/* Subtract dst from src. Add back after dst aligned.  */
 	subq	%rdi, %rsi
 	/* Align dst.  */
@@ -540,25 +532,25 @@ L(more_8x_vec_backward):
 	.p2align 4,, 11
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
-	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VMM(3)
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VMM(4)
 	addq	$(VEC_SIZE * -4), %rsi
-	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
-	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
-	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
+	VMOVA	%VMM(1), (VEC_SIZE * 3)(%rcx)
+	VMOVA	%VMM(2), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VMM(3), (VEC_SIZE * 1)(%rcx)
+	VMOVA	%VMM(4), (VEC_SIZE * 0)(%rcx)
 	addq	$(VEC_SIZE * -4), %rcx
 	cmpq	%rcx, %rdi
 	jb	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(5), VEC_SIZE(%rdi)
+	VMOVU	%VMM(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
-	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
+	VMOVU	%VMM(8), -VEC_SIZE(%rdx, %rdi)
 	VZEROUPPER_RETURN
 
 #if defined USE_MULTIARCH && IS_IN (libc)
@@ -568,7 +560,7 @@ L(loop_4x_vec_backward):
 # if ALIGN_MOVSB
 L(skip_short_movsb_check):
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 #  endif
 #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
 #   error Unsupported MOVSB_ALIGN_TO
@@ -597,9 +589,9 @@ L(skip_short_movsb_check):
 
 	rep	movsb
 
-	VMOVU	%VEC(0), (%r8)
+	VMOVU	%VMM(0), (%r8)
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	%VEC(1), VEC_SIZE(%r8)
+	VMOVU	%VMM(1), VEC_SIZE(%r8)
 #  endif
 	VZEROUPPER_RETURN
 # endif
@@ -640,7 +632,7 @@ L(movsb):
 # endif
 # if ALIGN_MOVSB
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 #  endif
 #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
 #   error Unsupported MOVSB_ALIGN_TO
@@ -664,9 +656,9 @@ L(movsb_align_dst):
 	rep	movsb
 
 	/* Store VECs loaded for aligning.  */
-	VMOVU	%VEC(0), (%r8)
+	VMOVU	%VMM(0), (%r8)
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	%VEC(1), VEC_SIZE(%r8)
+	VMOVU	%VMM(1), VEC_SIZE(%r8)
 #  endif
 	VZEROUPPER_RETURN
 # else	/* !ALIGN_MOVSB.  */
@@ -701,18 +693,18 @@ L(large_memcpy_2x):
 
 	/* First vec was also loaded into VEC(0).  */
 # if VEC_SIZE < 64
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 #  if VEC_SIZE < 32
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
 #  endif
 # endif
-	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VMM(0), (%rdi)
 # if VEC_SIZE < 64
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
 #  if VEC_SIZE < 32
-	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
 #  endif
 # endif
 
@@ -761,12 +753,12 @@ L(loop_large_memcpy_2x_inner):
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
 	/* Load vectors from rsi.  */
-	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
 	subq	$-LARGE_LOAD_SIZE, %rsi
 	/* Non-temporal store vectors to rdi.  */
-	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
 	subq	$-LARGE_LOAD_SIZE, %rdi
 	decl	%ecx
 	jnz	L(loop_large_memcpy_2x_inner)
@@ -785,31 +777,31 @@ L(loop_large_memcpy_2x_tail):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
 	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VMM(0)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
 	subq	$-(VEC_SIZE * 4), %rsi
 	addl	$-(VEC_SIZE * 4), %edx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VMM(0), (%rdi)
+	VMOVA	%VMM(1), VEC_SIZE(%rdi)
+	VMOVA	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VMM(3), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpl	$(VEC_SIZE * 4), %edx
 	ja	L(loop_large_memcpy_2x_tail)
 
 L(large_memcpy_2x_end):
 	/* Store the last 4 * VEC.  */
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
-
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
-	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
-	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
-	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(3)
+
+	VMOVU	%VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VMM(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
 	.p2align 4
@@ -831,16 +823,16 @@ L(loop_large_memcpy_4x_inner):
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
 	/* Load vectors from rsi.  */
-	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
 	subq	$-LARGE_LOAD_SIZE, %rsi
 	/* Non-temporal store vectors to rdi.  */
-	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
-	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
-	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
 	subq	$-LARGE_LOAD_SIZE, %rdi
 	decl	%ecx
 	jnz	L(loop_large_memcpy_4x_inner)
@@ -858,31 +850,31 @@ L(loop_large_memcpy_4x_tail):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
 	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VMM(0)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
 	subq	$-(VEC_SIZE * 4), %rsi
 	addl	$-(VEC_SIZE * 4), %edx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VMM(0), (%rdi)
+	VMOVA	%VMM(1), VEC_SIZE(%rdi)
+	VMOVA	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VMM(3), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpl	$(VEC_SIZE * 4), %edx
 	ja	L(loop_large_memcpy_4x_tail)
 
 L(large_memcpy_4x_end):
 	/* Store the last 4 * VEC.  */
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
-
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
-	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
-	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
-	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(3)
+
+	VMOVU	%VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VMM(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v9 4/6] x86: Update memset to use new VEC macros
  2022-10-15  0:20 ` [PATCH v9 " Noah Goldstein
  2022-10-15  0:20   ` [PATCH v9 2/6] x86: Update memrchr to use new VEC macros Noah Goldstein
  2022-10-15  0:20   ` [PATCH v9 3/6] x86: Update memmove " Noah Goldstein
@ 2022-10-15  0:20   ` Noah Goldstein
  2022-10-15  2:53     ` H.J. Lu
  2022-10-15  0:20   ` [PATCH v9 5/6] x86: Remove now unused vec header macros Noah Goldstein
                     ` (2 subsequent siblings)
  5 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  0:20 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Replace %VEC(n) -> %VMM(n)

This commit does not change libc.so

Tested build on x86-64
---
 .../memset-avx2-unaligned-erms-rtm.S          |  8 +--
 .../multiarch/memset-avx2-unaligned-erms.S    | 14 +---
 .../multiarch/memset-avx512-unaligned-erms.S  | 20 +-----
 .../multiarch/memset-evex-unaligned-erms.S    | 20 +-----
 .../multiarch/memset-sse2-unaligned-erms.S    | 10 +--
 .../multiarch/memset-vec-unaligned-erms.S     | 70 ++++++++-----------
 6 files changed, 43 insertions(+), 99 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
index 8ac3e479bb..bc8605faf3 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
@@ -1,10 +1,6 @@
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+#include "x86-avx-rtm-vecs.h"
 
-#define VZEROUPPER_RETURN jmp	 L(return)
-
-#define SECTION(p) p##.avx.rtm
 #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
 #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
 
-#include "memset-avx2-unaligned-erms.S"
+# include "memset-avx2-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index a9054a9122..47cf5072a4 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -4,14 +4,9 @@
 
 # define USE_WITH_AVX2	1
 
-# define VEC_SIZE	32
-# define MOV_SIZE	4
-# define RET_SIZE	4
-
-# define VEC(i)		ymm##i
-
-# define VMOVU     vmovdqu
-# define VMOVA     vmovdqa
+# ifndef VEC_SIZE
+#  include "x86-avx-vecs.h"
+# endif
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
@@ -26,9 +21,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
 # define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
 
-# ifndef SECTION
-#  define SECTION(p)		p##.avx
-# endif
 # ifndef MEMSET_SYMBOL
 #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 47623b8ee8..84145b6c27 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -4,26 +4,14 @@
 
 # define USE_WITH_AVX512	1
 
-# define VEC_SIZE	64
-# define MOV_SIZE	6
-# define RET_SIZE	1
-
-# define XMM0		xmm16
-# define YMM0		ymm16
-# define VEC0		zmm16
-# define VEC(i)		VEC##i
-
-# define VMOVU     vmovdqu64
-# define VMOVA     vmovdqa64
-
-# define VZEROUPPER
+# include "x86-evex512-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastb d, %VEC0; \
+  vpbroadcastb d, %VMM(0); \
   movq r, %rax
 
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastd d, %VEC0; \
+  vpbroadcastd d, %VMM(0); \
   movq r, %rax
 
 # define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -32,8 +20,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p##.evex512
-
 #ifndef MEMSET_SYMBOL
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index ac4b2d2d50..1f03b26bf8 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -4,26 +4,14 @@
 
 # define USE_WITH_EVEX	1
 
-# define VEC_SIZE	32
-# define MOV_SIZE	6
-# define RET_SIZE	1
-
-# define XMM0		xmm16
-# define YMM0		ymm16
-# define VEC0		ymm16
-# define VEC(i)		VEC##i
-
-# define VMOVU     vmovdqu64
-# define VMOVA     vmovdqa64
-
-# define VZEROUPPER
+# include "x86-evex256-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastb d, %VEC0; \
+  vpbroadcastb d, %VMM(0); \
   movq r, %rax
 
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastd d, %VEC0; \
+  vpbroadcastd d, %VMM(0); \
   movq r, %rax
 
 # define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -32,8 +20,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p##.evex
-
 #ifndef MEMSET_SYMBOL
 # define MEMSET_SYMBOL(p,s)	p##_evex_##s
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index 44f9b8888b..34b245d8ca 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -26,13 +26,7 @@
 # include <sysdep.h>
 # define USE_WITH_SSE2	1
 
-# define VEC_SIZE	16
-# define MOV_SIZE	3
-# define RET_SIZE	1
-
-# define VEC(i)		xmm##i
-# define VMOVU     movups
-# define VMOVA     movaps
+# include "x86-sse2-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
@@ -52,8 +46,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p
-
 # ifndef MEMSET_SYMBOL
 #  define MEMSET_SYMBOL(p,s)	p##_sse2_##s
 # endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 905d0fa464..03de0ab907 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -34,14 +34,6 @@
 # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
 #endif
 
-#ifndef XMM0
-# define XMM0				xmm0
-#endif
-
-#ifndef YMM0
-# define YMM0				ymm0
-#endif
-
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER			vzeroupper
@@ -150,8 +142,8 @@ L(entry_from_wmemset):
 	cmpq	$(VEC_SIZE * 2), %rdx
 	ja	L(more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VMM(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VMM(0), (%rdi)
 	VZEROUPPER_RETURN
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
@@ -175,19 +167,19 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(stosb_more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
 	.p2align 4,, 4
 L(last_2x_vec):
 #ifdef USE_LESS_VEC_MASK_STORE
-	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * -2)(%rdi, %rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
 #else
-	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * -2)(%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * -1)(%rdi)
 #endif
 	VZEROUPPER_RETURN
 
@@ -221,7 +213,7 @@ L(less_vec_from_wmemset):
 	bzhil	%edx, %ecx, %ecx
 	kmovd	%ecx, %k1
 # endif
-	vmovdqu8 %VEC(0), (%rax){%k1}
+	vmovdqu8 %VMM(0), (%rax){%k1}
 	VZEROUPPER_RETURN
 
 # if defined USE_MULTIARCH && IS_IN (libc)
@@ -249,8 +241,8 @@ L(stosb_more_2x_vec):
 	   and (4x, 8x] jump to target.  */
 L(more_2x_vec):
 	/* Store next 2x vec regardless.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * 1)(%rdi)
 
 
 	/* Two different methods of setting up pointers / compare. The two
@@ -278,8 +270,8 @@ L(more_2x_vec):
 #endif
 
 	/* Store next 2x vec regardless.  */
-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
-	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
+	VMOVU	%VMM(0), (VEC_SIZE * 2)(%rax)
+	VMOVU	%VMM(0), (VEC_SIZE * 3)(%rax)
 
 
 #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
@@ -304,20 +296,20 @@ L(more_2x_vec):
 	andq	$(VEC_SIZE * -2), %LOOP_REG
 	.p2align 4
 L(loop):
-	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
-	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
-	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
-	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VMM(0), LOOP_4X_OFFSET(%LOOP_REG)
+	VMOVA	%VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
 	subq	$-(VEC_SIZE * 4), %LOOP_REG
 	cmpq	%END_REG, %LOOP_REG
 	jb	L(loop)
 	.p2align 4,, MOV_SIZE
 L(last_4x_vec):
-	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
-	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
-L(return):
+	VMOVU	%VMM(0), LOOP_4X_OFFSET(%END_REG)
+	VMOVU	%VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
+L(return_vzeroupper):
 #if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
@@ -355,7 +347,7 @@ L(cross_page):
 	jge	L(between_16_31)
 #endif
 #ifndef USE_XMM_LESS_VEC
-	MOVQ	%XMM0, %SET_REG64
+	MOVQ	%VMM_128(0), %SET_REG64
 #endif
 	cmpl	$8, %edx
 	jge	L(between_8_15)
@@ -374,8 +366,8 @@ L(between_0_0):
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
-	VMOVU	%YMM0, (%LESS_VEC_REG)
-	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
+	VMOVU	%VMM_256(0), (%LESS_VEC_REG)
+	VMOVU	%VMM_256(0), -32(%LESS_VEC_REG, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
@@ -383,8 +375,8 @@ L(between_32_63):
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
 L(between_16_31):
 	/* From 16 to 31.  No branch when size == 16.  */
-	VMOVU	%XMM0, (%LESS_VEC_REG)
-	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
+	VMOVU	%VMM_128(0), (%LESS_VEC_REG)
+	VMOVU	%VMM_128(0), -16(%LESS_VEC_REG, %rdx)
 	ret
 #endif
 
@@ -394,8 +386,8 @@ L(between_16_31):
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
 #ifdef USE_XMM_LESS_VEC
-	MOVQ	%XMM0, (%rdi)
-	MOVQ	%XMM0, -8(%rdi, %rdx)
+	MOVQ	%VMM_128(0), (%rdi)
+	MOVQ	%VMM_128(0), -8(%rdi, %rdx)
 #else
 	movq	%SET_REG64, (%LESS_VEC_REG)
 	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
@@ -408,8 +400,8 @@ L(between_8_15):
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
 #ifdef USE_XMM_LESS_VEC
-	MOVD	%XMM0, (%rdi)
-	MOVD	%XMM0, -4(%rdi, %rdx)
+	MOVD	%VMM_128(0), (%rdi)
+	MOVD	%VMM_128(0), -4(%rdi, %rdx)
 #else
 	movl	%SET_REG32, (%LESS_VEC_REG)
 	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v9 5/6] x86: Remove now unused vec header macros.
  2022-10-15  0:20 ` [PATCH v9 " Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-10-15  0:20   ` [PATCH v9 4/6] x86: Update memset " Noah Goldstein
@ 2022-10-15  0:20   ` Noah Goldstein
  2022-10-15  2:56     ` H.J. Lu
  2022-10-15  0:21   ` [PATCH v9 6/6] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
  2022-10-15  2:45   ` [PATCH v9 1/6] x86: Update VEC macros to complete API for evex/evex512 impls H.J. Lu
  5 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  0:20 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h     | 35 --------
 sysdeps/x86_64/multiarch/avx-vecs.h         | 47 -----------
 sysdeps/x86_64/multiarch/evex-vecs-common.h | 39 ---------
 sysdeps/x86_64/multiarch/evex256-vecs.h     | 35 --------
 sysdeps/x86_64/multiarch/evex512-vecs.h     | 35 --------
 sysdeps/x86_64/multiarch/sse2-vecs.h        | 47 -----------
 sysdeps/x86_64/multiarch/vec-macros.h       | 90 ---------------------
 7 files changed, 328 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/evex-vecs-common.h
 delete mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/vec-macros.h

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
deleted file mode 100644
index 6ca9f5e6ba..0000000000
--- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Common config for AVX-RTM VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _AVX_RTM_VECS_H
-#define _AVX_RTM_VECS_H			1
-
-#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
-#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
-	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
-
-#define USE_WITH_RTM			1
-#include "avx-vecs.h"
-
-#undef SECTION
-#define SECTION(p)				p##.avx.rtm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
deleted file mode 100644
index 89680f5db8..0000000000
--- a/sysdeps/x86_64/multiarch/avx-vecs.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Common config for AVX VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _AVX_VECS_H
-#define _AVX_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			32
-#include "vec-macros.h"
-
-#define USE_WITH_AVX		1
-#define SECTION(p)			p##.avx
-
-/* 4-byte mov instructions with AVX2.  */
-#define MOV_SIZE			4
-/* 1 (ret) + 3 (vzeroupper).  */
-#define RET_SIZE			4
-#define VZEROUPPER			vzeroupper
-
-#define VMOVU				vmovdqu
-#define VMOVA				vmovdqa
-#define VMOVNT				vmovntdq
-
-/* Often need to access xmm portion.  */
-#define VEC_xmm				VEC_any_xmm
-#define VEC					VEC_any_ymm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
deleted file mode 100644
index 99806ebcd7..0000000000
--- a/sysdeps/x86_64/multiarch/evex-vecs-common.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Common config for EVEX256 and EVEX512 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _EVEX_VECS_COMMON_H
-#define _EVEX_VECS_COMMON_H			1
-
-#include "vec-macros.h"
-
-/* 6-byte mov instructions with EVEX.  */
-#define MOV_SIZE			6
-/* No vzeroupper needed.  */
-#define RET_SIZE			1
-#define VZEROUPPER
-
-#define VMOVU				vmovdqu64
-#define VMOVA				vmovdqa64
-#define VMOVNT				vmovntdq
-
-#define VEC_xmm				VEC_hi_xmm
-#define VEC_ymm				VEC_hi_ymm
-#define VEC_zmm				VEC_hi_zmm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
deleted file mode 100644
index 222ba46dc7..0000000000
--- a/sysdeps/x86_64/multiarch/evex256-vecs.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Common config for EVEX256 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _EVEX256_VECS_H
-#define _EVEX256_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			32
-#include "evex-vecs-common.h"
-
-#define USE_WITH_EVEX256	1
-#define SECTION(p)			p##.evex
-
-#define VEC					VEC_ymm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
deleted file mode 100644
index d1784d5368..0000000000
--- a/sysdeps/x86_64/multiarch/evex512-vecs.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Common config for EVEX512 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _EVEX512_VECS_H
-#define _EVEX512_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			64
-#include "evex-vecs-common.h"
-
-#define USE_WITH_EVEX512	1
-#define SECTION(p)			p##.evex512
-
-#define VEC					VEC_zmm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
deleted file mode 100644
index 2b77a59d56..0000000000
--- a/sysdeps/x86_64/multiarch/sse2-vecs.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Common config for SSE2 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _SSE2_VECS_H
-#define _SSE2_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			16
-#include "vec-macros.h"
-
-#define USE_WITH_SSE2		1
-#define SECTION(p)			p
-
-/* 3-byte mov instructions with SSE2.  */
-#define MOV_SIZE			3
-/* No vzeroupper needed.  */
-#define RET_SIZE			1
-#define VZEROUPPER
-
-#define VMOVU				movups
-#define VMOVA				movaps
-#define VMOVNT				movntdq
-
-#define VEC_xmm				VEC_any_xmm
-#define VEC					VEC_any_xmm
-
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
deleted file mode 100644
index 9f3ffecede..0000000000
--- a/sysdeps/x86_64/multiarch/vec-macros.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Macro helpers for VEC_{type}({vec_num})
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _VEC_MACROS_H
-#define _VEC_MACROS_H			1
-
-#ifndef VEC_SIZE
-# error "Never include this file directly. Always include a vector config."
-#endif
-
-/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
-   VEC(N) values.  */
-#define VEC_hi_xmm0				xmm16
-#define VEC_hi_xmm1				xmm17
-#define VEC_hi_xmm2				xmm18
-#define VEC_hi_xmm3				xmm19
-#define VEC_hi_xmm4				xmm20
-#define VEC_hi_xmm5				xmm21
-#define VEC_hi_xmm6				xmm22
-#define VEC_hi_xmm7				xmm23
-#define VEC_hi_xmm8				xmm24
-#define VEC_hi_xmm9				xmm25
-#define VEC_hi_xmm10			xmm26
-#define VEC_hi_xmm11			xmm27
-#define VEC_hi_xmm12			xmm28
-#define VEC_hi_xmm13			xmm29
-#define VEC_hi_xmm14			xmm30
-#define VEC_hi_xmm15			xmm31
-
-#define VEC_hi_ymm0				ymm16
-#define VEC_hi_ymm1				ymm17
-#define VEC_hi_ymm2				ymm18
-#define VEC_hi_ymm3				ymm19
-#define VEC_hi_ymm4				ymm20
-#define VEC_hi_ymm5				ymm21
-#define VEC_hi_ymm6				ymm22
-#define VEC_hi_ymm7				ymm23
-#define VEC_hi_ymm8				ymm24
-#define VEC_hi_ymm9				ymm25
-#define VEC_hi_ymm10			ymm26
-#define VEC_hi_ymm11			ymm27
-#define VEC_hi_ymm12			ymm28
-#define VEC_hi_ymm13			ymm29
-#define VEC_hi_ymm14			ymm30
-#define VEC_hi_ymm15			ymm31
-
-#define VEC_hi_zmm0				zmm16
-#define VEC_hi_zmm1				zmm17
-#define VEC_hi_zmm2				zmm18
-#define VEC_hi_zmm3				zmm19
-#define VEC_hi_zmm4				zmm20
-#define VEC_hi_zmm5				zmm21
-#define VEC_hi_zmm6				zmm22
-#define VEC_hi_zmm7				zmm23
-#define VEC_hi_zmm8				zmm24
-#define VEC_hi_zmm9				zmm25
-#define VEC_hi_zmm10			zmm26
-#define VEC_hi_zmm11			zmm27
-#define VEC_hi_zmm12			zmm28
-#define VEC_hi_zmm13			zmm29
-#define VEC_hi_zmm14			zmm30
-#define VEC_hi_zmm15			zmm31
-
-#define PRIMITIVE_VEC(vec, num)		vec##num
-
-#define VEC_any_xmm(i)			PRIMITIVE_VEC(xmm, i)
-#define VEC_any_ymm(i)			PRIMITIVE_VEC(ymm, i)
-#define VEC_any_zmm(i)			PRIMITIVE_VEC(zmm, i)
-
-#define VEC_hi_xmm(i)			PRIMITIVE_VEC(VEC_hi_xmm, i)
-#define VEC_hi_ymm(i)			PRIMITIVE_VEC(VEC_hi_ymm, i)
-#define VEC_hi_zmm(i)			PRIMITIVE_VEC(VEC_hi_zmm, i)
-
-#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls
  2022-10-15  0:12   ` [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls H.J. Lu
@ 2022-10-15  0:20     ` Noah Goldstein
  0 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  0:20 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 7:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Oct 14, 2022 at 5:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > 1) Copy so that backport will be easier.
> > 2) Make section only define if there is not a previous definition
> > 3) Add `VEC_lo` definition for proper reg-width but in the
> >    ymm/zmm0-15 range.
> > 4) Add macros for accessing GPRs based on VEC_SIZE
> >         This is to make it easier to do think like:
> >         ```
> >             vpcmpb %VEC(0), %VEC(1), %k0
> >             kmov{d|q} %k0, %{eax|rax}
> >             test %{eax|rax}
> >         ```
> >         It adds macro s.t any GPR can get the proper width with:
> >             `V{upper_case_GPR_name}`
> >
> >         and any mask insn can get the proper width with:
> >             `{mask_insn_without_postfix}V`
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>
> This comment is incorrect.

Fixed in V9.
>
> >
> > This commit does not change libc.so
> >
> > Tested build on x86-64
> > ---
> >  sysdeps/x86_64/multiarch/reg-macros.h         | 168 ++++++++++++++++++
> >  .../multiarch/scripts/gen-reg-macros.py       | 125 +++++++++++++
> >  sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h   |  35 ++++
> >  sysdeps/x86_64/multiarch/x86-avx-vecs.h       |  47 +++++
> >  .../x86_64/multiarch/x86-evex-vecs-common.h   |  39 ++++
> >  sysdeps/x86_64/multiarch/x86-evex256-vecs.h   |  38 ++++
> >  sysdeps/x86_64/multiarch/x86-evex512-vecs.h   |  38 ++++
> >  sysdeps/x86_64/multiarch/x86-sse2-vecs.h      |  47 +++++
> >  sysdeps/x86_64/multiarch/x86-vec-macros.h     |  90 ++++++++++
> >  9 files changed, 627 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
> >  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> >  create mode 100644 sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/x86-avx-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
> >  create mode 100644 sysdeps/x86_64/multiarch/x86-evex256-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/x86-evex512-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/x86-sse2-vecs.h
> >  create mode 100644 sysdeps/x86_64/multiarch/x86-vec-macros.h
> >
> > diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> > new file mode 100644
> > index 0000000000..c8ea330256
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> > @@ -0,0 +1,168 @@
> > +/* This file was generated by: gen-reg-macros.py.
> > +
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _REG_MACROS_H
> > +#define _REG_MACROS_H  1
> > +
> > +#define rax_8  al
> > +#define rax_16 ax
> > +#define rax_32 eax
> > +#define rax_64 rax
> > +#define rbx_8  bl
> > +#define rbx_16 bx
> > +#define rbx_32 ebx
> > +#define rbx_64 rbx
> > +#define rcx_8  cl
> > +#define rcx_16 cx
> > +#define rcx_32 ecx
> > +#define rcx_64 rcx
> > +#define rdx_8  dl
> > +#define rdx_16 dx
> > +#define rdx_32 edx
> > +#define rdx_64 rdx
> > +#define rbp_8  bpl
> > +#define rbp_16 bp
> > +#define rbp_32 ebp
> > +#define rbp_64 rbp
> > +#define rsp_8  spl
> > +#define rsp_16 sp
> > +#define rsp_32 esp
> > +#define rsp_64 rsp
> > +#define rsi_8  sil
> > +#define rsi_16 si
> > +#define rsi_32 esi
> > +#define rsi_64 rsi
> > +#define rdi_8  dil
> > +#define rdi_16 di
> > +#define rdi_32 edi
> > +#define rdi_64 rdi
> > +#define r8_8   r8b
> > +#define r8_16  r8w
> > +#define r8_32  r8d
> > +#define r8_64  r8
> > +#define r9_8   r9b
> > +#define r9_16  r9w
> > +#define r9_32  r9d
> > +#define r9_64  r9
> > +#define r10_8  r10b
> > +#define r10_16 r10w
> > +#define r10_32 r10d
> > +#define r10_64 r10
> > +#define r11_8  r11b
> > +#define r11_16 r11w
> > +#define r11_32 r11d
> > +#define r11_64 r11
> > +#define r12_8  r12b
> > +#define r12_16 r12w
> > +#define r12_32 r12d
> > +#define r12_64 r12
> > +#define r13_8  r13b
> > +#define r13_16 r13w
> > +#define r13_32 r13d
> > +#define r13_64 r13
> > +#define r14_8  r14b
> > +#define r14_16 r14w
> > +#define r14_32 r14d
> > +#define r14_64 r14
> > +#define r15_8  r15b
> > +#define r15_16 r15w
> > +#define r15_32 r15d
> > +#define r15_64 r15
> > +
> > +#define kmov_8 kmovb
> > +#define kmov_16        kmovw
> > +#define kmov_32        kmovd
> > +#define kmov_64        kmovq
> > +#define kortest_8      kortestb
> > +#define kortest_16     kortestw
> > +#define kortest_32     kortestd
> > +#define kortest_64     kortestq
> > +#define kor_8  korb
> > +#define kor_16 korw
> > +#define kor_32 kord
> > +#define kor_64 korq
> > +#define ktest_8        ktestb
> > +#define ktest_16       ktestw
> > +#define ktest_32       ktestd
> > +#define ktest_64       ktestq
> > +#define kand_8 kandb
> > +#define kand_16        kandw
> > +#define kand_32        kandd
> > +#define kand_64        kandq
> > +#define kxor_8 kxorb
> > +#define kxor_16        kxorw
> > +#define kxor_32        kxord
> > +#define kxor_64        kxorq
> > +#define knot_8 knotb
> > +#define knot_16        knotw
> > +#define knot_32        knotd
> > +#define knot_64        knotq
> > +#define kxnor_8        kxnorb
> > +#define kxnor_16       kxnorw
> > +#define kxnor_32       kxnord
> > +#define kxnor_64       kxnorq
> > +#define kunpack_8      kunpackbw
> > +#define kunpack_16     kunpackwd
> > +#define kunpack_32     kunpackdq
> > +
> > +/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
> > +#define VRAX   VGPR(rax)
> > +#define VRBX   VGPR(rbx)
> > +#define VRCX   VGPR(rcx)
> > +#define VRDX   VGPR(rdx)
> > +#define VRBP   VGPR(rbp)
> > +#define VRSP   VGPR(rsp)
> > +#define VRSI   VGPR(rsi)
> > +#define VRDI   VGPR(rdi)
> > +#define VR8    VGPR(r8)
> > +#define VR9    VGPR(r9)
> > +#define VR10   VGPR(r10)
> > +#define VR11   VGPR(r11)
> > +#define VR12   VGPR(r12)
> > +#define VR13   VGPR(r13)
> > +#define VR14   VGPR(r14)
> > +#define VR15   VGPR(r15)
> > +
> > +/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
> > +#define KMOV   VKINSN(kmov)
> > +#define KORTEST        VKINSN(kortest)
> > +#define KOR    VKINSN(kor)
> > +#define KTEST  VKINSN(ktest)
> > +#define KAND   VKINSN(kand)
> > +#define KXOR   VKINSN(kxor)
> > +#define KNOT   VKINSN(knot)
> > +#define KXNOR  VKINSN(kxnor)
> > +#define KUNPACK        VKINSN(kunpack)
>
> These aren't register macros.  Should reg-macros.h be renamed, like
> vec-macros.h?
>
> > +
> > +#ifdef USE_WIDE_CHAR
> > +# define REG_WIDTH 32
> > +#else
> > +# define REG_WIDTH VEC_SIZE
> > +#endif
> > +
> > +#define VPASTER(x, y)  x##_##y
> > +#define VEVALUATOR(x, y)       VPASTER(x, y)
> > +
> > +#define VGPR_SZ(reg_name, reg_size)    VEVALUATOR(reg_name, reg_size)
> > +#define VKINSN_SZ(insn, reg_size)      VEVALUATOR(insn, reg_size)
> > +
> > +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> > +#define VKINSN(mask_insn)      VKINSN_SZ(mask_insn, REG_WIDTH)
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > new file mode 100644
> > index 0000000000..6a05f27ff4
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> > @@ -0,0 +1,125 @@
> > +#!/usr/bin/python3
> > +# Copyright (C) 2022 Free Software Foundation, Inc.
> > +# This file is part of the GNU C Library.
> > +#
> > +# The GNU C Library is free software; you can redistribute it and/or
> > +# modify it under the terms of the GNU Lesser General Public
> > +# License as published by the Free Software Foundation; either
> > +# version 2.1 of the License, or (at your option) any later version.
> > +#
> > +# The GNU C Library is distributed in the hope that it will be useful,
> > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +# Lesser General Public License for more details.
> > +#
> > +# You should have received a copy of the GNU Lesser General Public
> > +# License along with the GNU C Library; if not, see
> > +# <https://www.gnu.org/licenses/>.
> > +"""Generate macros for getting GPR name of a certain size
> > +
> > +Inputs: None
> > +Output: Prints header fill to stdout
> > +
> > +API:
> > +    VGPR(reg_name)
> > +        - Get register name VEC_SIZE component of `reg_name`
> > +    VGPR_SZ(reg_name, reg_size)
> > +        - Get register name `reg_size` component of `reg_name`
> > +"""
> > +
> > +import sys
> > +import os
> > +from datetime import datetime
> > +
> > +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> > +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> > +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> > +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> > +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> > +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> > +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> > +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> > +
> > +mask_insns = [
> > +    "kmov",
> > +    "kortest",
> > +    "kor",
> > +    "ktest",
> > +    "kand",
> > +    "kxor",
> > +    "knot",
> > +    "kxnor",
> > +]
> > +mask_insns_ext = ["b", "w", "d", "q"]
> > +
> > +cr = """
> > +   Copyright (C) {} Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +"""
> > +
> > +print("/* This file was generated by: {}.".format(os.path.basename(
> > +    sys.argv[0])))
> > +print(cr.format(datetime.today().year))
> > +
> > +print("#ifndef _REG_MACROS_H")
> > +print("#define _REG_MACROS_H\t1")
> > +print("")
> > +for reg in registers:
> > +    for i in range(0, 4):
> > +        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
> > +
> > +print("")
> > +for mask_insn in mask_insns:
> > +    for i in range(0, 4):
> > +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> > +                                           mask_insns_ext[i]))
> > +for i in range(0, 3):
> > +    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
> > +                                                   mask_insns_ext[i + 1]))
> > +mask_insns.append("kunpack")
> > +
> > +print("")
> > +print(
> > +    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
> > +for reg in registers:
> > +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> > +
> > +print("")
> > +
> > +print(
> > +    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
> > +)
> > +for mask_insn in mask_insns:
> > +    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
> > +print("")
> > +
> > +print("#ifdef USE_WIDE_CHAR")
> > +print("# define REG_WIDTH 32")
> > +print("#else")
> > +print("# define REG_WIDTH VEC_SIZE")
> > +print("#endif")
> > +print("")
> > +print("#define VPASTER(x, y)\tx##_##y")
> > +print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
> > +print("")
> > +print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
> > +print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
> > +print("")
> > +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> > +print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
> > +
> > +print("\n#endif")
> > diff --git a/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
> > new file mode 100644
> > index 0000000000..0b326c8a70
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
> > @@ -0,0 +1,35 @@
> > +/* Common config for AVX-RTM VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _X86_AVX_RTM_VECS_H
> > +#define _X86_AVX_RTM_VECS_H                    1
> > +
> > +#define COND_VZEROUPPER                        COND_VZEROUPPER_XTEST
> > +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > +
> > +#define VZEROUPPER_RETURN              jmp L(return_vzeroupper)
> > +
> > +#define USE_WITH_RTM                   1
> > +#include "x86-avx-vecs.h"
> > +
> > +#undef SECTION
> > +#define SECTION(p)                             p##.avx.rtm
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> > new file mode 100644
> > index 0000000000..dca1089060
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> > @@ -0,0 +1,47 @@
> > +/* Common config for AVX VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _X86_AVX_VECS_H
> > +#define _X86_AVX_VECS_H                        1
> > +
> > +#ifdef VEC_SIZE
> > +# error "Multiple VEC configs included!"
> > +#endif
> > +
> > +#define VEC_SIZE                       32
> > +#include "x86-vec-macros.h"
> > +
> > +#define USE_WITH_AVX           1
> > +#define SECTION(p)                     p##.avx
> > +
> > +/* 4-byte mov instructions with AVX2.  */
> > +#define MOV_SIZE                       4
> > +/* 1 (ret) + 3 (vzeroupper).  */
> > +#define RET_SIZE                       4
> > +#define VZEROUPPER                     vzeroupper
> > +
> > +#define VMOVU                          vmovdqu
> > +#define VMOVA                          vmovdqa
> > +#define VMOVNT                         vmovntdq
> > +
> > +/* Often need to access xmm portion.  */
> > +#define VMM_128                                VMM_any_xmm
> > +#define VMM                                    VMM_any_ymm
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
> > new file mode 100644
> > index 0000000000..f331e9d8ec
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
> > @@ -0,0 +1,39 @@
> > +/* Common config for EVEX256 and EVEX512 VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _X86_EVEX_VECS_COMMON_H
> > +#define _X86_EVEX_VECS_COMMON_H                        1
> > +
> > +#include "x86-vec-macros.h"
> > +
> > +/* 6-byte mov instructions with EVEX.  */
> > +#define MOV_SIZE                       6
> > +/* No vzeroupper needed.  */
> > +#define RET_SIZE                       1
> > +#define VZEROUPPER
> > +
> > +#define VMOVU                          vmovdqu64
> > +#define VMOVA                          vmovdqa64
> > +#define VMOVNT                         vmovntdq
> > +
> > +#define VMM_128                                VMM_hi_xmm
> > +#define VMM_256                                VMM_hi_ymm
> > +#define VMM_512                                VMM_hi_zmm
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/x86-evex256-vecs.h b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
> > new file mode 100644
> > index 0000000000..8337b95504
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
> > @@ -0,0 +1,38 @@
> > +/* Common config for EVEX256 VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _EVEX256_VECS_H
> > +#define _EVEX256_VECS_H                        1
> > +
> > +#ifdef VEC_SIZE
> > +# error "Multiple VEC configs included!"
> > +#endif
> > +
> > +#define VEC_SIZE                       32
> > +#include "x86-evex-vecs-common.h"
> > +
> > +#define USE_WITH_EVEX256       1
> > +
> > +#ifndef SECTION
> > +# define SECTION(p)                    p##.evex
> > +#endif
> > +
> > +#define VMM                                    VMM_256
> > +#define VMM_lo                         VMM_any_ymm
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/x86-evex512-vecs.h b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
> > new file mode 100644
> > index 0000000000..7dc5c23ad0
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
> > @@ -0,0 +1,38 @@
> > +/* Common config for EVEX512 VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _EVEX512_VECS_H
> > +#define _EVEX512_VECS_H                        1
> > +
> > +#ifdef VEC_SIZE
> > +# error "Multiple VEC configs included!"
> > +#endif
> > +
> > +#define VEC_SIZE                       64
> > +#include "x86-evex-vecs-common.h"
> > +
> > +#define USE_WITH_EVEX512       1
> > +
> > +#ifndef SECTION
> > +# define SECTION(p)                    p##.evex512
> > +#endif
> > +
> > +#define VMM                                    VMM_512
> > +#define VMM_lo                         VMM_any_zmm
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/x86-sse2-vecs.h b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
> > new file mode 100644
> > index 0000000000..b8bbd5dc29
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
> > @@ -0,0 +1,47 @@
> > +/* Common config for SSE2 VECs
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _X86_SSE2_VECS_H
> > +#define _X86_SSE2_VECS_H                       1
> > +
> > +#ifdef VEC_SIZE
> > +# error "Multiple VEC configs included!"
> > +#endif
> > +
> > +#define VEC_SIZE                       16
> > +#include "x86-vec-macros.h"
> > +
> > +#define USE_WITH_SSE2          1
> > +#define SECTION(p)                     p
> > +
> > +/* 3-byte mov instructions with SSE2.  */
> > +#define MOV_SIZE                       3
> > +/* No vzeroupper needed.  */
> > +#define RET_SIZE                       1
> > +#define VZEROUPPER
> > +
> > +#define VMOVU                          movups
> > +#define VMOVA                          movaps
> > +#define VMOVNT                         movntdq
> > +
> > +#define VMM_128                                VMM_any_xmm
> > +#define VMM                                    VMM_any_xmm
> > +
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/x86-vec-macros.h b/sysdeps/x86_64/multiarch/x86-vec-macros.h
> > new file mode 100644
> > index 0000000000..7d6bb31d55
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/x86-vec-macros.h
> > @@ -0,0 +1,90 @@
> > +/* Macro helpers for VEC_{type}({vec_num})
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _X86_VEC_MACROS_H
> > +#define _X86_VEC_MACROS_H                      1
> > +
> > +#ifndef VEC_SIZE
> > +# error "Never include this file directly. Always include a vector config."
> > +#endif
> > +
> > +/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
> > +   VMM(N) values.  */
> > +#define VMM_hi_xmm0                            xmm16
> > +#define VMM_hi_xmm1                            xmm17
> > +#define VMM_hi_xmm2                            xmm18
> > +#define VMM_hi_xmm3                            xmm19
> > +#define VMM_hi_xmm4                            xmm20
> > +#define VMM_hi_xmm5                            xmm21
> > +#define VMM_hi_xmm6                            xmm22
> > +#define VMM_hi_xmm7                            xmm23
> > +#define VMM_hi_xmm8                            xmm24
> > +#define VMM_hi_xmm9                            xmm25
> > +#define VMM_hi_xmm10                   xmm26
> > +#define VMM_hi_xmm11                   xmm27
> > +#define VMM_hi_xmm12                   xmm28
> > +#define VMM_hi_xmm13                   xmm29
> > +#define VMM_hi_xmm14                   xmm30
> > +#define VMM_hi_xmm15                   xmm31
> > +
> > +#define VMM_hi_ymm0                            ymm16
> > +#define VMM_hi_ymm1                            ymm17
> > +#define VMM_hi_ymm2                            ymm18
> > +#define VMM_hi_ymm3                            ymm19
> > +#define VMM_hi_ymm4                            ymm20
> > +#define VMM_hi_ymm5                            ymm21
> > +#define VMM_hi_ymm6                            ymm22
> > +#define VMM_hi_ymm7                            ymm23
> > +#define VMM_hi_ymm8                            ymm24
> > +#define VMM_hi_ymm9                            ymm25
> > +#define VMM_hi_ymm10                   ymm26
> > +#define VMM_hi_ymm11                   ymm27
> > +#define VMM_hi_ymm12                   ymm28
> > +#define VMM_hi_ymm13                   ymm29
> > +#define VMM_hi_ymm14                   ymm30
> > +#define VMM_hi_ymm15                   ymm31
> > +
> > +#define VMM_hi_zmm0                            zmm16
> > +#define VMM_hi_zmm1                            zmm17
> > +#define VMM_hi_zmm2                            zmm18
> > +#define VMM_hi_zmm3                            zmm19
> > +#define VMM_hi_zmm4                            zmm20
> > +#define VMM_hi_zmm5                            zmm21
> > +#define VMM_hi_zmm6                            zmm22
> > +#define VMM_hi_zmm7                            zmm23
> > +#define VMM_hi_zmm8                            zmm24
> > +#define VMM_hi_zmm9                            zmm25
> > +#define VMM_hi_zmm10                   zmm26
> > +#define VMM_hi_zmm11                   zmm27
> > +#define VMM_hi_zmm12                   zmm28
> > +#define VMM_hi_zmm13                   zmm29
> > +#define VMM_hi_zmm14                   zmm30
> > +#define VMM_hi_zmm15                   zmm31
> > +
> > +#define PRIMITIVE_VMM(vec, num)                vec##num
> > +
> > +#define VMM_any_xmm(i)                 PRIMITIVE_VMM(xmm, i)
> > +#define VMM_any_ymm(i)                 PRIMITIVE_VMM(ymm, i)
> > +#define VMM_any_zmm(i)                 PRIMITIVE_VMM(zmm, i)
> > +
> > +#define VMM_hi_xmm(i)                  PRIMITIVE_VMM(VMM_hi_xmm, i)
> > +#define VMM_hi_ymm(i)                  PRIMITIVE_VMM(VMM_hi_ymm, i)
> > +#define VMM_hi_zmm(i)                  PRIMITIVE_VMM(VMM_hi_zmm, i)
> > +
> > +#endif
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v9 6/6] x86: Update strlen-evex-base to use new reg/vec macros.
  2022-10-15  0:20 ` [PATCH v9 " Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-10-15  0:20   ` [PATCH v9 5/6] x86: Remove now unused vec header macros Noah Goldstein
@ 2022-10-15  0:21   ` Noah Goldstein
  2022-10-15  2:58     ` H.J. Lu
  2022-10-15  2:45   ` [PATCH v9 1/6] x86: Update VEC macros to complete API for evex/evex512 impls H.J. Lu
  5 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  0:21 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

To avoid duplicate the VMM / GPR / mask insn macros in all incoming
evex512 files use the macros defined in 'reg-macros.h' and
'{vec}-macros.h'

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 116 +++++++-------------
 sysdeps/x86_64/multiarch/strlen-evex512.S   |   4 +-
 2 files changed, 44 insertions(+), 76 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index 418e9f8411..c832b15a48 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -36,42 +36,10 @@
 #  define CHAR_SIZE	1
 # endif
 
-# define XMM0		xmm16
 # define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# if VEC_SIZE == 64
-#  define KMOV		kmovq
-#  define KORTEST	kortestq
-#  define RAX		rax
-#  define RCX		rcx
-#  define RDX		rdx
-#  define SHR		shrq
-#  define TEXTSUFFIX	evex512
-#  define VMM0		zmm16
-#  define VMM1		zmm17
-#  define VMM2		zmm18
-#  define VMM3		zmm19
-#  define VMM4		zmm20
-#  define VMOVA		vmovdqa64
-# elif VEC_SIZE == 32
-/* Currently Unused.  */
-#  define KMOV		kmovd
-#  define KORTEST	kortestd
-#  define RAX		eax
-#  define RCX		ecx
-#  define RDX		edx
-#  define SHR		shrl
-#  define TEXTSUFFIX	evex256
-#  define VMM0		ymm16
-#  define VMM1		ymm17
-#  define VMM2		ymm18
-#  define VMM3		ymm19
-#  define VMM4		ymm20
-#  define VMOVA		vmovdqa32
-# endif
-
-	.section .text.TEXTSUFFIX, "ax", @progbits
+	.section SECTION(.text),"ax",@progbits
 /* Aligning entry point to 64 byte, provides better performance for
    one vector length string.  */
 ENTRY_P2ALIGN (STRLEN, 6)
@@ -86,18 +54,18 @@ ENTRY_P2ALIGN (STRLEN, 6)
 # endif
 
 	movl	%edi, %eax
-	vpxorq	%XMM0, %XMM0, %XMM0
+	vpxorq	%VMM_128(0), %VMM_128(0), %VMM_128(0)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(page_cross)
 
 	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMP	$0, (%rdi), %VMM0, %k0
-	KMOV	%k0, %RAX
-	test	%RAX, %RAX
+	VPCMP	$0, (%rdi), %VMM(0), %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
@@ -120,7 +88,7 @@ L(align_more):
 	movq	%rax, %rdx
 	subq	%rdi, %rdx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RDX
+	shr	$2, %VRDX
 #  endif
 	/* At this point rdx contains [w]chars already compared.  */
 	subq	%rsi, %rdx
@@ -131,9 +99,9 @@ L(align_more):
 # endif
 
 	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMP	$0, (%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
 # ifdef USE_AS_STRNLEN
@@ -141,9 +109,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, VEC_SIZE(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, VEC_SIZE(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
 # ifdef USE_AS_STRNLEN
@@ -151,9 +119,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 
 # ifdef USE_AS_STRNLEN
@@ -161,9 +129,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x4)
 
 # ifdef USE_AS_STRNLEN
@@ -179,7 +147,7 @@ L(align_more):
 # ifdef USE_AS_STRNLEN
 	subq	%rax, %rcx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RCX
+	shr	$2, %VRCX
 #  endif
 	/* rcx contains number of [w]char will be recompared due to
 	   alignment fixes.  rdx must be incremented by rcx to offset
@@ -199,42 +167,42 @@ L(loop_entry):
 # endif
 	/* VPMINU and VPCMP combination provide better performance as
 	   compared to alternative combinations.  */
-	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
-	VPMINU	(VEC_SIZE * 5)(%rax), %VMM1, %VMM2
-	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
-	VPMINU	(VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+	VMOVA	(VEC_SIZE * 4)(%rax), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rax), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
 
-	VPTESTN	%VMM2, %VMM2, %k0
-	VPTESTN	%VMM4, %VMM4, %k1
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k1
 
 	subq	$-(VEC_SIZE * 4), %rax
 	KORTEST	%k0, %k1
 	jz	L(loop)
 
-	VPTESTN	%VMM1, %VMM1, %k2
-	KMOV	%k2, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VMM(1), %VMM(1), %k2
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
-	KMOV	%k0, %RCX
+	KMOV	%k0, %VRCX
 	/* At this point, if k0 is non zero, null char must be in the
 	   second vector.  */
-	test	%RCX, %RCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
-	VPTESTN	%VMM3, %VMM3, %k3
-	KMOV	%k3, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VMM(3), %VMM(3), %k3
+	KMOV	%k3, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 	/* At this point null [w]char must be in the fourth vector so no
 	   need to check.  */
-	KMOV	%k1, %RCX
+	KMOV	%k1, %VRCX
 
 	/* Fourth, third, second vector terminating are pretty much
 	   same, implemented this way to avoid branching and reuse code
 	   from pre loop exit condition.  */
 L(ret_vec_x4):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 3), %rax
@@ -250,7 +218,7 @@ L(ret_vec_x4):
 	ret
 
 L(ret_vec_x3):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 2), %rax
@@ -268,7 +236,7 @@ L(ret_vec_x3):
 L(ret_vec_x2):
 	subq	$-VEC_SIZE, %rax
 L(ret_vec_x1):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
@@ -289,13 +257,13 @@ L(page_cross):
 	/* ecx contains number of w[char] to be skipped as a result
 	   of address alignment.  */
 	xorq	%rdi, %rax
-	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
-	KMOV	%k0, %RAX
+	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRAX
 	/* Ignore number of character for alignment adjustment.  */
-	SHR	%cl, %RAX
+	shr	%cl, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
index 116f8981c8..10c3415c8a 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -2,6 +2,6 @@
 # define STRLEN		__strlen_evex512
 #endif
 
-#define VEC_SIZE	64
-
+#include "x86-evex512-vecs.h"
+#include "reg-macros.h"
 #include "strlen-evex-base.S"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v9 1/6] x86: Update VEC macros to complete API for evex/evex512 impls
  2022-10-15  0:20 ` [PATCH v9 " Noah Goldstein
                     ` (4 preceding siblings ...)
  2022-10-15  0:21   ` [PATCH v9 6/6] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
@ 2022-10-15  2:45   ` H.J. Lu
  5 siblings, 0 replies; 72+ messages in thread
From: H.J. Lu @ 2022-10-15  2:45 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Fri, Oct 14, 2022 at 5:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> 1) Copy so that backport will be easier.
> 2) Make section only define if there is not a previous definition
> 3) Add `VEC_lo` definition for proper reg-width but in the
>    ymm/zmm0-15 range.
> 4) Add macros for accessing GPRs based on VEC_SIZE
>         This is to make it easier to do think like:
>         ```
>             vpcmpb %VEC(0), %VEC(1), %k0
>             kmov{d|q} %k0, %{eax|rax}
>             test %{eax|rax}
>         ```
>         It adds macro s.t any GPR can get the proper width with:
>             `V{upcase_GPR_name}`
>
>         and any mask insn can get the proper width with:
>             `{upcase_mask_insn_without_postfix}`
>
> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  sysdeps/x86_64/multiarch/reg-macros.h         | 168 ++++++++++++++++++
>  .../multiarch/scripts/gen-reg-macros.py       | 133 ++++++++++++++
>  sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h   |  35 ++++
>  sysdeps/x86_64/multiarch/x86-avx-vecs.h       |  47 +++++
>  .../x86_64/multiarch/x86-evex-vecs-common.h   |  39 ++++
>  sysdeps/x86_64/multiarch/x86-evex256-vecs.h   |  38 ++++
>  sysdeps/x86_64/multiarch/x86-evex512-vecs.h   |  38 ++++
>  sysdeps/x86_64/multiarch/x86-sse2-vecs.h      |  47 +++++
>  sysdeps/x86_64/multiarch/x86-vec-macros.h     |  90 ++++++++++
>  9 files changed, 635 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
>  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
>  create mode 100644 sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-avx-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-evex256-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-evex512-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-sse2-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-vec-macros.h
>
> diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> new file mode 100644
> index 0000000000..c8ea330256
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> @@ -0,0 +1,168 @@
> +/* This file was generated by: gen-reg-macros.py.
> +
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _REG_MACROS_H
> +#define _REG_MACROS_H  1
> +
> +#define rax_8  al
> +#define rax_16 ax
> +#define rax_32 eax
> +#define rax_64 rax
> +#define rbx_8  bl
> +#define rbx_16 bx
> +#define rbx_32 ebx
> +#define rbx_64 rbx
> +#define rcx_8  cl
> +#define rcx_16 cx
> +#define rcx_32 ecx
> +#define rcx_64 rcx
> +#define rdx_8  dl
> +#define rdx_16 dx
> +#define rdx_32 edx
> +#define rdx_64 rdx
> +#define rbp_8  bpl
> +#define rbp_16 bp
> +#define rbp_32 ebp
> +#define rbp_64 rbp
> +#define rsp_8  spl
> +#define rsp_16 sp
> +#define rsp_32 esp
> +#define rsp_64 rsp
> +#define rsi_8  sil
> +#define rsi_16 si
> +#define rsi_32 esi
> +#define rsi_64 rsi
> +#define rdi_8  dil
> +#define rdi_16 di
> +#define rdi_32 edi
> +#define rdi_64 rdi
> +#define r8_8   r8b
> +#define r8_16  r8w
> +#define r8_32  r8d
> +#define r8_64  r8
> +#define r9_8   r9b
> +#define r9_16  r9w
> +#define r9_32  r9d
> +#define r9_64  r9
> +#define r10_8  r10b
> +#define r10_16 r10w
> +#define r10_32 r10d
> +#define r10_64 r10
> +#define r11_8  r11b
> +#define r11_16 r11w
> +#define r11_32 r11d
> +#define r11_64 r11
> +#define r12_8  r12b
> +#define r12_16 r12w
> +#define r12_32 r12d
> +#define r12_64 r12
> +#define r13_8  r13b
> +#define r13_16 r13w
> +#define r13_32 r13d
> +#define r13_64 r13
> +#define r14_8  r14b
> +#define r14_16 r14w
> +#define r14_32 r14d
> +#define r14_64 r14
> +#define r15_8  r15b
> +#define r15_16 r15w
> +#define r15_32 r15d
> +#define r15_64 r15
> +
> +#define kmov_8 kmovb
> +#define kmov_16        kmovw
> +#define kmov_32        kmovd
> +#define kmov_64        kmovq
> +#define kortest_8      kortestb
> +#define kortest_16     kortestw
> +#define kortest_32     kortestd
> +#define kortest_64     kortestq
> +#define kor_8  korb
> +#define kor_16 korw
> +#define kor_32 kord
> +#define kor_64 korq
> +#define ktest_8        ktestb
> +#define ktest_16       ktestw
> +#define ktest_32       ktestd
> +#define ktest_64       ktestq
> +#define kand_8 kandb
> +#define kand_16        kandw
> +#define kand_32        kandd
> +#define kand_64        kandq
> +#define kxor_8 kxorb
> +#define kxor_16        kxorw
> +#define kxor_32        kxord
> +#define kxor_64        kxorq
> +#define knot_8 knotb
> +#define knot_16        knotw
> +#define knot_32        knotd
> +#define knot_64        knotq
> +#define kxnor_8        kxnorb
> +#define kxnor_16       kxnorw
> +#define kxnor_32       kxnord
> +#define kxnor_64       kxnorq
> +#define kunpack_8      kunpackbw
> +#define kunpack_16     kunpackwd
> +#define kunpack_32     kunpackdq
> +
> +/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
> +#define VRAX   VGPR(rax)
> +#define VRBX   VGPR(rbx)
> +#define VRCX   VGPR(rcx)
> +#define VRDX   VGPR(rdx)
> +#define VRBP   VGPR(rbp)
> +#define VRSP   VGPR(rsp)
> +#define VRSI   VGPR(rsi)
> +#define VRDI   VGPR(rdi)
> +#define VR8    VGPR(r8)
> +#define VR9    VGPR(r9)
> +#define VR10   VGPR(r10)
> +#define VR11   VGPR(r11)
> +#define VR12   VGPR(r12)
> +#define VR13   VGPR(r13)
> +#define VR14   VGPR(r14)
> +#define VR15   VGPR(r15)
> +
> +/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
> +#define KMOV   VKINSN(kmov)
> +#define KORTEST        VKINSN(kortest)
> +#define KOR    VKINSN(kor)
> +#define KTEST  VKINSN(ktest)
> +#define KAND   VKINSN(kand)
> +#define KXOR   VKINSN(kxor)
> +#define KNOT   VKINSN(knot)
> +#define KXNOR  VKINSN(kxnor)
> +#define KUNPACK        VKINSN(kunpack)
> +
> +#ifdef USE_WIDE_CHAR
> +# define REG_WIDTH 32
> +#else
> +# define REG_WIDTH VEC_SIZE
> +#endif
> +
> +#define VPASTER(x, y)  x##_##y
> +#define VEVALUATOR(x, y)       VPASTER(x, y)
> +
> +#define VGPR_SZ(reg_name, reg_size)    VEVALUATOR(reg_name, reg_size)
> +#define VKINSN_SZ(insn, reg_size)      VEVALUATOR(insn, reg_size)
> +
> +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> +#define VKINSN(mask_insn)      VKINSN_SZ(mask_insn, REG_WIDTH)
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> new file mode 100644
> index 0000000000..9fb6903212
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> @@ -0,0 +1,133 @@
> +#!/usr/bin/python3
> +# Copyright (C) 2022 Free Software Foundation, Inc.
> +# This file is part of the GNU C Library.
> +#
> +# The GNU C Library is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU Lesser General Public
> +# License as published by the Free Software Foundation; either
> +# version 2.1 of the License, or (at your option) any later version.
> +#
> +# The GNU C Library is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +# Lesser General Public License for more details.
> +#
> +# You should have received a copy of the GNU Lesser General Public
> +# License along with the GNU C Library; if not, see
> +# <https://www.gnu.org/licenses/>.
> +"""Generate macros for getting GPR name of a certain size
> +
> +Inputs: None
> +Output: Prints header fill to stdout
> +
> +API:
> +    V{upcase_GPR_name}
> +        - Get register name REG_WIDTH component of `upcase_GPR_name`
> +    {upcase_mask_insn_without_postfix}
> +        - Get proper REG_WIDTH mask insn for `upcase_mask_insn_without_postfix`
> +    VGPR(reg_name)
> +        - Get register name REG_WIDTH component of `reg_name`
> +    VKINSN(mask_insn)
> +        - Get proper REG_WIDTH mask insn for `mask_insn`
> +    VGPR_SZ(reg_name, reg_size)
> +        - Get register name `reg_size` component of `reg_name`
> +    VKINSN_SZ(mask_insn, insn_size)
> +        - Get proper `insn_size` mask insn for `mask_insn`
> +"""
> +
> +import sys
> +import os
> +from datetime import datetime
> +
> +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> +
> +mask_insns = [
> +    "kmov",
> +    "kortest",
> +    "kor",
> +    "ktest",
> +    "kand",
> +    "kxor",
> +    "knot",
> +    "kxnor",
> +]
> +mask_insns_ext = ["b", "w", "d", "q"]
> +
> +cr = """
> +   Copyright (C) {} Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +"""
> +
> +print("/* This file was generated by: {}.".format(os.path.basename(
> +    sys.argv[0])))
> +print(cr.format(datetime.today().year))
> +
> +print("#ifndef _REG_MACROS_H")
> +print("#define _REG_MACROS_H\t1")
> +print("")
> +for reg in registers:
> +    for i in range(0, 4):
> +        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
> +
> +print("")
> +for mask_insn in mask_insns:
> +    for i in range(0, 4):
> +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> +                                           mask_insns_ext[i]))
> +for i in range(0, 3):
> +    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
> +                                                   mask_insns_ext[i + 1]))
> +mask_insns.append("kunpack")
> +
> +print("")
> +print(
> +    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
> +for reg in registers:
> +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> +
> +print("")
> +
> +print(
> +    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
> +)
> +for mask_insn in mask_insns:
> +    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
> +print("")
> +
> +print("#ifdef USE_WIDE_CHAR")
> +print("# define REG_WIDTH 32")
> +print("#else")
> +print("# define REG_WIDTH VEC_SIZE")
> +print("#endif")
> +print("")
> +print("#define VPASTER(x, y)\tx##_##y")
> +print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
> +print("")
> +print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
> +print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
> +print("")
> +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> +print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
> +
> +print("\n#endif")
> diff --git a/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
> new file mode 100644
> index 0000000000..0b326c8a70
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
> @@ -0,0 +1,35 @@
> +/* Common config for AVX-RTM VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_AVX_RTM_VECS_H
> +#define _X86_AVX_RTM_VECS_H                    1
> +
> +#define COND_VZEROUPPER                        COND_VZEROUPPER_XTEST
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN              jmp L(return_vzeroupper)
> +
> +#define USE_WITH_RTM                   1
> +#include "x86-avx-vecs.h"
> +
> +#undef SECTION
> +#define SECTION(p)                             p##.avx.rtm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> new file mode 100644
> index 0000000000..dca1089060
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> @@ -0,0 +1,47 @@
> +/* Common config for AVX VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_AVX_VECS_H
> +#define _X86_AVX_VECS_H                        1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       32
> +#include "x86-vec-macros.h"
> +
> +#define USE_WITH_AVX           1
> +#define SECTION(p)                     p##.avx
> +
> +/* 4-byte mov instructions with AVX2.  */
> +#define MOV_SIZE                       4
> +/* 1 (ret) + 3 (vzeroupper).  */
> +#define RET_SIZE                       4
> +#define VZEROUPPER                     vzeroupper
> +
> +#define VMOVU                          vmovdqu
> +#define VMOVA                          vmovdqa
> +#define VMOVNT                         vmovntdq
> +
> +/* Often need to access xmm portion.  */
> +#define VMM_128                                VMM_any_xmm
> +#define VMM                                    VMM_any_ymm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
> new file mode 100644
> index 0000000000..f331e9d8ec
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
> @@ -0,0 +1,39 @@
> +/* Common config for EVEX256 and EVEX512 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_EVEX_VECS_COMMON_H
> +#define _X86_EVEX_VECS_COMMON_H                        1
> +
> +#include "x86-vec-macros.h"
> +
> +/* 6-byte mov instructions with EVEX.  */
> +#define MOV_SIZE                       6
> +/* No vzeroupper needed.  */
> +#define RET_SIZE                       1
> +#define VZEROUPPER
> +
> +#define VMOVU                          vmovdqu64
> +#define VMOVA                          vmovdqa64
> +#define VMOVNT                         vmovntdq
> +
> +#define VMM_128                                VMM_hi_xmm
> +#define VMM_256                                VMM_hi_ymm
> +#define VMM_512                                VMM_hi_zmm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-evex256-vecs.h b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
> new file mode 100644
> index 0000000000..8337b95504
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
> @@ -0,0 +1,38 @@
> +/* Common config for EVEX256 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _EVEX256_VECS_H
> +#define _EVEX256_VECS_H                        1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       32
> +#include "x86-evex-vecs-common.h"
> +
> +#define USE_WITH_EVEX256       1
> +
> +#ifndef SECTION
> +# define SECTION(p)                    p##.evex
> +#endif
> +
> +#define VMM                                    VMM_256
> +#define VMM_lo                         VMM_any_ymm
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-evex512-vecs.h b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
> new file mode 100644
> index 0000000000..7dc5c23ad0
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
> @@ -0,0 +1,38 @@
> +/* Common config for EVEX512 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _EVEX512_VECS_H
> +#define _EVEX512_VECS_H                        1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       64
> +#include "x86-evex-vecs-common.h"
> +
> +#define USE_WITH_EVEX512       1
> +
> +#ifndef SECTION
> +# define SECTION(p)                    p##.evex512
> +#endif
> +
> +#define VMM                                    VMM_512
> +#define VMM_lo                         VMM_any_zmm
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-sse2-vecs.h b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
> new file mode 100644
> index 0000000000..b8bbd5dc29
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
> @@ -0,0 +1,47 @@
> +/* Common config for SSE2 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_SSE2_VECS_H
> +#define _X86_SSE2_VECS_H                       1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       16
> +#include "x86-vec-macros.h"
> +
> +#define USE_WITH_SSE2          1
> +#define SECTION(p)                     p
> +
> +/* 3-byte mov instructions with SSE2.  */
> +#define MOV_SIZE                       3
> +/* No vzeroupper needed.  */
> +#define RET_SIZE                       1
> +#define VZEROUPPER
> +
> +#define VMOVU                          movups
> +#define VMOVA                          movaps
> +#define VMOVNT                         movntdq
> +
> +#define VMM_128                                VMM_any_xmm
> +#define VMM                                    VMM_any_xmm
> +
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-vec-macros.h b/sysdeps/x86_64/multiarch/x86-vec-macros.h
> new file mode 100644
> index 0000000000..7d6bb31d55
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-vec-macros.h
> @@ -0,0 +1,90 @@
> +/* Macro helpers for VEC_{type}({vec_num})
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_VEC_MACROS_H
> +#define _X86_VEC_MACROS_H                      1
> +
> +#ifndef VEC_SIZE
> +# error "Never include this file directly. Always include a vector config."
> +#endif
> +
> +/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
> +   VMM(N) values.  */
> +#define VMM_hi_xmm0                            xmm16
> +#define VMM_hi_xmm1                            xmm17
> +#define VMM_hi_xmm2                            xmm18
> +#define VMM_hi_xmm3                            xmm19
> +#define VMM_hi_xmm4                            xmm20
> +#define VMM_hi_xmm5                            xmm21
> +#define VMM_hi_xmm6                            xmm22
> +#define VMM_hi_xmm7                            xmm23
> +#define VMM_hi_xmm8                            xmm24
> +#define VMM_hi_xmm9                            xmm25
> +#define VMM_hi_xmm10                   xmm26
> +#define VMM_hi_xmm11                   xmm27
> +#define VMM_hi_xmm12                   xmm28
> +#define VMM_hi_xmm13                   xmm29
> +#define VMM_hi_xmm14                   xmm30
> +#define VMM_hi_xmm15                   xmm31
> +
> +#define VMM_hi_ymm0                            ymm16
> +#define VMM_hi_ymm1                            ymm17
> +#define VMM_hi_ymm2                            ymm18
> +#define VMM_hi_ymm3                            ymm19
> +#define VMM_hi_ymm4                            ymm20
> +#define VMM_hi_ymm5                            ymm21
> +#define VMM_hi_ymm6                            ymm22
> +#define VMM_hi_ymm7                            ymm23
> +#define VMM_hi_ymm8                            ymm24
> +#define VMM_hi_ymm9                            ymm25
> +#define VMM_hi_ymm10                   ymm26
> +#define VMM_hi_ymm11                   ymm27
> +#define VMM_hi_ymm12                   ymm28
> +#define VMM_hi_ymm13                   ymm29
> +#define VMM_hi_ymm14                   ymm30
> +#define VMM_hi_ymm15                   ymm31
> +
> +#define VMM_hi_zmm0                            zmm16
> +#define VMM_hi_zmm1                            zmm17
> +#define VMM_hi_zmm2                            zmm18
> +#define VMM_hi_zmm3                            zmm19
> +#define VMM_hi_zmm4                            zmm20
> +#define VMM_hi_zmm5                            zmm21
> +#define VMM_hi_zmm6                            zmm22
> +#define VMM_hi_zmm7                            zmm23
> +#define VMM_hi_zmm8                            zmm24
> +#define VMM_hi_zmm9                            zmm25
> +#define VMM_hi_zmm10                   zmm26
> +#define VMM_hi_zmm11                   zmm27
> +#define VMM_hi_zmm12                   zmm28
> +#define VMM_hi_zmm13                   zmm29
> +#define VMM_hi_zmm14                   zmm30
> +#define VMM_hi_zmm15                   zmm31
> +
> +#define PRIMITIVE_VMM(vec, num)                vec##num
> +
> +#define VMM_any_xmm(i)                 PRIMITIVE_VMM(xmm, i)
> +#define VMM_any_ymm(i)                 PRIMITIVE_VMM(ymm, i)
> +#define VMM_any_zmm(i)                 PRIMITIVE_VMM(zmm, i)
> +
> +#define VMM_hi_xmm(i)                  PRIMITIVE_VMM(VMM_hi_xmm, i)
> +#define VMM_hi_ymm(i)                  PRIMITIVE_VMM(VMM_hi_ymm, i)
> +#define VMM_hi_zmm(i)                  PRIMITIVE_VMM(VMM_hi_zmm, i)
> +
> +#endif
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v9 2/6] x86: Update memrchr to use new VEC macros
  2022-10-15  0:20   ` [PATCH v9 2/6] x86: Update memrchr to use new VEC macros Noah Goldstein
@ 2022-10-15  2:48     ` H.J. Lu
  0 siblings, 0 replies; 72+ messages in thread
From: H.J. Lu @ 2022-10-15  2:48 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 5:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Replace %VEC(n) -> %VMM(n)
>
> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  sysdeps/x86_64/multiarch/memrchr-evex.S | 42 ++++++++++++-------------
>  1 file changed, 21 insertions(+), 21 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
> index ea3a0a0a60..550b328c5a 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
> @@ -21,7 +21,7 @@
>  #if ISA_SHOULD_BUILD (4)
>
>  # include <sysdep.h>
> -# include "evex256-vecs.h"
> +# include "x86-evex256-vecs.h"
>  # if VEC_SIZE != 32
>  #  error "VEC_SIZE != 32 unimplemented"
>  # endif
> @@ -31,7 +31,7 @@
>  # endif
>
>  # define PAGE_SIZE                     4096
> -# define VECMATCH                      VEC(0)
> +# define VMMMATCH                      VMM(0)
>
>         .section SECTION(.text), "ax", @progbits
>  ENTRY_P2ALIGN(MEMRCHR, 6)
> @@ -47,7 +47,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
>            correct page cross check and 2) it correctly sets up end ptr to be
>            subtract by lzcnt aligned.  */
>         leaq    -1(%rdi, %rdx), %rax
> -       vpbroadcastb %esi, %VECMATCH
> +       vpbroadcastb %esi, %VMMMATCH
>
>         /* Check if we can load 1x VEC without cross a page.  */
>         testl   $(PAGE_SIZE - VEC_SIZE), %eax
> @@ -55,7 +55,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
>
>         /* Don't use rax for pointer here because EVEX has better encoding with
>            offset % VEC_SIZE == 0.  */
> -       vpcmpb  $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
> +       vpcmpb  $0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>
>         /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
> @@ -96,7 +96,7 @@ L(more_1x_vec):
>         movq    %rax, %rdx
>
>         /* Need no matter what.  */
> -       vpcmpb  $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>
>         subq    %rdi, %rdx
> @@ -115,7 +115,7 @@ L(last_2x_vec):
>
>         /* Don't use rax for pointer here because EVEX has better encoding with
>            offset % VEC_SIZE == 0.  */
> -       vpcmpb  $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
> +       vpcmpb  $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>         /* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
>         lzcntq  %rcx, %rcx
> @@ -131,7 +131,7 @@ L(last_2x_vec):
>  L(page_cross):
>         movq    %rax, %rsi
>         andq    $-VEC_SIZE, %rsi
> -       vpcmpb  $0, (%rsi), %VECMATCH, %k0
> +       vpcmpb  $0, (%rsi), %VMMMATCH, %k0
>         kmovd   %k0, %r8d
>         /* Shift out negative alignment (because we are starting from endptr and
>            working backwards).  */
> @@ -165,13 +165,13 @@ L(more_2x_vec):
>         testl   %ecx, %ecx
>         jnz     L(ret_vec_x0_dec)
>
> -       vpcmpb  $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>         testl   %ecx, %ecx
>         jnz     L(ret_vec_x1)
>
>         /* Need no matter what.  */
> -       vpcmpb  $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>
>         subq    $(VEC_SIZE * 4), %rdx
> @@ -185,7 +185,7 @@ L(last_vec):
>
>
>         /* Need no matter what.  */
> -       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>         lzcntl  %ecx, %ecx
>         subq    $(VEC_SIZE * 3 + 1), %rax
> @@ -220,7 +220,7 @@ L(more_4x_vec):
>         testl   %ecx, %ecx
>         jnz     L(ret_vec_x2)
>
> -       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>
>         testl   %ecx, %ecx
> @@ -243,17 +243,17 @@ L(more_4x_vec):
>  L(loop_4x_vec):
>         /* Store 1 were not-equals and 0 where equals in k1 (used to mask later
>            on).  */
> -       vpcmpb  $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
> +       vpcmpb  $4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1
>
>         /* VEC(2/3) will have zero-byte where we found a CHAR.  */
> -       vpxorq  (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
> -       vpxorq  (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
> -       vpcmpb  $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
> +       vpxorq  (VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2)
> +       vpxorq  (VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3)
> +       vpcmpb  $0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4
>
>         /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
>            CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
> -       vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z}
> -       vptestnmb %VEC(3), %VEC(3), %k2
> +       vpminub %VMM(2), %VMM(3), %VMM(3){%k1}{z}
> +       vptestnmb %VMM(3), %VMM(3), %k2
>
>         /* Any 1s and we found CHAR.  */
>         kortestd %k2, %k4
> @@ -270,7 +270,7 @@ L(loop_4x_vec):
>  L(last_4x_vec):
>
>         /* Used no matter what.  */
> -       vpcmpb  $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>
>         cmpl    $(VEC_SIZE * 2), %edx
> @@ -280,14 +280,14 @@ L(last_4x_vec):
>         jnz     L(ret_vec_x0_dec)
>
>
> -       vpcmpb  $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>
>         testl   %ecx, %ecx
>         jnz     L(ret_vec_x1)
>
>         /* Used no matter what.  */
> -       vpcmpb  $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>
>         cmpl    $(VEC_SIZE * 3), %edx
> @@ -309,7 +309,7 @@ L(loop_end):
>         testl   %ecx, %ecx
>         jnz     L(ret_vec_x0_end)
>
> -       vptestnmb %VEC(2), %VEC(2), %k0
> +       vptestnmb %VMM(2), %VMM(2), %k0
>         kmovd   %k0, %ecx
>         testl   %ecx, %ecx
>         jnz     L(ret_vec_x1_end)
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v9 3/6] x86: Update memmove to use new VEC macros
  2022-10-15  0:20   ` [PATCH v9 3/6] x86: Update memmove " Noah Goldstein
@ 2022-10-15  2:52     ` H.J. Lu
  2022-10-15  2:57       ` Noah Goldstein
  0 siblings, 1 reply; 72+ messages in thread
From: H.J. Lu @ 2022-10-15  2:52 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 5:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Replace %VEC(n) -> %VMM(n)
>
> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  .../memmove-avx-unaligned-erms-rtm.S          |  15 +-
>  .../multiarch/memmove-avx-unaligned-erms.S    |   9 +-
>  .../multiarch/memmove-avx512-unaligned-erms.S |  30 +-
>  .../multiarch/memmove-evex-unaligned-erms.S   |  30 +-
>  .../multiarch/memmove-sse2-unaligned-erms.S   |  11 +-
>  .../multiarch/memmove-vec-unaligned-erms.S    | 262 +++++++++---------
>  6 files changed, 135 insertions(+), 222 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> index 67a55f0c85..c2a95dc247 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> @@ -1,16 +1,9 @@
> -#if IS_IN (libc)

Keep this.

> -# define VEC_SIZE      32
> -# define VEC(i)                ymm##i
> -# define VMOVNT                vmovntdq
> -# define VMOVU         vmovdqu
> -# define VMOVA         vmovdqa
> -# define MOV_SIZE      4
> -# define ZERO_UPPER_VEC_REGISTERS_RETURN \
> -  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +#include <isa-level.h>
>
> -# define VZEROUPPER_RETURN jmp  L(return)
> +#if ISA_SHOULD_BUILD (3)

This ISA_SHOULD_BUILD change isn't needed.

> +# include "x86-avx-rtm-vecs.h"
>
> -# define SECTION(p)            p##.avx.rtm
>  # define MEMMOVE_SYMBOL(p,s)   p##_avx_##s##_rtm
>
>  # include "memmove-vec-unaligned-erms.S"
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> index a14b155667..4e4b4635f9 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> @@ -2,14 +2,7 @@
>
>  #if ISA_SHOULD_BUILD (3)
>
> -# define VEC_SIZE      32
> -# define VEC(i)                ymm##i
> -# define VMOVNT                vmovntdq
> -# define VMOVU         vmovdqu
> -# define VMOVA         vmovdqa
> -# define MOV_SIZE      4
> -
> -# define SECTION(p)            p##.avx
> +# include "x86-avx-vecs.h"
>
>  # ifndef MEMMOVE_SYMBOL
>  #  define MEMMOVE_SYMBOL(p,s)  p##_avx_##s
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> index 8d1568a7ba..cca97e38f8 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> @@ -2,35 +2,7 @@
>
>  #if ISA_SHOULD_BUILD (4)
>
> -# define VEC_SIZE      64
> -# define XMM0          xmm16
> -# define XMM1          xmm17
> -# define YMM0          ymm16
> -# define YMM1          ymm17
> -# define VEC0          zmm16
> -# define VEC1          zmm17
> -# define VEC2          zmm18
> -# define VEC3          zmm19
> -# define VEC4          zmm20
> -# define VEC5          zmm21
> -# define VEC6          zmm22
> -# define VEC7          zmm23
> -# define VEC8          zmm24
> -# define VEC9          zmm25
> -# define VEC10         zmm26
> -# define VEC11         zmm27
> -# define VEC12         zmm28
> -# define VEC13         zmm29
> -# define VEC14         zmm30
> -# define VEC15         zmm31
> -# define VEC(i)                VEC##i
> -# define VMOVNT                vmovntdq
> -# define VMOVU         vmovdqu64
> -# define VMOVA         vmovdqa64
> -# define VZEROUPPER
> -# define MOV_SIZE      6
> -
> -# define SECTION(p)            p##.evex512
> +# include "x86-evex512-vecs.h"
>
>  # ifndef MEMMOVE_SYMBOL
>  #  define MEMMOVE_SYMBOL(p,s)  p##_avx512_##s
> diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> index 2373017358..1f7b5715f7 100644
> --- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> @@ -2,35 +2,7 @@
>
>  #if ISA_SHOULD_BUILD (4)
>
> -# define VEC_SIZE      32
> -# define XMM0          xmm16
> -# define XMM1          xmm17
> -# define YMM0          ymm16
> -# define YMM1          ymm17
> -# define VEC0          ymm16
> -# define VEC1          ymm17
> -# define VEC2          ymm18
> -# define VEC3          ymm19
> -# define VEC4          ymm20
> -# define VEC5          ymm21
> -# define VEC6          ymm22
> -# define VEC7          ymm23
> -# define VEC8          ymm24
> -# define VEC9          ymm25
> -# define VEC10         ymm26
> -# define VEC11         ymm27
> -# define VEC12         ymm28
> -# define VEC13         ymm29
> -# define VEC14         ymm30
> -# define VEC15         ymm31
> -# define VEC(i)                VEC##i
> -# define VMOVNT                vmovntdq
> -# define VMOVU         vmovdqu64
> -# define VMOVA         vmovdqa64
> -# define VZEROUPPER
> -# define MOV_SIZE      6
> -
> -# define SECTION(p)            p##.evex
> +# include "x86-evex256-vecs.h"
>
>  # ifndef MEMMOVE_SYMBOL
>  #  define MEMMOVE_SYMBOL(p,s)  p##_evex_##s
> diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
> index 422a079902..8431bcd000 100644
> --- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
> @@ -22,18 +22,9 @@
>     so we need this to build for ISA V2 builds. */
>  #if ISA_SHOULD_BUILD (2)
>
> -# include <sysdep.h>
> +# include "x86-sse2-vecs.h"
>
> -# define VEC_SIZE      16
> -# define VEC(i)                xmm##i
>  # define PREFETCHNT    prefetchnta
> -# define VMOVNT                movntdq
> -/* Use movups and movaps for smaller code sizes.  */
> -# define VMOVU         movups
> -# define VMOVA         movaps
> -# define MOV_SIZE      3
> -
> -# define SECTION(p)            p
>
>  # ifndef MEMMOVE_SYMBOL
>  #  define MEMMOVE_SYMBOL(p,s)  p##_sse2_##s
> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> index 04747133b7..5b758cae5e 100644
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -60,14 +60,6 @@
>  # define MEMMOVE_CHK_SYMBOL(p,s)       MEMMOVE_SYMBOL(p, s)
>  #endif
>
> -#ifndef XMM0
> -# define XMM0                          xmm0
> -#endif
> -
> -#ifndef YMM0
> -# define YMM0                          ymm0
> -#endif
> -
>  #ifndef VZEROUPPER
>  # if VEC_SIZE > 16
>  #  define VZEROUPPER vzeroupper
> @@ -225,13 +217,13 @@ L(start):
>         cmp     $VEC_SIZE, %RDX_LP
>         jb      L(less_vec)
>         /* Load regardless.  */
> -       VMOVU   (%rsi), %VEC(0)
> +       VMOVU   (%rsi), %VMM(0)
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(more_2x_vec)
>         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
> +       VMOVU   -VEC_SIZE(%rsi,%rdx), %VMM(1)
> +       VMOVU   %VMM(0), (%rdi)
> +       VMOVU   %VMM(1), -VEC_SIZE(%rdi,%rdx)
>  #if !(defined USE_MULTIARCH && IS_IN (libc))
>         ZERO_UPPER_VEC_REGISTERS_RETURN
>  #else
> @@ -270,15 +262,15 @@ L(start_erms):
>         cmp     $VEC_SIZE, %RDX_LP
>         jb      L(less_vec)
>         /* Load regardless.  */
> -       VMOVU   (%rsi), %VEC(0)
> +       VMOVU   (%rsi), %VMM(0)
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(movsb_more_2x_vec)
>         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
>          */
> -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(1)
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), -VEC_SIZE(%rdi, %rdx)
> -L(return):
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(1)
> +       VMOVU   %VMM(0), (%rdi)
> +       VMOVU   %VMM(1), -VEC_SIZE(%rdi, %rdx)
> +L(return_vzeroupper):
>  # if VEC_SIZE > 16
>         ZERO_UPPER_VEC_REGISTERS_RETURN
>  # else
> @@ -359,10 +351,10 @@ L(between_16_31):
>         .p2align 4,, 10
>  L(between_32_63):
>         /* From 32 to 63.  No branch when size == 32.  */
> -       VMOVU   (%rsi), %YMM0
> -       VMOVU   -32(%rsi, %rdx), %YMM1
> -       VMOVU   %YMM0, (%rdi)
> -       VMOVU   %YMM1, -32(%rdi, %rdx)
> +       VMOVU   (%rsi), %VMM_256(0)
> +       VMOVU   -32(%rsi, %rdx), %VMM_256(1)
> +       VMOVU   %VMM_256(0), (%rdi)
> +       VMOVU   %VMM_256(1), -32(%rdi, %rdx)
>         VZEROUPPER_RETURN
>  #endif
>
> @@ -380,12 +372,12 @@ L(last_4x_vec):
>         /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
>
>         /* VEC(0) and VEC(1) have already been loaded.  */
> -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(2)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> -       VMOVU   %VEC(2), -VEC_SIZE(%rdi, %rdx)
> -       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(2)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
> +       VMOVU   %VMM(0), (%rdi)
> +       VMOVU   %VMM(1), VEC_SIZE(%rdi)
> +       VMOVU   %VMM(2), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   %VMM(3), -(VEC_SIZE * 2)(%rdi, %rdx)
>         VZEROUPPER_RETURN
>
>         .p2align 4
> @@ -400,24 +392,24 @@ L(more_2x_vec):
>         cmpq    $(VEC_SIZE * 8), %rdx
>         ja      L(more_8x_vec)
>         /* Load VEC(1) regardless. VEC(0) has already been loaded.  */
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
>         cmpq    $(VEC_SIZE * 4), %rdx
>         jbe     L(last_4x_vec)
>         /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(4)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
> -       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
> -       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> -       VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
> -       VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> -       VMOVU   %VEC(4), -VEC_SIZE(%rdi, %rdx)
> -       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
> -       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
> -       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(4)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(5)
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(6)
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(7)
> +       VMOVU   %VMM(0), (%rdi)
> +       VMOVU   %VMM(1), VEC_SIZE(%rdi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> +       VMOVU   %VMM(4), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   %VMM(5), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   %VMM(6), -(VEC_SIZE * 3)(%rdi, %rdx)
> +       VMOVU   %VMM(7), -(VEC_SIZE * 4)(%rdi, %rdx)
>         VZEROUPPER_RETURN
>
>         .p2align 4,, 4
> @@ -466,14 +458,14 @@ L(more_8x_vec_forward):
>          */
>
>         /* First vec was already loaded into VEC(0).  */
> -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(5)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(5)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(6)
>         /* Save begining of dst.  */
>         movq    %rdi, %rcx
>         /* Align dst to VEC_SIZE - 1.  */
>         orq     $(VEC_SIZE - 1), %rdi
> -       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
> -       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(7)
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(8)
>
>         /* Subtract dst from src. Add back after dst aligned.  */
>         subq    %rcx, %rsi
> @@ -488,25 +480,25 @@ L(more_8x_vec_forward):
>         .p2align 4,, 11
>  L(loop_4x_vec_forward):
>         /* Copy 4 * VEC a time forward.  */
> -       VMOVU   (%rsi), %VEC(1)
> -       VMOVU   VEC_SIZE(%rsi), %VEC(2)
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(3)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(4)
> +       VMOVU   (%rsi), %VMM(1)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(2)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(3)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(4)
>         subq    $-(VEC_SIZE * 4), %rsi
> -       VMOVA   %VEC(1), (%rdi)
> -       VMOVA   %VEC(2), VEC_SIZE(%rdi)
> -       VMOVA   %VEC(3), (VEC_SIZE * 2)(%rdi)
> -       VMOVA   %VEC(4), (VEC_SIZE * 3)(%rdi)
> +       VMOVA   %VMM(1), (%rdi)
> +       VMOVA   %VMM(2), VEC_SIZE(%rdi)
> +       VMOVA   %VMM(3), (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VMM(4), (VEC_SIZE * 3)(%rdi)
>         subq    $-(VEC_SIZE * 4), %rdi
>         cmpq    %rdi, %rdx
>         ja      L(loop_4x_vec_forward)
>         /* Store the last 4 * VEC.  */
> -       VMOVU   %VEC(5), (VEC_SIZE * 3)(%rdx)
> -       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdx)
> -       VMOVU   %VEC(7), VEC_SIZE(%rdx)
> -       VMOVU   %VEC(8), (%rdx)
> +       VMOVU   %VMM(5), (VEC_SIZE * 3)(%rdx)
> +       VMOVU   %VMM(6), (VEC_SIZE * 2)(%rdx)
> +       VMOVU   %VMM(7), VEC_SIZE(%rdx)
> +       VMOVU   %VMM(8), (%rdx)
>         /* Store the first VEC.  */
> -       VMOVU   %VEC(0), (%rcx)
> +       VMOVU   %VMM(0), (%rcx)
>         /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
>          */
>  L(nop_backward):
> @@ -523,12 +515,12 @@ L(more_8x_vec_backward):
>            addresses.  */
>
>         /* First vec was also loaded into VEC(0).  */
> -       VMOVU   VEC_SIZE(%rsi), %VEC(5)
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(6)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(5)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(6)
>         /* Begining of region for 4x backward copy stored in rcx.  */
>         leaq    (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(7)
> -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(8)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(7)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(8)
>         /* Subtract dst from src. Add back after dst aligned.  */
>         subq    %rdi, %rsi
>         /* Align dst.  */
> @@ -540,25 +532,25 @@ L(more_8x_vec_backward):
>         .p2align 4,, 11
>  L(loop_4x_vec_backward):
>         /* Copy 4 * VEC a time backward.  */
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(1)
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> -       VMOVU   (VEC_SIZE * 1)(%rsi), %VEC(3)
> -       VMOVU   (VEC_SIZE * 0)(%rsi), %VEC(4)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(1)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   (VEC_SIZE * 1)(%rsi), %VMM(3)
> +       VMOVU   (VEC_SIZE * 0)(%rsi), %VMM(4)
>         addq    $(VEC_SIZE * -4), %rsi
> -       VMOVA   %VEC(1), (VEC_SIZE * 3)(%rcx)
> -       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rcx)
> -       VMOVA   %VEC(3), (VEC_SIZE * 1)(%rcx)
> -       VMOVA   %VEC(4), (VEC_SIZE * 0)(%rcx)
> +       VMOVA   %VMM(1), (VEC_SIZE * 3)(%rcx)
> +       VMOVA   %VMM(2), (VEC_SIZE * 2)(%rcx)
> +       VMOVA   %VMM(3), (VEC_SIZE * 1)(%rcx)
> +       VMOVA   %VMM(4), (VEC_SIZE * 0)(%rcx)
>         addq    $(VEC_SIZE * -4), %rcx
>         cmpq    %rcx, %rdi
>         jb      L(loop_4x_vec_backward)
>         /* Store the first 4 * VEC.  */
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(5), VEC_SIZE(%rdi)
> -       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
> -       VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
> +       VMOVU   %VMM(0), (%rdi)
> +       VMOVU   %VMM(5), VEC_SIZE(%rdi)
> +       VMOVU   %VMM(6), (VEC_SIZE * 2)(%rdi)
> +       VMOVU   %VMM(7), (VEC_SIZE * 3)(%rdi)
>         /* Store the last VEC.  */
> -       VMOVU   %VEC(8), -VEC_SIZE(%rdx, %rdi)
> +       VMOVU   %VMM(8), -VEC_SIZE(%rdx, %rdi)
>         VZEROUPPER_RETURN
>
>  #if defined USE_MULTIARCH && IS_IN (libc)
> @@ -568,7 +560,7 @@ L(loop_4x_vec_backward):
>  # if ALIGN_MOVSB
>  L(skip_short_movsb_check):
>  #  if MOVSB_ALIGN_TO > VEC_SIZE
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
>  #  endif
>  #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
>  #   error Unsupported MOVSB_ALIGN_TO
> @@ -597,9 +589,9 @@ L(skip_short_movsb_check):
>
>         rep     movsb
>
> -       VMOVU   %VEC(0), (%r8)
> +       VMOVU   %VMM(0), (%r8)
>  #  if MOVSB_ALIGN_TO > VEC_SIZE
> -       VMOVU   %VEC(1), VEC_SIZE(%r8)
> +       VMOVU   %VMM(1), VEC_SIZE(%r8)
>  #  endif
>         VZEROUPPER_RETURN
>  # endif
> @@ -640,7 +632,7 @@ L(movsb):
>  # endif
>  # if ALIGN_MOVSB
>  #  if MOVSB_ALIGN_TO > VEC_SIZE
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
>  #  endif
>  #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
>  #   error Unsupported MOVSB_ALIGN_TO
> @@ -664,9 +656,9 @@ L(movsb_align_dst):
>         rep     movsb
>
>         /* Store VECs loaded for aligning.  */
> -       VMOVU   %VEC(0), (%r8)
> +       VMOVU   %VMM(0), (%r8)
>  #  if MOVSB_ALIGN_TO > VEC_SIZE
> -       VMOVU   %VEC(1), VEC_SIZE(%r8)
> +       VMOVU   %VMM(1), VEC_SIZE(%r8)
>  #  endif
>         VZEROUPPER_RETURN
>  # else /* !ALIGN_MOVSB.  */
> @@ -701,18 +693,18 @@ L(large_memcpy_2x):
>
>         /* First vec was also loaded into VEC(0).  */
>  # if VEC_SIZE < 64
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
>  #  if VEC_SIZE < 32
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
>  #  endif
>  # endif
> -       VMOVU   %VEC(0), (%rdi)
> +       VMOVU   %VMM(0), (%rdi)
>  # if VEC_SIZE < 64
> -       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> +       VMOVU   %VMM(1), VEC_SIZE(%rdi)
>  #  if VEC_SIZE < 32
> -       VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
> -       VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
>  #  endif
>  # endif
>
> @@ -761,12 +753,12 @@ L(loop_large_memcpy_2x_inner):
>         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
>         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
>         /* Load vectors from rsi.  */
> -       LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> -       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +       LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
> +       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
>         subq    $-LARGE_LOAD_SIZE, %rsi
>         /* Non-temporal store vectors to rdi.  */
> -       STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> -       STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +       STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
> +       STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
>         subq    $-LARGE_LOAD_SIZE, %rdi
>         decl    %ecx
>         jnz     L(loop_large_memcpy_2x_inner)
> @@ -785,31 +777,31 @@ L(loop_large_memcpy_2x_tail):
>         /* Copy 4 * VEC a time forward with non-temporal stores.  */
>         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
>         PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> +       VMOVU   (%rsi), %VMM(0)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
>         subq    $-(VEC_SIZE * 4), %rsi
>         addl    $-(VEC_SIZE * 4), %edx
> -       VMOVA   %VEC(0), (%rdi)
> -       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> -       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> -       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> +       VMOVA   %VMM(0), (%rdi)
> +       VMOVA   %VMM(1), VEC_SIZE(%rdi)
> +       VMOVA   %VMM(2), (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VMM(3), (VEC_SIZE * 3)(%rdi)
>         subq    $-(VEC_SIZE * 4), %rdi
>         cmpl    $(VEC_SIZE * 4), %edx
>         ja      L(loop_large_memcpy_2x_tail)
>
>  L(large_memcpy_2x_end):
>         /* Store the last 4 * VEC.  */
> -       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> -       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> -
> -       VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> -       VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> -       VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> -       VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(3)
> +
> +       VMOVU   %VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> +       VMOVU   %VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> +       VMOVU   %VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   %VMM(3), -VEC_SIZE(%rdi, %rdx)
>         VZEROUPPER_RETURN
>
>         .p2align 4
> @@ -831,16 +823,16 @@ L(loop_large_memcpy_4x_inner):
>         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
>         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
>         /* Load vectors from rsi.  */
> -       LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> -       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> -       LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> -       LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> +       LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
> +       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
> +       LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
> +       LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
>         subq    $-LARGE_LOAD_SIZE, %rsi
>         /* Non-temporal store vectors to rdi.  */
> -       STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> -       STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> -       STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> -       STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> +       STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
> +       STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
> +       STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
> +       STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
>         subq    $-LARGE_LOAD_SIZE, %rdi
>         decl    %ecx
>         jnz     L(loop_large_memcpy_4x_inner)
> @@ -858,31 +850,31 @@ L(loop_large_memcpy_4x_tail):
>         /* Copy 4 * VEC a time forward with non-temporal stores.  */
>         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
>         PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> +       VMOVU   (%rsi), %VMM(0)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
>         subq    $-(VEC_SIZE * 4), %rsi
>         addl    $-(VEC_SIZE * 4), %edx
> -       VMOVA   %VEC(0), (%rdi)
> -       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> -       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> -       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> +       VMOVA   %VMM(0), (%rdi)
> +       VMOVA   %VMM(1), VEC_SIZE(%rdi)
> +       VMOVA   %VMM(2), (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VMM(3), (VEC_SIZE * 3)(%rdi)
>         subq    $-(VEC_SIZE * 4), %rdi
>         cmpl    $(VEC_SIZE * 4), %edx
>         ja      L(loop_large_memcpy_4x_tail)
>
>  L(large_memcpy_4x_end):
>         /* Store the last 4 * VEC.  */
> -       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> -       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> -
> -       VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> -       VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> -       VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> -       VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(3)
> +
> +       VMOVU   %VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> +       VMOVU   %VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> +       VMOVU   %VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   %VMM(3), -VEC_SIZE(%rdi, %rdx)
>         VZEROUPPER_RETURN
>  #endif
>  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v9 4/6] x86: Update memset to use new VEC macros
  2022-10-15  0:20   ` [PATCH v9 4/6] x86: Update memset " Noah Goldstein
@ 2022-10-15  2:53     ` H.J. Lu
  0 siblings, 0 replies; 72+ messages in thread
From: H.J. Lu @ 2022-10-15  2:53 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 5:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Replace %VEC(n) -> %VMM(n)
>
> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  .../memset-avx2-unaligned-erms-rtm.S          |  8 +--
>  .../multiarch/memset-avx2-unaligned-erms.S    | 14 +---
>  .../multiarch/memset-avx512-unaligned-erms.S  | 20 +-----
>  .../multiarch/memset-evex-unaligned-erms.S    | 20 +-----
>  .../multiarch/memset-sse2-unaligned-erms.S    | 10 +--
>  .../multiarch/memset-vec-unaligned-erms.S     | 70 ++++++++-----------
>  6 files changed, 43 insertions(+), 99 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> index 8ac3e479bb..bc8605faf3 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> @@ -1,10 +1,6 @@
> -#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> -  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +#include "x86-avx-rtm-vecs.h"
>
> -#define VZEROUPPER_RETURN jmp   L(return)
> -
> -#define SECTION(p) p##.avx.rtm
>  #define MEMSET_SYMBOL(p,s)     p##_avx2_##s##_rtm
>  #define WMEMSET_SYMBOL(p,s)    p##_avx2_##s##_rtm
>
> -#include "memset-avx2-unaligned-erms.S"
> +# include "memset-avx2-unaligned-erms.S"
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index a9054a9122..47cf5072a4 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -4,14 +4,9 @@
>
>  # define USE_WITH_AVX2 1
>
> -# define VEC_SIZE      32
> -# define MOV_SIZE      4
> -# define RET_SIZE      4
> -
> -# define VEC(i)                ymm##i
> -
> -# define VMOVU     vmovdqu
> -# define VMOVA     vmovdqa
> +# ifndef VEC_SIZE
> +#  include "x86-avx-vecs.h"
> +# endif
>
>  # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    vmovd d, %xmm0; \
> @@ -26,9 +21,6 @@
>  # define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
>  # define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
>
> -# ifndef SECTION
> -#  define SECTION(p)           p##.avx
> -# endif
>  # ifndef MEMSET_SYMBOL
>  #  define MEMSET_SYMBOL(p,s)   p##_avx2_##s
>  # endif
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index 47623b8ee8..84145b6c27 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -4,26 +4,14 @@
>
>  # define USE_WITH_AVX512       1
>
> -# define VEC_SIZE      64
> -# define MOV_SIZE      6
> -# define RET_SIZE      1
> -
> -# define XMM0          xmm16
> -# define YMM0          ymm16
> -# define VEC0          zmm16
> -# define VEC(i)                VEC##i
> -
> -# define VMOVU     vmovdqu64
> -# define VMOVA     vmovdqa64
> -
> -# define VZEROUPPER
> +# include "x86-evex512-vecs.h"
>
>  # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> -  vpbroadcastb d, %VEC0; \
> +  vpbroadcastb d, %VMM(0); \
>    movq r, %rax
>
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> -  vpbroadcastd d, %VEC0; \
> +  vpbroadcastd d, %VMM(0); \
>    movq r, %rax
>
>  # define MEMSET_VDUP_TO_VEC0_HIGH()
> @@ -32,8 +20,6 @@
>  # define WMEMSET_VDUP_TO_VEC0_HIGH()
>  # define WMEMSET_VDUP_TO_VEC0_LOW()
>
> -# define SECTION(p)            p##.evex512
> -
>  #ifndef MEMSET_SYMBOL
>  # define MEMSET_SYMBOL(p,s)    p##_avx512_##s
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> index ac4b2d2d50..1f03b26bf8 100644
> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> @@ -4,26 +4,14 @@
>
>  # define USE_WITH_EVEX 1
>
> -# define VEC_SIZE      32
> -# define MOV_SIZE      6
> -# define RET_SIZE      1
> -
> -# define XMM0          xmm16
> -# define YMM0          ymm16
> -# define VEC0          ymm16
> -# define VEC(i)                VEC##i
> -
> -# define VMOVU     vmovdqu64
> -# define VMOVA     vmovdqa64
> -
> -# define VZEROUPPER
> +# include "x86-evex256-vecs.h"
>
>  # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> -  vpbroadcastb d, %VEC0; \
> +  vpbroadcastb d, %VMM(0); \
>    movq r, %rax
>
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> -  vpbroadcastd d, %VEC0; \
> +  vpbroadcastd d, %VMM(0); \
>    movq r, %rax
>
>  # define MEMSET_VDUP_TO_VEC0_HIGH()
> @@ -32,8 +20,6 @@
>  # define WMEMSET_VDUP_TO_VEC0_HIGH()
>  # define WMEMSET_VDUP_TO_VEC0_LOW()
>
> -# define SECTION(p)            p##.evex
> -
>  #ifndef MEMSET_SYMBOL
>  # define MEMSET_SYMBOL(p,s)    p##_evex_##s
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> index 44f9b8888b..34b245d8ca 100644
> --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> @@ -26,13 +26,7 @@
>  # include <sysdep.h>
>  # define USE_WITH_SSE2 1
>
> -# define VEC_SIZE      16
> -# define MOV_SIZE      3
> -# define RET_SIZE      1
> -
> -# define VEC(i)                xmm##i
> -# define VMOVU     movups
> -# define VMOVA     movaps
> +# include "x86-sse2-vecs.h"
>
>  # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    movd d, %xmm0; \
> @@ -52,8 +46,6 @@
>  # define WMEMSET_VDUP_TO_VEC0_HIGH()
>  # define WMEMSET_VDUP_TO_VEC0_LOW()
>
> -# define SECTION(p)            p
> -
>  # ifndef MEMSET_SYMBOL
>  #  define MEMSET_SYMBOL(p,s)   p##_sse2_##s
>  # endif
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 905d0fa464..03de0ab907 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -34,14 +34,6 @@
>  # define WMEMSET_CHK_SYMBOL(p,s)       WMEMSET_SYMBOL(p, s)
>  #endif
>
> -#ifndef XMM0
> -# define XMM0                          xmm0
> -#endif
> -
> -#ifndef YMM0
> -# define YMM0                          ymm0
> -#endif
> -
>  #ifndef VZEROUPPER
>  # if VEC_SIZE > 16
>  #  define VZEROUPPER                   vzeroupper
> @@ -150,8 +142,8 @@ L(entry_from_wmemset):
>         cmpq    $(VEC_SIZE * 2), %rdx
>         ja      L(more_2x_vec)
>         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> -       VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
> -       VMOVU   %VEC(0), (%rdi)
> +       VMOVU   %VMM(0), -VEC_SIZE(%rdi,%rdx)
> +       VMOVU   %VMM(0), (%rdi)
>         VZEROUPPER_RETURN
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  END (MEMSET_SYMBOL (__memset, unaligned))
> @@ -175,19 +167,19 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(stosb_more_2x_vec)
>         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> +       VMOVU   %VMM(0), (%rdi)
> +       VMOVU   %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
>         VZEROUPPER_RETURN
>  #endif
>
>         .p2align 4,, 4
>  L(last_2x_vec):
>  #ifdef USE_LESS_VEC_MASK_STORE
> -       VMOVU   %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
> -       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> +       VMOVU   %VMM(0), (VEC_SIZE * -2)(%rdi, %rdx)
> +       VMOVU   %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
>  #else
> -       VMOVU   %VEC(0), (VEC_SIZE * -2)(%rdi)
> -       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi)
> +       VMOVU   %VMM(0), (VEC_SIZE * -2)(%rdi)
> +       VMOVU   %VMM(0), (VEC_SIZE * -1)(%rdi)
>  #endif
>         VZEROUPPER_RETURN
>
> @@ -221,7 +213,7 @@ L(less_vec_from_wmemset):
>         bzhil   %edx, %ecx, %ecx
>         kmovd   %ecx, %k1
>  # endif
> -       vmovdqu8 %VEC(0), (%rax){%k1}
> +       vmovdqu8 %VMM(0), (%rax){%k1}
>         VZEROUPPER_RETURN
>
>  # if defined USE_MULTIARCH && IS_IN (libc)
> @@ -249,8 +241,8 @@ L(stosb_more_2x_vec):
>            and (4x, 8x] jump to target.  */
>  L(more_2x_vec):
>         /* Store next 2x vec regardless.  */
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(0), (VEC_SIZE * 1)(%rdi)
> +       VMOVU   %VMM(0), (%rdi)
> +       VMOVU   %VMM(0), (VEC_SIZE * 1)(%rdi)
>
>
>         /* Two different methods of setting up pointers / compare. The two
> @@ -278,8 +270,8 @@ L(more_2x_vec):
>  #endif
>
>         /* Store next 2x vec regardless.  */
> -       VMOVU   %VEC(0), (VEC_SIZE * 2)(%rax)
> -       VMOVU   %VEC(0), (VEC_SIZE * 3)(%rax)
> +       VMOVU   %VMM(0), (VEC_SIZE * 2)(%rax)
> +       VMOVU   %VMM(0), (VEC_SIZE * 3)(%rax)
>
>
>  #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> @@ -304,20 +296,20 @@ L(more_2x_vec):
>         andq    $(VEC_SIZE * -2), %LOOP_REG
>         .p2align 4
>  L(loop):
> -       VMOVA   %VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
> -       VMOVA   %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
> -       VMOVA   %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
> -       VMOVA   %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
> +       VMOVA   %VMM(0), LOOP_4X_OFFSET(%LOOP_REG)
> +       VMOVA   %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
> +       VMOVA   %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
> +       VMOVA   %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
>         subq    $-(VEC_SIZE * 4), %LOOP_REG
>         cmpq    %END_REG, %LOOP_REG
>         jb      L(loop)
>         .p2align 4,, MOV_SIZE
>  L(last_4x_vec):
> -       VMOVU   %VEC(0), LOOP_4X_OFFSET(%END_REG)
> -       VMOVU   %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
> -       VMOVU   %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
> -       VMOVU   %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
> -L(return):
> +       VMOVU   %VMM(0), LOOP_4X_OFFSET(%END_REG)
> +       VMOVU   %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
> +       VMOVU   %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
> +       VMOVU   %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
> +L(return_vzeroupper):
>  #if VEC_SIZE > 16
>         ZERO_UPPER_VEC_REGISTERS_RETURN
>  #else
> @@ -355,7 +347,7 @@ L(cross_page):
>         jge     L(between_16_31)
>  #endif
>  #ifndef USE_XMM_LESS_VEC
> -       MOVQ    %XMM0, %SET_REG64
> +       MOVQ    %VMM_128(0), %SET_REG64
>  #endif
>         cmpl    $8, %edx
>         jge     L(between_8_15)
> @@ -374,8 +366,8 @@ L(between_0_0):
>         .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
>         /* From 32 to 63.  No branch when size == 32.  */
>  L(between_32_63):
> -       VMOVU   %YMM0, (%LESS_VEC_REG)
> -       VMOVU   %YMM0, -32(%LESS_VEC_REG, %rdx)
> +       VMOVU   %VMM_256(0), (%LESS_VEC_REG)
> +       VMOVU   %VMM_256(0), -32(%LESS_VEC_REG, %rdx)
>         VZEROUPPER_RETURN
>  #endif
>
> @@ -383,8 +375,8 @@ L(between_32_63):
>         .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
>  L(between_16_31):
>         /* From 16 to 31.  No branch when size == 16.  */
> -       VMOVU   %XMM0, (%LESS_VEC_REG)
> -       VMOVU   %XMM0, -16(%LESS_VEC_REG, %rdx)
> +       VMOVU   %VMM_128(0), (%LESS_VEC_REG)
> +       VMOVU   %VMM_128(0), -16(%LESS_VEC_REG, %rdx)
>         ret
>  #endif
>
> @@ -394,8 +386,8 @@ L(between_16_31):
>  L(between_8_15):
>         /* From 8 to 15.  No branch when size == 8.  */
>  #ifdef USE_XMM_LESS_VEC
> -       MOVQ    %XMM0, (%rdi)
> -       MOVQ    %XMM0, -8(%rdi, %rdx)
> +       MOVQ    %VMM_128(0), (%rdi)
> +       MOVQ    %VMM_128(0), -8(%rdi, %rdx)
>  #else
>         movq    %SET_REG64, (%LESS_VEC_REG)
>         movq    %SET_REG64, -8(%LESS_VEC_REG, %rdx)
> @@ -408,8 +400,8 @@ L(between_8_15):
>  L(between_4_7):
>         /* From 4 to 7.  No branch when size == 4.  */
>  #ifdef USE_XMM_LESS_VEC
> -       MOVD    %XMM0, (%rdi)
> -       MOVD    %XMM0, -4(%rdi, %rdx)
> +       MOVD    %VMM_128(0), (%rdi)
> +       MOVD    %VMM_128(0), -4(%rdi, %rdx)
>  #else
>         movl    %SET_REG32, (%LESS_VEC_REG)
>         movl    %SET_REG32, -4(%LESS_VEC_REG, %rdx)
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v9 5/6] x86: Remove now unused vec header macros.
  2022-10-15  0:20   ` [PATCH v9 5/6] x86: Remove now unused vec header macros Noah Goldstein
@ 2022-10-15  2:56     ` H.J. Lu
  0 siblings, 0 replies; 72+ messages in thread
From: H.J. Lu @ 2022-10-15  2:56 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 5:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  sysdeps/x86_64/multiarch/avx-rtm-vecs.h     | 35 --------
>  sysdeps/x86_64/multiarch/avx-vecs.h         | 47 -----------
>  sysdeps/x86_64/multiarch/evex-vecs-common.h | 39 ---------
>  sysdeps/x86_64/multiarch/evex256-vecs.h     | 35 --------
>  sysdeps/x86_64/multiarch/evex512-vecs.h     | 35 --------
>  sysdeps/x86_64/multiarch/sse2-vecs.h        | 47 -----------
>  sysdeps/x86_64/multiarch/vec-macros.h       | 90 ---------------------
>  7 files changed, 328 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
>  delete mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
>  delete mode 100644 sysdeps/x86_64/multiarch/evex-vecs-common.h
>  delete mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
>  delete mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
>  delete mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
>  delete mode 100644 sysdeps/x86_64/multiarch/vec-macros.h
>
> diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> deleted file mode 100644
> index 6ca9f5e6ba..0000000000
> --- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> +++ /dev/null
> @@ -1,35 +0,0 @@
> -/* Common config for AVX-RTM VECs
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifndef _AVX_RTM_VECS_H
> -#define _AVX_RTM_VECS_H                        1
> -
> -#define COND_VZEROUPPER                        COND_VZEROUPPER_XTEST
> -#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> -       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> -
> -#define VZEROUPPER_RETURN              jmp L(return_vzeroupper)
> -
> -#define USE_WITH_RTM                   1
> -#include "avx-vecs.h"
> -
> -#undef SECTION
> -#define SECTION(p)                             p##.avx.rtm
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
> deleted file mode 100644
> index 89680f5db8..0000000000
> --- a/sysdeps/x86_64/multiarch/avx-vecs.h
> +++ /dev/null
> @@ -1,47 +0,0 @@
> -/* Common config for AVX VECs
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifndef _AVX_VECS_H
> -#define _AVX_VECS_H                    1
> -
> -#ifdef VEC_SIZE
> -# error "Multiple VEC configs included!"
> -#endif
> -
> -#define VEC_SIZE                       32
> -#include "vec-macros.h"
> -
> -#define USE_WITH_AVX           1
> -#define SECTION(p)                     p##.avx
> -
> -/* 4-byte mov instructions with AVX2.  */
> -#define MOV_SIZE                       4
> -/* 1 (ret) + 3 (vzeroupper).  */
> -#define RET_SIZE                       4
> -#define VZEROUPPER                     vzeroupper
> -
> -#define VMOVU                          vmovdqu
> -#define VMOVA                          vmovdqa
> -#define VMOVNT                         vmovntdq
> -
> -/* Often need to access xmm portion.  */
> -#define VEC_xmm                                VEC_any_xmm
> -#define VEC                                    VEC_any_ymm
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
> deleted file mode 100644
> index 99806ebcd7..0000000000
> --- a/sysdeps/x86_64/multiarch/evex-vecs-common.h
> +++ /dev/null
> @@ -1,39 +0,0 @@
> -/* Common config for EVEX256 and EVEX512 VECs
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifndef _EVEX_VECS_COMMON_H
> -#define _EVEX_VECS_COMMON_H                    1
> -
> -#include "vec-macros.h"
> -
> -/* 6-byte mov instructions with EVEX.  */
> -#define MOV_SIZE                       6
> -/* No vzeroupper needed.  */
> -#define RET_SIZE                       1
> -#define VZEROUPPER
> -
> -#define VMOVU                          vmovdqu64
> -#define VMOVA                          vmovdqa64
> -#define VMOVNT                         vmovntdq
> -
> -#define VEC_xmm                                VEC_hi_xmm
> -#define VEC_ymm                                VEC_hi_ymm
> -#define VEC_zmm                                VEC_hi_zmm
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
> deleted file mode 100644
> index 222ba46dc7..0000000000
> --- a/sysdeps/x86_64/multiarch/evex256-vecs.h
> +++ /dev/null
> @@ -1,35 +0,0 @@
> -/* Common config for EVEX256 VECs
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifndef _EVEX256_VECS_H
> -#define _EVEX256_VECS_H                        1
> -
> -#ifdef VEC_SIZE
> -# error "Multiple VEC configs included!"
> -#endif
> -
> -#define VEC_SIZE                       32
> -#include "evex-vecs-common.h"
> -
> -#define USE_WITH_EVEX256       1
> -#define SECTION(p)                     p##.evex
> -
> -#define VEC                                    VEC_ymm
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
> deleted file mode 100644
> index d1784d5368..0000000000
> --- a/sysdeps/x86_64/multiarch/evex512-vecs.h
> +++ /dev/null
> @@ -1,35 +0,0 @@
> -/* Common config for EVEX512 VECs
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifndef _EVEX512_VECS_H
> -#define _EVEX512_VECS_H                        1
> -
> -#ifdef VEC_SIZE
> -# error "Multiple VEC configs included!"
> -#endif
> -
> -#define VEC_SIZE                       64
> -#include "evex-vecs-common.h"
> -
> -#define USE_WITH_EVEX512       1
> -#define SECTION(p)                     p##.evex512
> -
> -#define VEC                                    VEC_zmm
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
> deleted file mode 100644
> index 2b77a59d56..0000000000
> --- a/sysdeps/x86_64/multiarch/sse2-vecs.h
> +++ /dev/null
> @@ -1,47 +0,0 @@
> -/* Common config for SSE2 VECs
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifndef _SSE2_VECS_H
> -#define _SSE2_VECS_H                   1
> -
> -#ifdef VEC_SIZE
> -# error "Multiple VEC configs included!"
> -#endif
> -
> -#define VEC_SIZE                       16
> -#include "vec-macros.h"
> -
> -#define USE_WITH_SSE2          1
> -#define SECTION(p)                     p
> -
> -/* 3-byte mov instructions with SSE2.  */
> -#define MOV_SIZE                       3
> -/* No vzeroupper needed.  */
> -#define RET_SIZE                       1
> -#define VZEROUPPER
> -
> -#define VMOVU                          movups
> -#define VMOVA                          movaps
> -#define VMOVNT                         movntdq
> -
> -#define VEC_xmm                                VEC_any_xmm
> -#define VEC                                    VEC_any_xmm
> -
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
> deleted file mode 100644
> index 9f3ffecede..0000000000
> --- a/sysdeps/x86_64/multiarch/vec-macros.h
> +++ /dev/null
> @@ -1,90 +0,0 @@
> -/* Macro helpers for VEC_{type}({vec_num})
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifndef _VEC_MACROS_H
> -#define _VEC_MACROS_H                  1
> -
> -#ifndef VEC_SIZE
> -# error "Never include this file directly. Always include a vector config."
> -#endif
> -
> -/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
> -   VEC(N) values.  */
> -#define VEC_hi_xmm0                            xmm16
> -#define VEC_hi_xmm1                            xmm17
> -#define VEC_hi_xmm2                            xmm18
> -#define VEC_hi_xmm3                            xmm19
> -#define VEC_hi_xmm4                            xmm20
> -#define VEC_hi_xmm5                            xmm21
> -#define VEC_hi_xmm6                            xmm22
> -#define VEC_hi_xmm7                            xmm23
> -#define VEC_hi_xmm8                            xmm24
> -#define VEC_hi_xmm9                            xmm25
> -#define VEC_hi_xmm10                   xmm26
> -#define VEC_hi_xmm11                   xmm27
> -#define VEC_hi_xmm12                   xmm28
> -#define VEC_hi_xmm13                   xmm29
> -#define VEC_hi_xmm14                   xmm30
> -#define VEC_hi_xmm15                   xmm31
> -
> -#define VEC_hi_ymm0                            ymm16
> -#define VEC_hi_ymm1                            ymm17
> -#define VEC_hi_ymm2                            ymm18
> -#define VEC_hi_ymm3                            ymm19
> -#define VEC_hi_ymm4                            ymm20
> -#define VEC_hi_ymm5                            ymm21
> -#define VEC_hi_ymm6                            ymm22
> -#define VEC_hi_ymm7                            ymm23
> -#define VEC_hi_ymm8                            ymm24
> -#define VEC_hi_ymm9                            ymm25
> -#define VEC_hi_ymm10                   ymm26
> -#define VEC_hi_ymm11                   ymm27
> -#define VEC_hi_ymm12                   ymm28
> -#define VEC_hi_ymm13                   ymm29
> -#define VEC_hi_ymm14                   ymm30
> -#define VEC_hi_ymm15                   ymm31
> -
> -#define VEC_hi_zmm0                            zmm16
> -#define VEC_hi_zmm1                            zmm17
> -#define VEC_hi_zmm2                            zmm18
> -#define VEC_hi_zmm3                            zmm19
> -#define VEC_hi_zmm4                            zmm20
> -#define VEC_hi_zmm5                            zmm21
> -#define VEC_hi_zmm6                            zmm22
> -#define VEC_hi_zmm7                            zmm23
> -#define VEC_hi_zmm8                            zmm24
> -#define VEC_hi_zmm9                            zmm25
> -#define VEC_hi_zmm10                   zmm26
> -#define VEC_hi_zmm11                   zmm27
> -#define VEC_hi_zmm12                   zmm28
> -#define VEC_hi_zmm13                   zmm29
> -#define VEC_hi_zmm14                   zmm30
> -#define VEC_hi_zmm15                   zmm31
> -
> -#define PRIMITIVE_VEC(vec, num)                vec##num
> -
> -#define VEC_any_xmm(i)                 PRIMITIVE_VEC(xmm, i)
> -#define VEC_any_ymm(i)                 PRIMITIVE_VEC(ymm, i)
> -#define VEC_any_zmm(i)                 PRIMITIVE_VEC(zmm, i)
> -
> -#define VEC_hi_xmm(i)                  PRIMITIVE_VEC(VEC_hi_xmm, i)
> -#define VEC_hi_ymm(i)                  PRIMITIVE_VEC(VEC_hi_ymm, i)
> -#define VEC_hi_zmm(i)                  PRIMITIVE_VEC(VEC_hi_zmm, i)
> -
> -#endif
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v9 3/6] x86: Update memmove to use new VEC macros
  2022-10-15  2:52     ` H.J. Lu
@ 2022-10-15  2:57       ` Noah Goldstein
  0 siblings, 0 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  2:57 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 9:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Oct 14, 2022 at 5:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Replace %VEC(n) -> %VMM(n)
> >
> > This commit does not change libc.so
> >
> > Tested build on x86-64
> > ---
> >  .../memmove-avx-unaligned-erms-rtm.S          |  15 +-
> >  .../multiarch/memmove-avx-unaligned-erms.S    |   9 +-
> >  .../multiarch/memmove-avx512-unaligned-erms.S |  30 +-
> >  .../multiarch/memmove-evex-unaligned-erms.S   |  30 +-
> >  .../multiarch/memmove-sse2-unaligned-erms.S   |  11 +-
> >  .../multiarch/memmove-vec-unaligned-erms.S    | 262 +++++++++---------
> >  6 files changed, 135 insertions(+), 222 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> > index 67a55f0c85..c2a95dc247 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> > @@ -1,16 +1,9 @@
> > -#if IS_IN (libc)
>
> Keep this.
>
> > -# define VEC_SIZE      32
> > -# define VEC(i)                ymm##i
> > -# define VMOVNT                vmovntdq
> > -# define VMOVU         vmovdqu
> > -# define VMOVA         vmovdqa
> > -# define MOV_SIZE      4
> > -# define ZERO_UPPER_VEC_REGISTERS_RETURN \
> > -  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > +#include <isa-level.h>
> >
> > -# define VZEROUPPER_RETURN jmp  L(return)
> > +#if ISA_SHOULD_BUILD (3)
>
> This ISA_SHOULD_BUILD change isn't needed.

kk will fix for next version. Although I think we should patch that.
>
> > +# include "x86-avx-rtm-vecs.h"
> >
> > -# define SECTION(p)            p##.avx.rtm
> >  # define MEMMOVE_SYMBOL(p,s)   p##_avx_##s##_rtm
> >
> >  # include "memmove-vec-unaligned-erms.S"
> > diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> > index a14b155667..4e4b4635f9 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> > @@ -2,14 +2,7 @@
> >
> >  #if ISA_SHOULD_BUILD (3)
> >
> > -# define VEC_SIZE      32
> > -# define VEC(i)                ymm##i
> > -# define VMOVNT                vmovntdq
> > -# define VMOVU         vmovdqu
> > -# define VMOVA         vmovdqa
> > -# define MOV_SIZE      4
> > -
> > -# define SECTION(p)            p##.avx
> > +# include "x86-avx-vecs.h"
> >
> >  # ifndef MEMMOVE_SYMBOL
> >  #  define MEMMOVE_SYMBOL(p,s)  p##_avx_##s
> > diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> > index 8d1568a7ba..cca97e38f8 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> > @@ -2,35 +2,7 @@
> >
> >  #if ISA_SHOULD_BUILD (4)
> >
> > -# define VEC_SIZE      64
> > -# define XMM0          xmm16
> > -# define XMM1          xmm17
> > -# define YMM0          ymm16
> > -# define YMM1          ymm17
> > -# define VEC0          zmm16
> > -# define VEC1          zmm17
> > -# define VEC2          zmm18
> > -# define VEC3          zmm19
> > -# define VEC4          zmm20
> > -# define VEC5          zmm21
> > -# define VEC6          zmm22
> > -# define VEC7          zmm23
> > -# define VEC8          zmm24
> > -# define VEC9          zmm25
> > -# define VEC10         zmm26
> > -# define VEC11         zmm27
> > -# define VEC12         zmm28
> > -# define VEC13         zmm29
> > -# define VEC14         zmm30
> > -# define VEC15         zmm31
> > -# define VEC(i)                VEC##i
> > -# define VMOVNT                vmovntdq
> > -# define VMOVU         vmovdqu64
> > -# define VMOVA         vmovdqa64
> > -# define VZEROUPPER
> > -# define MOV_SIZE      6
> > -
> > -# define SECTION(p)            p##.evex512
> > +# include "x86-evex512-vecs.h"
> >
> >  # ifndef MEMMOVE_SYMBOL
> >  #  define MEMMOVE_SYMBOL(p,s)  p##_avx512_##s
> > diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> > index 2373017358..1f7b5715f7 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> > @@ -2,35 +2,7 @@
> >
> >  #if ISA_SHOULD_BUILD (4)
> >
> > -# define VEC_SIZE      32
> > -# define XMM0          xmm16
> > -# define XMM1          xmm17
> > -# define YMM0          ymm16
> > -# define YMM1          ymm17
> > -# define VEC0          ymm16
> > -# define VEC1          ymm17
> > -# define VEC2          ymm18
> > -# define VEC3          ymm19
> > -# define VEC4          ymm20
> > -# define VEC5          ymm21
> > -# define VEC6          ymm22
> > -# define VEC7          ymm23
> > -# define VEC8          ymm24
> > -# define VEC9          ymm25
> > -# define VEC10         ymm26
> > -# define VEC11         ymm27
> > -# define VEC12         ymm28
> > -# define VEC13         ymm29
> > -# define VEC14         ymm30
> > -# define VEC15         ymm31
> > -# define VEC(i)                VEC##i
> > -# define VMOVNT                vmovntdq
> > -# define VMOVU         vmovdqu64
> > -# define VMOVA         vmovdqa64
> > -# define VZEROUPPER
> > -# define MOV_SIZE      6
> > -
> > -# define SECTION(p)            p##.evex
> > +# include "x86-evex256-vecs.h"
> >
> >  # ifndef MEMMOVE_SYMBOL
> >  #  define MEMMOVE_SYMBOL(p,s)  p##_evex_##s
> > diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
> > index 422a079902..8431bcd000 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
> > @@ -22,18 +22,9 @@
> >     so we need this to build for ISA V2 builds. */
> >  #if ISA_SHOULD_BUILD (2)
> >
> > -# include <sysdep.h>
> > +# include "x86-sse2-vecs.h"
> >
> > -# define VEC_SIZE      16
> > -# define VEC(i)                xmm##i
> >  # define PREFETCHNT    prefetchnta
> > -# define VMOVNT                movntdq
> > -/* Use movups and movaps for smaller code sizes.  */
> > -# define VMOVU         movups
> > -# define VMOVA         movaps
> > -# define MOV_SIZE      3
> > -
> > -# define SECTION(p)            p
> >
> >  # ifndef MEMMOVE_SYMBOL
> >  #  define MEMMOVE_SYMBOL(p,s)  p##_sse2_##s
> > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > index 04747133b7..5b758cae5e 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > @@ -60,14 +60,6 @@
> >  # define MEMMOVE_CHK_SYMBOL(p,s)       MEMMOVE_SYMBOL(p, s)
> >  #endif
> >
> > -#ifndef XMM0
> > -# define XMM0                          xmm0
> > -#endif
> > -
> > -#ifndef YMM0
> > -# define YMM0                          ymm0
> > -#endif
> > -
> >  #ifndef VZEROUPPER
> >  # if VEC_SIZE > 16
> >  #  define VZEROUPPER vzeroupper
> > @@ -225,13 +217,13 @@ L(start):
> >         cmp     $VEC_SIZE, %RDX_LP
> >         jb      L(less_vec)
> >         /* Load regardless.  */
> > -       VMOVU   (%rsi), %VEC(0)
> > +       VMOVU   (%rsi), %VMM(0)
> >         cmp     $(VEC_SIZE * 2), %RDX_LP
> >         ja      L(more_2x_vec)
> >         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
> > -       VMOVU   %VEC(0), (%rdi)
> > -       VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
> > +       VMOVU   -VEC_SIZE(%rsi,%rdx), %VMM(1)
> > +       VMOVU   %VMM(0), (%rdi)
> > +       VMOVU   %VMM(1), -VEC_SIZE(%rdi,%rdx)
> >  #if !(defined USE_MULTIARCH && IS_IN (libc))
> >         ZERO_UPPER_VEC_REGISTERS_RETURN
> >  #else
> > @@ -270,15 +262,15 @@ L(start_erms):
> >         cmp     $VEC_SIZE, %RDX_LP
> >         jb      L(less_vec)
> >         /* Load regardless.  */
> > -       VMOVU   (%rsi), %VEC(0)
> > +       VMOVU   (%rsi), %VMM(0)
> >         cmp     $(VEC_SIZE * 2), %RDX_LP
> >         ja      L(movsb_more_2x_vec)
> >         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
> >          */
> > -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(1)
> > -       VMOVU   %VEC(0), (%rdi)
> > -       VMOVU   %VEC(1), -VEC_SIZE(%rdi, %rdx)
> > -L(return):
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(1)
> > +       VMOVU   %VMM(0), (%rdi)
> > +       VMOVU   %VMM(1), -VEC_SIZE(%rdi, %rdx)
> > +L(return_vzeroupper):
> >  # if VEC_SIZE > 16
> >         ZERO_UPPER_VEC_REGISTERS_RETURN
> >  # else
> > @@ -359,10 +351,10 @@ L(between_16_31):
> >         .p2align 4,, 10
> >  L(between_32_63):
> >         /* From 32 to 63.  No branch when size == 32.  */
> > -       VMOVU   (%rsi), %YMM0
> > -       VMOVU   -32(%rsi, %rdx), %YMM1
> > -       VMOVU   %YMM0, (%rdi)
> > -       VMOVU   %YMM1, -32(%rdi, %rdx)
> > +       VMOVU   (%rsi), %VMM_256(0)
> > +       VMOVU   -32(%rsi, %rdx), %VMM_256(1)
> > +       VMOVU   %VMM_256(0), (%rdi)
> > +       VMOVU   %VMM_256(1), -32(%rdi, %rdx)
> >         VZEROUPPER_RETURN
> >  #endif
> >
> > @@ -380,12 +372,12 @@ L(last_4x_vec):
> >         /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
> >
> >         /* VEC(0) and VEC(1) have already been loaded.  */
> > -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(2)
> > -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
> > -       VMOVU   %VEC(0), (%rdi)
> > -       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> > -       VMOVU   %VEC(2), -VEC_SIZE(%rdi, %rdx)
> > -       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(2)
> > +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
> > +       VMOVU   %VMM(0), (%rdi)
> > +       VMOVU   %VMM(1), VEC_SIZE(%rdi)
> > +       VMOVU   %VMM(2), -VEC_SIZE(%rdi, %rdx)
> > +       VMOVU   %VMM(3), -(VEC_SIZE * 2)(%rdi, %rdx)
> >         VZEROUPPER_RETURN
> >
> >         .p2align 4
> > @@ -400,24 +392,24 @@ L(more_2x_vec):
> >         cmpq    $(VEC_SIZE * 8), %rdx
> >         ja      L(more_8x_vec)
> >         /* Load VEC(1) regardless. VEC(0) has already been loaded.  */
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
> >         cmpq    $(VEC_SIZE * 4), %rdx
> >         jbe     L(last_4x_vec)
> >         /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
> > -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(4)
> > -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
> > -       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
> > -       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
> > -       VMOVU   %VEC(0), (%rdi)
> > -       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> > -       VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > -       VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > -       VMOVU   %VEC(4), -VEC_SIZE(%rdi, %rdx)
> > -       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
> > -       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
> > -       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(4)
> > +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(5)
> > +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(6)
> > +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(7)
> > +       VMOVU   %VMM(0), (%rdi)
> > +       VMOVU   %VMM(1), VEC_SIZE(%rdi)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> > +       VMOVU   %VMM(4), -VEC_SIZE(%rdi, %rdx)
> > +       VMOVU   %VMM(5), -(VEC_SIZE * 2)(%rdi, %rdx)
> > +       VMOVU   %VMM(6), -(VEC_SIZE * 3)(%rdi, %rdx)
> > +       VMOVU   %VMM(7), -(VEC_SIZE * 4)(%rdi, %rdx)
> >         VZEROUPPER_RETURN
> >
> >         .p2align 4,, 4
> > @@ -466,14 +458,14 @@ L(more_8x_vec_forward):
> >          */
> >
> >         /* First vec was already loaded into VEC(0).  */
> > -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(5)
> > -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(5)
> > +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(6)
> >         /* Save begining of dst.  */
> >         movq    %rdi, %rcx
> >         /* Align dst to VEC_SIZE - 1.  */
> >         orq     $(VEC_SIZE - 1), %rdi
> > -       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
> > -       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
> > +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(7)
> > +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(8)
> >
> >         /* Subtract dst from src. Add back after dst aligned.  */
> >         subq    %rcx, %rsi
> > @@ -488,25 +480,25 @@ L(more_8x_vec_forward):
> >         .p2align 4,, 11
> >  L(loop_4x_vec_forward):
> >         /* Copy 4 * VEC a time forward.  */
> > -       VMOVU   (%rsi), %VEC(1)
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(2)
> > -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(3)
> > -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(4)
> > +       VMOVU   (%rsi), %VMM(1)
> > +       VMOVU   VEC_SIZE(%rsi), %VMM(2)
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(3)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(4)
> >         subq    $-(VEC_SIZE * 4), %rsi
> > -       VMOVA   %VEC(1), (%rdi)
> > -       VMOVA   %VEC(2), VEC_SIZE(%rdi)
> > -       VMOVA   %VEC(3), (VEC_SIZE * 2)(%rdi)
> > -       VMOVA   %VEC(4), (VEC_SIZE * 3)(%rdi)
> > +       VMOVA   %VMM(1), (%rdi)
> > +       VMOVA   %VMM(2), VEC_SIZE(%rdi)
> > +       VMOVA   %VMM(3), (VEC_SIZE * 2)(%rdi)
> > +       VMOVA   %VMM(4), (VEC_SIZE * 3)(%rdi)
> >         subq    $-(VEC_SIZE * 4), %rdi
> >         cmpq    %rdi, %rdx
> >         ja      L(loop_4x_vec_forward)
> >         /* Store the last 4 * VEC.  */
> > -       VMOVU   %VEC(5), (VEC_SIZE * 3)(%rdx)
> > -       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdx)
> > -       VMOVU   %VEC(7), VEC_SIZE(%rdx)
> > -       VMOVU   %VEC(8), (%rdx)
> > +       VMOVU   %VMM(5), (VEC_SIZE * 3)(%rdx)
> > +       VMOVU   %VMM(6), (VEC_SIZE * 2)(%rdx)
> > +       VMOVU   %VMM(7), VEC_SIZE(%rdx)
> > +       VMOVU   %VMM(8), (%rdx)
> >         /* Store the first VEC.  */
> > -       VMOVU   %VEC(0), (%rcx)
> > +       VMOVU   %VMM(0), (%rcx)
> >         /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
> >          */
> >  L(nop_backward):
> > @@ -523,12 +515,12 @@ L(more_8x_vec_backward):
> >            addresses.  */
> >
> >         /* First vec was also loaded into VEC(0).  */
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(5)
> > -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(6)
> > +       VMOVU   VEC_SIZE(%rsi), %VMM(5)
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(6)
> >         /* Begining of region for 4x backward copy stored in rcx.  */
> >         leaq    (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
> > -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(7)
> > -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(8)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(7)
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(8)
> >         /* Subtract dst from src. Add back after dst aligned.  */
> >         subq    %rdi, %rsi
> >         /* Align dst.  */
> > @@ -540,25 +532,25 @@ L(more_8x_vec_backward):
> >         .p2align 4,, 11
> >  L(loop_4x_vec_backward):
> >         /* Copy 4 * VEC a time backward.  */
> > -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(1)
> > -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > -       VMOVU   (VEC_SIZE * 1)(%rsi), %VEC(3)
> > -       VMOVU   (VEC_SIZE * 0)(%rsi), %VEC(4)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(1)
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > +       VMOVU   (VEC_SIZE * 1)(%rsi), %VMM(3)
> > +       VMOVU   (VEC_SIZE * 0)(%rsi), %VMM(4)
> >         addq    $(VEC_SIZE * -4), %rsi
> > -       VMOVA   %VEC(1), (VEC_SIZE * 3)(%rcx)
> > -       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rcx)
> > -       VMOVA   %VEC(3), (VEC_SIZE * 1)(%rcx)
> > -       VMOVA   %VEC(4), (VEC_SIZE * 0)(%rcx)
> > +       VMOVA   %VMM(1), (VEC_SIZE * 3)(%rcx)
> > +       VMOVA   %VMM(2), (VEC_SIZE * 2)(%rcx)
> > +       VMOVA   %VMM(3), (VEC_SIZE * 1)(%rcx)
> > +       VMOVA   %VMM(4), (VEC_SIZE * 0)(%rcx)
> >         addq    $(VEC_SIZE * -4), %rcx
> >         cmpq    %rcx, %rdi
> >         jb      L(loop_4x_vec_backward)
> >         /* Store the first 4 * VEC.  */
> > -       VMOVU   %VEC(0), (%rdi)
> > -       VMOVU   %VEC(5), VEC_SIZE(%rdi)
> > -       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
> > -       VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
> > +       VMOVU   %VMM(0), (%rdi)
> > +       VMOVU   %VMM(5), VEC_SIZE(%rdi)
> > +       VMOVU   %VMM(6), (VEC_SIZE * 2)(%rdi)
> > +       VMOVU   %VMM(7), (VEC_SIZE * 3)(%rdi)
> >         /* Store the last VEC.  */
> > -       VMOVU   %VEC(8), -VEC_SIZE(%rdx, %rdi)
> > +       VMOVU   %VMM(8), -VEC_SIZE(%rdx, %rdi)
> >         VZEROUPPER_RETURN
> >
> >  #if defined USE_MULTIARCH && IS_IN (libc)
> > @@ -568,7 +560,7 @@ L(loop_4x_vec_backward):
> >  # if ALIGN_MOVSB
> >  L(skip_short_movsb_check):
> >  #  if MOVSB_ALIGN_TO > VEC_SIZE
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
> >  #  endif
> >  #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> >  #   error Unsupported MOVSB_ALIGN_TO
> > @@ -597,9 +589,9 @@ L(skip_short_movsb_check):
> >
> >         rep     movsb
> >
> > -       VMOVU   %VEC(0), (%r8)
> > +       VMOVU   %VMM(0), (%r8)
> >  #  if MOVSB_ALIGN_TO > VEC_SIZE
> > -       VMOVU   %VEC(1), VEC_SIZE(%r8)
> > +       VMOVU   %VMM(1), VEC_SIZE(%r8)
> >  #  endif
> >         VZEROUPPER_RETURN
> >  # endif
> > @@ -640,7 +632,7 @@ L(movsb):
> >  # endif
> >  # if ALIGN_MOVSB
> >  #  if MOVSB_ALIGN_TO > VEC_SIZE
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
> >  #  endif
> >  #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> >  #   error Unsupported MOVSB_ALIGN_TO
> > @@ -664,9 +656,9 @@ L(movsb_align_dst):
> >         rep     movsb
> >
> >         /* Store VECs loaded for aligning.  */
> > -       VMOVU   %VEC(0), (%r8)
> > +       VMOVU   %VMM(0), (%r8)
> >  #  if MOVSB_ALIGN_TO > VEC_SIZE
> > -       VMOVU   %VEC(1), VEC_SIZE(%r8)
> > +       VMOVU   %VMM(1), VEC_SIZE(%r8)
> >  #  endif
> >         VZEROUPPER_RETURN
> >  # else /* !ALIGN_MOVSB.  */
> > @@ -701,18 +693,18 @@ L(large_memcpy_2x):
> >
> >         /* First vec was also loaded into VEC(0).  */
> >  # if VEC_SIZE < 64
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
> >  #  if VEC_SIZE < 32
> > -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
> >  #  endif
> >  # endif
> > -       VMOVU   %VEC(0), (%rdi)
> > +       VMOVU   %VMM(0), (%rdi)
> >  # if VEC_SIZE < 64
> > -       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> > +       VMOVU   %VMM(1), VEC_SIZE(%rdi)
> >  #  if VEC_SIZE < 32
> > -       VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > -       VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> >  #  endif
> >  # endif
> >
> > @@ -761,12 +753,12 @@ L(loop_large_memcpy_2x_inner):
> >         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> >         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
> >         /* Load vectors from rsi.  */
> > -       LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > -       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > +       LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
> > +       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
> >         subq    $-LARGE_LOAD_SIZE, %rsi
> >         /* Non-temporal store vectors to rdi.  */
> > -       STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > -       STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > +       STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
> > +       STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
> >         subq    $-LARGE_LOAD_SIZE, %rdi
> >         decl    %ecx
> >         jnz     L(loop_large_memcpy_2x_inner)
> > @@ -785,31 +777,31 @@ L(loop_large_memcpy_2x_tail):
> >         /* Copy 4 * VEC a time forward with non-temporal stores.  */
> >         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> >         PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> > -       VMOVU   (%rsi), %VEC(0)
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > +       VMOVU   (%rsi), %VMM(0)
> > +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
> >         subq    $-(VEC_SIZE * 4), %rsi
> >         addl    $-(VEC_SIZE * 4), %edx
> > -       VMOVA   %VEC(0), (%rdi)
> > -       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > -       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > -       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > +       VMOVA   %VMM(0), (%rdi)
> > +       VMOVA   %VMM(1), VEC_SIZE(%rdi)
> > +       VMOVA   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > +       VMOVA   %VMM(3), (VEC_SIZE * 3)(%rdi)
> >         subq    $-(VEC_SIZE * 4), %rdi
> >         cmpl    $(VEC_SIZE * 4), %edx
> >         ja      L(loop_large_memcpy_2x_tail)
> >
> >  L(large_memcpy_2x_end):
> >         /* Store the last 4 * VEC.  */
> > -       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > -       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > -
> > -       VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > -       VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > -       VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > -       VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> > +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
> > +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
> > +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(3)
> > +
> > +       VMOVU   %VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > +       VMOVU   %VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > +       VMOVU   %VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > +       VMOVU   %VMM(3), -VEC_SIZE(%rdi, %rdx)
> >         VZEROUPPER_RETURN
> >
> >         .p2align 4
> > @@ -831,16 +823,16 @@ L(loop_large_memcpy_4x_inner):
> >         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> >         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
> >         /* Load vectors from rsi.  */
> > -       LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > -       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > -       LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > -       LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > +       LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
> > +       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
> > +       LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
> > +       LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
> >         subq    $-LARGE_LOAD_SIZE, %rsi
> >         /* Non-temporal store vectors to rdi.  */
> > -       STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > -       STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > -       STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > -       STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > +       STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
> > +       STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
> > +       STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
> > +       STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
> >         subq    $-LARGE_LOAD_SIZE, %rdi
> >         decl    %ecx
> >         jnz     L(loop_large_memcpy_4x_inner)
> > @@ -858,31 +850,31 @@ L(loop_large_memcpy_4x_tail):
> >         /* Copy 4 * VEC a time forward with non-temporal stores.  */
> >         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> >         PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> > -       VMOVU   (%rsi), %VEC(0)
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > +       VMOVU   (%rsi), %VMM(0)
> > +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
> >         subq    $-(VEC_SIZE * 4), %rsi
> >         addl    $-(VEC_SIZE * 4), %edx
> > -       VMOVA   %VEC(0), (%rdi)
> > -       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > -       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > -       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > +       VMOVA   %VMM(0), (%rdi)
> > +       VMOVA   %VMM(1), VEC_SIZE(%rdi)
> > +       VMOVA   %VMM(2), (VEC_SIZE * 2)(%rdi)
> > +       VMOVA   %VMM(3), (VEC_SIZE * 3)(%rdi)
> >         subq    $-(VEC_SIZE * 4), %rdi
> >         cmpl    $(VEC_SIZE * 4), %edx
> >         ja      L(loop_large_memcpy_4x_tail)
> >
> >  L(large_memcpy_4x_end):
> >         /* Store the last 4 * VEC.  */
> > -       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > -       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > -
> > -       VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > -       VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > -       VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > -       VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> > +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
> > +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
> > +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(3)
> > +
> > +       VMOVU   %VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > +       VMOVU   %VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > +       VMOVU   %VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > +       VMOVU   %VMM(3), -VEC_SIZE(%rdi, %rdx)
> >         VZEROUPPER_RETURN
> >  #endif
> >  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v9 6/6] x86: Update strlen-evex-base to use new reg/vec macros.
  2022-10-15  0:21   ` [PATCH v9 6/6] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
@ 2022-10-15  2:58     ` H.J. Lu
  0 siblings, 0 replies; 72+ messages in thread
From: H.J. Lu @ 2022-10-15  2:58 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Oct 14, 2022 at 5:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> To avoid duplicate the VMM / GPR / mask insn macros in all incoming
> evex512 files use the macros defined in 'reg-macros.h' and
> '{vec}-macros.h'
>
> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  sysdeps/x86_64/multiarch/strlen-evex-base.S | 116 +++++++-------------
>  sysdeps/x86_64/multiarch/strlen-evex512.S   |   4 +-
>  2 files changed, 44 insertions(+), 76 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> index 418e9f8411..c832b15a48 100644
> --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
> +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> @@ -36,42 +36,10 @@
>  #  define CHAR_SIZE    1
>  # endif
>
> -# define XMM0          xmm16
>  # define PAGE_SIZE     4096
>  # define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
> -# if VEC_SIZE == 64
> -#  define KMOV         kmovq
> -#  define KORTEST      kortestq
> -#  define RAX          rax
> -#  define RCX          rcx
> -#  define RDX          rdx
> -#  define SHR          shrq
> -#  define TEXTSUFFIX   evex512
> -#  define VMM0         zmm16
> -#  define VMM1         zmm17
> -#  define VMM2         zmm18
> -#  define VMM3         zmm19
> -#  define VMM4         zmm20
> -#  define VMOVA                vmovdqa64
> -# elif VEC_SIZE == 32
> -/* Currently Unused.  */
> -#  define KMOV         kmovd
> -#  define KORTEST      kortestd
> -#  define RAX          eax
> -#  define RCX          ecx
> -#  define RDX          edx
> -#  define SHR          shrl
> -#  define TEXTSUFFIX   evex256
> -#  define VMM0         ymm16
> -#  define VMM1         ymm17
> -#  define VMM2         ymm18
> -#  define VMM3         ymm19
> -#  define VMM4         ymm20
> -#  define VMOVA                vmovdqa32
> -# endif
> -
> -       .section .text.TEXTSUFFIX, "ax", @progbits
> +       .section SECTION(.text),"ax",@progbits
>  /* Aligning entry point to 64 byte, provides better performance for
>     one vector length string.  */
>  ENTRY_P2ALIGN (STRLEN, 6)
> @@ -86,18 +54,18 @@ ENTRY_P2ALIGN (STRLEN, 6)
>  # endif
>
>         movl    %edi, %eax
> -       vpxorq  %XMM0, %XMM0, %XMM0
> +       vpxorq  %VMM_128(0), %VMM_128(0), %VMM_128(0)
>         andl    $(PAGE_SIZE - 1), %eax
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         ja      L(page_cross)
>
>         /* Compare [w]char for null, mask bit will be set for match.  */
> -       VPCMP   $0, (%rdi), %VMM0, %k0
> -       KMOV    %k0, %RAX
> -       test    %RAX, %RAX
> +       VPCMP   $0, (%rdi), %VMM(0), %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jz      L(align_more)
>
> -       bsf     %RAX, %RAX
> +       bsf     %VRAX, %VRAX
>  # ifdef USE_AS_STRNLEN
>         cmpq    %rsi, %rax
>         cmovnb  %rsi, %rax
> @@ -120,7 +88,7 @@ L(align_more):
>         movq    %rax, %rdx
>         subq    %rdi, %rdx
>  #  ifdef USE_AS_WCSLEN
> -       SHR     $2, %RDX
> +       shr     $2, %VRDX
>  #  endif
>         /* At this point rdx contains [w]chars already compared.  */
>         subq    %rsi, %rdx
> @@ -131,9 +99,9 @@ L(align_more):
>  # endif
>
>         /* Loop unroll 4 times for 4 vector loop.  */
> -       VPCMP   $0, (%rax), %VMM0, %k0
> -       KMOV    %k0, %RCX
> -       test    %RCX, %RCX
> +       VPCMP   $0, (%rax), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(ret_vec_x1)
>
>  # ifdef USE_AS_STRNLEN
> @@ -141,9 +109,9 @@ L(align_more):
>         jbe     L(ret_max)
>  # endif
>
> -       VPCMP   $0, VEC_SIZE(%rax), %VMM0, %k0
> -       KMOV    %k0, %RCX
> -       test    %RCX, %RCX
> +       VPCMP   $0, VEC_SIZE(%rax), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(ret_vec_x2)
>
>  # ifdef USE_AS_STRNLEN
> @@ -151,9 +119,9 @@ L(align_more):
>         jbe     L(ret_max)
>  # endif
>
> -       VPCMP   $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
> -       KMOV    %k0, %RCX
> -       test    %RCX, %RCX
> +       VPCMP   $0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(ret_vec_x3)
>
>  # ifdef USE_AS_STRNLEN
> @@ -161,9 +129,9 @@ L(align_more):
>         jbe     L(ret_max)
>  # endif
>
> -       VPCMP   $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
> -       KMOV    %k0, %RCX
> -       test    %RCX, %RCX
> +       VPCMP   $0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(ret_vec_x4)
>
>  # ifdef USE_AS_STRNLEN
> @@ -179,7 +147,7 @@ L(align_more):
>  # ifdef USE_AS_STRNLEN
>         subq    %rax, %rcx
>  #  ifdef USE_AS_WCSLEN
> -       SHR     $2, %RCX
> +       shr     $2, %VRCX
>  #  endif
>         /* rcx contains number of [w]char will be recompared due to
>            alignment fixes.  rdx must be incremented by rcx to offset
> @@ -199,42 +167,42 @@ L(loop_entry):
>  # endif
>         /* VPMINU and VPCMP combination provide better performance as
>            compared to alternative combinations.  */
> -       VMOVA   (VEC_SIZE * 4)(%rax), %VMM1
> -       VPMINU  (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
> -       VMOVA   (VEC_SIZE * 6)(%rax), %VMM3
> -       VPMINU  (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
> +       VMOVA   (VEC_SIZE * 4)(%rax), %VMM(1)
> +       VPMINU  (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rax), %VMM(3)
> +       VPMINU  (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
>
> -       VPTESTN %VMM2, %VMM2, %k0
> -       VPTESTN %VMM4, %VMM4, %k1
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       VPTESTN %VMM(4), %VMM(4), %k1
>
>         subq    $-(VEC_SIZE * 4), %rax
>         KORTEST %k0, %k1
>         jz      L(loop)
>
> -       VPTESTN %VMM1, %VMM1, %k2
> -       KMOV    %k2, %RCX
> -       test    %RCX, %RCX
> +       VPTESTN %VMM(1), %VMM(1), %k2
> +       KMOV    %k2, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(ret_vec_x1)
>
> -       KMOV    %k0, %RCX
> +       KMOV    %k0, %VRCX
>         /* At this point, if k0 is non zero, null char must be in the
>            second vector.  */
> -       test    %RCX, %RCX
> +       test    %VRCX, %VRCX
>         jnz     L(ret_vec_x2)
>
> -       VPTESTN %VMM3, %VMM3, %k3
> -       KMOV    %k3, %RCX
> -       test    %RCX, %RCX
> +       VPTESTN %VMM(3), %VMM(3), %k3
> +       KMOV    %k3, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(ret_vec_x3)
>         /* At this point null [w]char must be in the fourth vector so no
>            need to check.  */
> -       KMOV    %k1, %RCX
> +       KMOV    %k1, %VRCX
>
>         /* Fourth, third, second vector terminating are pretty much
>            same, implemented this way to avoid branching and reuse code
>            from pre loop exit condition.  */
>  L(ret_vec_x4):
> -       bsf     %RCX, %RCX
> +       bsf     %VRCX, %VRCX
>         subq    %rdi, %rax
>  # ifdef USE_AS_WCSLEN
>         subq    $-(VEC_SIZE * 3), %rax
> @@ -250,7 +218,7 @@ L(ret_vec_x4):
>         ret
>
>  L(ret_vec_x3):
> -       bsf     %RCX, %RCX
> +       bsf     %VRCX, %VRCX
>         subq    %rdi, %rax
>  # ifdef USE_AS_WCSLEN
>         subq    $-(VEC_SIZE * 2), %rax
> @@ -268,7 +236,7 @@ L(ret_vec_x3):
>  L(ret_vec_x2):
>         subq    $-VEC_SIZE, %rax
>  L(ret_vec_x1):
> -       bsf     %RCX, %RCX
> +       bsf     %VRCX, %VRCX
>         subq    %rdi, %rax
>  # ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
> @@ -289,13 +257,13 @@ L(page_cross):
>         /* ecx contains number of w[char] to be skipped as a result
>            of address alignment.  */
>         xorq    %rdi, %rax
> -       VPCMP   $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
> -       KMOV    %k0, %RAX
> +       VPCMP   $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0
> +       KMOV    %k0, %VRAX
>         /* Ignore number of character for alignment adjustment.  */
> -       SHR     %cl, %RAX
> +       shr     %cl, %VRAX
>         jz      L(align_more)
>
> -       bsf     %RAX, %RAX
> +       bsf     %VRAX, %VRAX
>  # ifdef USE_AS_STRNLEN
>         cmpq    %rsi, %rax
>         cmovnb  %rsi, %rax
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> index 116f8981c8..10c3415c8a 100644
> --- a/sysdeps/x86_64/multiarch/strlen-evex512.S
> +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> @@ -2,6 +2,6 @@
>  # define STRLEN                __strlen_evex512
>  #endif
>
> -#define VEC_SIZE       64
> -
> +#include "x86-evex512-vecs.h"
> +#include "reg-macros.h"
>  #include "strlen-evex-base.S"
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v10 1/6] x86: Update VEC macros to complete API for evex/evex512 impls
  2022-10-14 16:40 [PATCH v1 1/3] x86: Update evex256/512 vec macros Noah Goldstein
                   ` (9 preceding siblings ...)
  2022-10-15  0:20 ` [PATCH v9 " Noah Goldstein
@ 2022-10-15  3:00 ` Noah Goldstein
  2022-10-15  3:00   ` [PATCH v10 2/6] x86: Update memrchr to use new VEC macros Noah Goldstein
                     ` (5 more replies)
  10 siblings, 6 replies; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  3:00 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

1) Copy so that backport will be easier.
2) Make section only define if there is not a previous definition
3) Add `VEC_lo` definition for proper reg-width but in the
   ymm/zmm0-15 range.
4) Add macros for accessing GPRs based on VEC_SIZE
        This is to make it easier to do think like:
        ```
            vpcmpb %VEC(0), %VEC(1), %k0
            kmov{d|q} %k0, %{eax|rax}
            test %{eax|rax}
        ```
        It adds macro s.t any GPR can get the proper width with:
            `V{upcase_GPR_name}`

        and any mask insn can get the proper width with:
            `{upcase_mask_insn_without_postfix}`

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/reg-macros.h         | 168 ++++++++++++++++++
 .../multiarch/scripts/gen-reg-macros.py       | 133 ++++++++++++++
 sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h   |  35 ++++
 sysdeps/x86_64/multiarch/x86-avx-vecs.h       |  47 +++++
 .../x86_64/multiarch/x86-evex-vecs-common.h   |  39 ++++
 sysdeps/x86_64/multiarch/x86-evex256-vecs.h   |  38 ++++
 sysdeps/x86_64/multiarch/x86-evex512-vecs.h   |  38 ++++
 sysdeps/x86_64/multiarch/x86-sse2-vecs.h      |  47 +++++
 sysdeps/x86_64/multiarch/x86-vec-macros.h     |  90 ++++++++++
 9 files changed, 635 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
 create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-avx-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-evex256-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-evex512-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-sse2-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/x86-vec-macros.h

diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
new file mode 100644
index 0000000000..c8ea330256
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/reg-macros.h
@@ -0,0 +1,168 @@
+/* This file was generated by: gen-reg-macros.py.
+
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _REG_MACROS_H
+#define _REG_MACROS_H	1
+
+#define rax_8	al
+#define rax_16	ax
+#define rax_32	eax
+#define rax_64	rax
+#define rbx_8	bl
+#define rbx_16	bx
+#define rbx_32	ebx
+#define rbx_64	rbx
+#define rcx_8	cl
+#define rcx_16	cx
+#define rcx_32	ecx
+#define rcx_64	rcx
+#define rdx_8	dl
+#define rdx_16	dx
+#define rdx_32	edx
+#define rdx_64	rdx
+#define rbp_8	bpl
+#define rbp_16	bp
+#define rbp_32	ebp
+#define rbp_64	rbp
+#define rsp_8	spl
+#define rsp_16	sp
+#define rsp_32	esp
+#define rsp_64	rsp
+#define rsi_8	sil
+#define rsi_16	si
+#define rsi_32	esi
+#define rsi_64	rsi
+#define rdi_8	dil
+#define rdi_16	di
+#define rdi_32	edi
+#define rdi_64	rdi
+#define r8_8	r8b
+#define r8_16	r8w
+#define r8_32	r8d
+#define r8_64	r8
+#define r9_8	r9b
+#define r9_16	r9w
+#define r9_32	r9d
+#define r9_64	r9
+#define r10_8	r10b
+#define r10_16	r10w
+#define r10_32	r10d
+#define r10_64	r10
+#define r11_8	r11b
+#define r11_16	r11w
+#define r11_32	r11d
+#define r11_64	r11
+#define r12_8	r12b
+#define r12_16	r12w
+#define r12_32	r12d
+#define r12_64	r12
+#define r13_8	r13b
+#define r13_16	r13w
+#define r13_32	r13d
+#define r13_64	r13
+#define r14_8	r14b
+#define r14_16	r14w
+#define r14_32	r14d
+#define r14_64	r14
+#define r15_8	r15b
+#define r15_16	r15w
+#define r15_32	r15d
+#define r15_64	r15
+
+#define kmov_8	kmovb
+#define kmov_16	kmovw
+#define kmov_32	kmovd
+#define kmov_64	kmovq
+#define kortest_8	kortestb
+#define kortest_16	kortestw
+#define kortest_32	kortestd
+#define kortest_64	kortestq
+#define kor_8	korb
+#define kor_16	korw
+#define kor_32	kord
+#define kor_64	korq
+#define ktest_8	ktestb
+#define ktest_16	ktestw
+#define ktest_32	ktestd
+#define ktest_64	ktestq
+#define kand_8	kandb
+#define kand_16	kandw
+#define kand_32	kandd
+#define kand_64	kandq
+#define kxor_8	kxorb
+#define kxor_16	kxorw
+#define kxor_32	kxord
+#define kxor_64	kxorq
+#define knot_8	knotb
+#define knot_16	knotw
+#define knot_32	knotd
+#define knot_64	knotq
+#define kxnor_8	kxnorb
+#define kxnor_16	kxnorw
+#define kxnor_32	kxnord
+#define kxnor_64	kxnorq
+#define kunpack_8	kunpackbw
+#define kunpack_16	kunpackwd
+#define kunpack_32	kunpackdq
+
+/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
+#define VRAX	VGPR(rax)
+#define VRBX	VGPR(rbx)
+#define VRCX	VGPR(rcx)
+#define VRDX	VGPR(rdx)
+#define VRBP	VGPR(rbp)
+#define VRSP	VGPR(rsp)
+#define VRSI	VGPR(rsi)
+#define VRDI	VGPR(rdi)
+#define VR8	VGPR(r8)
+#define VR9	VGPR(r9)
+#define VR10	VGPR(r10)
+#define VR11	VGPR(r11)
+#define VR12	VGPR(r12)
+#define VR13	VGPR(r13)
+#define VR14	VGPR(r14)
+#define VR15	VGPR(r15)
+
+/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
+#define KMOV 	VKINSN(kmov)
+#define KORTEST 	VKINSN(kortest)
+#define KOR 	VKINSN(kor)
+#define KTEST 	VKINSN(ktest)
+#define KAND 	VKINSN(kand)
+#define KXOR 	VKINSN(kxor)
+#define KNOT 	VKINSN(knot)
+#define KXNOR 	VKINSN(kxnor)
+#define KUNPACK 	VKINSN(kunpack)
+
+#ifdef USE_WIDE_CHAR
+# define REG_WIDTH 32
+#else
+# define REG_WIDTH VEC_SIZE
+#endif
+
+#define VPASTER(x, y)	x##_##y
+#define VEVALUATOR(x, y)	VPASTER(x, y)
+
+#define VGPR_SZ(reg_name, reg_size)	VEVALUATOR(reg_name, reg_size)
+#define VKINSN_SZ(insn, reg_size)	VEVALUATOR(insn, reg_size)
+
+#define VGPR(reg_name)	VGPR_SZ(reg_name, REG_WIDTH)
+#define VKINSN(mask_insn)	VKINSN_SZ(mask_insn, REG_WIDTH)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
new file mode 100644
index 0000000000..9fb6903212
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
@@ -0,0 +1,133 @@
+#!/usr/bin/python3
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+"""Generate macros for getting GPR name of a certain size
+
+Inputs: None
+Output: Prints header fill to stdout
+
+API:
+    V{upcase_GPR_name}
+        - Get register name REG_WIDTH component of `upcase_GPR_name`
+    {upcase_mask_insn_without_postfix}
+        - Get proper REG_WIDTH mask insn for `upcase_mask_insn_without_postfix`
+    VGPR(reg_name)
+        - Get register name REG_WIDTH component of `reg_name`
+    VKINSN(mask_insn)
+        - Get proper REG_WIDTH mask insn for `mask_insn`
+    VGPR_SZ(reg_name, reg_size)
+        - Get register name `reg_size` component of `reg_name`
+    VKINSN_SZ(mask_insn, insn_size)
+        - Get proper `insn_size` mask insn for `mask_insn`
+"""
+
+import sys
+import os
+from datetime import datetime
+
+registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
+             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
+             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
+             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
+             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
+             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
+             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
+             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
+
+mask_insns = [
+    "kmov",
+    "kortest",
+    "kor",
+    "ktest",
+    "kand",
+    "kxor",
+    "knot",
+    "kxnor",
+]
+mask_insns_ext = ["b", "w", "d", "q"]
+
+cr = """
+   Copyright (C) {} Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+"""
+
+print("/* This file was generated by: {}.".format(os.path.basename(
+    sys.argv[0])))
+print(cr.format(datetime.today().year))
+
+print("#ifndef _REG_MACROS_H")
+print("#define _REG_MACROS_H\t1")
+print("")
+for reg in registers:
+    for i in range(0, 4):
+        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
+
+print("")
+for mask_insn in mask_insns:
+    for i in range(0, 4):
+        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
+                                           mask_insns_ext[i]))
+for i in range(0, 3):
+    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
+                                                   mask_insns_ext[i + 1]))
+mask_insns.append("kunpack")
+
+print("")
+print(
+    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
+for reg in registers:
+    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
+
+print("")
+
+print(
+    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
+)
+for mask_insn in mask_insns:
+    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
+print("")
+
+print("#ifdef USE_WIDE_CHAR")
+print("# define REG_WIDTH 32")
+print("#else")
+print("# define REG_WIDTH VEC_SIZE")
+print("#endif")
+print("")
+print("#define VPASTER(x, y)\tx##_##y")
+print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
+print("")
+print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
+print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
+print("")
+print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
+print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
+
+print("\n#endif")
diff --git a/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
new file mode 100644
index 0000000000..0b326c8a70
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
@@ -0,0 +1,35 @@
+/* Common config for AVX-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX_RTM_VECS_H
+#define _X86_AVX_RTM_VECS_H			1
+
+#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define USE_WITH_RTM			1
+#include "x86-avx-vecs.h"
+
+#undef SECTION
+#define SECTION(p)				p##.avx.rtm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
new file mode 100644
index 0000000000..dca1089060
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for AVX VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_AVX_VECS_H
+#define _X86_AVX_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "x86-vec-macros.h"
+
+#define USE_WITH_AVX		1
+#define SECTION(p)			p##.avx
+
+/* 4-byte mov instructions with AVX2.  */
+#define MOV_SIZE			4
+/* 1 (ret) + 3 (vzeroupper).  */
+#define RET_SIZE			4
+#define VZEROUPPER			vzeroupper
+
+#define VMOVU				vmovdqu
+#define VMOVA				vmovdqa
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VMM_128				VMM_any_xmm
+#define VMM					VMM_any_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
new file mode 100644
index 0000000000..f331e9d8ec
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
@@ -0,0 +1,39 @@
+/* Common config for EVEX256 and EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_EVEX_VECS_COMMON_H
+#define _X86_EVEX_VECS_COMMON_H			1
+
+#include "x86-vec-macros.h"
+
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+#define VMM_128				VMM_hi_xmm
+#define VMM_256				VMM_hi_ymm
+#define VMM_512				VMM_hi_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-evex256-vecs.h b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
new file mode 100644
index 0000000000..8337b95504
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
@@ -0,0 +1,38 @@
+/* Common config for EVEX256 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX256_VECS_H
+#define _EVEX256_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "x86-evex-vecs-common.h"
+
+#define USE_WITH_EVEX256	1
+
+#ifndef SECTION
+# define SECTION(p)			p##.evex
+#endif
+
+#define VMM					VMM_256
+#define VMM_lo				VMM_any_ymm
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-evex512-vecs.h b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
new file mode 100644
index 0000000000..7dc5c23ad0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
@@ -0,0 +1,38 @@
+/* Common config for EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX512_VECS_H
+#define _EVEX512_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			64
+#include "x86-evex-vecs-common.h"
+
+#define USE_WITH_EVEX512	1
+
+#ifndef SECTION
+# define SECTION(p)			p##.evex512
+#endif
+
+#define VMM					VMM_512
+#define VMM_lo				VMM_any_zmm
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-sse2-vecs.h b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
new file mode 100644
index 0000000000..b8bbd5dc29
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for SSE2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_SSE2_VECS_H
+#define _X86_SSE2_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			16
+#include "x86-vec-macros.h"
+
+#define USE_WITH_SSE2		1
+#define SECTION(p)			p
+
+/* 3-byte mov instructions with SSE2.  */
+#define MOV_SIZE			3
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				movups
+#define VMOVA				movaps
+#define VMOVNT				movntdq
+
+#define VMM_128				VMM_any_xmm
+#define VMM					VMM_any_xmm
+
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/x86-vec-macros.h b/sysdeps/x86_64/multiarch/x86-vec-macros.h
new file mode 100644
index 0000000000..7d6bb31d55
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/x86-vec-macros.h
@@ -0,0 +1,90 @@
+/* Macro helpers for VEC_{type}({vec_num})
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_VEC_MACROS_H
+#define _X86_VEC_MACROS_H			1
+
+#ifndef VEC_SIZE
+# error "Never include this file directly. Always include a vector config."
+#endif
+
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
+   VMM(N) values.  */
+#define VMM_hi_xmm0				xmm16
+#define VMM_hi_xmm1				xmm17
+#define VMM_hi_xmm2				xmm18
+#define VMM_hi_xmm3				xmm19
+#define VMM_hi_xmm4				xmm20
+#define VMM_hi_xmm5				xmm21
+#define VMM_hi_xmm6				xmm22
+#define VMM_hi_xmm7				xmm23
+#define VMM_hi_xmm8				xmm24
+#define VMM_hi_xmm9				xmm25
+#define VMM_hi_xmm10			xmm26
+#define VMM_hi_xmm11			xmm27
+#define VMM_hi_xmm12			xmm28
+#define VMM_hi_xmm13			xmm29
+#define VMM_hi_xmm14			xmm30
+#define VMM_hi_xmm15			xmm31
+
+#define VMM_hi_ymm0				ymm16
+#define VMM_hi_ymm1				ymm17
+#define VMM_hi_ymm2				ymm18
+#define VMM_hi_ymm3				ymm19
+#define VMM_hi_ymm4				ymm20
+#define VMM_hi_ymm5				ymm21
+#define VMM_hi_ymm6				ymm22
+#define VMM_hi_ymm7				ymm23
+#define VMM_hi_ymm8				ymm24
+#define VMM_hi_ymm9				ymm25
+#define VMM_hi_ymm10			ymm26
+#define VMM_hi_ymm11			ymm27
+#define VMM_hi_ymm12			ymm28
+#define VMM_hi_ymm13			ymm29
+#define VMM_hi_ymm14			ymm30
+#define VMM_hi_ymm15			ymm31
+
+#define VMM_hi_zmm0				zmm16
+#define VMM_hi_zmm1				zmm17
+#define VMM_hi_zmm2				zmm18
+#define VMM_hi_zmm3				zmm19
+#define VMM_hi_zmm4				zmm20
+#define VMM_hi_zmm5				zmm21
+#define VMM_hi_zmm6				zmm22
+#define VMM_hi_zmm7				zmm23
+#define VMM_hi_zmm8				zmm24
+#define VMM_hi_zmm9				zmm25
+#define VMM_hi_zmm10			zmm26
+#define VMM_hi_zmm11			zmm27
+#define VMM_hi_zmm12			zmm28
+#define VMM_hi_zmm13			zmm29
+#define VMM_hi_zmm14			zmm30
+#define VMM_hi_zmm15			zmm31
+
+#define PRIMITIVE_VMM(vec, num)		vec##num
+
+#define VMM_any_xmm(i)			PRIMITIVE_VMM(xmm, i)
+#define VMM_any_ymm(i)			PRIMITIVE_VMM(ymm, i)
+#define VMM_any_zmm(i)			PRIMITIVE_VMM(zmm, i)
+
+#define VMM_hi_xmm(i)			PRIMITIVE_VMM(VMM_hi_xmm, i)
+#define VMM_hi_ymm(i)			PRIMITIVE_VMM(VMM_hi_ymm, i)
+#define VMM_hi_zmm(i)			PRIMITIVE_VMM(VMM_hi_zmm, i)
+
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v10 2/6] x86: Update memrchr to use new VEC macros
  2022-10-15  3:00 ` [PATCH v10 " Noah Goldstein
@ 2022-10-15  3:00   ` Noah Goldstein
  2022-10-15  3:44     ` Sunil Pandey
  2022-10-15  3:00   ` [PATCH v10 3/6] x86: Update memmove " Noah Goldstein
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  3:00 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Replace %VEC(n) -> %VMM(n)

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/memrchr-evex.S | 42 ++++++++++++-------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index ea3a0a0a60..550b328c5a 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -21,7 +21,7 @@
 #if ISA_SHOULD_BUILD (4)
 
 # include <sysdep.h>
-# include "evex256-vecs.h"
+# include "x86-evex256-vecs.h"
 # if VEC_SIZE != 32
 #  error "VEC_SIZE != 32 unimplemented"
 # endif
@@ -31,7 +31,7 @@
 # endif
 
 # define PAGE_SIZE			4096
-# define VECMATCH			VEC(0)
+# define VMMMATCH			VMM(0)
 
 	.section SECTION(.text), "ax", @progbits
 ENTRY_P2ALIGN(MEMRCHR, 6)
@@ -47,7 +47,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
 	   correct page cross check and 2) it correctly sets up end ptr to be
 	   subtract by lzcnt aligned.  */
 	leaq	-1(%rdi, %rdx), %rax
-	vpbroadcastb %esi, %VECMATCH
+	vpbroadcastb %esi, %VMMMATCH
 
 	/* Check if we can load 1x VEC without cross a page.  */
 	testl	$(PAGE_SIZE - VEC_SIZE), %eax
@@ -55,7 +55,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
 
 	/* Don't use rax for pointer here because EVEX has better encoding with
 	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
@@ -96,7 +96,7 @@ L(more_1x_vec):
 	movq	%rax, %rdx
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	subq	%rdi, %rdx
@@ -115,7 +115,7 @@ L(last_2x_vec):
 
 	/* Don't use rax for pointer here because EVEX has better encoding with
 	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
 	lzcntq	%rcx, %rcx
@@ -131,7 +131,7 @@ L(last_2x_vec):
 L(page_cross):
 	movq	%rax, %rsi
 	andq	$-VEC_SIZE, %rsi
-	vpcmpb	$0, (%rsi), %VECMATCH, %k0
+	vpcmpb	$0, (%rsi), %VMMMATCH, %k0
 	kmovd	%k0, %r8d
 	/* Shift out negative alignment (because we are starting from endptr and
 	   working backwards).  */
@@ -165,13 +165,13 @@ L(more_2x_vec):
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x0_dec)
 
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x1)
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	subq	$(VEC_SIZE * 4), %rdx
@@ -185,7 +185,7 @@ L(last_vec):
 
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 	lzcntl	%ecx, %ecx
 	subq	$(VEC_SIZE * 3 + 1), %rax
@@ -220,7 +220,7 @@ L(more_4x_vec):
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x2)
 
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	testl	%ecx, %ecx
@@ -243,17 +243,17 @@ L(more_4x_vec):
 L(loop_4x_vec):
 	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
 	   on).  */
-	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1
 
 	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
-	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
-	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
-	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
+	vpxorq	(VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3)
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4
 
 	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
 	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
-	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
-	vptestnmb %VEC(3), %VEC(3), %k2
+	vpminub	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	vptestnmb %VMM(3), %VMM(3), %k2
 
 	/* Any 1s and we found CHAR.  */
 	kortestd %k2, %k4
@@ -270,7 +270,7 @@ L(loop_4x_vec):
 L(last_4x_vec):
 
 	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	cmpl	$(VEC_SIZE * 2), %edx
@@ -280,14 +280,14 @@ L(last_4x_vec):
 	jnz	L(ret_vec_x0_dec)
 
 
-	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x1)
 
 	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0
 	kmovd	%k0, %ecx
 
 	cmpl	$(VEC_SIZE * 3), %edx
@@ -309,7 +309,7 @@ L(loop_end):
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x0_end)
 
-	vptestnmb %VEC(2), %VEC(2), %k0
+	vptestnmb %VMM(2), %VMM(2), %k0
 	kmovd	%k0, %ecx
 	testl	%ecx, %ecx
 	jnz	L(ret_vec_x1_end)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v10 3/6] x86: Update memmove to use new VEC macros
  2022-10-15  3:00 ` [PATCH v10 " Noah Goldstein
  2022-10-15  3:00   ` [PATCH v10 2/6] x86: Update memrchr to use new VEC macros Noah Goldstein
@ 2022-10-15  3:00   ` Noah Goldstein
  2022-10-15  3:43     ` Sunil Pandey
  2022-10-15  3:00   ` [PATCH v10 4/6] x86: Update memset " Noah Goldstein
                     ` (3 subsequent siblings)
  5 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  3:00 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Replace %VEC(n) -> %VMM(n)

This commit does not change libc.so

Tested build on x86-64
---
 .../memmove-avx-unaligned-erms-rtm.S          |  11 +-
 .../multiarch/memmove-avx-unaligned-erms.S    |   9 +-
 .../multiarch/memmove-avx512-unaligned-erms.S |  30 +-
 .../multiarch/memmove-evex-unaligned-erms.S   |  30 +-
 .../multiarch/memmove-sse2-unaligned-erms.S   |  11 +-
 .../multiarch/memmove-vec-unaligned-erms.S    | 262 +++++++++---------
 6 files changed, 132 insertions(+), 221 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
index 67a55f0c85..20746e6713 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -1,16 +1,7 @@
 #if IS_IN (libc)
-# define VEC_SIZE	32
-# define VEC(i)		ymm##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu
-# define VMOVA		vmovdqa
-# define MOV_SIZE	4
-# define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
-# define VZEROUPPER_RETURN jmp	 L(return)
+# include "x86-avx-rtm-vecs.h"
 
-# define SECTION(p)		p##.avx.rtm
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s##_rtm
 
 # include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index a14b155667..4e4b4635f9 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -2,14 +2,7 @@
 
 #if ISA_SHOULD_BUILD (3)
 
-# define VEC_SIZE	32
-# define VEC(i)		ymm##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu
-# define VMOVA		vmovdqa
-# define MOV_SIZE	4
-
-# define SECTION(p)		p##.avx
+# include "x86-avx-vecs.h"
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index 8d1568a7ba..cca97e38f8 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -2,35 +2,7 @@
 
 #if ISA_SHOULD_BUILD (4)
 
-# define VEC_SIZE	64
-# define XMM0		xmm16
-# define XMM1		xmm17
-# define YMM0		ymm16
-# define YMM1		ymm17
-# define VEC0		zmm16
-# define VEC1		zmm17
-# define VEC2		zmm18
-# define VEC3		zmm19
-# define VEC4		zmm20
-# define VEC5		zmm21
-# define VEC6		zmm22
-# define VEC7		zmm23
-# define VEC8		zmm24
-# define VEC9		zmm25
-# define VEC10		zmm26
-# define VEC11		zmm27
-# define VEC12		zmm28
-# define VEC13		zmm29
-# define VEC14		zmm30
-# define VEC15		zmm31
-# define VEC(i)		VEC##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-# define VZEROUPPER
-# define MOV_SIZE	6
-
-# define SECTION(p)		p##.evex512
+# include "x86-evex512-vecs.h"
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
index 2373017358..1f7b5715f7 100644
--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -2,35 +2,7 @@
 
 #if ISA_SHOULD_BUILD (4)
 
-# define VEC_SIZE	32
-# define XMM0		xmm16
-# define XMM1		xmm17
-# define YMM0		ymm16
-# define YMM1		ymm17
-# define VEC0		ymm16
-# define VEC1		ymm17
-# define VEC2		ymm18
-# define VEC3		ymm19
-# define VEC4		ymm20
-# define VEC5		ymm21
-# define VEC6		ymm22
-# define VEC7		ymm23
-# define VEC8		ymm24
-# define VEC9		ymm25
-# define VEC10		ymm26
-# define VEC11		ymm27
-# define VEC12		ymm28
-# define VEC13		ymm29
-# define VEC14		ymm30
-# define VEC15		ymm31
-# define VEC(i)		VEC##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-# define VZEROUPPER
-# define MOV_SIZE	6
-
-# define SECTION(p)		p##.evex
+# include "x86-evex256-vecs.h"
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
index 422a079902..8431bcd000 100644
--- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -22,18 +22,9 @@
    so we need this to build for ISA V2 builds. */
 #if ISA_SHOULD_BUILD (2)
 
-# include <sysdep.h>
+# include "x86-sse2-vecs.h"
 
-# define VEC_SIZE	16
-# define VEC(i)		xmm##i
 # define PREFETCHNT	prefetchnta
-# define VMOVNT		movntdq
-/* Use movups and movaps for smaller code sizes.  */
-# define VMOVU		movups
-# define VMOVA		movaps
-# define MOV_SIZE	3
-
-# define SECTION(p)		p
 
 # ifndef MEMMOVE_SYMBOL
 #  define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 04747133b7..5b758cae5e 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -60,14 +60,6 @@
 # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
 #endif
 
-#ifndef XMM0
-# define XMM0				xmm0
-#endif
-
-#ifndef YMM0
-# define YMM0				ymm0
-#endif
-
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER vzeroupper
@@ -225,13 +217,13 @@ L(start):
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 	/* Load regardless.  */
-	VMOVU	(%rsi), %VEC(0)
+	VMOVU	(%rsi), %VMM(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VMM(1)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), -VEC_SIZE(%rdi,%rdx)
 #if !(defined USE_MULTIARCH && IS_IN (libc))
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
@@ -270,15 +262,15 @@ L(start_erms):
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 	/* Load regardless.  */
-	VMOVU	(%rsi), %VEC(0)
+	VMOVU	(%rsi), %VMM(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
 	 */
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
-L(return):
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), -VEC_SIZE(%rdi, %rdx)
+L(return_vzeroupper):
 # if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 # else
@@ -359,10 +351,10 @@ L(between_16_31):
 	.p2align 4,, 10
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
-	VMOVU	(%rsi), %YMM0
-	VMOVU	-32(%rsi, %rdx), %YMM1
-	VMOVU	%YMM0, (%rdi)
-	VMOVU	%YMM1, -32(%rdi, %rdx)
+	VMOVU	(%rsi), %VMM_256(0)
+	VMOVU	-32(%rsi, %rdx), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -32(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
@@ -380,12 +372,12 @@ L(last_4x_vec):
 	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
 
 	/* VEC(0) and VEC(1) have already been loaded.  */
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
+	VMOVU	%VMM(2), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VMM(3), -(VEC_SIZE * 2)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
 	.p2align 4
@@ -400,24 +392,24 @@ L(more_2x_vec):
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
 	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_4x_vec)
 	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(7)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(4), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VMM(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VMM(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(7), -(VEC_SIZE * 4)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
 	.p2align 4,, 4
@@ -466,14 +458,14 @@ L(more_8x_vec_forward):
 	 */
 
 	/* First vec was already loaded into VEC(0).  */
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(5)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(6)
 	/* Save begining of dst.  */
 	movq	%rdi, %rcx
 	/* Align dst to VEC_SIZE - 1.  */
 	orq	$(VEC_SIZE - 1), %rdi
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(7)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(8)
 
 	/* Subtract dst from src. Add back after dst aligned.  */
 	subq	%rcx, %rsi
@@ -488,25 +480,25 @@ L(more_8x_vec_forward):
 	.p2align 4,, 11
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
-	VMOVU	(%rsi), %VEC(1)
-	VMOVU	VEC_SIZE(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
+	VMOVU	(%rsi), %VMM(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(4)
 	subq	$-(VEC_SIZE * 4), %rsi
-	VMOVA	%VEC(1), (%rdi)
-	VMOVA	%VEC(2), VEC_SIZE(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VMM(1), (%rdi)
+	VMOVA	%VMM(2), VEC_SIZE(%rdi)
+	VMOVA	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VMM(4), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpq	%rdi, %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
-	VMOVU	%VEC(7), VEC_SIZE(%rdx)
-	VMOVU	%VEC(8), (%rdx)
+	VMOVU	%VMM(5), (VEC_SIZE * 3)(%rdx)
+	VMOVU	%VMM(6), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VMM(7), VEC_SIZE(%rdx)
+	VMOVU	%VMM(8), (%rdx)
 	/* Store the first VEC.  */
-	VMOVU	%VEC(0), (%rcx)
+	VMOVU	%VMM(0), (%rcx)
 	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
 	 */
 L(nop_backward):
@@ -523,12 +515,12 @@ L(more_8x_vec_backward):
 	   addresses.  */
 
 	/* First vec was also loaded into VEC(0).  */
-	VMOVU	VEC_SIZE(%rsi), %VEC(5)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	VMOVU	VEC_SIZE(%rsi), %VMM(5)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(6)
 	/* Begining of region for 4x backward copy stored in rcx.  */
 	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(7)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(8)
 	/* Subtract dst from src. Add back after dst aligned.  */
 	subq	%rdi, %rsi
 	/* Align dst.  */
@@ -540,25 +532,25 @@ L(more_8x_vec_backward):
 	.p2align 4,, 11
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
-	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VMM(3)
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VMM(4)
 	addq	$(VEC_SIZE * -4), %rsi
-	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
-	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
-	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
+	VMOVA	%VMM(1), (VEC_SIZE * 3)(%rcx)
+	VMOVA	%VMM(2), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VMM(3), (VEC_SIZE * 1)(%rcx)
+	VMOVA	%VMM(4), (VEC_SIZE * 0)(%rcx)
 	addq	$(VEC_SIZE * -4), %rcx
 	cmpq	%rcx, %rdi
 	jb	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(5), VEC_SIZE(%rdi)
+	VMOVU	%VMM(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
-	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
+	VMOVU	%VMM(8), -VEC_SIZE(%rdx, %rdi)
 	VZEROUPPER_RETURN
 
 #if defined USE_MULTIARCH && IS_IN (libc)
@@ -568,7 +560,7 @@ L(loop_4x_vec_backward):
 # if ALIGN_MOVSB
 L(skip_short_movsb_check):
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 #  endif
 #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
 #   error Unsupported MOVSB_ALIGN_TO
@@ -597,9 +589,9 @@ L(skip_short_movsb_check):
 
 	rep	movsb
 
-	VMOVU	%VEC(0), (%r8)
+	VMOVU	%VMM(0), (%r8)
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	%VEC(1), VEC_SIZE(%r8)
+	VMOVU	%VMM(1), VEC_SIZE(%r8)
 #  endif
 	VZEROUPPER_RETURN
 # endif
@@ -640,7 +632,7 @@ L(movsb):
 # endif
 # if ALIGN_MOVSB
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 #  endif
 #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
 #   error Unsupported MOVSB_ALIGN_TO
@@ -664,9 +656,9 @@ L(movsb_align_dst):
 	rep	movsb
 
 	/* Store VECs loaded for aligning.  */
-	VMOVU	%VEC(0), (%r8)
+	VMOVU	%VMM(0), (%r8)
 #  if MOVSB_ALIGN_TO > VEC_SIZE
-	VMOVU	%VEC(1), VEC_SIZE(%r8)
+	VMOVU	%VMM(1), VEC_SIZE(%r8)
 #  endif
 	VZEROUPPER_RETURN
 # else	/* !ALIGN_MOVSB.  */
@@ -701,18 +693,18 @@ L(large_memcpy_2x):
 
 	/* First vec was also loaded into VEC(0).  */
 # if VEC_SIZE < 64
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
 #  if VEC_SIZE < 32
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
 #  endif
 # endif
-	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VMM(0), (%rdi)
 # if VEC_SIZE < 64
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VMM(1), VEC_SIZE(%rdi)
 #  if VEC_SIZE < 32
-	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
 #  endif
 # endif
 
@@ -761,12 +753,12 @@ L(loop_large_memcpy_2x_inner):
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
 	/* Load vectors from rsi.  */
-	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
 	subq	$-LARGE_LOAD_SIZE, %rsi
 	/* Non-temporal store vectors to rdi.  */
-	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
 	subq	$-LARGE_LOAD_SIZE, %rdi
 	decl	%ecx
 	jnz	L(loop_large_memcpy_2x_inner)
@@ -785,31 +777,31 @@ L(loop_large_memcpy_2x_tail):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
 	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VMM(0)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
 	subq	$-(VEC_SIZE * 4), %rsi
 	addl	$-(VEC_SIZE * 4), %edx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VMM(0), (%rdi)
+	VMOVA	%VMM(1), VEC_SIZE(%rdi)
+	VMOVA	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VMM(3), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpl	$(VEC_SIZE * 4), %edx
 	ja	L(loop_large_memcpy_2x_tail)
 
 L(large_memcpy_2x_end):
 	/* Store the last 4 * VEC.  */
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
-
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
-	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
-	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
-	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(3)
+
+	VMOVU	%VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VMM(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
 	.p2align 4
@@ -831,16 +823,16 @@ L(loop_large_memcpy_4x_inner):
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
 	/* Load vectors from rsi.  */
-	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
-	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
 	subq	$-LARGE_LOAD_SIZE, %rsi
 	/* Non-temporal store vectors to rdi.  */
-	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
-	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
-	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
-	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
 	subq	$-LARGE_LOAD_SIZE, %rdi
 	decl	%ecx
 	jnz	L(loop_large_memcpy_4x_inner)
@@ -858,31 +850,31 @@ L(loop_large_memcpy_4x_tail):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
 	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VMM(0)
+	VMOVU	VEC_SIZE(%rsi), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(3)
 	subq	$-(VEC_SIZE * 4), %rsi
 	addl	$-(VEC_SIZE * 4), %edx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VMM(0), (%rdi)
+	VMOVA	%VMM(1), VEC_SIZE(%rdi)
+	VMOVA	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VMM(3), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 	cmpl	$(VEC_SIZE * 4), %edx
 	ja	L(loop_large_memcpy_4x_tail)
 
 L(large_memcpy_4x_end):
 	/* Store the last 4 * VEC.  */
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
-
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
-	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
-	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
-	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VMM(3)
+
+	VMOVU	%VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VMM(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v10 4/6] x86: Update memset to use new VEC macros
  2022-10-15  3:00 ` [PATCH v10 " Noah Goldstein
  2022-10-15  3:00   ` [PATCH v10 2/6] x86: Update memrchr to use new VEC macros Noah Goldstein
  2022-10-15  3:00   ` [PATCH v10 3/6] x86: Update memmove " Noah Goldstein
@ 2022-10-15  3:00   ` Noah Goldstein
  2022-10-15  3:42     ` Sunil Pandey
  2022-10-15  3:00   ` [PATCH v10 5/6] x86: Remove now unused vec header macros Noah Goldstein
                     ` (2 subsequent siblings)
  5 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  3:00 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Replace %VEC(n) -> %VMM(n)

This commit does not change libc.so

Tested build on x86-64
---
 .../memset-avx2-unaligned-erms-rtm.S          |  8 +--
 .../multiarch/memset-avx2-unaligned-erms.S    | 14 +---
 .../multiarch/memset-avx512-unaligned-erms.S  | 20 +-----
 .../multiarch/memset-evex-unaligned-erms.S    | 20 +-----
 .../multiarch/memset-sse2-unaligned-erms.S    | 10 +--
 .../multiarch/memset-vec-unaligned-erms.S     | 70 ++++++++-----------
 6 files changed, 43 insertions(+), 99 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
index 8ac3e479bb..bc8605faf3 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
@@ -1,10 +1,6 @@
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+#include "x86-avx-rtm-vecs.h"
 
-#define VZEROUPPER_RETURN jmp	 L(return)
-
-#define SECTION(p) p##.avx.rtm
 #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
 #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
 
-#include "memset-avx2-unaligned-erms.S"
+# include "memset-avx2-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index a9054a9122..47cf5072a4 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -4,14 +4,9 @@
 
 # define USE_WITH_AVX2	1
 
-# define VEC_SIZE	32
-# define MOV_SIZE	4
-# define RET_SIZE	4
-
-# define VEC(i)		ymm##i
-
-# define VMOVU     vmovdqu
-# define VMOVA     vmovdqa
+# ifndef VEC_SIZE
+#  include "x86-avx-vecs.h"
+# endif
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
@@ -26,9 +21,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
 # define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
 
-# ifndef SECTION
-#  define SECTION(p)		p##.avx
-# endif
 # ifndef MEMSET_SYMBOL
 #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 47623b8ee8..84145b6c27 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -4,26 +4,14 @@
 
 # define USE_WITH_AVX512	1
 
-# define VEC_SIZE	64
-# define MOV_SIZE	6
-# define RET_SIZE	1
-
-# define XMM0		xmm16
-# define YMM0		ymm16
-# define VEC0		zmm16
-# define VEC(i)		VEC##i
-
-# define VMOVU     vmovdqu64
-# define VMOVA     vmovdqa64
-
-# define VZEROUPPER
+# include "x86-evex512-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastb d, %VEC0; \
+  vpbroadcastb d, %VMM(0); \
   movq r, %rax
 
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastd d, %VEC0; \
+  vpbroadcastd d, %VMM(0); \
   movq r, %rax
 
 # define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -32,8 +20,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p##.evex512
-
 #ifndef MEMSET_SYMBOL
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index ac4b2d2d50..1f03b26bf8 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -4,26 +4,14 @@
 
 # define USE_WITH_EVEX	1
 
-# define VEC_SIZE	32
-# define MOV_SIZE	6
-# define RET_SIZE	1
-
-# define XMM0		xmm16
-# define YMM0		ymm16
-# define VEC0		ymm16
-# define VEC(i)		VEC##i
-
-# define VMOVU     vmovdqu64
-# define VMOVA     vmovdqa64
-
-# define VZEROUPPER
+# include "x86-evex256-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastb d, %VEC0; \
+  vpbroadcastb d, %VMM(0); \
   movq r, %rax
 
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastd d, %VEC0; \
+  vpbroadcastd d, %VMM(0); \
   movq r, %rax
 
 # define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -32,8 +20,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p##.evex
-
 #ifndef MEMSET_SYMBOL
 # define MEMSET_SYMBOL(p,s)	p##_evex_##s
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index 44f9b8888b..34b245d8ca 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -26,13 +26,7 @@
 # include <sysdep.h>
 # define USE_WITH_SSE2	1
 
-# define VEC_SIZE	16
-# define MOV_SIZE	3
-# define RET_SIZE	1
-
-# define VEC(i)		xmm##i
-# define VMOVU     movups
-# define VMOVA     movaps
+# include "x86-sse2-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
@@ -52,8 +46,6 @@
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p
-
 # ifndef MEMSET_SYMBOL
 #  define MEMSET_SYMBOL(p,s)	p##_sse2_##s
 # endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 905d0fa464..03de0ab907 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -34,14 +34,6 @@
 # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
 #endif
 
-#ifndef XMM0
-# define XMM0				xmm0
-#endif
-
-#ifndef YMM0
-# define YMM0				ymm0
-#endif
-
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER			vzeroupper
@@ -150,8 +142,8 @@ L(entry_from_wmemset):
 	cmpq	$(VEC_SIZE * 2), %rdx
 	ja	L(more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VMM(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VMM(0), (%rdi)
 	VZEROUPPER_RETURN
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
@@ -175,19 +167,19 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(stosb_more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
 	.p2align 4,, 4
 L(last_2x_vec):
 #ifdef USE_LESS_VEC_MASK_STORE
-	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * -2)(%rdi, %rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
 #else
-	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * -2)(%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * -1)(%rdi)
 #endif
 	VZEROUPPER_RETURN
 
@@ -221,7 +213,7 @@ L(less_vec_from_wmemset):
 	bzhil	%edx, %ecx, %ecx
 	kmovd	%ecx, %k1
 # endif
-	vmovdqu8 %VEC(0), (%rax){%k1}
+	vmovdqu8 %VMM(0), (%rax){%k1}
 	VZEROUPPER_RETURN
 
 # if defined USE_MULTIARCH && IS_IN (libc)
@@ -249,8 +241,8 @@ L(stosb_more_2x_vec):
 	   and (4x, 8x] jump to target.  */
 L(more_2x_vec):
 	/* Store next 2x vec regardless.  */
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
+	VMOVU	%VMM(0), (%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * 1)(%rdi)
 
 
 	/* Two different methods of setting up pointers / compare. The two
@@ -278,8 +270,8 @@ L(more_2x_vec):
 #endif
 
 	/* Store next 2x vec regardless.  */
-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
-	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
+	VMOVU	%VMM(0), (VEC_SIZE * 2)(%rax)
+	VMOVU	%VMM(0), (VEC_SIZE * 3)(%rax)
 
 
 #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
@@ -304,20 +296,20 @@ L(more_2x_vec):
 	andq	$(VEC_SIZE * -2), %LOOP_REG
 	.p2align 4
 L(loop):
-	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
-	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
-	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
-	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VMM(0), LOOP_4X_OFFSET(%LOOP_REG)
+	VMOVA	%VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
 	subq	$-(VEC_SIZE * 4), %LOOP_REG
 	cmpq	%END_REG, %LOOP_REG
 	jb	L(loop)
 	.p2align 4,, MOV_SIZE
 L(last_4x_vec):
-	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
-	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
-L(return):
+	VMOVU	%VMM(0), LOOP_4X_OFFSET(%END_REG)
+	VMOVU	%VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
+L(return_vzeroupper):
 #if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
@@ -355,7 +347,7 @@ L(cross_page):
 	jge	L(between_16_31)
 #endif
 #ifndef USE_XMM_LESS_VEC
-	MOVQ	%XMM0, %SET_REG64
+	MOVQ	%VMM_128(0), %SET_REG64
 #endif
 	cmpl	$8, %edx
 	jge	L(between_8_15)
@@ -374,8 +366,8 @@ L(between_0_0):
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
-	VMOVU	%YMM0, (%LESS_VEC_REG)
-	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
+	VMOVU	%VMM_256(0), (%LESS_VEC_REG)
+	VMOVU	%VMM_256(0), -32(%LESS_VEC_REG, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
@@ -383,8 +375,8 @@ L(between_32_63):
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
 L(between_16_31):
 	/* From 16 to 31.  No branch when size == 16.  */
-	VMOVU	%XMM0, (%LESS_VEC_REG)
-	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
+	VMOVU	%VMM_128(0), (%LESS_VEC_REG)
+	VMOVU	%VMM_128(0), -16(%LESS_VEC_REG, %rdx)
 	ret
 #endif
 
@@ -394,8 +386,8 @@ L(between_16_31):
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
 #ifdef USE_XMM_LESS_VEC
-	MOVQ	%XMM0, (%rdi)
-	MOVQ	%XMM0, -8(%rdi, %rdx)
+	MOVQ	%VMM_128(0), (%rdi)
+	MOVQ	%VMM_128(0), -8(%rdi, %rdx)
 #else
 	movq	%SET_REG64, (%LESS_VEC_REG)
 	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
@@ -408,8 +400,8 @@ L(between_8_15):
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
 #ifdef USE_XMM_LESS_VEC
-	MOVD	%XMM0, (%rdi)
-	MOVD	%XMM0, -4(%rdi, %rdx)
+	MOVD	%VMM_128(0), (%rdi)
+	MOVD	%VMM_128(0), -4(%rdi, %rdx)
 #else
 	movl	%SET_REG32, (%LESS_VEC_REG)
 	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v10 5/6] x86: Remove now unused vec header macros.
  2022-10-15  3:00 ` [PATCH v10 " Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-10-15  3:00   ` [PATCH v10 4/6] x86: Update memset " Noah Goldstein
@ 2022-10-15  3:00   ` Noah Goldstein
  2022-10-15  3:39     ` Sunil Pandey
  2022-10-15  3:00   ` [PATCH v10 6/6] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
  2022-10-15  3:37   ` [PATCH v10 1/6] x86: Update VEC macros to complete API for evex/evex512 impls Sunil Pandey
  5 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  3:00 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h     | 35 --------
 sysdeps/x86_64/multiarch/avx-vecs.h         | 47 -----------
 sysdeps/x86_64/multiarch/evex-vecs-common.h | 39 ---------
 sysdeps/x86_64/multiarch/evex256-vecs.h     | 35 --------
 sysdeps/x86_64/multiarch/evex512-vecs.h     | 35 --------
 sysdeps/x86_64/multiarch/sse2-vecs.h        | 47 -----------
 sysdeps/x86_64/multiarch/vec-macros.h       | 90 ---------------------
 7 files changed, 328 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/evex-vecs-common.h
 delete mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/vec-macros.h

diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
deleted file mode 100644
index 6ca9f5e6ba..0000000000
--- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Common config for AVX-RTM VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _AVX_RTM_VECS_H
-#define _AVX_RTM_VECS_H			1
-
-#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
-#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
-	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
-
-#define USE_WITH_RTM			1
-#include "avx-vecs.h"
-
-#undef SECTION
-#define SECTION(p)				p##.avx.rtm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
deleted file mode 100644
index 89680f5db8..0000000000
--- a/sysdeps/x86_64/multiarch/avx-vecs.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Common config for AVX VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _AVX_VECS_H
-#define _AVX_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			32
-#include "vec-macros.h"
-
-#define USE_WITH_AVX		1
-#define SECTION(p)			p##.avx
-
-/* 4-byte mov instructions with AVX2.  */
-#define MOV_SIZE			4
-/* 1 (ret) + 3 (vzeroupper).  */
-#define RET_SIZE			4
-#define VZEROUPPER			vzeroupper
-
-#define VMOVU				vmovdqu
-#define VMOVA				vmovdqa
-#define VMOVNT				vmovntdq
-
-/* Often need to access xmm portion.  */
-#define VEC_xmm				VEC_any_xmm
-#define VEC					VEC_any_ymm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
deleted file mode 100644
index 99806ebcd7..0000000000
--- a/sysdeps/x86_64/multiarch/evex-vecs-common.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Common config for EVEX256 and EVEX512 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _EVEX_VECS_COMMON_H
-#define _EVEX_VECS_COMMON_H			1
-
-#include "vec-macros.h"
-
-/* 6-byte mov instructions with EVEX.  */
-#define MOV_SIZE			6
-/* No vzeroupper needed.  */
-#define RET_SIZE			1
-#define VZEROUPPER
-
-#define VMOVU				vmovdqu64
-#define VMOVA				vmovdqa64
-#define VMOVNT				vmovntdq
-
-#define VEC_xmm				VEC_hi_xmm
-#define VEC_ymm				VEC_hi_ymm
-#define VEC_zmm				VEC_hi_zmm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
deleted file mode 100644
index 222ba46dc7..0000000000
--- a/sysdeps/x86_64/multiarch/evex256-vecs.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Common config for EVEX256 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _EVEX256_VECS_H
-#define _EVEX256_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			32
-#include "evex-vecs-common.h"
-
-#define USE_WITH_EVEX256	1
-#define SECTION(p)			p##.evex
-
-#define VEC					VEC_ymm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
deleted file mode 100644
index d1784d5368..0000000000
--- a/sysdeps/x86_64/multiarch/evex512-vecs.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Common config for EVEX512 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _EVEX512_VECS_H
-#define _EVEX512_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			64
-#include "evex-vecs-common.h"
-
-#define USE_WITH_EVEX512	1
-#define SECTION(p)			p##.evex512
-
-#define VEC					VEC_zmm
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
deleted file mode 100644
index 2b77a59d56..0000000000
--- a/sysdeps/x86_64/multiarch/sse2-vecs.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Common config for SSE2 VECs
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _SSE2_VECS_H
-#define _SSE2_VECS_H			1
-
-#ifdef VEC_SIZE
-# error "Multiple VEC configs included!"
-#endif
-
-#define VEC_SIZE			16
-#include "vec-macros.h"
-
-#define USE_WITH_SSE2		1
-#define SECTION(p)			p
-
-/* 3-byte mov instructions with SSE2.  */
-#define MOV_SIZE			3
-/* No vzeroupper needed.  */
-#define RET_SIZE			1
-#define VZEROUPPER
-
-#define VMOVU				movups
-#define VMOVA				movaps
-#define VMOVNT				movntdq
-
-#define VEC_xmm				VEC_any_xmm
-#define VEC					VEC_any_xmm
-
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
deleted file mode 100644
index 9f3ffecede..0000000000
--- a/sysdeps/x86_64/multiarch/vec-macros.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Macro helpers for VEC_{type}({vec_num})
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef _VEC_MACROS_H
-#define _VEC_MACROS_H			1
-
-#ifndef VEC_SIZE
-# error "Never include this file directly. Always include a vector config."
-#endif
-
-/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
-   VEC(N) values.  */
-#define VEC_hi_xmm0				xmm16
-#define VEC_hi_xmm1				xmm17
-#define VEC_hi_xmm2				xmm18
-#define VEC_hi_xmm3				xmm19
-#define VEC_hi_xmm4				xmm20
-#define VEC_hi_xmm5				xmm21
-#define VEC_hi_xmm6				xmm22
-#define VEC_hi_xmm7				xmm23
-#define VEC_hi_xmm8				xmm24
-#define VEC_hi_xmm9				xmm25
-#define VEC_hi_xmm10			xmm26
-#define VEC_hi_xmm11			xmm27
-#define VEC_hi_xmm12			xmm28
-#define VEC_hi_xmm13			xmm29
-#define VEC_hi_xmm14			xmm30
-#define VEC_hi_xmm15			xmm31
-
-#define VEC_hi_ymm0				ymm16
-#define VEC_hi_ymm1				ymm17
-#define VEC_hi_ymm2				ymm18
-#define VEC_hi_ymm3				ymm19
-#define VEC_hi_ymm4				ymm20
-#define VEC_hi_ymm5				ymm21
-#define VEC_hi_ymm6				ymm22
-#define VEC_hi_ymm7				ymm23
-#define VEC_hi_ymm8				ymm24
-#define VEC_hi_ymm9				ymm25
-#define VEC_hi_ymm10			ymm26
-#define VEC_hi_ymm11			ymm27
-#define VEC_hi_ymm12			ymm28
-#define VEC_hi_ymm13			ymm29
-#define VEC_hi_ymm14			ymm30
-#define VEC_hi_ymm15			ymm31
-
-#define VEC_hi_zmm0				zmm16
-#define VEC_hi_zmm1				zmm17
-#define VEC_hi_zmm2				zmm18
-#define VEC_hi_zmm3				zmm19
-#define VEC_hi_zmm4				zmm20
-#define VEC_hi_zmm5				zmm21
-#define VEC_hi_zmm6				zmm22
-#define VEC_hi_zmm7				zmm23
-#define VEC_hi_zmm8				zmm24
-#define VEC_hi_zmm9				zmm25
-#define VEC_hi_zmm10			zmm26
-#define VEC_hi_zmm11			zmm27
-#define VEC_hi_zmm12			zmm28
-#define VEC_hi_zmm13			zmm29
-#define VEC_hi_zmm14			zmm30
-#define VEC_hi_zmm15			zmm31
-
-#define PRIMITIVE_VEC(vec, num)		vec##num
-
-#define VEC_any_xmm(i)			PRIMITIVE_VEC(xmm, i)
-#define VEC_any_ymm(i)			PRIMITIVE_VEC(ymm, i)
-#define VEC_any_zmm(i)			PRIMITIVE_VEC(zmm, i)
-
-#define VEC_hi_xmm(i)			PRIMITIVE_VEC(VEC_hi_xmm, i)
-#define VEC_hi_ymm(i)			PRIMITIVE_VEC(VEC_hi_ymm, i)
-#define VEC_hi_zmm(i)			PRIMITIVE_VEC(VEC_hi_zmm, i)
-
-#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH v10 6/6] x86: Update strlen-evex-base to use new reg/vec macros.
  2022-10-15  3:00 ` [PATCH v10 " Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-10-15  3:00   ` [PATCH v10 5/6] x86: Remove now unused vec header macros Noah Goldstein
@ 2022-10-15  3:00   ` Noah Goldstein
  2022-10-15  3:48     ` Sunil Pandey
  2022-10-15  3:37   ` [PATCH v10 1/6] x86: Update VEC macros to complete API for evex/evex512 impls Sunil Pandey
  5 siblings, 1 reply; 72+ messages in thread
From: Noah Goldstein @ 2022-10-15  3:00 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

To avoid duplicate the VMM / GPR / mask insn macros in all incoming
evex512 files use the macros defined in 'reg-macros.h' and
'{vec}-macros.h'

This commit does not change libc.so

Tested build on x86-64
---
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 116 +++++++-------------
 sysdeps/x86_64/multiarch/strlen-evex512.S   |   4 +-
 2 files changed, 44 insertions(+), 76 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index 418e9f8411..c832b15a48 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -36,42 +36,10 @@
 #  define CHAR_SIZE	1
 # endif
 
-# define XMM0		xmm16
 # define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# if VEC_SIZE == 64
-#  define KMOV		kmovq
-#  define KORTEST	kortestq
-#  define RAX		rax
-#  define RCX		rcx
-#  define RDX		rdx
-#  define SHR		shrq
-#  define TEXTSUFFIX	evex512
-#  define VMM0		zmm16
-#  define VMM1		zmm17
-#  define VMM2		zmm18
-#  define VMM3		zmm19
-#  define VMM4		zmm20
-#  define VMOVA		vmovdqa64
-# elif VEC_SIZE == 32
-/* Currently Unused.  */
-#  define KMOV		kmovd
-#  define KORTEST	kortestd
-#  define RAX		eax
-#  define RCX		ecx
-#  define RDX		edx
-#  define SHR		shrl
-#  define TEXTSUFFIX	evex256
-#  define VMM0		ymm16
-#  define VMM1		ymm17
-#  define VMM2		ymm18
-#  define VMM3		ymm19
-#  define VMM4		ymm20
-#  define VMOVA		vmovdqa32
-# endif
-
-	.section .text.TEXTSUFFIX, "ax", @progbits
+	.section SECTION(.text),"ax",@progbits
 /* Aligning entry point to 64 byte, provides better performance for
    one vector length string.  */
 ENTRY_P2ALIGN (STRLEN, 6)
@@ -86,18 +54,18 @@ ENTRY_P2ALIGN (STRLEN, 6)
 # endif
 
 	movl	%edi, %eax
-	vpxorq	%XMM0, %XMM0, %XMM0
+	vpxorq	%VMM_128(0), %VMM_128(0), %VMM_128(0)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(page_cross)
 
 	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMP	$0, (%rdi), %VMM0, %k0
-	KMOV	%k0, %RAX
-	test	%RAX, %RAX
+	VPCMP	$0, (%rdi), %VMM(0), %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
@@ -120,7 +88,7 @@ L(align_more):
 	movq	%rax, %rdx
 	subq	%rdi, %rdx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RDX
+	shr	$2, %VRDX
 #  endif
 	/* At this point rdx contains [w]chars already compared.  */
 	subq	%rsi, %rdx
@@ -131,9 +99,9 @@ L(align_more):
 # endif
 
 	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMP	$0, (%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
 # ifdef USE_AS_STRNLEN
@@ -141,9 +109,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, VEC_SIZE(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, VEC_SIZE(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
 # ifdef USE_AS_STRNLEN
@@ -151,9 +119,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 
 # ifdef USE_AS_STRNLEN
@@ -161,9 +129,9 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
-	KMOV	%k0, %RCX
-	test	%RCX, %RCX
+	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x4)
 
 # ifdef USE_AS_STRNLEN
@@ -179,7 +147,7 @@ L(align_more):
 # ifdef USE_AS_STRNLEN
 	subq	%rax, %rcx
 #  ifdef USE_AS_WCSLEN
-	SHR	$2, %RCX
+	shr	$2, %VRCX
 #  endif
 	/* rcx contains number of [w]char will be recompared due to
 	   alignment fixes.  rdx must be incremented by rcx to offset
@@ -199,42 +167,42 @@ L(loop_entry):
 # endif
 	/* VPMINU and VPCMP combination provide better performance as
 	   compared to alternative combinations.  */
-	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
-	VPMINU	(VEC_SIZE * 5)(%rax), %VMM1, %VMM2
-	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
-	VPMINU	(VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+	VMOVA	(VEC_SIZE * 4)(%rax), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rax), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
 
-	VPTESTN	%VMM2, %VMM2, %k0
-	VPTESTN	%VMM4, %VMM4, %k1
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k1
 
 	subq	$-(VEC_SIZE * 4), %rax
 	KORTEST	%k0, %k1
 	jz	L(loop)
 
-	VPTESTN	%VMM1, %VMM1, %k2
-	KMOV	%k2, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VMM(1), %VMM(1), %k2
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
 
-	KMOV	%k0, %RCX
+	KMOV	%k0, %VRCX
 	/* At this point, if k0 is non zero, null char must be in the
 	   second vector.  */
-	test	%RCX, %RCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
 
-	VPTESTN	%VMM3, %VMM3, %k3
-	KMOV	%k3, %RCX
-	test	%RCX, %RCX
+	VPTESTN	%VMM(3), %VMM(3), %k3
+	KMOV	%k3, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
 	/* At this point null [w]char must be in the fourth vector so no
 	   need to check.  */
-	KMOV	%k1, %RCX
+	KMOV	%k1, %VRCX
 
 	/* Fourth, third, second vector terminating are pretty much
 	   same, implemented this way to avoid branching and reuse code
 	   from pre loop exit condition.  */
 L(ret_vec_x4):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 3), %rax
@@ -250,7 +218,7 @@ L(ret_vec_x4):
 	ret
 
 L(ret_vec_x3):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	subq	$-(VEC_SIZE * 2), %rax
@@ -268,7 +236,7 @@ L(ret_vec_x3):
 L(ret_vec_x2):
 	subq	$-VEC_SIZE, %rax
 L(ret_vec_x1):
-	bsf	%RCX, %RCX
+	bsf	%VRCX, %VRCX
 	subq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
@@ -289,13 +257,13 @@ L(page_cross):
 	/* ecx contains number of w[char] to be skipped as a result
 	   of address alignment.  */
 	xorq	%rdi, %rax
-	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
-	KMOV	%k0, %RAX
+	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRAX
 	/* Ignore number of character for alignment adjustment.  */
-	SHR	%cl, %RAX
+	shr	%cl, %VRAX
 	jz	L(align_more)
 
-	bsf	%RAX, %RAX
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
 	cmovnb	%rsi, %rax
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
index 116f8981c8..10c3415c8a 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -2,6 +2,6 @@
 # define STRLEN		__strlen_evex512
 #endif
 
-#define VEC_SIZE	64
-
+#include "x86-evex512-vecs.h"
+#include "reg-macros.h"
 #include "strlen-evex-base.S"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v10 1/6] x86: Update VEC macros to complete API for evex/evex512 impls
  2022-10-15  3:00 ` [PATCH v10 " Noah Goldstein
                     ` (4 preceding siblings ...)
  2022-10-15  3:00   ` [PATCH v10 6/6] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
@ 2022-10-15  3:37   ` Sunil Pandey
  5 siblings, 0 replies; 72+ messages in thread
From: Sunil Pandey @ 2022-10-15  3:37 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha

On Fri, Oct 14, 2022 at 8:01 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> 1) Copy so that backport will be easier.
> 2) Make section only define if there is not a previous definition
> 3) Add `VEC_lo` definition for proper reg-width but in the
>    ymm/zmm0-15 range.
> 4) Add macros for accessing GPRs based on VEC_SIZE
>         This is to make it easier to do think like:
>         ```
>             vpcmpb %VEC(0), %VEC(1), %k0
>             kmov{d|q} %k0, %{eax|rax}
>             test %{eax|rax}
>         ```
>         It adds macro s.t any GPR can get the proper width with:
>             `V{upcase_GPR_name}`
>
>         and any mask insn can get the proper width with:
>             `{upcase_mask_insn_without_postfix}`
>
> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  sysdeps/x86_64/multiarch/reg-macros.h         | 168 ++++++++++++++++++
>  .../multiarch/scripts/gen-reg-macros.py       | 133 ++++++++++++++
>  sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h   |  35 ++++
>  sysdeps/x86_64/multiarch/x86-avx-vecs.h       |  47 +++++
>  .../x86_64/multiarch/x86-evex-vecs-common.h   |  39 ++++
>  sysdeps/x86_64/multiarch/x86-evex256-vecs.h   |  38 ++++
>  sysdeps/x86_64/multiarch/x86-evex512-vecs.h   |  38 ++++
>  sysdeps/x86_64/multiarch/x86-sse2-vecs.h      |  47 +++++
>  sysdeps/x86_64/multiarch/x86-vec-macros.h     |  90 ++++++++++
>  9 files changed, 635 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/reg-macros.h
>  create mode 100644 sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
>  create mode 100644 sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-avx-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-evex256-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-evex512-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-sse2-vecs.h
>  create mode 100644 sysdeps/x86_64/multiarch/x86-vec-macros.h
>
> diff --git a/sysdeps/x86_64/multiarch/reg-macros.h b/sysdeps/x86_64/multiarch/reg-macros.h
> new file mode 100644
> index 0000000000..c8ea330256
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/reg-macros.h
> @@ -0,0 +1,168 @@
> +/* This file was generated by: gen-reg-macros.py.
> +
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _REG_MACROS_H
> +#define _REG_MACROS_H  1
> +
> +#define rax_8  al
> +#define rax_16 ax
> +#define rax_32 eax
> +#define rax_64 rax
> +#define rbx_8  bl
> +#define rbx_16 bx
> +#define rbx_32 ebx
> +#define rbx_64 rbx
> +#define rcx_8  cl
> +#define rcx_16 cx
> +#define rcx_32 ecx
> +#define rcx_64 rcx
> +#define rdx_8  dl
> +#define rdx_16 dx
> +#define rdx_32 edx
> +#define rdx_64 rdx
> +#define rbp_8  bpl
> +#define rbp_16 bp
> +#define rbp_32 ebp
> +#define rbp_64 rbp
> +#define rsp_8  spl
> +#define rsp_16 sp
> +#define rsp_32 esp
> +#define rsp_64 rsp
> +#define rsi_8  sil
> +#define rsi_16 si
> +#define rsi_32 esi
> +#define rsi_64 rsi
> +#define rdi_8  dil
> +#define rdi_16 di
> +#define rdi_32 edi
> +#define rdi_64 rdi
> +#define r8_8   r8b
> +#define r8_16  r8w
> +#define r8_32  r8d
> +#define r8_64  r8
> +#define r9_8   r9b
> +#define r9_16  r9w
> +#define r9_32  r9d
> +#define r9_64  r9
> +#define r10_8  r10b
> +#define r10_16 r10w
> +#define r10_32 r10d
> +#define r10_64 r10
> +#define r11_8  r11b
> +#define r11_16 r11w
> +#define r11_32 r11d
> +#define r11_64 r11
> +#define r12_8  r12b
> +#define r12_16 r12w
> +#define r12_32 r12d
> +#define r12_64 r12
> +#define r13_8  r13b
> +#define r13_16 r13w
> +#define r13_32 r13d
> +#define r13_64 r13
> +#define r14_8  r14b
> +#define r14_16 r14w
> +#define r14_32 r14d
> +#define r14_64 r14
> +#define r15_8  r15b
> +#define r15_16 r15w
> +#define r15_32 r15d
> +#define r15_64 r15
> +
> +#define kmov_8 kmovb
> +#define kmov_16        kmovw
> +#define kmov_32        kmovd
> +#define kmov_64        kmovq
> +#define kortest_8      kortestb
> +#define kortest_16     kortestw
> +#define kortest_32     kortestd
> +#define kortest_64     kortestq
> +#define kor_8  korb
> +#define kor_16 korw
> +#define kor_32 kord
> +#define kor_64 korq
> +#define ktest_8        ktestb
> +#define ktest_16       ktestw
> +#define ktest_32       ktestd
> +#define ktest_64       ktestq
> +#define kand_8 kandb
> +#define kand_16        kandw
> +#define kand_32        kandd
> +#define kand_64        kandq
> +#define kxor_8 kxorb
> +#define kxor_16        kxorw
> +#define kxor_32        kxord
> +#define kxor_64        kxorq
> +#define knot_8 knotb
> +#define knot_16        knotw
> +#define knot_32        knotd
> +#define knot_64        knotq
> +#define kxnor_8        kxnorb
> +#define kxnor_16       kxnorw
> +#define kxnor_32       kxnord
> +#define kxnor_64       kxnorq
> +#define kunpack_8      kunpackbw
> +#define kunpack_16     kunpackwd
> +#define kunpack_32     kunpackdq
> +
> +/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */
> +#define VRAX   VGPR(rax)
> +#define VRBX   VGPR(rbx)
> +#define VRCX   VGPR(rcx)
> +#define VRDX   VGPR(rdx)
> +#define VRBP   VGPR(rbp)
> +#define VRSP   VGPR(rsp)
> +#define VRSI   VGPR(rsi)
> +#define VRDI   VGPR(rdi)
> +#define VR8    VGPR(r8)
> +#define VR9    VGPR(r9)
> +#define VR10   VGPR(r10)
> +#define VR11   VGPR(r11)
> +#define VR12   VGPR(r12)
> +#define VR13   VGPR(r13)
> +#define VR14   VGPR(r14)
> +#define VR15   VGPR(r15)
> +
> +/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */
> +#define KMOV   VKINSN(kmov)
> +#define KORTEST        VKINSN(kortest)
> +#define KOR    VKINSN(kor)
> +#define KTEST  VKINSN(ktest)
> +#define KAND   VKINSN(kand)
> +#define KXOR   VKINSN(kxor)
> +#define KNOT   VKINSN(knot)
> +#define KXNOR  VKINSN(kxnor)
> +#define KUNPACK        VKINSN(kunpack)
> +
> +#ifdef USE_WIDE_CHAR
> +# define REG_WIDTH 32
> +#else
> +# define REG_WIDTH VEC_SIZE
> +#endif
> +
> +#define VPASTER(x, y)  x##_##y
> +#define VEVALUATOR(x, y)       VPASTER(x, y)
> +
> +#define VGPR_SZ(reg_name, reg_size)    VEVALUATOR(reg_name, reg_size)
> +#define VKINSN_SZ(insn, reg_size)      VEVALUATOR(insn, reg_size)
> +
> +#define VGPR(reg_name) VGPR_SZ(reg_name, REG_WIDTH)
> +#define VKINSN(mask_insn)      VKINSN_SZ(mask_insn, REG_WIDTH)
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> new file mode 100644
> index 0000000000..9fb6903212
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/scripts/gen-reg-macros.py
> @@ -0,0 +1,133 @@
> +#!/usr/bin/python3
> +# Copyright (C) 2022 Free Software Foundation, Inc.
> +# This file is part of the GNU C Library.
> +#
> +# The GNU C Library is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU Lesser General Public
> +# License as published by the Free Software Foundation; either
> +# version 2.1 of the License, or (at your option) any later version.
> +#
> +# The GNU C Library is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +# Lesser General Public License for more details.
> +#
> +# You should have received a copy of the GNU Lesser General Public
> +# License along with the GNU C Library; if not, see
> +# <https://www.gnu.org/licenses/>.
> +"""Generate macros for getting GPR name of a certain size
> +
> +Inputs: None
> +Output: Prints header fill to stdout
> +
> +API:
> +    V{upcase_GPR_name}
> +        - Get register name REG_WIDTH component of `upcase_GPR_name`
> +    {upcase_mask_insn_without_postfix}
> +        - Get proper REG_WIDTH mask insn for `upcase_mask_insn_without_postfix`
> +    VGPR(reg_name)
> +        - Get register name REG_WIDTH component of `reg_name`
> +    VKINSN(mask_insn)
> +        - Get proper REG_WIDTH mask insn for `mask_insn`
> +    VGPR_SZ(reg_name, reg_size)
> +        - Get register name `reg_size` component of `reg_name`
> +    VKINSN_SZ(mask_insn, insn_size)
> +        - Get proper `insn_size` mask insn for `mask_insn`
> +"""
> +
> +import sys
> +import os
> +from datetime import datetime
> +
> +registers = [["rax", "eax", "ax", "al"], ["rbx", "ebx", "bx", "bl"],
> +             ["rcx", "ecx", "cx", "cl"], ["rdx", "edx", "dx", "dl"],
> +             ["rbp", "ebp", "bp", "bpl"], ["rsp", "esp", "sp", "spl"],
> +             ["rsi", "esi", "si", "sil"], ["rdi", "edi", "di", "dil"],
> +             ["r8", "r8d", "r8w", "r8b"], ["r9", "r9d", "r9w", "r9b"],
> +             ["r10", "r10d", "r10w", "r10b"], ["r11", "r11d", "r11w", "r11b"],
> +             ["r12", "r12d", "r12w", "r12b"], ["r13", "r13d", "r13w", "r13b"],
> +             ["r14", "r14d", "r14w", "r14b"], ["r15", "r15d", "r15w", "r15b"]]
> +
> +mask_insns = [
> +    "kmov",
> +    "kortest",
> +    "kor",
> +    "ktest",
> +    "kand",
> +    "kxor",
> +    "knot",
> +    "kxnor",
> +]
> +mask_insns_ext = ["b", "w", "d", "q"]
> +
> +cr = """
> +   Copyright (C) {} Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +"""
> +
> +print("/* This file was generated by: {}.".format(os.path.basename(
> +    sys.argv[0])))
> +print(cr.format(datetime.today().year))
> +
> +print("#ifndef _REG_MACROS_H")
> +print("#define _REG_MACROS_H\t1")
> +print("")
> +for reg in registers:
> +    for i in range(0, 4):
> +        print("#define {}_{}\t{}".format(reg[0], 8 << i, reg[3 - i]))
> +
> +print("")
> +for mask_insn in mask_insns:
> +    for i in range(0, 4):
> +        print("#define {}_{}\t{}{}".format(mask_insn, 8 << i, mask_insn,
> +                                           mask_insns_ext[i]))
> +for i in range(0, 3):
> +    print("#define kunpack_{}\tkunpack{}{}".format(8 << i, mask_insns_ext[i],
> +                                                   mask_insns_ext[i + 1]))
> +mask_insns.append("kunpack")
> +
> +print("")
> +print(
> +    "/* Common API for accessing proper width GPR is V{upcase_GPR_name}.  */")
> +for reg in registers:
> +    print("#define V{}\tVGPR({})".format(reg[0].upper(), reg[0]))
> +
> +print("")
> +
> +print(
> +    "/* Common API for accessing proper width mask insn is {upcase_mask_insn}.  */"
> +)
> +for mask_insn in mask_insns:
> +    print("#define {} \tVKINSN({})".format(mask_insn.upper(), mask_insn))
> +print("")
> +
> +print("#ifdef USE_WIDE_CHAR")
> +print("# define REG_WIDTH 32")
> +print("#else")
> +print("# define REG_WIDTH VEC_SIZE")
> +print("#endif")
> +print("")
> +print("#define VPASTER(x, y)\tx##_##y")
> +print("#define VEVALUATOR(x, y)\tVPASTER(x, y)")
> +print("")
> +print("#define VGPR_SZ(reg_name, reg_size)\tVEVALUATOR(reg_name, reg_size)")
> +print("#define VKINSN_SZ(insn, reg_size)\tVEVALUATOR(insn, reg_size)")
> +print("")
> +print("#define VGPR(reg_name)\tVGPR_SZ(reg_name, REG_WIDTH)")
> +print("#define VKINSN(mask_insn)\tVKINSN_SZ(mask_insn, REG_WIDTH)")
> +
> +print("\n#endif")
> diff --git a/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
> new file mode 100644
> index 0000000000..0b326c8a70
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-avx-rtm-vecs.h
> @@ -0,0 +1,35 @@
> +/* Common config for AVX-RTM VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_AVX_RTM_VECS_H
> +#define _X86_AVX_RTM_VECS_H                    1
> +
> +#define COND_VZEROUPPER                        COND_VZEROUPPER_XTEST
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> +       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN              jmp L(return_vzeroupper)
> +
> +#define USE_WITH_RTM                   1
> +#include "x86-avx-vecs.h"
> +
> +#undef SECTION
> +#define SECTION(p)                             p##.avx.rtm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> new file mode 100644
> index 0000000000..dca1089060
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> @@ -0,0 +1,47 @@
> +/* Common config for AVX VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_AVX_VECS_H
> +#define _X86_AVX_VECS_H                        1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       32
> +#include "x86-vec-macros.h"
> +
> +#define USE_WITH_AVX           1
> +#define SECTION(p)                     p##.avx
> +
> +/* 4-byte mov instructions with AVX2.  */
> +#define MOV_SIZE                       4
> +/* 1 (ret) + 3 (vzeroupper).  */
> +#define RET_SIZE                       4
> +#define VZEROUPPER                     vzeroupper
> +
> +#define VMOVU                          vmovdqu
> +#define VMOVA                          vmovdqa
> +#define VMOVNT                         vmovntdq
> +
> +/* Often need to access xmm portion.  */
> +#define VMM_128                                VMM_any_xmm
> +#define VMM                                    VMM_any_ymm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
> new file mode 100644
> index 0000000000..f331e9d8ec
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-evex-vecs-common.h
> @@ -0,0 +1,39 @@
> +/* Common config for EVEX256 and EVEX512 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_EVEX_VECS_COMMON_H
> +#define _X86_EVEX_VECS_COMMON_H                        1
> +
> +#include "x86-vec-macros.h"
> +
> +/* 6-byte mov instructions with EVEX.  */
> +#define MOV_SIZE                       6
> +/* No vzeroupper needed.  */
> +#define RET_SIZE                       1
> +#define VZEROUPPER
> +
> +#define VMOVU                          vmovdqu64
> +#define VMOVA                          vmovdqa64
> +#define VMOVNT                         vmovntdq
> +
> +#define VMM_128                                VMM_hi_xmm
> +#define VMM_256                                VMM_hi_ymm
> +#define VMM_512                                VMM_hi_zmm
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-evex256-vecs.h b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
> new file mode 100644
> index 0000000000..8337b95504
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-evex256-vecs.h
> @@ -0,0 +1,38 @@
> +/* Common config for EVEX256 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _EVEX256_VECS_H
> +#define _EVEX256_VECS_H                        1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       32
> +#include "x86-evex-vecs-common.h"
> +
> +#define USE_WITH_EVEX256       1
> +
> +#ifndef SECTION
> +# define SECTION(p)                    p##.evex
> +#endif
> +
> +#define VMM                                    VMM_256
> +#define VMM_lo                         VMM_any_ymm
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-evex512-vecs.h b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
> new file mode 100644
> index 0000000000..7dc5c23ad0
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-evex512-vecs.h
> @@ -0,0 +1,38 @@
> +/* Common config for EVEX512 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _EVEX512_VECS_H
> +#define _EVEX512_VECS_H                        1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       64
> +#include "x86-evex-vecs-common.h"
> +
> +#define USE_WITH_EVEX512       1
> +
> +#ifndef SECTION
> +# define SECTION(p)                    p##.evex512
> +#endif
> +
> +#define VMM                                    VMM_512
> +#define VMM_lo                         VMM_any_zmm
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-sse2-vecs.h b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
> new file mode 100644
> index 0000000000..b8bbd5dc29
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-sse2-vecs.h
> @@ -0,0 +1,47 @@
> +/* Common config for SSE2 VECs
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_SSE2_VECS_H
> +#define _X86_SSE2_VECS_H                       1
> +
> +#ifdef VEC_SIZE
> +# error "Multiple VEC configs included!"
> +#endif
> +
> +#define VEC_SIZE                       16
> +#include "x86-vec-macros.h"
> +
> +#define USE_WITH_SSE2          1
> +#define SECTION(p)                     p
> +
> +/* 3-byte mov instructions with SSE2.  */
> +#define MOV_SIZE                       3
> +/* No vzeroupper needed.  */
> +#define RET_SIZE                       1
> +#define VZEROUPPER
> +
> +#define VMOVU                          movups
> +#define VMOVA                          movaps
> +#define VMOVNT                         movntdq
> +
> +#define VMM_128                                VMM_any_xmm
> +#define VMM                                    VMM_any_xmm
> +
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-vec-macros.h b/sysdeps/x86_64/multiarch/x86-vec-macros.h
> new file mode 100644
> index 0000000000..7d6bb31d55
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/x86-vec-macros.h
> @@ -0,0 +1,90 @@
> +/* Macro helpers for VEC_{type}({vec_num})
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _X86_VEC_MACROS_H
> +#define _X86_VEC_MACROS_H                      1
> +
> +#ifndef VEC_SIZE
> +# error "Never include this file directly. Always include a vector config."
> +#endif
> +
> +/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
> +   VMM(N) values.  */
> +#define VMM_hi_xmm0                            xmm16
> +#define VMM_hi_xmm1                            xmm17
> +#define VMM_hi_xmm2                            xmm18
> +#define VMM_hi_xmm3                            xmm19
> +#define VMM_hi_xmm4                            xmm20
> +#define VMM_hi_xmm5                            xmm21
> +#define VMM_hi_xmm6                            xmm22
> +#define VMM_hi_xmm7                            xmm23
> +#define VMM_hi_xmm8                            xmm24
> +#define VMM_hi_xmm9                            xmm25
> +#define VMM_hi_xmm10                   xmm26
> +#define VMM_hi_xmm11                   xmm27
> +#define VMM_hi_xmm12                   xmm28
> +#define VMM_hi_xmm13                   xmm29
> +#define VMM_hi_xmm14                   xmm30
> +#define VMM_hi_xmm15                   xmm31
> +
> +#define VMM_hi_ymm0                            ymm16
> +#define VMM_hi_ymm1                            ymm17
> +#define VMM_hi_ymm2                            ymm18
> +#define VMM_hi_ymm3                            ymm19
> +#define VMM_hi_ymm4                            ymm20
> +#define VMM_hi_ymm5                            ymm21
> +#define VMM_hi_ymm6                            ymm22
> +#define VMM_hi_ymm7                            ymm23
> +#define VMM_hi_ymm8                            ymm24
> +#define VMM_hi_ymm9                            ymm25
> +#define VMM_hi_ymm10                   ymm26
> +#define VMM_hi_ymm11                   ymm27
> +#define VMM_hi_ymm12                   ymm28
> +#define VMM_hi_ymm13                   ymm29
> +#define VMM_hi_ymm14                   ymm30
> +#define VMM_hi_ymm15                   ymm31
> +
> +#define VMM_hi_zmm0                            zmm16
> +#define VMM_hi_zmm1                            zmm17
> +#define VMM_hi_zmm2                            zmm18
> +#define VMM_hi_zmm3                            zmm19
> +#define VMM_hi_zmm4                            zmm20
> +#define VMM_hi_zmm5                            zmm21
> +#define VMM_hi_zmm6                            zmm22
> +#define VMM_hi_zmm7                            zmm23
> +#define VMM_hi_zmm8                            zmm24
> +#define VMM_hi_zmm9                            zmm25
> +#define VMM_hi_zmm10                   zmm26
> +#define VMM_hi_zmm11                   zmm27
> +#define VMM_hi_zmm12                   zmm28
> +#define VMM_hi_zmm13                   zmm29
> +#define VMM_hi_zmm14                   zmm30
> +#define VMM_hi_zmm15                   zmm31
> +
> +#define PRIMITIVE_VMM(vec, num)                vec##num
> +
> +#define VMM_any_xmm(i)                 PRIMITIVE_VMM(xmm, i)
> +#define VMM_any_ymm(i)                 PRIMITIVE_VMM(ymm, i)
> +#define VMM_any_zmm(i)                 PRIMITIVE_VMM(zmm, i)
> +
> +#define VMM_hi_xmm(i)                  PRIMITIVE_VMM(VMM_hi_xmm, i)
> +#define VMM_hi_ymm(i)                  PRIMITIVE_VMM(VMM_hi_ymm, i)
> +#define VMM_hi_zmm(i)                  PRIMITIVE_VMM(VMM_hi_zmm, i)
> +
> +#endif
> --
> 2.34.1
>

LGTM

--Sunil

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v10 5/6] x86: Remove now unused vec header macros.
  2022-10-15  3:00   ` [PATCH v10 5/6] x86: Remove now unused vec header macros Noah Goldstein
@ 2022-10-15  3:39     ` Sunil Pandey
  0 siblings, 0 replies; 72+ messages in thread
From: Sunil Pandey @ 2022-10-15  3:39 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha

On Fri, Oct 14, 2022 at 8:03 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  sysdeps/x86_64/multiarch/avx-rtm-vecs.h     | 35 --------
>  sysdeps/x86_64/multiarch/avx-vecs.h         | 47 -----------
>  sysdeps/x86_64/multiarch/evex-vecs-common.h | 39 ---------
>  sysdeps/x86_64/multiarch/evex256-vecs.h     | 35 --------
>  sysdeps/x86_64/multiarch/evex512-vecs.h     | 35 --------
>  sysdeps/x86_64/multiarch/sse2-vecs.h        | 47 -----------
>  sysdeps/x86_64/multiarch/vec-macros.h       | 90 ---------------------
>  7 files changed, 328 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
>  delete mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
>  delete mode 100644 sysdeps/x86_64/multiarch/evex-vecs-common.h
>  delete mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
>  delete mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
>  delete mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
>  delete mode 100644 sysdeps/x86_64/multiarch/vec-macros.h
>
> diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> deleted file mode 100644
> index 6ca9f5e6ba..0000000000
> --- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
> +++ /dev/null
> @@ -1,35 +0,0 @@
> -/* Common config for AVX-RTM VECs
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifndef _AVX_RTM_VECS_H
> -#define _AVX_RTM_VECS_H                        1
> -
> -#define COND_VZEROUPPER                        COND_VZEROUPPER_XTEST
> -#define ZERO_UPPER_VEC_REGISTERS_RETURN        \
> -       ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> -
> -#define VZEROUPPER_RETURN              jmp L(return_vzeroupper)
> -
> -#define USE_WITH_RTM                   1
> -#include "avx-vecs.h"
> -
> -#undef SECTION
> -#define SECTION(p)                             p##.avx.rtm
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
> deleted file mode 100644
> index 89680f5db8..0000000000
> --- a/sysdeps/x86_64/multiarch/avx-vecs.h
> +++ /dev/null
> @@ -1,47 +0,0 @@
> -/* Common config for AVX VECs
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifndef _AVX_VECS_H
> -#define _AVX_VECS_H                    1
> -
> -#ifdef VEC_SIZE
> -# error "Multiple VEC configs included!"
> -#endif
> -
> -#define VEC_SIZE                       32
> -#include "vec-macros.h"
> -
> -#define USE_WITH_AVX           1
> -#define SECTION(p)                     p##.avx
> -
> -/* 4-byte mov instructions with AVX2.  */
> -#define MOV_SIZE                       4
> -/* 1 (ret) + 3 (vzeroupper).  */
> -#define RET_SIZE                       4
> -#define VZEROUPPER                     vzeroupper
> -
> -#define VMOVU                          vmovdqu
> -#define VMOVA                          vmovdqa
> -#define VMOVNT                         vmovntdq
> -
> -/* Often need to access xmm portion.  */
> -#define VEC_xmm                                VEC_any_xmm
> -#define VEC                                    VEC_any_ymm
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
> deleted file mode 100644
> index 99806ebcd7..0000000000
> --- a/sysdeps/x86_64/multiarch/evex-vecs-common.h
> +++ /dev/null
> @@ -1,39 +0,0 @@
> -/* Common config for EVEX256 and EVEX512 VECs
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifndef _EVEX_VECS_COMMON_H
> -#define _EVEX_VECS_COMMON_H                    1
> -
> -#include "vec-macros.h"
> -
> -/* 6-byte mov instructions with EVEX.  */
> -#define MOV_SIZE                       6
> -/* No vzeroupper needed.  */
> -#define RET_SIZE                       1
> -#define VZEROUPPER
> -
> -#define VMOVU                          vmovdqu64
> -#define VMOVA                          vmovdqa64
> -#define VMOVNT                         vmovntdq
> -
> -#define VEC_xmm                                VEC_hi_xmm
> -#define VEC_ymm                                VEC_hi_ymm
> -#define VEC_zmm                                VEC_hi_zmm
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
> deleted file mode 100644
> index 222ba46dc7..0000000000
> --- a/sysdeps/x86_64/multiarch/evex256-vecs.h
> +++ /dev/null
> @@ -1,35 +0,0 @@
> -/* Common config for EVEX256 VECs
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifndef _EVEX256_VECS_H
> -#define _EVEX256_VECS_H                        1
> -
> -#ifdef VEC_SIZE
> -# error "Multiple VEC configs included!"
> -#endif
> -
> -#define VEC_SIZE                       32
> -#include "evex-vecs-common.h"
> -
> -#define USE_WITH_EVEX256       1
> -#define SECTION(p)                     p##.evex
> -
> -#define VEC                                    VEC_ymm
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
> deleted file mode 100644
> index d1784d5368..0000000000
> --- a/sysdeps/x86_64/multiarch/evex512-vecs.h
> +++ /dev/null
> @@ -1,35 +0,0 @@
> -/* Common config for EVEX512 VECs
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifndef _EVEX512_VECS_H
> -#define _EVEX512_VECS_H                        1
> -
> -#ifdef VEC_SIZE
> -# error "Multiple VEC configs included!"
> -#endif
> -
> -#define VEC_SIZE                       64
> -#include "evex-vecs-common.h"
> -
> -#define USE_WITH_EVEX512       1
> -#define SECTION(p)                     p##.evex512
> -
> -#define VEC                                    VEC_zmm
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
> deleted file mode 100644
> index 2b77a59d56..0000000000
> --- a/sysdeps/x86_64/multiarch/sse2-vecs.h
> +++ /dev/null
> @@ -1,47 +0,0 @@
> -/* Common config for SSE2 VECs
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifndef _SSE2_VECS_H
> -#define _SSE2_VECS_H                   1
> -
> -#ifdef VEC_SIZE
> -# error "Multiple VEC configs included!"
> -#endif
> -
> -#define VEC_SIZE                       16
> -#include "vec-macros.h"
> -
> -#define USE_WITH_SSE2          1
> -#define SECTION(p)                     p
> -
> -/* 3-byte mov instructions with SSE2.  */
> -#define MOV_SIZE                       3
> -/* No vzeroupper needed.  */
> -#define RET_SIZE                       1
> -#define VZEROUPPER
> -
> -#define VMOVU                          movups
> -#define VMOVA                          movaps
> -#define VMOVNT                         movntdq
> -
> -#define VEC_xmm                                VEC_any_xmm
> -#define VEC                                    VEC_any_xmm
> -
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
> deleted file mode 100644
> index 9f3ffecede..0000000000
> --- a/sysdeps/x86_64/multiarch/vec-macros.h
> +++ /dev/null
> @@ -1,90 +0,0 @@
> -/* Macro helpers for VEC_{type}({vec_num})
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifndef _VEC_MACROS_H
> -#define _VEC_MACROS_H                  1
> -
> -#ifndef VEC_SIZE
> -# error "Never include this file directly. Always include a vector config."
> -#endif
> -
> -/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
> -   VEC(N) values.  */
> -#define VEC_hi_xmm0                            xmm16
> -#define VEC_hi_xmm1                            xmm17
> -#define VEC_hi_xmm2                            xmm18
> -#define VEC_hi_xmm3                            xmm19
> -#define VEC_hi_xmm4                            xmm20
> -#define VEC_hi_xmm5                            xmm21
> -#define VEC_hi_xmm6                            xmm22
> -#define VEC_hi_xmm7                            xmm23
> -#define VEC_hi_xmm8                            xmm24
> -#define VEC_hi_xmm9                            xmm25
> -#define VEC_hi_xmm10                   xmm26
> -#define VEC_hi_xmm11                   xmm27
> -#define VEC_hi_xmm12                   xmm28
> -#define VEC_hi_xmm13                   xmm29
> -#define VEC_hi_xmm14                   xmm30
> -#define VEC_hi_xmm15                   xmm31
> -
> -#define VEC_hi_ymm0                            ymm16
> -#define VEC_hi_ymm1                            ymm17
> -#define VEC_hi_ymm2                            ymm18
> -#define VEC_hi_ymm3                            ymm19
> -#define VEC_hi_ymm4                            ymm20
> -#define VEC_hi_ymm5                            ymm21
> -#define VEC_hi_ymm6                            ymm22
> -#define VEC_hi_ymm7                            ymm23
> -#define VEC_hi_ymm8                            ymm24
> -#define VEC_hi_ymm9                            ymm25
> -#define VEC_hi_ymm10                   ymm26
> -#define VEC_hi_ymm11                   ymm27
> -#define VEC_hi_ymm12                   ymm28
> -#define VEC_hi_ymm13                   ymm29
> -#define VEC_hi_ymm14                   ymm30
> -#define VEC_hi_ymm15                   ymm31
> -
> -#define VEC_hi_zmm0                            zmm16
> -#define VEC_hi_zmm1                            zmm17
> -#define VEC_hi_zmm2                            zmm18
> -#define VEC_hi_zmm3                            zmm19
> -#define VEC_hi_zmm4                            zmm20
> -#define VEC_hi_zmm5                            zmm21
> -#define VEC_hi_zmm6                            zmm22
> -#define VEC_hi_zmm7                            zmm23
> -#define VEC_hi_zmm8                            zmm24
> -#define VEC_hi_zmm9                            zmm25
> -#define VEC_hi_zmm10                   zmm26
> -#define VEC_hi_zmm11                   zmm27
> -#define VEC_hi_zmm12                   zmm28
> -#define VEC_hi_zmm13                   zmm29
> -#define VEC_hi_zmm14                   zmm30
> -#define VEC_hi_zmm15                   zmm31
> -
> -#define PRIMITIVE_VEC(vec, num)                vec##num
> -
> -#define VEC_any_xmm(i)                 PRIMITIVE_VEC(xmm, i)
> -#define VEC_any_ymm(i)                 PRIMITIVE_VEC(ymm, i)
> -#define VEC_any_zmm(i)                 PRIMITIVE_VEC(zmm, i)
> -
> -#define VEC_hi_xmm(i)                  PRIMITIVE_VEC(VEC_hi_xmm, i)
> -#define VEC_hi_ymm(i)                  PRIMITIVE_VEC(VEC_hi_ymm, i)
> -#define VEC_hi_zmm(i)                  PRIMITIVE_VEC(VEC_hi_zmm, i)
> -
> -#endif
> --
> 2.34.1
>

LGTM

--Sunil

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v10 4/6] x86: Update memset to use new VEC macros
  2022-10-15  3:00   ` [PATCH v10 4/6] x86: Update memset " Noah Goldstein
@ 2022-10-15  3:42     ` Sunil Pandey
  0 siblings, 0 replies; 72+ messages in thread
From: Sunil Pandey @ 2022-10-15  3:42 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha

On Fri, Oct 14, 2022 at 8:03 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Replace %VEC(n) -> %VMM(n)
>
> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  .../memset-avx2-unaligned-erms-rtm.S          |  8 +--
>  .../multiarch/memset-avx2-unaligned-erms.S    | 14 +---
>  .../multiarch/memset-avx512-unaligned-erms.S  | 20 +-----
>  .../multiarch/memset-evex-unaligned-erms.S    | 20 +-----
>  .../multiarch/memset-sse2-unaligned-erms.S    | 10 +--
>  .../multiarch/memset-vec-unaligned-erms.S     | 70 ++++++++-----------
>  6 files changed, 43 insertions(+), 99 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> index 8ac3e479bb..bc8605faf3 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> @@ -1,10 +1,6 @@
> -#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> -  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +#include "x86-avx-rtm-vecs.h"
>
> -#define VZEROUPPER_RETURN jmp   L(return)
> -
> -#define SECTION(p) p##.avx.rtm
>  #define MEMSET_SYMBOL(p,s)     p##_avx2_##s##_rtm
>  #define WMEMSET_SYMBOL(p,s)    p##_avx2_##s##_rtm
>
> -#include "memset-avx2-unaligned-erms.S"
> +# include "memset-avx2-unaligned-erms.S"
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index a9054a9122..47cf5072a4 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -4,14 +4,9 @@
>
>  # define USE_WITH_AVX2 1
>
> -# define VEC_SIZE      32
> -# define MOV_SIZE      4
> -# define RET_SIZE      4
> -
> -# define VEC(i)                ymm##i
> -
> -# define VMOVU     vmovdqu
> -# define VMOVA     vmovdqa
> +# ifndef VEC_SIZE
> +#  include "x86-avx-vecs.h"
> +# endif
>
>  # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    vmovd d, %xmm0; \
> @@ -26,9 +21,6 @@
>  # define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
>  # define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
>
> -# ifndef SECTION
> -#  define SECTION(p)           p##.avx
> -# endif
>  # ifndef MEMSET_SYMBOL
>  #  define MEMSET_SYMBOL(p,s)   p##_avx2_##s
>  # endif
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index 47623b8ee8..84145b6c27 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -4,26 +4,14 @@
>
>  # define USE_WITH_AVX512       1
>
> -# define VEC_SIZE      64
> -# define MOV_SIZE      6
> -# define RET_SIZE      1
> -
> -# define XMM0          xmm16
> -# define YMM0          ymm16
> -# define VEC0          zmm16
> -# define VEC(i)                VEC##i
> -
> -# define VMOVU     vmovdqu64
> -# define VMOVA     vmovdqa64
> -
> -# define VZEROUPPER
> +# include "x86-evex512-vecs.h"
>
>  # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> -  vpbroadcastb d, %VEC0; \
> +  vpbroadcastb d, %VMM(0); \
>    movq r, %rax
>
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> -  vpbroadcastd d, %VEC0; \
> +  vpbroadcastd d, %VMM(0); \
>    movq r, %rax
>
>  # define MEMSET_VDUP_TO_VEC0_HIGH()
> @@ -32,8 +20,6 @@
>  # define WMEMSET_VDUP_TO_VEC0_HIGH()
>  # define WMEMSET_VDUP_TO_VEC0_LOW()
>
> -# define SECTION(p)            p##.evex512
> -
>  #ifndef MEMSET_SYMBOL
>  # define MEMSET_SYMBOL(p,s)    p##_avx512_##s
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> index ac4b2d2d50..1f03b26bf8 100644
> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> @@ -4,26 +4,14 @@
>
>  # define USE_WITH_EVEX 1
>
> -# define VEC_SIZE      32
> -# define MOV_SIZE      6
> -# define RET_SIZE      1
> -
> -# define XMM0          xmm16
> -# define YMM0          ymm16
> -# define VEC0          ymm16
> -# define VEC(i)                VEC##i
> -
> -# define VMOVU     vmovdqu64
> -# define VMOVA     vmovdqa64
> -
> -# define VZEROUPPER
> +# include "x86-evex256-vecs.h"
>
>  # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> -  vpbroadcastb d, %VEC0; \
> +  vpbroadcastb d, %VMM(0); \
>    movq r, %rax
>
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> -  vpbroadcastd d, %VEC0; \
> +  vpbroadcastd d, %VMM(0); \
>    movq r, %rax
>
>  # define MEMSET_VDUP_TO_VEC0_HIGH()
> @@ -32,8 +20,6 @@
>  # define WMEMSET_VDUP_TO_VEC0_HIGH()
>  # define WMEMSET_VDUP_TO_VEC0_LOW()
>
> -# define SECTION(p)            p##.evex
> -
>  #ifndef MEMSET_SYMBOL
>  # define MEMSET_SYMBOL(p,s)    p##_evex_##s
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> index 44f9b8888b..34b245d8ca 100644
> --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> @@ -26,13 +26,7 @@
>  # include <sysdep.h>
>  # define USE_WITH_SSE2 1
>
> -# define VEC_SIZE      16
> -# define MOV_SIZE      3
> -# define RET_SIZE      1
> -
> -# define VEC(i)                xmm##i
> -# define VMOVU     movups
> -# define VMOVA     movaps
> +# include "x86-sse2-vecs.h"
>
>  # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    movd d, %xmm0; \
> @@ -52,8 +46,6 @@
>  # define WMEMSET_VDUP_TO_VEC0_HIGH()
>  # define WMEMSET_VDUP_TO_VEC0_LOW()
>
> -# define SECTION(p)            p
> -
>  # ifndef MEMSET_SYMBOL
>  #  define MEMSET_SYMBOL(p,s)   p##_sse2_##s
>  # endif
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 905d0fa464..03de0ab907 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -34,14 +34,6 @@
>  # define WMEMSET_CHK_SYMBOL(p,s)       WMEMSET_SYMBOL(p, s)
>  #endif
>
> -#ifndef XMM0
> -# define XMM0                          xmm0
> -#endif
> -
> -#ifndef YMM0
> -# define YMM0                          ymm0
> -#endif
> -
>  #ifndef VZEROUPPER
>  # if VEC_SIZE > 16
>  #  define VZEROUPPER                   vzeroupper
> @@ -150,8 +142,8 @@ L(entry_from_wmemset):
>         cmpq    $(VEC_SIZE * 2), %rdx
>         ja      L(more_2x_vec)
>         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> -       VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
> -       VMOVU   %VEC(0), (%rdi)
> +       VMOVU   %VMM(0), -VEC_SIZE(%rdi,%rdx)
> +       VMOVU   %VMM(0), (%rdi)
>         VZEROUPPER_RETURN
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  END (MEMSET_SYMBOL (__memset, unaligned))
> @@ -175,19 +167,19 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(stosb_more_2x_vec)
>         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> +       VMOVU   %VMM(0), (%rdi)
> +       VMOVU   %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
>         VZEROUPPER_RETURN
>  #endif
>
>         .p2align 4,, 4
>  L(last_2x_vec):
>  #ifdef USE_LESS_VEC_MASK_STORE
> -       VMOVU   %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
> -       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> +       VMOVU   %VMM(0), (VEC_SIZE * -2)(%rdi, %rdx)
> +       VMOVU   %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx)
>  #else
> -       VMOVU   %VEC(0), (VEC_SIZE * -2)(%rdi)
> -       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi)
> +       VMOVU   %VMM(0), (VEC_SIZE * -2)(%rdi)
> +       VMOVU   %VMM(0), (VEC_SIZE * -1)(%rdi)
>  #endif
>         VZEROUPPER_RETURN
>
> @@ -221,7 +213,7 @@ L(less_vec_from_wmemset):
>         bzhil   %edx, %ecx, %ecx
>         kmovd   %ecx, %k1
>  # endif
> -       vmovdqu8 %VEC(0), (%rax){%k1}
> +       vmovdqu8 %VMM(0), (%rax){%k1}
>         VZEROUPPER_RETURN
>
>  # if defined USE_MULTIARCH && IS_IN (libc)
> @@ -249,8 +241,8 @@ L(stosb_more_2x_vec):
>            and (4x, 8x] jump to target.  */
>  L(more_2x_vec):
>         /* Store next 2x vec regardless.  */
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(0), (VEC_SIZE * 1)(%rdi)
> +       VMOVU   %VMM(0), (%rdi)
> +       VMOVU   %VMM(0), (VEC_SIZE * 1)(%rdi)
>
>
>         /* Two different methods of setting up pointers / compare. The two
> @@ -278,8 +270,8 @@ L(more_2x_vec):
>  #endif
>
>         /* Store next 2x vec regardless.  */
> -       VMOVU   %VEC(0), (VEC_SIZE * 2)(%rax)
> -       VMOVU   %VEC(0), (VEC_SIZE * 3)(%rax)
> +       VMOVU   %VMM(0), (VEC_SIZE * 2)(%rax)
> +       VMOVU   %VMM(0), (VEC_SIZE * 3)(%rax)
>
>
>  #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
> @@ -304,20 +296,20 @@ L(more_2x_vec):
>         andq    $(VEC_SIZE * -2), %LOOP_REG
>         .p2align 4
>  L(loop):
> -       VMOVA   %VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
> -       VMOVA   %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
> -       VMOVA   %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
> -       VMOVA   %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
> +       VMOVA   %VMM(0), LOOP_4X_OFFSET(%LOOP_REG)
> +       VMOVA   %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
> +       VMOVA   %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
> +       VMOVA   %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
>         subq    $-(VEC_SIZE * 4), %LOOP_REG
>         cmpq    %END_REG, %LOOP_REG
>         jb      L(loop)
>         .p2align 4,, MOV_SIZE
>  L(last_4x_vec):
> -       VMOVU   %VEC(0), LOOP_4X_OFFSET(%END_REG)
> -       VMOVU   %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
> -       VMOVU   %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
> -       VMOVU   %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
> -L(return):
> +       VMOVU   %VMM(0), LOOP_4X_OFFSET(%END_REG)
> +       VMOVU   %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
> +       VMOVU   %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
> +       VMOVU   %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
> +L(return_vzeroupper):
>  #if VEC_SIZE > 16
>         ZERO_UPPER_VEC_REGISTERS_RETURN
>  #else
> @@ -355,7 +347,7 @@ L(cross_page):
>         jge     L(between_16_31)
>  #endif
>  #ifndef USE_XMM_LESS_VEC
> -       MOVQ    %XMM0, %SET_REG64
> +       MOVQ    %VMM_128(0), %SET_REG64
>  #endif
>         cmpl    $8, %edx
>         jge     L(between_8_15)
> @@ -374,8 +366,8 @@ L(between_0_0):
>         .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
>         /* From 32 to 63.  No branch when size == 32.  */
>  L(between_32_63):
> -       VMOVU   %YMM0, (%LESS_VEC_REG)
> -       VMOVU   %YMM0, -32(%LESS_VEC_REG, %rdx)
> +       VMOVU   %VMM_256(0), (%LESS_VEC_REG)
> +       VMOVU   %VMM_256(0), -32(%LESS_VEC_REG, %rdx)
>         VZEROUPPER_RETURN
>  #endif
>
> @@ -383,8 +375,8 @@ L(between_32_63):
>         .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
>  L(between_16_31):
>         /* From 16 to 31.  No branch when size == 16.  */
> -       VMOVU   %XMM0, (%LESS_VEC_REG)
> -       VMOVU   %XMM0, -16(%LESS_VEC_REG, %rdx)
> +       VMOVU   %VMM_128(0), (%LESS_VEC_REG)
> +       VMOVU   %VMM_128(0), -16(%LESS_VEC_REG, %rdx)
>         ret
>  #endif
>
> @@ -394,8 +386,8 @@ L(between_16_31):
>  L(between_8_15):
>         /* From 8 to 15.  No branch when size == 8.  */
>  #ifdef USE_XMM_LESS_VEC
> -       MOVQ    %XMM0, (%rdi)
> -       MOVQ    %XMM0, -8(%rdi, %rdx)
> +       MOVQ    %VMM_128(0), (%rdi)
> +       MOVQ    %VMM_128(0), -8(%rdi, %rdx)
>  #else
>         movq    %SET_REG64, (%LESS_VEC_REG)
>         movq    %SET_REG64, -8(%LESS_VEC_REG, %rdx)
> @@ -408,8 +400,8 @@ L(between_8_15):
>  L(between_4_7):
>         /* From 4 to 7.  No branch when size == 4.  */
>  #ifdef USE_XMM_LESS_VEC
> -       MOVD    %XMM0, (%rdi)
> -       MOVD    %XMM0, -4(%rdi, %rdx)
> +       MOVD    %VMM_128(0), (%rdi)
> +       MOVD    %VMM_128(0), -4(%rdi, %rdx)
>  #else
>         movl    %SET_REG32, (%LESS_VEC_REG)
>         movl    %SET_REG32, -4(%LESS_VEC_REG, %rdx)
> --
> 2.34.1
>

LGTM

--Sunil

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v10 3/6] x86: Update memmove to use new VEC macros
  2022-10-15  3:00   ` [PATCH v10 3/6] x86: Update memmove " Noah Goldstein
@ 2022-10-15  3:43     ` Sunil Pandey
  0 siblings, 0 replies; 72+ messages in thread
From: Sunil Pandey @ 2022-10-15  3:43 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha

On Fri, Oct 14, 2022 at 8:02 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Replace %VEC(n) -> %VMM(n)
>
> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  .../memmove-avx-unaligned-erms-rtm.S          |  11 +-
>  .../multiarch/memmove-avx-unaligned-erms.S    |   9 +-
>  .../multiarch/memmove-avx512-unaligned-erms.S |  30 +-
>  .../multiarch/memmove-evex-unaligned-erms.S   |  30 +-
>  .../multiarch/memmove-sse2-unaligned-erms.S   |  11 +-
>  .../multiarch/memmove-vec-unaligned-erms.S    | 262 +++++++++---------
>  6 files changed, 132 insertions(+), 221 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> index 67a55f0c85..20746e6713 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> @@ -1,16 +1,7 @@
>  #if IS_IN (libc)
> -# define VEC_SIZE      32
> -# define VEC(i)                ymm##i
> -# define VMOVNT                vmovntdq
> -# define VMOVU         vmovdqu
> -# define VMOVA         vmovdqa
> -# define MOV_SIZE      4
> -# define ZERO_UPPER_VEC_REGISTERS_RETURN \
> -  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>
> -# define VZEROUPPER_RETURN jmp  L(return)
> +# include "x86-avx-rtm-vecs.h"
>
> -# define SECTION(p)            p##.avx.rtm
>  # define MEMMOVE_SYMBOL(p,s)   p##_avx_##s##_rtm
>
>  # include "memmove-vec-unaligned-erms.S"
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> index a14b155667..4e4b4635f9 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> @@ -2,14 +2,7 @@
>
>  #if ISA_SHOULD_BUILD (3)
>
> -# define VEC_SIZE      32
> -# define VEC(i)                ymm##i
> -# define VMOVNT                vmovntdq
> -# define VMOVU         vmovdqu
> -# define VMOVA         vmovdqa
> -# define MOV_SIZE      4
> -
> -# define SECTION(p)            p##.avx
> +# include "x86-avx-vecs.h"
>
>  # ifndef MEMMOVE_SYMBOL
>  #  define MEMMOVE_SYMBOL(p,s)  p##_avx_##s
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> index 8d1568a7ba..cca97e38f8 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> @@ -2,35 +2,7 @@
>
>  #if ISA_SHOULD_BUILD (4)
>
> -# define VEC_SIZE      64
> -# define XMM0          xmm16
> -# define XMM1          xmm17
> -# define YMM0          ymm16
> -# define YMM1          ymm17
> -# define VEC0          zmm16
> -# define VEC1          zmm17
> -# define VEC2          zmm18
> -# define VEC3          zmm19
> -# define VEC4          zmm20
> -# define VEC5          zmm21
> -# define VEC6          zmm22
> -# define VEC7          zmm23
> -# define VEC8          zmm24
> -# define VEC9          zmm25
> -# define VEC10         zmm26
> -# define VEC11         zmm27
> -# define VEC12         zmm28
> -# define VEC13         zmm29
> -# define VEC14         zmm30
> -# define VEC15         zmm31
> -# define VEC(i)                VEC##i
> -# define VMOVNT                vmovntdq
> -# define VMOVU         vmovdqu64
> -# define VMOVA         vmovdqa64
> -# define VZEROUPPER
> -# define MOV_SIZE      6
> -
> -# define SECTION(p)            p##.evex512
> +# include "x86-evex512-vecs.h"
>
>  # ifndef MEMMOVE_SYMBOL
>  #  define MEMMOVE_SYMBOL(p,s)  p##_avx512_##s
> diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> index 2373017358..1f7b5715f7 100644
> --- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> @@ -2,35 +2,7 @@
>
>  #if ISA_SHOULD_BUILD (4)
>
> -# define VEC_SIZE      32
> -# define XMM0          xmm16
> -# define XMM1          xmm17
> -# define YMM0          ymm16
> -# define YMM1          ymm17
> -# define VEC0          ymm16
> -# define VEC1          ymm17
> -# define VEC2          ymm18
> -# define VEC3          ymm19
> -# define VEC4          ymm20
> -# define VEC5          ymm21
> -# define VEC6          ymm22
> -# define VEC7          ymm23
> -# define VEC8          ymm24
> -# define VEC9          ymm25
> -# define VEC10         ymm26
> -# define VEC11         ymm27
> -# define VEC12         ymm28
> -# define VEC13         ymm29
> -# define VEC14         ymm30
> -# define VEC15         ymm31
> -# define VEC(i)                VEC##i
> -# define VMOVNT                vmovntdq
> -# define VMOVU         vmovdqu64
> -# define VMOVA         vmovdqa64
> -# define VZEROUPPER
> -# define MOV_SIZE      6
> -
> -# define SECTION(p)            p##.evex
> +# include "x86-evex256-vecs.h"
>
>  # ifndef MEMMOVE_SYMBOL
>  #  define MEMMOVE_SYMBOL(p,s)  p##_evex_##s
> diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
> index 422a079902..8431bcd000 100644
> --- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
> @@ -22,18 +22,9 @@
>     so we need this to build for ISA V2 builds. */
>  #if ISA_SHOULD_BUILD (2)
>
> -# include <sysdep.h>
> +# include "x86-sse2-vecs.h"
>
> -# define VEC_SIZE      16
> -# define VEC(i)                xmm##i
>  # define PREFETCHNT    prefetchnta
> -# define VMOVNT                movntdq
> -/* Use movups and movaps for smaller code sizes.  */
> -# define VMOVU         movups
> -# define VMOVA         movaps
> -# define MOV_SIZE      3
> -
> -# define SECTION(p)            p
>
>  # ifndef MEMMOVE_SYMBOL
>  #  define MEMMOVE_SYMBOL(p,s)  p##_sse2_##s
> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> index 04747133b7..5b758cae5e 100644
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -60,14 +60,6 @@
>  # define MEMMOVE_CHK_SYMBOL(p,s)       MEMMOVE_SYMBOL(p, s)
>  #endif
>
> -#ifndef XMM0
> -# define XMM0                          xmm0
> -#endif
> -
> -#ifndef YMM0
> -# define YMM0                          ymm0
> -#endif
> -
>  #ifndef VZEROUPPER
>  # if VEC_SIZE > 16
>  #  define VZEROUPPER vzeroupper
> @@ -225,13 +217,13 @@ L(start):
>         cmp     $VEC_SIZE, %RDX_LP
>         jb      L(less_vec)
>         /* Load regardless.  */
> -       VMOVU   (%rsi), %VEC(0)
> +       VMOVU   (%rsi), %VMM(0)
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(more_2x_vec)
>         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
> +       VMOVU   -VEC_SIZE(%rsi,%rdx), %VMM(1)
> +       VMOVU   %VMM(0), (%rdi)
> +       VMOVU   %VMM(1), -VEC_SIZE(%rdi,%rdx)
>  #if !(defined USE_MULTIARCH && IS_IN (libc))
>         ZERO_UPPER_VEC_REGISTERS_RETURN
>  #else
> @@ -270,15 +262,15 @@ L(start_erms):
>         cmp     $VEC_SIZE, %RDX_LP
>         jb      L(less_vec)
>         /* Load regardless.  */
> -       VMOVU   (%rsi), %VEC(0)
> +       VMOVU   (%rsi), %VMM(0)
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(movsb_more_2x_vec)
>         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
>          */
> -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(1)
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), -VEC_SIZE(%rdi, %rdx)
> -L(return):
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(1)
> +       VMOVU   %VMM(0), (%rdi)
> +       VMOVU   %VMM(1), -VEC_SIZE(%rdi, %rdx)
> +L(return_vzeroupper):
>  # if VEC_SIZE > 16
>         ZERO_UPPER_VEC_REGISTERS_RETURN
>  # else
> @@ -359,10 +351,10 @@ L(between_16_31):
>         .p2align 4,, 10
>  L(between_32_63):
>         /* From 32 to 63.  No branch when size == 32.  */
> -       VMOVU   (%rsi), %YMM0
> -       VMOVU   -32(%rsi, %rdx), %YMM1
> -       VMOVU   %YMM0, (%rdi)
> -       VMOVU   %YMM1, -32(%rdi, %rdx)
> +       VMOVU   (%rsi), %VMM_256(0)
> +       VMOVU   -32(%rsi, %rdx), %VMM_256(1)
> +       VMOVU   %VMM_256(0), (%rdi)
> +       VMOVU   %VMM_256(1), -32(%rdi, %rdx)
>         VZEROUPPER_RETURN
>  #endif
>
> @@ -380,12 +372,12 @@ L(last_4x_vec):
>         /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
>
>         /* VEC(0) and VEC(1) have already been loaded.  */
> -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(2)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> -       VMOVU   %VEC(2), -VEC_SIZE(%rdi, %rdx)
> -       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(2)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
> +       VMOVU   %VMM(0), (%rdi)
> +       VMOVU   %VMM(1), VEC_SIZE(%rdi)
> +       VMOVU   %VMM(2), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   %VMM(3), -(VEC_SIZE * 2)(%rdi, %rdx)
>         VZEROUPPER_RETURN
>
>         .p2align 4
> @@ -400,24 +392,24 @@ L(more_2x_vec):
>         cmpq    $(VEC_SIZE * 8), %rdx
>         ja      L(more_8x_vec)
>         /* Load VEC(1) regardless. VEC(0) has already been loaded.  */
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
>         cmpq    $(VEC_SIZE * 4), %rdx
>         jbe     L(last_4x_vec)
>         /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(4)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
> -       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
> -       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> -       VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
> -       VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> -       VMOVU   %VEC(4), -VEC_SIZE(%rdi, %rdx)
> -       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
> -       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
> -       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(4)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(5)
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(6)
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(7)
> +       VMOVU   %VMM(0), (%rdi)
> +       VMOVU   %VMM(1), VEC_SIZE(%rdi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
> +       VMOVU   %VMM(4), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   %VMM(5), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   %VMM(6), -(VEC_SIZE * 3)(%rdi, %rdx)
> +       VMOVU   %VMM(7), -(VEC_SIZE * 4)(%rdi, %rdx)
>         VZEROUPPER_RETURN
>
>         .p2align 4,, 4
> @@ -466,14 +458,14 @@ L(more_8x_vec_forward):
>          */
>
>         /* First vec was already loaded into VEC(0).  */
> -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(5)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(5)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(6)
>         /* Save begining of dst.  */
>         movq    %rdi, %rcx
>         /* Align dst to VEC_SIZE - 1.  */
>         orq     $(VEC_SIZE - 1), %rdi
> -       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
> -       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(7)
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(8)
>
>         /* Subtract dst from src. Add back after dst aligned.  */
>         subq    %rcx, %rsi
> @@ -488,25 +480,25 @@ L(more_8x_vec_forward):
>         .p2align 4,, 11
>  L(loop_4x_vec_forward):
>         /* Copy 4 * VEC a time forward.  */
> -       VMOVU   (%rsi), %VEC(1)
> -       VMOVU   VEC_SIZE(%rsi), %VEC(2)
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(3)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(4)
> +       VMOVU   (%rsi), %VMM(1)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(2)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(3)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(4)
>         subq    $-(VEC_SIZE * 4), %rsi
> -       VMOVA   %VEC(1), (%rdi)
> -       VMOVA   %VEC(2), VEC_SIZE(%rdi)
> -       VMOVA   %VEC(3), (VEC_SIZE * 2)(%rdi)
> -       VMOVA   %VEC(4), (VEC_SIZE * 3)(%rdi)
> +       VMOVA   %VMM(1), (%rdi)
> +       VMOVA   %VMM(2), VEC_SIZE(%rdi)
> +       VMOVA   %VMM(3), (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VMM(4), (VEC_SIZE * 3)(%rdi)
>         subq    $-(VEC_SIZE * 4), %rdi
>         cmpq    %rdi, %rdx
>         ja      L(loop_4x_vec_forward)
>         /* Store the last 4 * VEC.  */
> -       VMOVU   %VEC(5), (VEC_SIZE * 3)(%rdx)
> -       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdx)
> -       VMOVU   %VEC(7), VEC_SIZE(%rdx)
> -       VMOVU   %VEC(8), (%rdx)
> +       VMOVU   %VMM(5), (VEC_SIZE * 3)(%rdx)
> +       VMOVU   %VMM(6), (VEC_SIZE * 2)(%rdx)
> +       VMOVU   %VMM(7), VEC_SIZE(%rdx)
> +       VMOVU   %VMM(8), (%rdx)
>         /* Store the first VEC.  */
> -       VMOVU   %VEC(0), (%rcx)
> +       VMOVU   %VMM(0), (%rcx)
>         /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
>          */
>  L(nop_backward):
> @@ -523,12 +515,12 @@ L(more_8x_vec_backward):
>            addresses.  */
>
>         /* First vec was also loaded into VEC(0).  */
> -       VMOVU   VEC_SIZE(%rsi), %VEC(5)
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(6)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(5)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(6)
>         /* Begining of region for 4x backward copy stored in rcx.  */
>         leaq    (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(7)
> -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(8)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(7)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(8)
>         /* Subtract dst from src. Add back after dst aligned.  */
>         subq    %rdi, %rsi
>         /* Align dst.  */
> @@ -540,25 +532,25 @@ L(more_8x_vec_backward):
>         .p2align 4,, 11
>  L(loop_4x_vec_backward):
>         /* Copy 4 * VEC a time backward.  */
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(1)
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> -       VMOVU   (VEC_SIZE * 1)(%rsi), %VEC(3)
> -       VMOVU   (VEC_SIZE * 0)(%rsi), %VEC(4)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(1)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   (VEC_SIZE * 1)(%rsi), %VMM(3)
> +       VMOVU   (VEC_SIZE * 0)(%rsi), %VMM(4)
>         addq    $(VEC_SIZE * -4), %rsi
> -       VMOVA   %VEC(1), (VEC_SIZE * 3)(%rcx)
> -       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rcx)
> -       VMOVA   %VEC(3), (VEC_SIZE * 1)(%rcx)
> -       VMOVA   %VEC(4), (VEC_SIZE * 0)(%rcx)
> +       VMOVA   %VMM(1), (VEC_SIZE * 3)(%rcx)
> +       VMOVA   %VMM(2), (VEC_SIZE * 2)(%rcx)
> +       VMOVA   %VMM(3), (VEC_SIZE * 1)(%rcx)
> +       VMOVA   %VMM(4), (VEC_SIZE * 0)(%rcx)
>         addq    $(VEC_SIZE * -4), %rcx
>         cmpq    %rcx, %rdi
>         jb      L(loop_4x_vec_backward)
>         /* Store the first 4 * VEC.  */
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(5), VEC_SIZE(%rdi)
> -       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
> -       VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
> +       VMOVU   %VMM(0), (%rdi)
> +       VMOVU   %VMM(5), VEC_SIZE(%rdi)
> +       VMOVU   %VMM(6), (VEC_SIZE * 2)(%rdi)
> +       VMOVU   %VMM(7), (VEC_SIZE * 3)(%rdi)
>         /* Store the last VEC.  */
> -       VMOVU   %VEC(8), -VEC_SIZE(%rdx, %rdi)
> +       VMOVU   %VMM(8), -VEC_SIZE(%rdx, %rdi)
>         VZEROUPPER_RETURN
>
>  #if defined USE_MULTIARCH && IS_IN (libc)
> @@ -568,7 +560,7 @@ L(loop_4x_vec_backward):
>  # if ALIGN_MOVSB
>  L(skip_short_movsb_check):
>  #  if MOVSB_ALIGN_TO > VEC_SIZE
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
>  #  endif
>  #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
>  #   error Unsupported MOVSB_ALIGN_TO
> @@ -597,9 +589,9 @@ L(skip_short_movsb_check):
>
>         rep     movsb
>
> -       VMOVU   %VEC(0), (%r8)
> +       VMOVU   %VMM(0), (%r8)
>  #  if MOVSB_ALIGN_TO > VEC_SIZE
> -       VMOVU   %VEC(1), VEC_SIZE(%r8)
> +       VMOVU   %VMM(1), VEC_SIZE(%r8)
>  #  endif
>         VZEROUPPER_RETURN
>  # endif
> @@ -640,7 +632,7 @@ L(movsb):
>  # endif
>  # if ALIGN_MOVSB
>  #  if MOVSB_ALIGN_TO > VEC_SIZE
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
>  #  endif
>  #  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
>  #   error Unsupported MOVSB_ALIGN_TO
> @@ -664,9 +656,9 @@ L(movsb_align_dst):
>         rep     movsb
>
>         /* Store VECs loaded for aligning.  */
> -       VMOVU   %VEC(0), (%r8)
> +       VMOVU   %VMM(0), (%r8)
>  #  if MOVSB_ALIGN_TO > VEC_SIZE
> -       VMOVU   %VEC(1), VEC_SIZE(%r8)
> +       VMOVU   %VMM(1), VEC_SIZE(%r8)
>  #  endif
>         VZEROUPPER_RETURN
>  # else /* !ALIGN_MOVSB.  */
> @@ -701,18 +693,18 @@ L(large_memcpy_2x):
>
>         /* First vec was also loaded into VEC(0).  */
>  # if VEC_SIZE < 64
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
>  #  if VEC_SIZE < 32
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
>  #  endif
>  # endif
> -       VMOVU   %VEC(0), (%rdi)
> +       VMOVU   %VMM(0), (%rdi)
>  # if VEC_SIZE < 64
> -       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> +       VMOVU   %VMM(1), VEC_SIZE(%rdi)
>  #  if VEC_SIZE < 32
> -       VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
> -       VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 2)(%rdi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 3)(%rdi)
>  #  endif
>  # endif
>
> @@ -761,12 +753,12 @@ L(loop_large_memcpy_2x_inner):
>         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
>         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
>         /* Load vectors from rsi.  */
> -       LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> -       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +       LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
> +       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
>         subq    $-LARGE_LOAD_SIZE, %rsi
>         /* Non-temporal store vectors to rdi.  */
> -       STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> -       STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +       STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
> +       STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
>         subq    $-LARGE_LOAD_SIZE, %rdi
>         decl    %ecx
>         jnz     L(loop_large_memcpy_2x_inner)
> @@ -785,31 +777,31 @@ L(loop_large_memcpy_2x_tail):
>         /* Copy 4 * VEC a time forward with non-temporal stores.  */
>         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
>         PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> +       VMOVU   (%rsi), %VMM(0)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
>         subq    $-(VEC_SIZE * 4), %rsi
>         addl    $-(VEC_SIZE * 4), %edx
> -       VMOVA   %VEC(0), (%rdi)
> -       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> -       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> -       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> +       VMOVA   %VMM(0), (%rdi)
> +       VMOVA   %VMM(1), VEC_SIZE(%rdi)
> +       VMOVA   %VMM(2), (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VMM(3), (VEC_SIZE * 3)(%rdi)
>         subq    $-(VEC_SIZE * 4), %rdi
>         cmpl    $(VEC_SIZE * 4), %edx
>         ja      L(loop_large_memcpy_2x_tail)
>
>  L(large_memcpy_2x_end):
>         /* Store the last 4 * VEC.  */
> -       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> -       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> -
> -       VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> -       VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> -       VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> -       VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(3)
> +
> +       VMOVU   %VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> +       VMOVU   %VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> +       VMOVU   %VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   %VMM(3), -VEC_SIZE(%rdi, %rdx)
>         VZEROUPPER_RETURN
>
>         .p2align 4
> @@ -831,16 +823,16 @@ L(loop_large_memcpy_4x_inner):
>         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
>         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
>         /* Load vectors from rsi.  */
> -       LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> -       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> -       LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> -       LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> +       LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
> +       LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
> +       LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
> +       LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
>         subq    $-LARGE_LOAD_SIZE, %rsi
>         /* Non-temporal store vectors to rdi.  */
> -       STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> -       STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> -       STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> -       STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> +       STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
> +       STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
> +       STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
> +       STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
>         subq    $-LARGE_LOAD_SIZE, %rdi
>         decl    %ecx
>         jnz     L(loop_large_memcpy_4x_inner)
> @@ -858,31 +850,31 @@ L(loop_large_memcpy_4x_tail):
>         /* Copy 4 * VEC a time forward with non-temporal stores.  */
>         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
>         PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> +       VMOVU   (%rsi), %VMM(0)
> +       VMOVU   VEC_SIZE(%rsi), %VMM(1)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(2)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(3)
>         subq    $-(VEC_SIZE * 4), %rsi
>         addl    $-(VEC_SIZE * 4), %edx
> -       VMOVA   %VEC(0), (%rdi)
> -       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> -       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> -       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> +       VMOVA   %VMM(0), (%rdi)
> +       VMOVA   %VMM(1), VEC_SIZE(%rdi)
> +       VMOVA   %VMM(2), (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VMM(3), (VEC_SIZE * 3)(%rdi)
>         subq    $-(VEC_SIZE * 4), %rdi
>         cmpl    $(VEC_SIZE * 4), %edx
>         ja      L(loop_large_memcpy_4x_tail)
>
>  L(large_memcpy_4x_end):
>         /* Store the last 4 * VEC.  */
> -       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> -       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> -       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> -
> -       VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> -       VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> -       VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> -       VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VMM(3)
> +
> +       VMOVU   %VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> +       VMOVU   %VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> +       VMOVU   %VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   %VMM(3), -VEC_SIZE(%rdi, %rdx)
>         VZEROUPPER_RETURN
>  #endif
>  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> --
> 2.34.1
>

LGTM

--Sunil

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v10 2/6] x86: Update memrchr to use new VEC macros
  2022-10-15  3:00   ` [PATCH v10 2/6] x86: Update memrchr to use new VEC macros Noah Goldstein
@ 2022-10-15  3:44     ` Sunil Pandey
  0 siblings, 0 replies; 72+ messages in thread
From: Sunil Pandey @ 2022-10-15  3:44 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha

On Fri, Oct 14, 2022 at 8:01 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Replace %VEC(n) -> %VMM(n)
>
> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  sysdeps/x86_64/multiarch/memrchr-evex.S | 42 ++++++++++++-------------
>  1 file changed, 21 insertions(+), 21 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
> index ea3a0a0a60..550b328c5a 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
> @@ -21,7 +21,7 @@
>  #if ISA_SHOULD_BUILD (4)
>
>  # include <sysdep.h>
> -# include "evex256-vecs.h"
> +# include "x86-evex256-vecs.h"
>  # if VEC_SIZE != 32
>  #  error "VEC_SIZE != 32 unimplemented"
>  # endif
> @@ -31,7 +31,7 @@
>  # endif
>
>  # define PAGE_SIZE                     4096
> -# define VECMATCH                      VEC(0)
> +# define VMMMATCH                      VMM(0)
>
>         .section SECTION(.text), "ax", @progbits
>  ENTRY_P2ALIGN(MEMRCHR, 6)
> @@ -47,7 +47,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
>            correct page cross check and 2) it correctly sets up end ptr to be
>            subtract by lzcnt aligned.  */
>         leaq    -1(%rdi, %rdx), %rax
> -       vpbroadcastb %esi, %VECMATCH
> +       vpbroadcastb %esi, %VMMMATCH
>
>         /* Check if we can load 1x VEC without cross a page.  */
>         testl   $(PAGE_SIZE - VEC_SIZE), %eax
> @@ -55,7 +55,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
>
>         /* Don't use rax for pointer here because EVEX has better encoding with
>            offset % VEC_SIZE == 0.  */
> -       vpcmpb  $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
> +       vpcmpb  $0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>
>         /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
> @@ -96,7 +96,7 @@ L(more_1x_vec):
>         movq    %rax, %rdx
>
>         /* Need no matter what.  */
> -       vpcmpb  $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>
>         subq    %rdi, %rdx
> @@ -115,7 +115,7 @@ L(last_2x_vec):
>
>         /* Don't use rax for pointer here because EVEX has better encoding with
>            offset % VEC_SIZE == 0.  */
> -       vpcmpb  $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
> +       vpcmpb  $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>         /* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
>         lzcntq  %rcx, %rcx
> @@ -131,7 +131,7 @@ L(last_2x_vec):
>  L(page_cross):
>         movq    %rax, %rsi
>         andq    $-VEC_SIZE, %rsi
> -       vpcmpb  $0, (%rsi), %VECMATCH, %k0
> +       vpcmpb  $0, (%rsi), %VMMMATCH, %k0
>         kmovd   %k0, %r8d
>         /* Shift out negative alignment (because we are starting from endptr and
>            working backwards).  */
> @@ -165,13 +165,13 @@ L(more_2x_vec):
>         testl   %ecx, %ecx
>         jnz     L(ret_vec_x0_dec)
>
> -       vpcmpb  $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>         testl   %ecx, %ecx
>         jnz     L(ret_vec_x1)
>
>         /* Need no matter what.  */
> -       vpcmpb  $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>
>         subq    $(VEC_SIZE * 4), %rdx
> @@ -185,7 +185,7 @@ L(last_vec):
>
>
>         /* Need no matter what.  */
> -       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>         lzcntl  %ecx, %ecx
>         subq    $(VEC_SIZE * 3 + 1), %rax
> @@ -220,7 +220,7 @@ L(more_4x_vec):
>         testl   %ecx, %ecx
>         jnz     L(ret_vec_x2)
>
> -       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>
>         testl   %ecx, %ecx
> @@ -243,17 +243,17 @@ L(more_4x_vec):
>  L(loop_4x_vec):
>         /* Store 1 were not-equals and 0 where equals in k1 (used to mask later
>            on).  */
> -       vpcmpb  $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
> +       vpcmpb  $4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1
>
>         /* VEC(2/3) will have zero-byte where we found a CHAR.  */
> -       vpxorq  (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
> -       vpxorq  (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
> -       vpcmpb  $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
> +       vpxorq  (VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2)
> +       vpxorq  (VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3)
> +       vpcmpb  $0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4
>
>         /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
>            CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
> -       vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z}
> -       vptestnmb %VEC(3), %VEC(3), %k2
> +       vpminub %VMM(2), %VMM(3), %VMM(3){%k1}{z}
> +       vptestnmb %VMM(3), %VMM(3), %k2
>
>         /* Any 1s and we found CHAR.  */
>         kortestd %k2, %k4
> @@ -270,7 +270,7 @@ L(loop_4x_vec):
>  L(last_4x_vec):
>
>         /* Used no matter what.  */
> -       vpcmpb  $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>
>         cmpl    $(VEC_SIZE * 2), %edx
> @@ -280,14 +280,14 @@ L(last_4x_vec):
>         jnz     L(ret_vec_x0_dec)
>
>
> -       vpcmpb  $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>
>         testl   %ecx, %ecx
>         jnz     L(ret_vec_x1)
>
>         /* Used no matter what.  */
> -       vpcmpb  $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
> +       vpcmpb  $0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0
>         kmovd   %k0, %ecx
>
>         cmpl    $(VEC_SIZE * 3), %edx
> @@ -309,7 +309,7 @@ L(loop_end):
>         testl   %ecx, %ecx
>         jnz     L(ret_vec_x0_end)
>
> -       vptestnmb %VEC(2), %VEC(2), %k0
> +       vptestnmb %VMM(2), %VMM(2), %k0
>         kmovd   %k0, %ecx
>         testl   %ecx, %ecx
>         jnz     L(ret_vec_x1_end)
> --
> 2.34.1
>

LGTM

--Sunil

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v10 6/6] x86: Update strlen-evex-base to use new reg/vec macros.
  2022-10-15  3:00   ` [PATCH v10 6/6] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
@ 2022-10-15  3:48     ` Sunil Pandey
  0 siblings, 0 replies; 72+ messages in thread
From: Sunil Pandey @ 2022-10-15  3:48 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha

On Fri, Oct 14, 2022 at 8:01 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> To avoid duplicate the VMM / GPR / mask insn macros in all incoming
> evex512 files use the macros defined in 'reg-macros.h' and
> '{vec}-macros.h'
>
> This commit does not change libc.so
>
> Tested build on x86-64
> ---
>  sysdeps/x86_64/multiarch/strlen-evex-base.S | 116 +++++++-------------
>  sysdeps/x86_64/multiarch/strlen-evex512.S   |   4 +-
>  2 files changed, 44 insertions(+), 76 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> index 418e9f8411..c832b15a48 100644
> --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
> +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> @@ -36,42 +36,10 @@
>  #  define CHAR_SIZE    1
>  # endif
>
> -# define XMM0          xmm16
>  # define PAGE_SIZE     4096
>  # define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
> -# if VEC_SIZE == 64
> -#  define KMOV         kmovq
> -#  define KORTEST      kortestq
> -#  define RAX          rax
> -#  define RCX          rcx
> -#  define RDX          rdx
> -#  define SHR          shrq
> -#  define TEXTSUFFIX   evex512
> -#  define VMM0         zmm16
> -#  define VMM1         zmm17
> -#  define VMM2         zmm18
> -#  define VMM3         zmm19
> -#  define VMM4         zmm20
> -#  define VMOVA                vmovdqa64
> -# elif VEC_SIZE == 32
> -/* Currently Unused.  */
> -#  define KMOV         kmovd
> -#  define KORTEST      kortestd
> -#  define RAX          eax
> -#  define RCX          ecx
> -#  define RDX          edx
> -#  define SHR          shrl
> -#  define TEXTSUFFIX   evex256
> -#  define VMM0         ymm16
> -#  define VMM1         ymm17
> -#  define VMM2         ymm18
> -#  define VMM3         ymm19
> -#  define VMM4         ymm20
> -#  define VMOVA                vmovdqa32
> -# endif
> -
> -       .section .text.TEXTSUFFIX, "ax", @progbits
> +       .section SECTION(.text),"ax",@progbits
>  /* Aligning entry point to 64 byte, provides better performance for
>     one vector length string.  */
>  ENTRY_P2ALIGN (STRLEN, 6)
> @@ -86,18 +54,18 @@ ENTRY_P2ALIGN (STRLEN, 6)
>  # endif
>
>         movl    %edi, %eax
> -       vpxorq  %XMM0, %XMM0, %XMM0
> +       vpxorq  %VMM_128(0), %VMM_128(0), %VMM_128(0)
>         andl    $(PAGE_SIZE - 1), %eax
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         ja      L(page_cross)
>
>         /* Compare [w]char for null, mask bit will be set for match.  */
> -       VPCMP   $0, (%rdi), %VMM0, %k0
> -       KMOV    %k0, %RAX
> -       test    %RAX, %RAX
> +       VPCMP   $0, (%rdi), %VMM(0), %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jz      L(align_more)
>
> -       bsf     %RAX, %RAX
> +       bsf     %VRAX, %VRAX
>  # ifdef USE_AS_STRNLEN
>         cmpq    %rsi, %rax
>         cmovnb  %rsi, %rax
> @@ -120,7 +88,7 @@ L(align_more):
>         movq    %rax, %rdx
>         subq    %rdi, %rdx
>  #  ifdef USE_AS_WCSLEN
> -       SHR     $2, %RDX
> +       shr     $2, %VRDX
>  #  endif
>         /* At this point rdx contains [w]chars already compared.  */
>         subq    %rsi, %rdx
> @@ -131,9 +99,9 @@ L(align_more):
>  # endif
>
>         /* Loop unroll 4 times for 4 vector loop.  */
> -       VPCMP   $0, (%rax), %VMM0, %k0
> -       KMOV    %k0, %RCX
> -       test    %RCX, %RCX
> +       VPCMP   $0, (%rax), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(ret_vec_x1)
>
>  # ifdef USE_AS_STRNLEN
> @@ -141,9 +109,9 @@ L(align_more):
>         jbe     L(ret_max)
>  # endif
>
> -       VPCMP   $0, VEC_SIZE(%rax), %VMM0, %k0
> -       KMOV    %k0, %RCX
> -       test    %RCX, %RCX
> +       VPCMP   $0, VEC_SIZE(%rax), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(ret_vec_x2)
>
>  # ifdef USE_AS_STRNLEN
> @@ -151,9 +119,9 @@ L(align_more):
>         jbe     L(ret_max)
>  # endif
>
> -       VPCMP   $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
> -       KMOV    %k0, %RCX
> -       test    %RCX, %RCX
> +       VPCMP   $0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(ret_vec_x3)
>
>  # ifdef USE_AS_STRNLEN
> @@ -161,9 +129,9 @@ L(align_more):
>         jbe     L(ret_max)
>  # endif
>
> -       VPCMP   $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
> -       KMOV    %k0, %RCX
> -       test    %RCX, %RCX
> +       VPCMP   $0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(ret_vec_x4)
>
>  # ifdef USE_AS_STRNLEN
> @@ -179,7 +147,7 @@ L(align_more):
>  # ifdef USE_AS_STRNLEN
>         subq    %rax, %rcx
>  #  ifdef USE_AS_WCSLEN
> -       SHR     $2, %RCX
> +       shr     $2, %VRCX
>  #  endif
>         /* rcx contains number of [w]char will be recompared due to
>            alignment fixes.  rdx must be incremented by rcx to offset
> @@ -199,42 +167,42 @@ L(loop_entry):
>  # endif
>         /* VPMINU and VPCMP combination provide better performance as
>            compared to alternative combinations.  */
> -       VMOVA   (VEC_SIZE * 4)(%rax), %VMM1
> -       VPMINU  (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
> -       VMOVA   (VEC_SIZE * 6)(%rax), %VMM3
> -       VPMINU  (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
> +       VMOVA   (VEC_SIZE * 4)(%rax), %VMM(1)
> +       VPMINU  (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rax), %VMM(3)
> +       VPMINU  (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
>
> -       VPTESTN %VMM2, %VMM2, %k0
> -       VPTESTN %VMM4, %VMM4, %k1
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       VPTESTN %VMM(4), %VMM(4), %k1
>
>         subq    $-(VEC_SIZE * 4), %rax
>         KORTEST %k0, %k1
>         jz      L(loop)
>
> -       VPTESTN %VMM1, %VMM1, %k2
> -       KMOV    %k2, %RCX
> -       test    %RCX, %RCX
> +       VPTESTN %VMM(1), %VMM(1), %k2
> +       KMOV    %k2, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(ret_vec_x1)
>
> -       KMOV    %k0, %RCX
> +       KMOV    %k0, %VRCX
>         /* At this point, if k0 is non zero, null char must be in the
>            second vector.  */
> -       test    %RCX, %RCX
> +       test    %VRCX, %VRCX
>         jnz     L(ret_vec_x2)
>
> -       VPTESTN %VMM3, %VMM3, %k3
> -       KMOV    %k3, %RCX
> -       test    %RCX, %RCX
> +       VPTESTN %VMM(3), %VMM(3), %k3
> +       KMOV    %k3, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(ret_vec_x3)
>         /* At this point null [w]char must be in the fourth vector so no
>            need to check.  */
> -       KMOV    %k1, %RCX
> +       KMOV    %k1, %VRCX
>
>         /* Fourth, third, second vector terminating are pretty much
>            same, implemented this way to avoid branching and reuse code
>            from pre loop exit condition.  */
>  L(ret_vec_x4):
> -       bsf     %RCX, %RCX
> +       bsf     %VRCX, %VRCX
>         subq    %rdi, %rax
>  # ifdef USE_AS_WCSLEN
>         subq    $-(VEC_SIZE * 3), %rax
> @@ -250,7 +218,7 @@ L(ret_vec_x4):
>         ret
>
>  L(ret_vec_x3):
> -       bsf     %RCX, %RCX
> +       bsf     %VRCX, %VRCX
>         subq    %rdi, %rax
>  # ifdef USE_AS_WCSLEN
>         subq    $-(VEC_SIZE * 2), %rax
> @@ -268,7 +236,7 @@ L(ret_vec_x3):
>  L(ret_vec_x2):
>         subq    $-VEC_SIZE, %rax
>  L(ret_vec_x1):
> -       bsf     %RCX, %RCX
> +       bsf     %VRCX, %VRCX
>         subq    %rdi, %rax
>  # ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
> @@ -289,13 +257,13 @@ L(page_cross):
>         /* ecx contains number of w[char] to be skipped as a result
>            of address alignment.  */
>         xorq    %rdi, %rax
> -       VPCMP   $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
> -       KMOV    %k0, %RAX
> +       VPCMP   $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0
> +       KMOV    %k0, %VRAX
>         /* Ignore number of character for alignment adjustment.  */
> -       SHR     %cl, %RAX
> +       shr     %cl, %VRAX
>         jz      L(align_more)
>
> -       bsf     %RAX, %RAX
> +       bsf     %VRAX, %VRAX
>  # ifdef USE_AS_STRNLEN
>         cmpq    %rsi, %rax
>         cmovnb  %rsi, %rax
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> index 116f8981c8..10c3415c8a 100644
> --- a/sysdeps/x86_64/multiarch/strlen-evex512.S
> +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> @@ -2,6 +2,6 @@
>  # define STRLEN                __strlen_evex512
>  #endif
>
> -#define VEC_SIZE       64
> -
> +#include "x86-evex512-vecs.h"
> +#include "reg-macros.h"
>  #include "strlen-evex-base.S"
> --
> 2.34.1
>

LGTM

--Sunil

^ permalink raw reply	[flat|nested] 72+ messages in thread

end of thread, other threads:[~2022-10-15  3:49 UTC | newest]

Thread overview: 72+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-14 16:40 [PATCH v1 1/3] x86: Update evex256/512 vec macros Noah Goldstein
2022-10-14 16:40 ` [PATCH v1 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
2022-10-14 18:02   ` H.J. Lu
2022-10-14 18:26     ` Noah Goldstein
2022-10-14 18:35       ` H.J. Lu
2022-10-14 18:38         ` Noah Goldstein
2022-10-14 18:53           ` H.J. Lu
2022-10-14 19:00             ` Noah Goldstein
2022-10-14 19:13               ` H.J. Lu
2022-10-14 19:15                 ` Noah Goldstein
2022-10-14 16:40 ` [PATCH v1 3/3] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
2022-10-14 17:31 ` [PATCH v1 1/3] x86: Update evex256/512 vec macros H.J. Lu
2022-10-14 18:01 ` [PATCH v2 " Noah Goldstein
2022-10-14 18:01   ` [PATCH v2 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
2022-10-14 18:01   ` [PATCH v2 3/3] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
2022-10-14 18:22 ` [PATCH v3 1/3] x86: Update evex256/512 vec macros Noah Goldstein
2022-10-14 18:22   ` [PATCH v3 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
2022-10-14 18:22   ` [PATCH v3 3/3] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
2022-10-14 18:41 ` [PATCH v4 1/3] x86: Update evex256/512 vec macros Noah Goldstein
2022-10-14 18:41   ` [PATCH v4 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
2022-10-14 18:41   ` [PATCH v4 3/3] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
2022-10-14 21:14 ` [PATCH v5 1/3] x86: Update evex256/512 vec macros Noah Goldstein
2022-10-14 21:15   ` [PATCH v5 2/3] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
2022-10-14 21:28     ` H.J. Lu
2022-10-14 22:01       ` Noah Goldstein
2022-10-14 22:05         ` H.J. Lu
2022-10-14 22:27           ` Noah Goldstein
2022-10-14 22:41             ` H.J. Lu
2022-10-14 23:15               ` Noah Goldstein
2022-10-14 23:22                 ` H.J. Lu
2022-10-14 23:25                   ` Noah Goldstein
2022-10-14 21:15   ` [PATCH v5 3/3] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
2022-10-14 22:39 ` [PATCH v6 1/7] x86: Update and move evex256/512 vec macros Noah Goldstein
2022-10-14 22:39   ` [PATCH v6 2/7] x86: Add macros for GPRs / mask insn based on VEC_SIZE Noah Goldstein
2022-10-14 22:39   ` [PATCH v6 3/7] x86: Update memrchr to use new VEC macros Noah Goldstein
2022-10-14 22:39   ` [PATCH v6 4/7] x86: Remove now unused vec header macros Noah Goldstein
2022-10-14 22:39   ` [PATCH v6 5/7] x86: Update memmove to use new VEC macros Noah Goldstein
2022-10-14 22:39   ` [PATCH v6 6/7] x86: Update memset " Noah Goldstein
2022-10-14 22:39   ` [PATCH v6 7/7] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
2022-10-15  0:06 ` [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls Noah Goldstein
2022-10-15  0:06   ` [PATCH v8 2/6] x86: Update memrchr to use new VEC macros Noah Goldstein
2022-10-15  0:06   ` [PATCH v8 3/6] x86: Update memmove " Noah Goldstein
2022-10-15  0:06   ` [PATCH v8 4/6] x86: Update memset " Noah Goldstein
2022-10-15  0:06   ` [PATCH v8 5/6] x86: Remove now unused vec header macros Noah Goldstein
2022-10-15  0:06   ` [PATCH v8 6/6] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
2022-10-15  0:12   ` [PATCH v8 1/6] x86: Update VEC macros to complete API for evex/evex512 impls H.J. Lu
2022-10-15  0:20     ` Noah Goldstein
2022-10-15  0:20 ` [PATCH v9 " Noah Goldstein
2022-10-15  0:20   ` [PATCH v9 2/6] x86: Update memrchr to use new VEC macros Noah Goldstein
2022-10-15  2:48     ` H.J. Lu
2022-10-15  0:20   ` [PATCH v9 3/6] x86: Update memmove " Noah Goldstein
2022-10-15  2:52     ` H.J. Lu
2022-10-15  2:57       ` Noah Goldstein
2022-10-15  0:20   ` [PATCH v9 4/6] x86: Update memset " Noah Goldstein
2022-10-15  2:53     ` H.J. Lu
2022-10-15  0:20   ` [PATCH v9 5/6] x86: Remove now unused vec header macros Noah Goldstein
2022-10-15  2:56     ` H.J. Lu
2022-10-15  0:21   ` [PATCH v9 6/6] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
2022-10-15  2:58     ` H.J. Lu
2022-10-15  2:45   ` [PATCH v9 1/6] x86: Update VEC macros to complete API for evex/evex512 impls H.J. Lu
2022-10-15  3:00 ` [PATCH v10 " Noah Goldstein
2022-10-15  3:00   ` [PATCH v10 2/6] x86: Update memrchr to use new VEC macros Noah Goldstein
2022-10-15  3:44     ` Sunil Pandey
2022-10-15  3:00   ` [PATCH v10 3/6] x86: Update memmove " Noah Goldstein
2022-10-15  3:43     ` Sunil Pandey
2022-10-15  3:00   ` [PATCH v10 4/6] x86: Update memset " Noah Goldstein
2022-10-15  3:42     ` Sunil Pandey
2022-10-15  3:00   ` [PATCH v10 5/6] x86: Remove now unused vec header macros Noah Goldstein
2022-10-15  3:39     ` Sunil Pandey
2022-10-15  3:00   ` [PATCH v10 6/6] x86: Update strlen-evex-base to use new reg/vec macros Noah Goldstein
2022-10-15  3:48     ` Sunil Pandey
2022-10-15  3:37   ` [PATCH v10 1/6] x86: Update VEC macros to complete API for evex/evex512 impls Sunil Pandey

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).