public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] x86-64: Only define used SSE/AVX/AVX512 run-time resolvers
@ 2022-06-27 19:06 H.J. Lu
       [not found] ` <CAFUsyfKk285bCRRuNZ4qtXE93XuUjFC4Z64pnwUDpZHnfYPnkw@mail.gmail.com>
  0 siblings, 1 reply; 10+ messages in thread
From: H.J. Lu @ 2022-06-27 19:06 UTC (permalink / raw)
  To: libc-alpha

When glibc is built with x86-64 ISA level v3, SSE run-time resolvers
aren't used.  For x86-64 ISA level v4 build, both SSE and AVX resolvers
are unused.

1. Move X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P to
isa-level.h.
2. Check the minimum x86-64 ISA level to exclude the unused run-time
resolvers.
---
 sysdeps/x86/isa-ifunc-macros.h | 27 ----------------
 sysdeps/x86/isa-level.h        | 26 +++++++++++++++
 sysdeps/x86_64/dl-machine.h    | 12 ++++---
 sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++----------------
 4 files changed, 66 insertions(+), 58 deletions(-)

diff --git a/sysdeps/x86/isa-ifunc-macros.h b/sysdeps/x86/isa-ifunc-macros.h
index d69905689b..f967a1bec6 100644
--- a/sysdeps/x86/isa-ifunc-macros.h
+++ b/sysdeps/x86/isa-ifunc-macros.h
@@ -56,31 +56,4 @@
 # define X86_IFUNC_IMPL_ADD_V1(...)
 #endif
 
-/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
-   macros are wrappers for the the respective
-   CPU_FEATURE{S}_{USABLE|ARCH}_P runtime checks.  They differ in two
-   ways.
-
-    1.  The USABLE_P version is evaluated to true when the feature
-        is enabled.
-
-    2.  The ARCH_P version has a third argument `not`.  The `not`
-        argument can either be '!' or empty.  If the feature is
-        enabled above an ISA level, the third argument should be empty
-        and the expression is evaluated to true when the feature is
-        enabled.  If the feature is disabled above an ISA level, the
-        third argument should be `!` and the expression is evaluated
-        to true when the feature is disabled.
- */
-
-#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
-  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
-   || CPU_FEATURE_USABLE_P (ptr, name))
-
-
-#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
-  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
-   || not CPU_FEATURES_ARCH_P (ptr, name))
-
-
 #endif
diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
index 075e7c6ee1..f293aea906 100644
--- a/sysdeps/x86/isa-level.h
+++ b/sysdeps/x86/isa-level.h
@@ -68,10 +68,12 @@
    compile-time constant.. */
 
 /* ISA level >= 4 guaranteed includes.  */
+#define AVX512F_X86_ISA_LEVEL 4
 #define AVX512VL_X86_ISA_LEVEL 4
 #define AVX512BW_X86_ISA_LEVEL 4
 
 /* ISA level >= 3 guaranteed includes.  */
+#define AVX_X86_ISA_LEVEL 3
 #define AVX2_X86_ISA_LEVEL 3
 #define BMI2_X86_ISA_LEVEL 3
 
@@ -87,6 +89,30 @@
    when ISA level < 3.  */
 #define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
 
+/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
+   macros are wrappers for the respective CPU_FEATURE{S}_{USABLE|ARCH}_P
+   runtime checks.  They differ in two ways.
+
+    1.  The USABLE_P version is evaluated to true when the feature
+        is enabled.
+
+    2.  The ARCH_P version has a third argument `not`.  The `not`
+        argument can either be `!` or empty.  If the feature is
+        enabled above an ISA level, the third argument should be empty
+        and the expression is evaluated to true when the feature is
+        enabled.  If the feature is disabled above an ISA level, the
+        third argument should be `!` and the expression is evaluated
+        to true when the feature is disabled.
+ */
+
+#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
+  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
+   || CPU_FEATURE_USABLE_P (ptr, name))
+
+#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
+  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
+   || not CPU_FEATURES_ARCH_P (ptr, name))
+
 #define ISA_SHOULD_BUILD(isa_build_level)                              \
   (MINIMUM_X86_ISA_LEVEL <= (isa_build_level) && IS_IN (libc))         \
    || defined ISA_DEFAULT_IMPL
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index 34766325ae..005d089501 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -28,6 +28,7 @@
 #include <dl-tlsdesc.h>
 #include <dl-static-tls.h>
 #include <dl-machine-rel.h>
+#include <isa-level.h>
 
 /* Return nonzero iff ELF header is compatible with the running host.  */
 static inline int __attribute__ ((unused))
@@ -86,6 +87,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
       /* Identify this shared object.  */
       *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
 
+      const struct cpu_features* cpu_features = __get_cpu_features ();
+
       /* The got[2] entry contains the address of a function which gets
 	 called to get the address of a so far unresolved function and
 	 jump to it.  The profiling extension of the dynamic linker allows
@@ -94,9 +97,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 	 end in this function.  */
       if (__glibc_unlikely (profile))
 	{
-	  if (CPU_FEATURE_USABLE (AVX512F))
+	  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
 	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
-	  else if (CPU_FEATURE_USABLE (AVX))
+	  else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
 	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
 	  else
 	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
@@ -112,9 +115,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 	  /* This function will get called to fix up the GOT entry
 	     indicated by the offset on the stack, and then jump to
 	     the resolved address.  */
-	  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
+	  if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
+	      || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
 	    *(ElfW(Addr) *) (got + 2)
-	      = (CPU_FEATURE_USABLE (XSAVEC)
+	      = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
 		 ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
 		 : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
 	  else
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 831a654713..f669805ac5 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -20,6 +20,7 @@
 #include <sysdep.h>
 #include <cpu-features-offsets.h>
 #include <link-defines.h>
+#include <isa-level.h>
 
 #ifndef DL_STACK_ALIGNMENT
 /* Due to GCC bug:
@@ -62,35 +63,39 @@
 #undef VMOVA
 #undef VEC_SIZE
 
-#define VEC_SIZE		32
-#define VMOVA			vmovdqa
-#define VEC(i)			ymm##i
-#define _dl_runtime_profile	_dl_runtime_profile_avx
-#include "dl-trampoline.h"
-#undef _dl_runtime_profile
-#undef VEC
-#undef VMOVA
-#undef VEC_SIZE
+#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
+# define VEC_SIZE		32
+# define VMOVA			vmovdqa
+# define VEC(i)			ymm##i
+# define _dl_runtime_profile	_dl_runtime_profile_avx
+# include "dl-trampoline.h"
+# undef _dl_runtime_profile
+# undef VEC
+# undef VMOVA
+# undef VEC_SIZE
+#endif
 
+#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
 /* movaps/movups is 1-byte shorter.  */
-#define VEC_SIZE		16
-#define VMOVA			movaps
-#define VEC(i)			xmm##i
-#define _dl_runtime_profile	_dl_runtime_profile_sse
-#undef RESTORE_AVX
-#include "dl-trampoline.h"
-#undef _dl_runtime_profile
-#undef VEC
-#undef VMOVA
-#undef VEC_SIZE
-
-#define USE_FXSAVE
-#define STATE_SAVE_ALIGNMENT	16
-#define _dl_runtime_resolve	_dl_runtime_resolve_fxsave
-#include "dl-trampoline.h"
-#undef _dl_runtime_resolve
-#undef USE_FXSAVE
-#undef STATE_SAVE_ALIGNMENT
+# define VEC_SIZE		16
+# define VMOVA			movaps
+# define VEC(i)			xmm##i
+# define _dl_runtime_profile	_dl_runtime_profile_sse
+# undef RESTORE_AVX
+# include "dl-trampoline.h"
+# undef _dl_runtime_profile
+# undef VEC
+# undef VMOVA
+# undef VEC_SIZE
+
+# define USE_FXSAVE
+# define STATE_SAVE_ALIGNMENT	16
+# define _dl_runtime_resolve	_dl_runtime_resolve_fxsave
+# include "dl-trampoline.h"
+# undef _dl_runtime_resolve
+# undef USE_FXSAVE
+# undef STATE_SAVE_ALIGNMENT
+#endif
 
 #define USE_XSAVE
 #define STATE_SAVE_ALIGNMENT	64
-- 
2.36.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2022-06-27 20:46 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-27 19:06 [PATCH] x86-64: Only define used SSE/AVX/AVX512 run-time resolvers H.J. Lu
     [not found] ` <CAFUsyfKk285bCRRuNZ4qtXE93XuUjFC4Z64pnwUDpZHnfYPnkw@mail.gmail.com>
2022-06-27 19:43   ` H.J. Lu
2022-06-27 19:51     ` Noah Goldstein
2022-06-27 20:00       ` H.J. Lu
2022-06-27 20:09         ` Noah Goldstein
2022-06-27 20:19           ` H.J. Lu
2022-06-27 20:25             ` Noah Goldstein
2022-06-27 20:45               ` H.J. Lu
     [not found]   ` <CAFUsyfJKQgYQtSov51En4ASWqPCa9NLvo2jHymFC3BnFaap91g@mail.gmail.com>
2022-06-27 19:29     ` Noah Goldstein
2022-06-27 19:44     ` H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).