public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v1] x86: Fix type of `Slow_SSE4_2` def in isa-level.h
@ 2022-07-18 10:38 Noah Goldstein
  2022-07-18 10:38 ` [PATCH v1] x86: Continue building memmove-ssse3.S as ISA level V3 Noah Goldstein
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Noah Goldstein @ 2022-07-18 10:38 UTC (permalink / raw)
  To: libc-alpha

Change from `Slow_SSE42_X86_ISA_LEVEL` to
`Slow_SSE4_2_X86_ISA_LEVEL`. Currently the def is unused to no
need to change anything else.
---
 sysdeps/x86/isa-level.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
index 3c4480aba7..fe56af7e2b 100644
--- a/sysdeps/x86/isa-level.h
+++ b/sysdeps/x86/isa-level.h
@@ -104,7 +104,7 @@
 
 /* NB: This feature is disable when ISA level >= 3.  All CPUs with
    this feature don't run on glibc built with ISA level >= 3.  */
-#define Slow_SSE42_X86_ISA_LEVEL 3
+#define Slow_SSE4_2_X86_ISA_LEVEL 3
 
 /* Feature(s) enabled when ISA level >= 2.  */
 #define Fast_Unaligned_Load_X86_ISA_LEVEL 2
-- 
2.34.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v1] x86: Continue building memmove-ssse3.S as ISA level V3
  2022-07-18 10:38 [PATCH v1] x86: Fix type of `Slow_SSE4_2` def in isa-level.h Noah Goldstein
@ 2022-07-18 10:38 ` Noah Goldstein
  2022-07-18 11:54 ` [PATCH v2] x86: Fix typo of `Slow_SSE4_2` def in isa-level.h Noah Goldstein
  2022-07-18 13:25 ` [PATCH v3 1/2] " Noah Goldstein
  2 siblings, 0 replies; 5+ messages in thread
From: Noah Goldstein @ 2022-07-18 10:38 UTC (permalink / raw)
  To: libc-alpha

Some V3 processors still strongly prefer memmove-ssse3.S because it is
heavily optimized to avoid unaligned memory accesses.

Tested builds for x86-64 v1, v2, v3, and v4 with and without
multiarch.
---
 sysdeps/x86/isa-level.h                    | 15 +++++++++++
 sysdeps/x86_64/multiarch/ifunc-impl-list.c | 30 +++++++++++++---------
 sysdeps/x86_64/multiarch/ifunc-memmove.h   | 14 +++++-----
 sysdeps/x86_64/multiarch/memmove-ssse3.S   |  4 ++-
 4 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
index fe56af7e2b..f49336acf3 100644
--- a/sysdeps/x86/isa-level.h
+++ b/sysdeps/x86/isa-level.h
@@ -90,6 +90,14 @@
 
 /* For X86_ISA_CPU_FEATURES_ARCH_P.  */
 
+
+/* NB: This is just an alias to `AVX_Fast_Unaligned_Load` that will
+   continue doing runtime check up to ISA level >= 4.  This is for
+   some Zhaoxin CPUs which build at ISA level V3 but still have a
+   strong preference for avoiding unaligned `ymm` loads.  */
+#define V4_AVX_Fast_Unaligned_Load_X86_ISA_LEVEL 4
+#define V4_AVX_Fast_Unaligned_Load AVX_Fast_Unaligned_Load
+
 /* NB: This feature is enabled when ISA level >= 3, which was disabled
    for the following CPUs:
         - AMD Excavator
@@ -106,6 +114,13 @@
    this feature don't run on glibc built with ISA level >= 3.  */
 #define Slow_SSE4_2_X86_ISA_LEVEL 3
 
+/* NB: This is just an alias to `Fast_Unaligned_Copy` that will
+   continue doing runtime check up to ISA level >= 3.  This is for
+   some Zhaoxin CPUs which build at ISA level V3 but still have a
+   strong preference for avoiding unaligned `ymm` loads.  */
+#define V3_Fast_Unaligned_Copy_X86_ISA_LEVEL 3
+#define V3_Fast_Unaligned_Copy Fast_Unaligned_Copy
+
 /* Feature(s) enabled when ISA level >= 2.  */
 #define Fast_Unaligned_Load_X86_ISA_LEVEL 2
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a71444eccb..427f127427 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -143,8 +143,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memmove_chk_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, __memmove_chk,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __memmove_chk_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -190,8 +191,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memmove_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, memmove,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __memmove_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1004,8 +1006,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memcpy_chk_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, __memcpy_chk,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __memcpy_chk_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1051,8 +1054,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memcpy_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, memcpy,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __memcpy_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1098,8 +1102,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __mempcpy_chk_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, __mempcpy_chk,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __mempcpy_chk_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1145,8 +1150,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __mempcpy_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, mempcpy,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __mempcpy_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 1643d32887..be0c758783 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -72,7 +72,7 @@ IFUNC_SELECTOR (void)
     }
 
   if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
-				   AVX_Fast_Unaligned_Load, ))
+				   V4_AVX_Fast_Unaligned_Load, ))
     {
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
 	{
@@ -101,11 +101,13 @@ IFUNC_SELECTOR (void)
     }
 
   if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
-      /* Leave this as runtime check.  The SSSE3 is optimized almost
-         exclusively for avoiding unaligned memory access during the
-         copy and by and large is not better than the sse2
-         implementation as a general purpose memmove.  */
-      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+      /* Leave this as runtime check for V2.  By V3 assume it must be
+	     set.  The SSSE3 is optimized almost exclusively for avoiding
+	     unaligned memory access during the copy and by and large is
+	     not better than the sse2 implementation as a general purpose
+	     memmove. */
+      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+				      V3_Fast_Unaligned_Copy, !))
     {
       return OPTIMIZE (ssse3);
     }
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
index 57599752c7..15cafee766 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -20,7 +20,9 @@
 
 #include <isa-level.h>
 
-#if ISA_SHOULD_BUILD (2)
+/* Continue building up to ISA level V3 as some V3 CPUs strongly
+   prefer this implementation.  */
+#if ISA_SHOULD_BUILD (3)
 
 # include <sysdep.h>
 # ifndef MEMMOVE
-- 
2.34.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2] x86: Fix typo of `Slow_SSE4_2` def in isa-level.h
  2022-07-18 10:38 [PATCH v1] x86: Fix type of `Slow_SSE4_2` def in isa-level.h Noah Goldstein
  2022-07-18 10:38 ` [PATCH v1] x86: Continue building memmove-ssse3.S as ISA level V3 Noah Goldstein
@ 2022-07-18 11:54 ` Noah Goldstein
  2022-07-18 13:25 ` [PATCH v3 1/2] " Noah Goldstein
  2 siblings, 0 replies; 5+ messages in thread
From: Noah Goldstein @ 2022-07-18 11:54 UTC (permalink / raw)
  To: libc-alpha

Change from `Slow_SSE42_X86_ISA_LEVEL` to
`Slow_SSE4_2_X86_ISA_LEVEL`. Currently the def is unused to no
need to change anything else.
---
 sysdeps/x86/isa-level.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
index 3c4480aba7..fe56af7e2b 100644
--- a/sysdeps/x86/isa-level.h
+++ b/sysdeps/x86/isa-level.h
@@ -104,7 +104,7 @@
 
 /* NB: This feature is disable when ISA level >= 3.  All CPUs with
    this feature don't run on glibc built with ISA level >= 3.  */
-#define Slow_SSE42_X86_ISA_LEVEL 3
+#define Slow_SSE4_2_X86_ISA_LEVEL 3
 
 /* Feature(s) enabled when ISA level >= 2.  */
 #define Fast_Unaligned_Load_X86_ISA_LEVEL 2
-- 
2.34.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v3 1/2] x86: Fix typo of `Slow_SSE4_2` def in isa-level.h
  2022-07-18 10:38 [PATCH v1] x86: Fix type of `Slow_SSE4_2` def in isa-level.h Noah Goldstein
  2022-07-18 10:38 ` [PATCH v1] x86: Continue building memmove-ssse3.S as ISA level V3 Noah Goldstein
  2022-07-18 11:54 ` [PATCH v2] x86: Fix typo of `Slow_SSE4_2` def in isa-level.h Noah Goldstein
@ 2022-07-18 13:25 ` Noah Goldstein
  2022-07-18 13:25   ` [PATCH v3 2/2] x86: Continue building memmove-ssse3.S as ISA level V3 Noah Goldstein
  2 siblings, 1 reply; 5+ messages in thread
From: Noah Goldstein @ 2022-07-18 13:25 UTC (permalink / raw)
  To: libc-alpha

Change from `Slow_SSE42_X86_ISA_LEVEL` to
`Slow_SSE4_2_X86_ISA_LEVEL`. Currently the def is unused to no
need to change anything else.
---
 sysdeps/x86/isa-level.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
index 3c4480aba7..fe56af7e2b 100644
--- a/sysdeps/x86/isa-level.h
+++ b/sysdeps/x86/isa-level.h
@@ -104,7 +104,7 @@
 
 /* NB: This feature is disable when ISA level >= 3.  All CPUs with
    this feature don't run on glibc built with ISA level >= 3.  */
-#define Slow_SSE42_X86_ISA_LEVEL 3
+#define Slow_SSE4_2_X86_ISA_LEVEL 3
 
 /* Feature(s) enabled when ISA level >= 2.  */
 #define Fast_Unaligned_Load_X86_ISA_LEVEL 2
-- 
2.34.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v3 2/2] x86: Continue building memmove-ssse3.S as ISA level V3
  2022-07-18 13:25 ` [PATCH v3 1/2] " Noah Goldstein
@ 2022-07-18 13:25   ` Noah Goldstein
  0 siblings, 0 replies; 5+ messages in thread
From: Noah Goldstein @ 2022-07-18 13:25 UTC (permalink / raw)
  To: libc-alpha

Some V3 processors still strongly prefer memmove-ssse3.S because it is
heavily optimized to avoid unaligned memory accesses.

Tested builds for x86-64 v1, v2, v3, and v4 with and without
multiarch.
---
 sysdeps/x86/isa-level.h                    | 15 +++++++++++
 sysdeps/x86_64/multiarch/ifunc-impl-list.c | 30 +++++++++++++---------
 sysdeps/x86_64/multiarch/ifunc-memmove.h   | 14 +++++-----
 sysdeps/x86_64/multiarch/memmove-ssse3.S   |  4 ++-
 4 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
index fe56af7e2b..f49336acf3 100644
--- a/sysdeps/x86/isa-level.h
+++ b/sysdeps/x86/isa-level.h
@@ -90,6 +90,14 @@
 
 /* For X86_ISA_CPU_FEATURES_ARCH_P.  */
 
+
+/* NB: This is just an alias to `AVX_Fast_Unaligned_Load` that will
+   continue doing runtime check up to ISA level >= 4.  This is for
+   some Zhaoxin CPUs which build at ISA level V3 but still have a
+   strong preference for avoiding unaligned `ymm` loads.  */
+#define V4_AVX_Fast_Unaligned_Load_X86_ISA_LEVEL 4
+#define V4_AVX_Fast_Unaligned_Load AVX_Fast_Unaligned_Load
+
 /* NB: This feature is enabled when ISA level >= 3, which was disabled
    for the following CPUs:
         - AMD Excavator
@@ -106,6 +114,13 @@
    this feature don't run on glibc built with ISA level >= 3.  */
 #define Slow_SSE4_2_X86_ISA_LEVEL 3
 
+/* NB: This is just an alias to `Fast_Unaligned_Copy` that will
+   continue doing runtime check up to ISA level >= 3.  This is for
+   some Zhaoxin CPUs which build at ISA level V3 but still have a
+   strong preference for avoiding unaligned `ymm` loads.  */
+#define V3_Fast_Unaligned_Copy_X86_ISA_LEVEL 3
+#define V3_Fast_Unaligned_Copy Fast_Unaligned_Copy
+
 /* Feature(s) enabled when ISA level >= 2.  */
 #define Fast_Unaligned_Load_X86_ISA_LEVEL 2
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a71444eccb..427f127427 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -143,8 +143,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memmove_chk_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, __memmove_chk,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __memmove_chk_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -190,8 +191,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memmove_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, memmove,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __memmove_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1004,8 +1006,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memcpy_chk_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, __memcpy_chk,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __memcpy_chk_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1051,8 +1054,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memcpy_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, memcpy,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __memcpy_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1098,8 +1102,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __mempcpy_chk_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, __mempcpy_chk,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __mempcpy_chk_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1145,8 +1150,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __mempcpy_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, mempcpy,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __mempcpy_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 1643d32887..be0c758783 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -72,7 +72,7 @@ IFUNC_SELECTOR (void)
     }
 
   if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
-				   AVX_Fast_Unaligned_Load, ))
+				   V4_AVX_Fast_Unaligned_Load, ))
     {
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
 	{
@@ -101,11 +101,13 @@ IFUNC_SELECTOR (void)
     }
 
   if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
-      /* Leave this as runtime check.  The SSSE3 is optimized almost
-         exclusively for avoiding unaligned memory access during the
-         copy and by and large is not better than the sse2
-         implementation as a general purpose memmove.  */
-      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+      /* Leave this as runtime check for V2.  By V3 assume it must be
+	     set.  The SSSE3 is optimized almost exclusively for avoiding
+	     unaligned memory access during the copy and by and large is
+	     not better than the sse2 implementation as a general purpose
+	     memmove. */
+      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+				      V3_Fast_Unaligned_Copy, !))
     {
       return OPTIMIZE (ssse3);
     }
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
index 57599752c7..15cafee766 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -20,7 +20,9 @@
 
 #include <isa-level.h>
 
-#if ISA_SHOULD_BUILD (2)
+/* Continue building up to ISA level V3 as some V3 CPUs strongly
+   prefer this implementation.  */
+#if ISA_SHOULD_BUILD (3)
 
 # include <sysdep.h>
 # ifndef MEMMOVE
-- 
2.34.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-07-18 13:25 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-18 10:38 [PATCH v1] x86: Fix type of `Slow_SSE4_2` def in isa-level.h Noah Goldstein
2022-07-18 10:38 ` [PATCH v1] x86: Continue building memmove-ssse3.S as ISA level V3 Noah Goldstein
2022-07-18 11:54 ` [PATCH v2] x86: Fix typo of `Slow_SSE4_2` def in isa-level.h Noah Goldstein
2022-07-18 13:25 ` [PATCH v3 1/2] " Noah Goldstein
2022-07-18 13:25   ` [PATCH v3 2/2] x86: Continue building memmove-ssse3.S as ISA level V3 Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).