From: Noah Goldstein <goldstein.w.n@gmail.com>
To: libc-alpha@sourceware.org
Cc: goldstein.w.n@gmail.com, hjl.tools@gmail.com,
andrey.kolesov@intel.com, carlos@systemhalted.org
Subject: [PATCH v1 11/27] x86/fpu: Optimize svml_s_atanf16_core_avx512.S
Date: Wed, 7 Dec 2022 00:52:20 -0800 [thread overview]
Message-ID: <20221207085236.1424424-11-goldstein.w.n@gmail.com> (raw)
In-Reply-To: <20221207085236.1424424-1-goldstein.w.n@gmail.com>
1. Change the algorithm used to match the avx2 implementation which
seems to be faster.
2. Cleanup some missed optimizations in instruction selection /
unnecissary repeated rodata references.
3. Remove unused rodata.
4. Use common data definitions where possible.
Changing the algorithm (1) causes a slight ULP error increase (exact
same as the avx2 version).
Before:
ulp:
0: 4127324924 (0.9610)
1: 167635550 (0.0390)
2: 6822 (0.0000)
3: 0 (0.0000)
4: 0 (0.0000)
After:
ulp:
0: 4088299128 (0.9519)
1: 206531674 (0.0481)
2: 136494 (0.0000)
3: 0 (0.0000)
4: 0 (0.0000)
Since the max ULP is the same and the distribution matches the avx2
implementation this seems like an acceptable "regression" as it
doesn't seem feasible any application could have been relying on
the precision distribution.
Code Size Change: -79 Bytes (193 - 272)
Perf Changes:
Input New Time / Old Time
0F (0x00000000) -> 0.7612
0F (0x0000ffff, Denorm) -> 1.3234
.1F (0x3dcccccd) -> 0.7690
5F (0x40a00000) -> 0.7752
2315255808F (0x4f0a0000) -> 0.7712
-NaN (0xffffffff) -> 0.7824
Note the ~32% regression in the denorm case is because of
additional micro-code assists (from the algorithm shift).
This generally seems worth it for the ~23-24% perf improvement
in other cases as denormal inputs are almost certainly cold cases.
---
.../multiarch/svml_s_atanf16_core_avx512.S | 199 ++++++------------
1 file changed, 67 insertions(+), 132 deletions(-)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
index 88b44a989c..abb3c76209 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
@@ -28,146 +28,81 @@
*
*/
-/* Offsets for data table __svml_satan_data_internal_avx512
- */
-#define AbsMask 0
-#define Shifter 64
-#define MaxThreshold 128
-#define MOne 192
-#define One 256
-#define LargeX 320
-#define Zero 384
-#define Tbl_H 448
-#define Pi2 576
-#define coeff_1 640
-#define coeff_2 704
-#define coeff_3 768
+#define LOCAL_DATA_NAME __svml_satan_data_internal
+#include "svml_s_common_evex512_rodata_offsets.h"
+/* Offsets for data table __svml_satan_data_internal. */
+#define _sPC8 0
+#define _sPC7 64
+#define _sPC6 128
+#define _sPC5 192
+#define _sPC4 256
+#define _sPC3 320
+#define _sPC2 384
+#define _sPC1 448
+#define _sPIO2 512
#include <sysdep.h>
.section .text.evex512, "ax", @progbits
ENTRY(_ZGVeN16v_atanf_skx)
- vandps __svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
- vmovups MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
- vmovups One+__svml_satan_data_internal_avx512(%rip), %zmm8
-
- /* round to 2 bits after binary point */
- vreduceps $40, {sae}, %zmm7, %zmm5
-
- /* saturate X range */
- vmovups LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
- vmovups Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
- vcmpps $29, {sae}, %zmm3, %zmm7, %k1
-
- /* table lookup sequence */
- vmovups Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
- vsubps {rn-sae}, %zmm5, %zmm7, %zmm4
- vaddps {rn-sae}, %zmm2, %zmm7, %zmm1
- vxorps %zmm0, %zmm7, %zmm0
- vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
- vmovups coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
-
- /* if|X|>=MaxThreshold, set DiffX=-1 */
- vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
- vmovups coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
-
- /* if|X|>=MaxThreshold, set Y=X */
- vminps {sae}, %zmm7, %zmm6, %zmm8{%k1}
-
- /* R+Rl = DiffX/Y */
- vgetmantps $0, {sae}, %zmm9, %zmm12
- vgetexpps {sae}, %zmm9, %zmm10
- vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
- vgetmantps $0, {sae}, %zmm8, %zmm15
- vgetexpps {sae}, %zmm8, %zmm11
- vmovups coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
-
- /* set table value to Pi/2 for large X */
- vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
- vrcp14ps %zmm15, %zmm13
- vsubps {rn-sae}, %zmm11, %zmm10, %zmm2
- vmulps {rn-sae}, %zmm13, %zmm12, %zmm14
- vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
- vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
- vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
-
- /* polynomial evaluation */
- vmulps {rn-sae}, %zmm7, %zmm7, %zmm8
- vmulps {rn-sae}, %zmm7, %zmm8, %zmm6
- vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
- vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
- vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
- vaddps {rn-sae}, %zmm9, %zmm8, %zmm10
- vxorps %zmm0, %zmm10, %zmm0
+ /* 1) If x>1, then r=-1/x, PIO2=Pi/2
+ 2) If -1<=x<=1, then r=x, PIO2=0
+ 3) If x<-1, then r=-1/x, PIO2=-Pi/2. */
+ vmovups COMMON_DATA(_OneF)(%rip), %zmm2
+ vmovups COMMON_DATA(_SignMask)(%rip), %zmm7
+
+
+ /* Use minud\maxud operations for argument reduction. */
+ vandnps %zmm0, %zmm7, %zmm3
+ vpcmpgtd %zmm2, %zmm3, %k1
+
+ vpmaxud %zmm3, %zmm2, %zmm4
+ vpminud %zmm3, %zmm2, %zmm5
+
+ vdivps %zmm4, %zmm5, %zmm4
+
+ vandps %zmm7, %zmm0, %zmm3
+ vmovdqa32 %zmm7, %zmm7{%k1}{z}
+
+ vmulps %zmm4, %zmm4, %zmm1
+ vpternlogq $0x96, %zmm3, %zmm4, %zmm7
+
+ /* Polynomial. */
+
+ vmovups LOCAL_DATA(_sPC8)(%rip), %zmm0
+ vmovups LOCAL_DATA(_sPC7)(%rip), %zmm4
+
+ vmulps %zmm1, %zmm1, %zmm5
+
+ vfmadd213ps LOCAL_DATA(_sPC6)(%rip), %zmm5, %zmm0
+ vfmadd213ps LOCAL_DATA(_sPC5)(%rip), %zmm5, %zmm4
+ vfmadd213ps LOCAL_DATA(_sPC4)(%rip), %zmm5, %zmm0
+ vfmadd213ps LOCAL_DATA(_sPC3)(%rip), %zmm5, %zmm4
+ vfmadd213ps LOCAL_DATA(_sPC2)(%rip), %zmm5, %zmm0
+ vfmadd213ps LOCAL_DATA(_sPC1)(%rip), %zmm5, %zmm4
+ vfmadd213ps %zmm4, %zmm1, %zmm0
+ vfmadd213ps %zmm2, %zmm1, %zmm0
+ vorps LOCAL_DATA(_sPIO2)(%rip), %zmm3, %zmm3{%k1}
+
+ /* Reconstruction. */
+ vfmadd213ps %zmm3, %zmm7, %zmm0
ret
END(_ZGVeN16v_atanf_skx)
- .section .rodata, "a"
+ .section .rodata.evex512, "a"
.align 64
-#ifdef __svml_satan_data_internal_avx512_typedef
-typedef unsigned int VUINT32;
-typedef struct {
- __declspec(align(64)) VUINT32 AbsMask[16][1];
- __declspec(align(64)) VUINT32 Shifter[16][1];
- __declspec(align(64)) VUINT32 MaxThreshold[16][1];
- __declspec(align(64)) VUINT32 MOne[16][1];
- __declspec(align(64)) VUINT32 One[16][1];
- __declspec(align(64)) VUINT32 LargeX[16][1];
- __declspec(align(64)) VUINT32 Zero[16][1];
- __declspec(align(64)) VUINT32 Tbl_H[32][1];
- __declspec(align(64)) VUINT32 Pi2[16][1];
- __declspec(align(64)) VUINT32 coeff[3][16][1];
-} __svml_satan_data_internal_avx512;
-#endif
-__svml_satan_data_internal_avx512:
- /* AbsMask */
- .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
- /* Shifter */
- .align 64
- .long 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
- /* MaxThreshold */
- .align 64
- .long 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
- /* MOne */
- .align 64
- .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
- /* One */
- .align 64
- .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
- /* LargeX */
- .align 64
- .long 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
- /* Zero */
- .align 64
- .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
- /* Tbl_H */
- .align 64
- .long 0x00000000, 0x3e7adbb0
- .long 0x3eed6338, 0x3f24bc7d
- .long 0x3f490fdb, 0x3f6563e3
- .long 0x3f7b985f, 0x3f869c79
- .long 0x3f8db70d, 0x3f93877b
- .long 0x3f985b6c, 0x3f9c6b53
- .long 0x3f9fe0bb, 0x3fa2daa4
- .long 0x3fa57088, 0x3fa7b46f
- .long 0x3fa9b465, 0x3fab7b7a
- .long 0x3fad1283, 0x3fae809e
- .long 0x3fafcb99, 0x3fb0f836
- .long 0x3fb20a6a, 0x3fb30581
- .long 0x3fb3ec43, 0x3fb4c10a
- .long 0x3fb585d7, 0x3fb63c64
- .long 0x3fb6e62c, 0x3fb78478
- .long 0x3fb81868, 0x3fb8a2f5
- /* Pi2 */
- .align 64
- .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
- /* coeff3 */
- .align 64
- .long 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
- .long 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
- .long 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
- .align 64
- .type __svml_satan_data_internal_avx512, @object
- .size __svml_satan_data_internal_avx512, .-__svml_satan_data_internal_avx512
+LOCAL_DATA_NAME:
+ DATA_VEC (LOCAL_DATA_NAME, _sPC8, 0x3B322CC0)
+ DATA_VEC (LOCAL_DATA_NAME, _sPC7, 0xBC7F2631)
+ DATA_VEC (LOCAL_DATA_NAME, _sPC6, 0x3D2BC384)
+ DATA_VEC (LOCAL_DATA_NAME, _sPC5, 0xBD987629)
+ DATA_VEC (LOCAL_DATA_NAME, _sPC4, 0x3DD96474)
+ DATA_VEC (LOCAL_DATA_NAME, _sPC3, 0xBE1161F8)
+ DATA_VEC (LOCAL_DATA_NAME, _sPC2, 0x3E4CB79F)
+ DATA_VEC (LOCAL_DATA_NAME, _sPC1, 0xBEAAAA49)
+ DATA_VEC (LOCAL_DATA_NAME, _sPIO2, 0x3FC90FDB)
+
+ .type LOCAL_DATA_NAME, @object
+ .size LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
--
2.34.1
next prev parent reply other threads:[~2022-12-07 8:53 UTC|newest]
Thread overview: 38+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-12-07 8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 02/27] x86/fpu: Add file for common data used across svml_s_*_avx2.S files Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 03/27] x86/fpu: Add file for common data used across svml_s_*_avx512.S files Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 04/27] x86/fpu: Add file for common data used across svml_s_*_sse4.S files Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 05/27] x86/fpu: Build common data files for svml_s_*_{avx512,avx2,sse4}.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 06/27] x86/fpu: Update rodata usage in svml_s_tanhf_*_{avx2,sse4} Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S Noah Goldstein
2022-12-16 17:05 ` H.J. Lu
2022-12-16 18:17 ` Noah Goldstein
2022-12-16 21:37 ` H.J. Lu
2022-12-16 21:51 ` Noah Goldstein
2022-12-16 22:01 ` H.J. Lu
2022-12-16 22:54 ` Sunil Pandey
2023-06-27 18:23 ` Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 08/27] x86/fpu: Update rodata usage in svml_s_atanhf16_core_avx512.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 09/27] x86/fpu: Update rodata usage in svml_s_atanhf4_core_sse4.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 10/27] x86/fpu: Update rodata usage in svml_s_atanhf8_core_avx2.S Noah Goldstein
2022-12-07 8:52 ` Noah Goldstein [this message]
2022-12-07 8:52 ` [PATCH v1 12/27] x86/fpu: Optimize svml_s_atanf4_core_sse4.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 13/27] x86/fpu: Optimize svml_s_atanf8_core_avx2.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 14/27] x86/fpu: Add common rodata file for svml_s_tanf_*_{avx512,avx2,sse4}.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 15/27] x86/fpu: Optimize svml_s_tanf16_core_avx512.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 16/27] x86/fpu: Optimize svml_s_tanf4_core_sse4.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 17/27] x86/fpu: Optimize svml_s_tanf8_core_avx2.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 18/27] x86/fpu: Optimize svml_s_log10f16_core_avx512.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 19/27] x86/fpu: Optimize svml_s_log10f4_core_sse4.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 20/27] x86/fpu: Optimize svml_s_log10f8_core_avx2.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 21/27] x86/fpu: Optimize svml_s_log2f16_core_avx512.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 22/27] x86/fpu: Optimize svml_s_log2f4_core_sse4.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 23/27] x86/fpu: Optimize svml_s_log2f8_core_avx2.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 24/27] x86/fpu: Optimize svml_s_logf16_core_avx512.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 25/27] x86/fpu: Optimize svml_s_logf4_core_sse4.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 26/27] x86/fpu: Optimize svml_s_logf8_core_avx2.S Noah Goldstein
2022-12-07 8:52 ` [PATCH v1 27/27] x86/fpu: Remove unused svml_s_logf_data.S file Noah Goldstein
2022-12-07 23:53 ` [PATCH v1 01/27] x86/fpu: Create helper file for common data macros H.J. Lu
2022-12-08 0:13 ` Noah Goldstein
2022-12-08 0:22 ` H.J. Lu
2022-12-08 0:46 ` Noah Goldstein
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20221207085236.1424424-11-goldstein.w.n@gmail.com \
--to=goldstein.w.n@gmail.com \
--cc=andrey.kolesov@intel.com \
--cc=carlos@systemhalted.org \
--cc=hjl.tools@gmail.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).